diff options
Diffstat (limited to 'fs')
311 files changed, 16427 insertions, 6960 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 14d94420457..08b2eb15704 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -151,7 +151,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) if (access == V9FS_ACCESS_SINGLE) return ERR_PTR(-EPERM); - if (v9fs_extended(v9ses)) + if (v9fs_proto_dotu(v9ses)) uname = NULL; else uname = v9ses->uname; diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 7d6c2139891..6c7f6a25111 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -241,7 +241,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, list_add(&v9ses->slist, &v9fs_sessionlist); spin_unlock(&v9fs_sessionlist_lock); - v9ses->flags = V9FS_EXTENDED | V9FS_ACCESS_USER; + v9ses->flags = V9FS_PROTO_2000U | V9FS_ACCESS_USER; strcpy(v9ses->uname, V9FS_DEFUSER); strcpy(v9ses->aname, V9FS_DEFANAME); v9ses->uid = ~0; @@ -262,13 +262,13 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, goto error; } - if (!v9ses->clnt->dotu) - v9ses->flags &= ~V9FS_EXTENDED; + if (!p9_is_proto_dotu(v9ses->clnt)) + v9ses->flags &= ~V9FS_PROTO_2000U; v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; /* for legacy mode, fall back to V9FS_ACCESS_ANY */ - if (!v9fs_extended(v9ses) && + if (!v9fs_proto_dotu(v9ses) && ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) { v9ses->flags &= ~V9FS_ACCESS_MASK; diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 019f4ccb70c..6b801d1ddf4 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -23,7 +23,8 @@ /** * enum p9_session_flags - option flags for each 9P session - * @V9FS_EXTENDED: whether or not to use 9P2000.u extensions + * @V9FS_PROTO_2000U: whether or not to use 9P2000.u extensions + * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy * @V9FS_ACCESS_USER: a new attach will be issued for every user (default) * @V9FS_ACCESS_ANY: use a single attach for all users @@ -32,11 +33,12 @@ * Session flags reflect options selected by users at mount time */ enum p9_session_flags { - V9FS_EXTENDED = 0x01, - V9FS_ACCESS_SINGLE = 0x02, - V9FS_ACCESS_USER = 0x04, - V9FS_ACCESS_ANY = 0x06, - V9FS_ACCESS_MASK = 0x06, + V9FS_PROTO_2000U = 0x01, + V9FS_PROTO_2000L = 0x02, + V9FS_ACCESS_SINGLE = 0x04, + V9FS_ACCESS_USER = 0x08, + V9FS_ACCESS_ANY = 0x0C, + V9FS_ACCESS_MASK = 0x0C, }; /* possible values of ->cache */ @@ -121,7 +123,12 @@ static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode) return (inode->i_sb->s_fs_info); } -static inline int v9fs_extended(struct v9fs_session_info *v9ses) +static inline int v9fs_proto_dotu(struct v9fs_session_info *v9ses) { - return v9ses->flags & V9FS_EXTENDED; + return v9ses->flags & V9FS_PROTO_2000U; +} + +static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses) +{ + return v9ses->flags & V9FS_PROTO_2000L; } diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 15cce53bf61..d8a3afe4ff7 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -76,6 +76,15 @@ static inline int dt_type(struct p9_wstat *mistat) return rettype; } +static void p9stat_init(struct p9_wstat *stbuf) +{ + stbuf->name = NULL; + stbuf->uid = NULL; + stbuf->gid = NULL; + stbuf->muid = NULL; + stbuf->extension = NULL; +} + /** * v9fs_dir_readdir - read a directory * @filp: opened file structure @@ -131,11 +140,11 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) rdir->head = 0; rdir->tail = err; } - while (rdir->head < rdir->tail) { + p9stat_init(&st); err = p9stat_read(rdir->buf + rdir->head, buflen - rdir->head, &st, - fid->clnt->dotu); + fid->clnt->proto_version); if (err) { P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); err = -EIO; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 74a0461a9ac..df52d488d2a 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -61,7 +61,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file); v9ses = v9fs_inode2v9ses(inode); - omode = v9fs_uflags2omode(file->f_flags, v9fs_extended(v9ses)); + omode = v9fs_uflags2omode(file->f_flags, v9fs_proto_dotu(v9ses)); fid = file->private_data; if (!fid) { fid = v9fs_fid_clone(file->f_path.dentry); @@ -77,7 +77,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) i_size_write(inode, 0); inode->i_blocks = 0; } - if ((file->f_flags & O_APPEND) && (!v9fs_extended(v9ses))) + if ((file->f_flags & O_APPEND) && (!v9fs_proto_dotu(v9ses))) generic_file_llseek(file, 0, SEEK_END); } @@ -114,7 +114,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl) P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); /* No mandatory locks */ - if (__mandatory_lock(inode)) + if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) return -ENOLCK; if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { @@ -215,7 +215,7 @@ v9fs_file_write(struct file *filp, const char __user * data, struct p9_fid *fid; struct p9_client *clnt; struct inode *inode = filp->f_path.dentry->d_inode; - int origin = *offset; + loff_t origin = *offset; unsigned long pg_start, pg_end; P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data, diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index a407fa3388c..5fe45d692c9 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -60,7 +60,7 @@ static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode) res = mode & 0777; if (S_ISDIR(mode)) res |= P9_DMDIR; - if (v9fs_extended(v9ses)) { + if (v9fs_proto_dotu(v9ses)) { if (S_ISLNK(mode)) res |= P9_DMSYMLINK; if (v9ses->nodev == 0) { @@ -102,21 +102,21 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode) if ((mode & P9_DMDIR) == P9_DMDIR) res |= S_IFDIR; - else if ((mode & P9_DMSYMLINK) && (v9fs_extended(v9ses))) + else if ((mode & P9_DMSYMLINK) && (v9fs_proto_dotu(v9ses))) res |= S_IFLNK; - else if ((mode & P9_DMSOCKET) && (v9fs_extended(v9ses)) + else if ((mode & P9_DMSOCKET) && (v9fs_proto_dotu(v9ses)) && (v9ses->nodev == 0)) res |= S_IFSOCK; - else if ((mode & P9_DMNAMEDPIPE) && (v9fs_extended(v9ses)) + else if ((mode & P9_DMNAMEDPIPE) && (v9fs_proto_dotu(v9ses)) && (v9ses->nodev == 0)) res |= S_IFIFO; - else if ((mode & P9_DMDEVICE) && (v9fs_extended(v9ses)) + else if ((mode & P9_DMDEVICE) && (v9fs_proto_dotu(v9ses)) && (v9ses->nodev == 0)) res |= S_IFBLK; else res |= S_IFREG; - if (v9fs_extended(v9ses)) { + if (v9fs_proto_dotu(v9ses)) { if ((mode & P9_DMSETUID) == P9_DMSETUID) res |= S_ISUID; @@ -265,7 +265,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) case S_IFBLK: case S_IFCHR: case S_IFSOCK: - if (!v9fs_extended(v9ses)) { + if (!v9fs_proto_dotu(v9ses)) { P9_DPRINTK(P9_DEBUG_ERROR, "special files without extended mode\n"); err = -EINVAL; @@ -278,7 +278,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) inode->i_fop = &v9fs_file_operations; break; case S_IFLNK: - if (!v9fs_extended(v9ses)) { + if (!v9fs_proto_dotu(v9ses)) { P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used w/o 9P2000.u\n"); err = -EINVAL; @@ -288,7 +288,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) break; case S_IFDIR: inc_nlink(inode); - if (v9fs_extended(v9ses)) + if (v9fs_proto_dotu(v9ses)) inode->i_op = &v9fs_dir_inode_operations_ext; else inode->i_op = &v9fs_dir_inode_operations; @@ -575,7 +575,8 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode, flags = O_RDWR; fid = v9fs_create(v9ses, dir, dentry, NULL, perm, - v9fs_uflags2omode(flags, v9fs_extended(v9ses))); + v9fs_uflags2omode(flags, + v9fs_proto_dotu(v9ses))); if (IS_ERR(fid)) { err = PTR_ERR(fid); fid = NULL; @@ -858,7 +859,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) if (iattr->ia_valid & ATTR_SIZE) wstat.length = iattr->ia_size; - if (v9fs_extended(v9ses)) { + if (v9fs_proto_dotu(v9ses)) { if (iattr->ia_valid & ATTR_UID) wstat.n_uid = iattr->ia_uid; @@ -886,6 +887,8 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, struct super_block *sb) { char ext[32]; + char tag_name[14]; + unsigned int i_nlink; struct v9fs_session_info *v9ses = sb->s_fs_info; inode->i_nlink = 1; @@ -897,11 +900,26 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, inode->i_uid = v9ses->dfltuid; inode->i_gid = v9ses->dfltgid; - if (v9fs_extended(v9ses)) { + if (v9fs_proto_dotu(v9ses)) { inode->i_uid = stat->n_uid; inode->i_gid = stat->n_gid; } - + if ((S_ISREG(inode->i_mode)) || (S_ISDIR(inode->i_mode))) { + if (v9fs_proto_dotu(v9ses) && (stat->extension[0] != '\0')) { + /* + * Hadlink support got added later to + * to the .u extension. So there can be + * server out there that doesn't support + * this even with .u extension. So check + * for non NULL stat->extension + */ + strncpy(ext, stat->extension, sizeof(ext)); + /* HARDLINKCOUNT %u */ + sscanf(ext, "%13s %u", tag_name, &i_nlink); + if (!strncmp(tag_name, "HARDLINKCOUNT", 13)) + inode->i_nlink = i_nlink; + } + } inode->i_mode = p9mode2unixmode(v9ses, stat->mode); if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) { char type = 0; @@ -976,7 +994,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) if (IS_ERR(fid)) return PTR_ERR(fid); - if (!v9fs_extended(v9ses)) + if (!v9fs_proto_dotu(v9ses)) return -EBADF; st = p9_client_stat(fid); @@ -1066,7 +1084,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, struct p9_fid *fid; v9ses = v9fs_inode2v9ses(dir); - if (!v9fs_extended(v9ses)) { + if (!v9fs_proto_dotu(v9ses)) { P9_DPRINTK(P9_DEBUG_ERROR, "not extended\n"); return -EPERM; } diff --git a/fs/Kconfig b/fs/Kconfig index 64d44efad7a..7405f071be6 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -177,6 +177,7 @@ source "fs/efs/Kconfig" source "fs/jffs2/Kconfig" # UBIFS File system configuration source "fs/ubifs/Kconfig" +source "fs/logfs/Kconfig" source "fs/cramfs/Kconfig" source "fs/squashfs/Kconfig" source "fs/freevxfs/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index af6d04700d9..c3633aa4691 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -99,6 +99,7 @@ obj-$(CONFIG_NTFS_FS) += ntfs/ obj-$(CONFIG_UFS_FS) += ufs/ obj-$(CONFIG_EFS_FS) += efs/ obj-$(CONFIG_JFFS2_FS) += jffs2/ +obj-$(CONFIG_LOGFS) += logfs/ obj-$(CONFIG_UBIFS_FS) += ubifs/ obj-$(CONFIG_AFFS_FS) += affs/ obj-$(CONFIG_ROMFS_FS) += romfs/ diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h index 9cc18775b83..2ff622f6f54 100644 --- a/fs/adfs/adfs.h +++ b/fs/adfs/adfs.h @@ -121,7 +121,7 @@ struct adfs_discmap { /* Inode stuff */ struct inode *adfs_iget(struct super_block *sb, struct object_info *obj); -int adfs_write_inode(struct inode *inode,int unused); +int adfs_write_inode(struct inode *inode, struct writeback_control *wbc); int adfs_notify_change(struct dentry *dentry, struct iattr *attr); /* map.c */ diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 3f57ce4bee5..0f5e3097813 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -9,6 +9,7 @@ */ #include <linux/smp_lock.h> #include <linux/buffer_head.h> +#include <linux/writeback.h> #include "adfs.h" /* @@ -360,7 +361,7 @@ out: * The adfs-specific inode data has already been updated by * adfs_notify_change() */ -int adfs_write_inode(struct inode *inode, int wait) +int adfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct super_block *sb = inode->i_sb; struct object_info obj; @@ -375,7 +376,7 @@ int adfs_write_inode(struct inode *inode, int wait) obj.attr = ADFS_I(inode)->attr; obj.size = inode->i_size; - ret = adfs_dir_update(sb, &obj, wait); + ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL); unlock_kernel(); return ret; } diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 0e40caaba45..861dae68ac1 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -175,7 +175,8 @@ extern void affs_delete_inode(struct inode *inode); extern void affs_clear_inode(struct inode *inode); extern struct inode *affs_iget(struct super_block *sb, unsigned long ino); -extern int affs_write_inode(struct inode *inode, int); +extern int affs_write_inode(struct inode *inode, + struct writeback_control *wbc); extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type); /* file.c */ diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c index dc5ef14bdc1..8306d53307e 100644 --- a/fs/affs/bitmap.c +++ b/fs/affs/bitmap.c @@ -128,7 +128,7 @@ err_range: /* * Allocate a block in the given allocation zone. * Since we have to byte-swap the bitmap on little-endian - * machines, this is rather expensive. Therefor we will + * machines, this is rather expensive. Therefore we will * preallocate up to 16 blocks from the same word, if * possible. We are not doing preallocations in the * header zone, though. diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 3c4ec7d864c..c9744d771d9 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -166,7 +166,7 @@ bad_inode: } int -affs_write_inode(struct inode *inode, int unused) +affs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct super_block *sb = inode->i_sb; struct buffer_head *bh; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 6ece2a13bf7..c54dad4e606 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -733,7 +733,6 @@ extern int afs_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata); extern int afs_writepage(struct page *, struct writeback_control *); extern int afs_writepages(struct address_space *, struct writeback_control *); -extern int afs_write_inode(struct inode *, int); extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *); extern ssize_t afs_file_write(struct kiocb *, const struct iovec *, unsigned long, loff_t); diff --git a/fs/afs/super.c b/fs/afs/super.c index e1ea1c240b6..14f6431598a 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -48,7 +48,6 @@ struct file_system_type afs_fs_type = { static const struct super_operations afs_super_ops = { .statfs = afs_statfs, .alloc_inode = afs_alloc_inode, - .write_inode = afs_write_inode, .destroy_inode = afs_destroy_inode, .clear_inode = afs_clear_inode, .put_super = afs_put_super, diff --git a/fs/afs/write.c b/fs/afs/write.c index 5e15a21dbf9..3bed54a294d 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -585,27 +585,6 @@ int afs_writepages(struct address_space *mapping, } /* - * write an inode back - */ -int afs_write_inode(struct inode *inode, int sync) -{ - struct afs_vnode *vnode = AFS_FS_I(inode); - int ret; - - _enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode); - - ret = 0; - if (sync) { - ret = filemap_fdatawait(inode->i_mapping); - if (ret < 0) - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); - } - - _leave(" = %d", ret); - return ret; -} - -/* * completion of write to server */ void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call) diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 9f0bf13291e..2de009565d8 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -209,6 +209,7 @@ static struct inode *anon_inode_mkinode(void) inode->i_mode = S_IRUSR | S_IWUSR; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); + inode->i_flags |= S_PRIVATE; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; return inode; } diff --git a/fs/attr.c b/fs/attr.c index 96d394bdadd..0815e93bb48 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -12,7 +12,6 @@ #include <linux/capability.h> #include <linux/fsnotify.h> #include <linux/fcntl.h> -#include <linux/quotaops.h> #include <linux/security.h> /* Taken over from the old code... */ @@ -82,7 +81,7 @@ int inode_newsize_ok(const struct inode *inode, loff_t offset) if (inode->i_size < offset) { unsigned long limit; - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + limit = rlimit(RLIMIT_FSIZE); if (limit != RLIM_INFINITY && offset > limit) goto out_sig; if (offset > inode->i_sb->s_maxbytes) @@ -212,14 +211,8 @@ int notify_change(struct dentry * dentry, struct iattr * attr) error = inode->i_op->setattr(dentry, attr); } else { error = inode_change_ok(inode, attr); - if (!error) { - if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || - (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) - error = vfs_dq_transfer(inode, attr) ? - -EDQUOT : 0; - if (!error) - error = inode_setattr(inode, attr); - } + if (!error) + error = inode_setattr(inode, attr); } if (ia_valid & ATTR_SIZE) diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 8f3d9fd8960..f22a7d3dc36 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -15,6 +15,7 @@ #include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/vfs.h> +#include <linux/writeback.h> #include <asm/uaccess.h> #include "bfs.h" @@ -98,7 +99,7 @@ error: return ERR_PTR(-EIO); } -static int bfs_write_inode(struct inode *inode, int wait) +static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct bfs_sb_info *info = BFS_SB(inode->i_sb); unsigned int ino = (u16)inode->i_ino; @@ -147,7 +148,7 @@ static int bfs_write_inode(struct inode *inode, int wait) di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1); mark_buffer_dirty(bh); - if (wait) { + if (wbc->sync_mode == WB_SYNC_ALL) { sync_dirty_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) err = -EIO; diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index fdd39709917..15d80bb35d6 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -24,6 +24,7 @@ #include <linux/binfmts.h> #include <linux/personality.h> #include <linux/init.h> +#include <linux/coredump.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -60,26 +61,6 @@ static int set_brk(unsigned long start, unsigned long end) } /* - * These are the only things you should do on a core-file: use only these - * macros to write out all the necessary info. - */ - -static int dump_write(struct file *file, const void *addr, int nr) -{ - return file->f_op->write(file, addr, nr, &file->f_pos) == nr; -} - -#define DUMP_WRITE(addr, nr) \ - if (!dump_write(file, (void *)(addr), (nr))) \ - goto end_coredump; - -#define DUMP_SEEK(offset) \ -if (file->f_op->llseek) { \ - if (file->f_op->llseek(file,(offset),0) != (offset)) \ - goto end_coredump; \ -} else file->f_pos = (offset) - -/* * Routine writes a core dump image in the current directory. * Currently only a stub-function. * @@ -130,26 +111,31 @@ static int aout_core_dump(struct coredump_params *cprm) set_fs(KERNEL_DS); /* struct user */ - DUMP_WRITE(&dump,sizeof(dump)); + if (!dump_write(file, &dump, sizeof(dump))) + goto end_coredump; /* Now dump all of the user data. Include malloced stuff as well */ - DUMP_SEEK(PAGE_SIZE); + if (!dump_seek(cprm->file, PAGE_SIZE - sizeof(dump))) + goto end_coredump; /* now we start writing out the user space info */ set_fs(USER_DS); /* Dump the data area */ if (dump.u_dsize != 0) { dump_start = START_DATA(dump); dump_size = dump.u_dsize << PAGE_SHIFT; - DUMP_WRITE(dump_start,dump_size); + if (!dump_write(file, dump_start, dump_size)) + goto end_coredump; } /* Now prepare to dump the stack area */ if (dump.u_ssize != 0) { dump_start = START_STACK(dump); dump_size = dump.u_ssize << PAGE_SHIFT; - DUMP_WRITE(dump_start,dump_size); + if (!dump_write(file, dump_start, dump_size)) + goto end_coredump; } /* Finally dump the task struct. Not be used by gdb, but could be useful */ set_fs(KERNEL_DS); - DUMP_WRITE(current,sizeof(*current)); + if (!dump_write(file, current, sizeof(*current))) + goto end_coredump; end_coredump: set_fs(fs); return has_dumped; @@ -247,7 +233,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) * size limits imposed on them by creating programs with large * arrays in the data or bss. */ - rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; + rlim = rlimit(RLIMIT_DATA); if (rlim >= RLIM_INFINITY) rlim = ~0; if (ex.a_data + ex.a_bss > rlim) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index fd5b2ea5d29..535e763ab1a 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -31,6 +31,7 @@ #include <linux/random.h> #include <linux/elf.h> #include <linux/utsname.h> +#include <linux/coredump.h> #include <asm/uaccess.h> #include <asm/param.h> #include <asm/page.h> @@ -1085,36 +1086,6 @@ out: * Modelled on fs/exec.c:aout_core_dump() * Jeremy Fitzhardinge <jeremy@sw.oz.au> */ -/* - * These are the only things you should do on a core-file: use only these - * functions to write out all the necessary info. - */ -static int dump_write(struct file *file, const void *addr, int nr) -{ - return file->f_op->write(file, addr, nr, &file->f_pos) == nr; -} - -static int dump_seek(struct file *file, loff_t off) -{ - if (file->f_op->llseek && file->f_op->llseek != no_llseek) { - if (file->f_op->llseek(file, off, SEEK_CUR) < 0) - return 0; - } else { - char *buf = (char *)get_zeroed_page(GFP_KERNEL); - if (!buf) - return 0; - while (off > 0) { - unsigned long n = off; - if (n > PAGE_SIZE) - n = PAGE_SIZE; - if (!dump_write(file, buf, n)) - return 0; - off -= n; - } - free_page((unsigned long)buf); - } - return 1; -} /* * Decide what to dump of a segment, part, all or none. @@ -1249,11 +1220,6 @@ static int writenote(struct memelfnote *men, struct file *file, } #undef DUMP_WRITE -#define DUMP_WRITE(addr, nr) \ - if ((size += (nr)) > cprm->limit || \ - !dump_write(cprm->file, (addr), (nr))) \ - goto end_coredump; - static void fill_elf_header(struct elfhdr *elf, int segs, u16 machine, u32 flags, u8 osabi) { @@ -1872,6 +1838,34 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, return gate_vma; } +static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, + elf_addr_t e_shoff, int segs) +{ + elf->e_shoff = e_shoff; + elf->e_shentsize = sizeof(*shdr4extnum); + elf->e_shnum = 1; + elf->e_shstrndx = SHN_UNDEF; + + memset(shdr4extnum, 0, sizeof(*shdr4extnum)); + + shdr4extnum->sh_type = SHT_NULL; + shdr4extnum->sh_size = elf->e_shnum; + shdr4extnum->sh_link = elf->e_shstrndx; + shdr4extnum->sh_info = segs; +} + +static size_t elf_core_vma_data_size(struct vm_area_struct *gate_vma, + unsigned long mm_flags) +{ + struct vm_area_struct *vma; + size_t size = 0; + + for (vma = first_vma(current, gate_vma); vma != NULL; + vma = next_vma(vma, gate_vma)) + size += vma_dump_size(vma, mm_flags); + return size; +} + /* * Actual dumper * @@ -1888,8 +1882,11 @@ static int elf_core_dump(struct coredump_params *cprm) struct vm_area_struct *vma, *gate_vma; struct elfhdr *elf = NULL; loff_t offset = 0, dataoff, foffset; - unsigned long mm_flags; struct elf_note_info info; + struct elf_phdr *phdr4note = NULL; + struct elf_shdr *shdr4extnum = NULL; + Elf_Half e_phnum; + elf_addr_t e_shoff; /* * We no longer stop all VM operations. @@ -1912,20 +1909,25 @@ static int elf_core_dump(struct coredump_params *cprm) * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here. */ segs = current->mm->map_count; -#ifdef ELF_CORE_EXTRA_PHDRS - segs += ELF_CORE_EXTRA_PHDRS; -#endif + segs += elf_core_extra_phdrs(); gate_vma = get_gate_vma(current); if (gate_vma != NULL) segs++; + /* for notes section */ + segs++; + + /* If segs > PN_XNUM(0xffff), then e_phnum overflows. To avoid + * this, kernel supports extended numbering. Have a look at + * include/linux/elf.h for further information. */ + e_phnum = segs > PN_XNUM ? PN_XNUM : segs; + /* * Collect all the non-memory information about the process for the * notes. This also sets up the file header. */ - if (!fill_note_info(elf, segs + 1, /* including notes section */ - &info, cprm->signr, cprm->regs)) + if (!fill_note_info(elf, e_phnum, &info, cprm->signr, cprm->regs)) goto cleanup; has_dumped = 1; @@ -1934,31 +1936,47 @@ static int elf_core_dump(struct coredump_params *cprm) fs = get_fs(); set_fs(KERNEL_DS); - DUMP_WRITE(elf, sizeof(*elf)); offset += sizeof(*elf); /* Elf header */ - offset += (segs + 1) * sizeof(struct elf_phdr); /* Program headers */ + offset += segs * sizeof(struct elf_phdr); /* Program headers */ foffset = offset; /* Write notes phdr entry */ { - struct elf_phdr phdr; size_t sz = get_note_info_size(&info); sz += elf_coredump_extra_notes_size(); - fill_elf_note_phdr(&phdr, sz, offset); + phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL); + if (!phdr4note) + goto end_coredump; + + fill_elf_note_phdr(phdr4note, sz, offset); offset += sz; - DUMP_WRITE(&phdr, sizeof(phdr)); } dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - /* - * We must use the same mm->flags while dumping core to avoid - * inconsistency between the program headers and bodies, otherwise an - * unusable core file can be generated. - */ - mm_flags = current->mm->flags; + offset += elf_core_vma_data_size(gate_vma, cprm->mm_flags); + offset += elf_core_extra_data_size(); + e_shoff = offset; + + if (e_phnum == PN_XNUM) { + shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL); + if (!shdr4extnum) + goto end_coredump; + fill_extnum_info(elf, shdr4extnum, e_shoff, segs); + } + + offset = dataoff; + + size += sizeof(*elf); + if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf))) + goto end_coredump; + + size += sizeof(*phdr4note); + if (size > cprm->limit + || !dump_write(cprm->file, phdr4note, sizeof(*phdr4note))) + goto end_coredump; /* Write program headers for segments dump */ for (vma = first_vma(current, gate_vma); vma != NULL; @@ -1969,7 +1987,7 @@ static int elf_core_dump(struct coredump_params *cprm) phdr.p_offset = offset; phdr.p_vaddr = vma->vm_start; phdr.p_paddr = 0; - phdr.p_filesz = vma_dump_size(vma, mm_flags); + phdr.p_filesz = vma_dump_size(vma, cprm->mm_flags); phdr.p_memsz = vma->vm_end - vma->vm_start; offset += phdr.p_filesz; phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; @@ -1979,12 +1997,14 @@ static int elf_core_dump(struct coredump_params *cprm) phdr.p_flags |= PF_X; phdr.p_align = ELF_EXEC_PAGESIZE; - DUMP_WRITE(&phdr, sizeof(phdr)); + size += sizeof(phdr); + if (size > cprm->limit + || !dump_write(cprm->file, &phdr, sizeof(phdr))) + goto end_coredump; } -#ifdef ELF_CORE_WRITE_EXTRA_PHDRS - ELF_CORE_WRITE_EXTRA_PHDRS; -#endif + if (!elf_core_write_extra_phdrs(cprm->file, offset, &size, cprm->limit)) + goto end_coredump; /* write out the notes section */ if (!write_note_info(&info, cprm->file, &foffset)) @@ -2002,7 +2022,7 @@ static int elf_core_dump(struct coredump_params *cprm) unsigned long addr; unsigned long end; - end = vma->vm_start + vma_dump_size(vma, mm_flags); + end = vma->vm_start + vma_dump_size(vma, cprm->mm_flags); for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) { struct page *page; @@ -2023,15 +2043,24 @@ static int elf_core_dump(struct coredump_params *cprm) } } -#ifdef ELF_CORE_WRITE_EXTRA_DATA - ELF_CORE_WRITE_EXTRA_DATA; -#endif + if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit)) + goto end_coredump; + + if (e_phnum == PN_XNUM) { + size += sizeof(*shdr4extnum); + if (size > cprm->limit + || !dump_write(cprm->file, shdr4extnum, + sizeof(*shdr4extnum))) + goto end_coredump; + } end_coredump: set_fs(fs); cleanup: free_note_info(&info); + kfree(shdr4extnum); + kfree(phdr4note); kfree(elf); out: return has_dumped; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 18d77297ccc..2c32d00a669 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -34,6 +34,7 @@ #include <linux/elf.h> #include <linux/elf-fdpic.h> #include <linux/elfcore.h> +#include <linux/coredump.h> #include <asm/uaccess.h> #include <asm/param.h> @@ -1216,26 +1217,6 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, #ifdef CONFIG_ELF_CORE /* - * These are the only things you should do on a core-file: use only these - * functions to write out all the necessary info. - */ -static int dump_write(struct file *file, const void *addr, int nr) -{ - return file->f_op->write(file, addr, nr, &file->f_pos) == nr; -} - -static int dump_seek(struct file *file, loff_t off) -{ - if (file->f_op->llseek) { - if (file->f_op->llseek(file, off, SEEK_SET) != off) - return 0; - } else { - file->f_pos = off; - } - return 1; -} - -/* * Decide whether a segment is worth dumping; default is yes to be * sure (missing info is worse than too much; etc). * Personally I'd include everything, and use the coredump limit... @@ -1313,35 +1294,35 @@ static int notesize(struct memelfnote *en) /* #define DEBUG */ -#define DUMP_WRITE(addr, nr) \ - do { if (!dump_write(file, (addr), (nr))) return 0; } while(0) -#define DUMP_SEEK(off) \ - do { if (!dump_seek(file, (off))) return 0; } while(0) +#define DUMP_WRITE(addr, nr, foffset) \ + do { if (!dump_write(file, (addr), (nr))) return 0; *foffset += (nr); } while(0) -static int writenote(struct memelfnote *men, struct file *file) +static int alignfile(struct file *file, loff_t *foffset) { - struct elf_note en; + static const char buf[4] = { 0, }; + DUMP_WRITE(buf, roundup(*foffset, 4) - *foffset, foffset); + return 1; +} +static int writenote(struct memelfnote *men, struct file *file, + loff_t *foffset) +{ + struct elf_note en; en.n_namesz = strlen(men->name) + 1; en.n_descsz = men->datasz; en.n_type = men->type; - DUMP_WRITE(&en, sizeof(en)); - DUMP_WRITE(men->name, en.n_namesz); - /* XXX - cast from long long to long to avoid need for libgcc.a */ - DUMP_SEEK(roundup((unsigned long)file->f_pos, 4)); /* XXX */ - DUMP_WRITE(men->data, men->datasz); - DUMP_SEEK(roundup((unsigned long)file->f_pos, 4)); /* XXX */ + DUMP_WRITE(&en, sizeof(en), foffset); + DUMP_WRITE(men->name, en.n_namesz, foffset); + if (!alignfile(file, foffset)) + return 0; + DUMP_WRITE(men->data, men->datasz, foffset); + if (!alignfile(file, foffset)) + return 0; return 1; } #undef DUMP_WRITE -#undef DUMP_SEEK - -#define DUMP_WRITE(addr, nr) \ - if ((size += (nr)) > cprm->limit || \ - !dump_write(cprm->file, (addr), (nr))) \ - goto end_coredump; static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs) { @@ -1393,7 +1374,7 @@ static inline void fill_note(struct memelfnote *note, const char *name, int type /* * fill up all the fields in prstatus from the given task struct, except - * registers which need to be filled up seperately. + * registers which need to be filled up separately. */ static void fill_prstatus(struct elf_prstatus *prstatus, struct task_struct *p, long signr) @@ -1524,6 +1505,22 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t) return sz; } +static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, + elf_addr_t e_shoff, int segs) +{ + elf->e_shoff = e_shoff; + elf->e_shentsize = sizeof(*shdr4extnum); + elf->e_shnum = 1; + elf->e_shstrndx = SHN_UNDEF; + + memset(shdr4extnum, 0, sizeof(*shdr4extnum)); + + shdr4extnum->sh_type = SHT_NULL; + shdr4extnum->sh_size = elf->e_shnum; + shdr4extnum->sh_link = elf->e_shstrndx; + shdr4extnum->sh_info = segs; +} + /* * dump the segments for an MMU process */ @@ -1552,7 +1549,7 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size, err = -EIO; kunmap(page); page_cache_release(page); - } else if (!dump_seek(file, file->f_pos + PAGE_SIZE)) + } else if (!dump_seek(file, PAGE_SIZE)) err = -EFBIG; if (err) goto out; @@ -1588,6 +1585,17 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size, } #endif +static size_t elf_core_vma_data_size(unsigned long mm_flags) +{ + struct vm_area_struct *vma; + size_t size = 0; + + for (vma = current->mm->mmap; vma; vma->vm_next) + if (maydump(vma, mm_flags)) + size += vma->vm_end - vma->vm_start; + return size; +} + /* * Actual dumper * @@ -1605,7 +1613,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) int i; struct vm_area_struct *vma; struct elfhdr *elf = NULL; - loff_t offset = 0, dataoff; + loff_t offset = 0, dataoff, foffset; int numnote; struct memelfnote *notes = NULL; struct elf_prstatus *prstatus = NULL; /* NT_PRSTATUS */ @@ -1618,7 +1626,10 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) #endif int thread_status_size = 0; elf_addr_t *auxv; - unsigned long mm_flags; + struct elf_phdr *phdr4note = NULL; + struct elf_shdr *shdr4extnum = NULL; + Elf_Half e_phnum; + elf_addr_t e_shoff; /* * We no longer stop all VM operations. @@ -1683,12 +1694,18 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) elf_core_copy_regs(&prstatus->pr_reg, cprm->regs); segs = current->mm->map_count; -#ifdef ELF_CORE_EXTRA_PHDRS - segs += ELF_CORE_EXTRA_PHDRS; -#endif + segs += elf_core_extra_phdrs(); + + /* for notes section */ + segs++; + + /* If segs > PN_XNUM(0xffff), then e_phnum overflows. To avoid + * this, kernel supports extended numbering. Have a look at + * include/linux/elf.h for further information. */ + e_phnum = segs > PN_XNUM ? PN_XNUM : segs; /* Set up header */ - fill_elf_fdpic_header(elf, segs + 1); /* including notes section */ + fill_elf_fdpic_header(elf, e_phnum); has_dumped = 1; current->flags |= PF_DUMPCORE; @@ -1727,13 +1744,12 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) fs = get_fs(); set_fs(KERNEL_DS); - DUMP_WRITE(elf, sizeof(*elf)); offset += sizeof(*elf); /* Elf header */ - offset += (segs+1) * sizeof(struct elf_phdr); /* Program headers */ + offset += segs * sizeof(struct elf_phdr); /* Program headers */ + foffset = offset; /* Write notes phdr entry */ { - struct elf_phdr phdr; int sz = 0; for (i = 0; i < numnote; i++) @@ -1741,20 +1757,38 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) sz += thread_status_size; - fill_elf_note_phdr(&phdr, sz, offset); + phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL); + if (!phdr4note) + goto end_coredump; + + fill_elf_note_phdr(phdr4note, sz, offset); offset += sz; - DUMP_WRITE(&phdr, sizeof(phdr)); } /* Page-align dumped data */ dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - /* - * We must use the same mm->flags while dumping core to avoid - * inconsistency between the program headers and bodies, otherwise an - * unusable core file can be generated. - */ - mm_flags = current->mm->flags; + offset += elf_core_vma_data_size(cprm->mm_flags); + offset += elf_core_extra_data_size(); + e_shoff = offset; + + if (e_phnum == PN_XNUM) { + shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL); + if (!shdr4extnum) + goto end_coredump; + fill_extnum_info(elf, shdr4extnum, e_shoff, segs); + } + + offset = dataoff; + + size += sizeof(*elf); + if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf))) + goto end_coredump; + + size += sizeof(*phdr4note); + if (size > cprm->limit + || !dump_write(cprm->file, phdr4note, sizeof(*phdr4note))) + goto end_coredump; /* write program headers for segments dump */ for (vma = current->mm->mmap; vma; vma = vma->vm_next) { @@ -1767,7 +1801,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) phdr.p_offset = offset; phdr.p_vaddr = vma->vm_start; phdr.p_paddr = 0; - phdr.p_filesz = maydump(vma, mm_flags) ? sz : 0; + phdr.p_filesz = maydump(vma, cprm->mm_flags) ? sz : 0; phdr.p_memsz = sz; offset += phdr.p_filesz; phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; @@ -1777,16 +1811,18 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) phdr.p_flags |= PF_X; phdr.p_align = ELF_EXEC_PAGESIZE; - DUMP_WRITE(&phdr, sizeof(phdr)); + size += sizeof(phdr); + if (size > cprm->limit + || !dump_write(cprm->file, &phdr, sizeof(phdr))) + goto end_coredump; } -#ifdef ELF_CORE_WRITE_EXTRA_PHDRS - ELF_CORE_WRITE_EXTRA_PHDRS; -#endif + if (!elf_core_write_extra_phdrs(cprm->file, offset, &size, cprm->limit)) + goto end_coredump; /* write out the notes section */ for (i = 0; i < numnote; i++) - if (!writenote(notes + i, cprm->file)) + if (!writenote(notes + i, cprm->file, &foffset)) goto end_coredump; /* write out the thread status notes section */ @@ -1795,20 +1831,27 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) list_entry(t, struct elf_thread_status, list); for (i = 0; i < tmp->num_notes; i++) - if (!writenote(&tmp->notes[i], cprm->file)) + if (!writenote(&tmp->notes[i], cprm->file, &foffset)) goto end_coredump; } - if (!dump_seek(cprm->file, dataoff)) + if (!dump_seek(cprm->file, dataoff - foffset)) goto end_coredump; if (elf_fdpic_dump_segments(cprm->file, &size, &cprm->limit, - mm_flags) < 0) + cprm->mm_flags) < 0) goto end_coredump; -#ifdef ELF_CORE_WRITE_EXTRA_DATA - ELF_CORE_WRITE_EXTRA_DATA; -#endif + if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit)) + goto end_coredump; + + if (e_phnum == PN_XNUM) { + size += sizeof(*shdr4extnum); + if (size > cprm->limit + || !dump_write(cprm->file, shdr4extnum, + sizeof(*shdr4extnum))) + goto end_coredump; + } if (cprm->file->f_pos != offset) { /* Sanity check */ @@ -1826,7 +1869,7 @@ cleanup: list_del(tmp); kfree(list_entry(tmp, struct elf_thread_status, list)); } - + kfree(phdr4note); kfree(elf); kfree(prstatus); kfree(psinfo); diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 42c6b4a5444..e0e769bdca5 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -501,7 +501,7 @@ static int load_flat_file(struct linux_binprm * bprm, * size limits imposed on them by creating programs with large * arrays in the data or bss. */ - rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; + rlim = rlimit(RLIMIT_DATA); if (rlim >= RLIM_INFINITY) rlim = ~0; if (data_len + bss_len > rlim) { @@ -264,13 +264,12 @@ EXPORT_SYMBOL(bio_init); * bio_alloc_bioset - allocate a bio for I/O * @gfp_mask: the GFP_ mask given to the slab allocator * @nr_iovecs: number of iovecs to pre-allocate - * @bs: the bio_set to allocate from. If %NULL, just use kmalloc + * @bs: the bio_set to allocate from. * * Description: - * bio_alloc_bioset will first try its own mempool to satisfy the allocation. + * bio_alloc_bioset will try its own mempool to satisfy the allocation. * If %__GFP_WAIT is set then we will block on the internal pool waiting - * for a &struct bio to become free. If a %NULL @bs is passed in, we will - * fall back to just using @kmalloc to allocate the required memory. + * for a &struct bio to become free. * * Note that the caller must set ->bi_destructor on successful return * of a bio, to do the appropriate freeing of the bio once the reference diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 3f1f50d9d91..7a4dee19983 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -153,6 +153,11 @@ struct btrfs_inode { unsigned ordered_data_close:1; unsigned dummy_inode:1; + /* + * always compress this one file + */ + unsigned force_compress:1; + struct inode vfs_inode; }; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index a11a32058b5..28b92a7218a 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -478,7 +478,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, goto next; } - page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS); + page = alloc_page(mapping_gfp_mask(mapping) & ~__GFP_FS); if (!page) break; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2aa8ec6a098..0af2e386857 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -373,11 +373,13 @@ struct btrfs_super_block { * ones specified below then we will fail to mount */ #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) +#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (2ULL << 0) #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL #define BTRFS_FEATURE_INCOMPAT_SUPP \ - BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF + (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ + BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL) /* * A leaf is full of items. offset and size tell us where to find @@ -1182,7 +1184,6 @@ struct btrfs_root { #define BTRFS_INODE_NOATIME (1 << 9) #define BTRFS_INODE_DIRSYNC (1 << 10) - /* some macros to generate set/get funcs for the struct fields. This * assumes there is a lefoo_to_cpu for every type, so lets make a simple * one for u8: @@ -1842,7 +1843,7 @@ BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block, BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block, compat_flags, 64); BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block, - compat_flags, 64); + compat_ro_flags, 64); BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, incompat_flags, 64); BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, @@ -2310,7 +2311,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u32 min_type); int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); -int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end); +int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, + struct extent_state **cached_state); int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, @@ -2326,7 +2328,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_delete_inode(struct inode *inode); void btrfs_put_inode(struct inode *inode); -int btrfs_write_inode(struct inode *inode, int wait); +int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); void btrfs_dirty_inode(struct inode *inode); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); @@ -2335,7 +2337,7 @@ int btrfs_init_cachep(void); void btrfs_destroy_cachep(void); long btrfs_ioctl_trans_end(struct file *file); struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, - struct btrfs_root *root); + struct btrfs_root *root, int *was_new); int btrfs_commit_write(struct file *file, struct page *page, unsigned from, unsigned to); struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, @@ -2386,7 +2388,6 @@ void btrfs_sysfs_del_super(struct btrfs_fs_info *root); ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); /* super.c */ -u64 btrfs_parse_size(char *str); int btrfs_parse_options(struct btrfs_root *root, char *options); int btrfs_sync_fs(struct super_block *sb, int wait); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2b59201b955..11d0ad30e20 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -263,13 +263,15 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, static int verify_parent_transid(struct extent_io_tree *io_tree, struct extent_buffer *eb, u64 parent_transid) { + struct extent_state *cached_state = NULL; int ret; if (!parent_transid || btrfs_header_generation(eb) == parent_transid) return 0; - lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS); - if (extent_buffer_uptodate(io_tree, eb) && + lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, + 0, &cached_state, GFP_NOFS); + if (extent_buffer_uptodate(io_tree, eb, cached_state) && btrfs_header_generation(eb) == parent_transid) { ret = 0; goto out; @@ -282,10 +284,10 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, (unsigned long long)btrfs_header_generation(eb)); } ret = 1; - clear_extent_buffer_uptodate(io_tree, eb); + clear_extent_buffer_uptodate(io_tree, eb, &cached_state); out: - unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, - GFP_NOFS); + unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, + &cached_state, GFP_NOFS); return ret; } @@ -901,7 +903,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->highest_objectid = 0; root->name = NULL; root->in_sysfs = 0; - root->inode_tree.rb_node = NULL; + root->inode_tree = RB_ROOT; INIT_LIST_HEAD(&root->dirty_list); INIT_LIST_HEAD(&root->orphan_list); @@ -1673,7 +1675,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, insert_inode_hash(fs_info->btree_inode); spin_lock_init(&fs_info->block_group_cache_lock); - fs_info->block_group_cache_tree.rb_node = NULL; + fs_info->block_group_cache_tree = RB_ROOT; extent_io_tree_init(&fs_info->freed_extents[0], fs_info->btree_inode->i_mapping, GFP_NOFS); @@ -2497,7 +2499,8 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid) int ret; struct inode *btree_inode = buf->first_page->mapping->host; - ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf); + ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf, + NULL); if (!ret) return ret; diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index ba5c3fd5ab8..951ef09b82f 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -95,7 +95,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); key.offset = 0; - inode = btrfs_iget(sb, &key, root); + inode = btrfs_iget(sb, &key, root, NULL); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto fail; @@ -223,7 +223,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child) key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root)); + dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL)); if (!IS_ERR(dentry)) dentry->d_op = &btrfs_dentry_operations; return dentry; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 559f72489b3..1727b26fb19 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6561,6 +6561,7 @@ static noinline int invalidate_extent_cache(struct btrfs_root *root, struct btrfs_key key; struct inode *inode = NULL; struct btrfs_file_extent_item *fi; + struct extent_state *cached_state = NULL; u64 num_bytes; u64 skip_objectid = 0; u32 nritems; @@ -6589,12 +6590,14 @@ static noinline int invalidate_extent_cache(struct btrfs_root *root, } num_bytes = btrfs_file_extent_num_bytes(leaf, fi); - lock_extent(&BTRFS_I(inode)->io_tree, key.offset, - key.offset + num_bytes - 1, GFP_NOFS); + lock_extent_bits(&BTRFS_I(inode)->io_tree, key.offset, + key.offset + num_bytes - 1, 0, &cached_state, + GFP_NOFS); btrfs_drop_extent_cache(inode, key.offset, key.offset + num_bytes - 1, 1); - unlock_extent(&BTRFS_I(inode)->io_tree, key.offset, - key.offset + num_bytes - 1, GFP_NOFS); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, key.offset, + key.offset + num_bytes - 1, &cached_state, + GFP_NOFS); cond_resched(); } iput(inode); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b177ed31961..c99121ac5d6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -104,8 +104,8 @@ void extent_io_exit(void) void extent_io_tree_init(struct extent_io_tree *tree, struct address_space *mapping, gfp_t mask) { - tree->state.rb_node = NULL; - tree->buffer.rb_node = NULL; + tree->state = RB_ROOT; + tree->buffer = RB_ROOT; tree->ops = NULL; tree->dirty_bytes = 0; spin_lock_init(&tree->lock); @@ -513,7 +513,10 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u64 last_end; int err; int set = 0; + int clear = 0; + if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) + clear = 1; again: if (!prealloc && (mask & __GFP_WAIT)) { prealloc = alloc_extent_state(mask); @@ -524,14 +527,20 @@ again: spin_lock(&tree->lock); if (cached_state) { cached = *cached_state; - *cached_state = NULL; - cached_state = NULL; + + if (clear) { + *cached_state = NULL; + cached_state = NULL; + } + if (cached && cached->tree && cached->start == start) { - atomic_dec(&cached->refs); + if (clear) + atomic_dec(&cached->refs); state = cached; goto hit_next; } - free_extent_state(cached); + if (clear) + free_extent_state(cached); } /* * this search will find the extents that end after @@ -946,11 +955,11 @@ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, } int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) + struct extent_state **cached_state, gfp_t mask) { return set_extent_bit(tree, start, end, EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, - 0, NULL, NULL, mask); + 0, NULL, cached_state, mask); } int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, @@ -984,10 +993,11 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, } static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, - u64 end, gfp_t mask) + u64 end, struct extent_state **cached_state, + gfp_t mask) { return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, - NULL, mask); + cached_state, mask); } int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) @@ -1171,7 +1181,8 @@ out: * 1 is returned if we find something, 0 if nothing was in the tree */ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, - u64 *start, u64 *end, u64 max_bytes) + u64 *start, u64 *end, u64 max_bytes, + struct extent_state **cached_state) { struct rb_node *node; struct extent_state *state; @@ -1203,8 +1214,11 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, *end = state->end; goto out; } - if (!found) + if (!found) { *start = state->start; + *cached_state = state; + atomic_inc(&state->refs); + } found++; *end = state->end; cur_start = state->end + 1; @@ -1336,10 +1350,11 @@ again: delalloc_start = *start; delalloc_end = 0; found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, - max_bytes); + max_bytes, &cached_state); if (!found || delalloc_end <= *start) { *start = delalloc_start; *end = delalloc_end; + free_extent_state(cached_state); return found; } @@ -1722,7 +1737,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err) } if (!uptodate) { - clear_extent_uptodate(tree, start, end, GFP_NOFS); + clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); ClearPageUptodate(page); SetPageError(page); } @@ -1750,7 +1765,8 @@ static void end_bio_extent_writepage(struct bio *bio, int err) static void end_bio_extent_readpage(struct bio *bio, int err) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; + struct bio_vec *bvec = bio->bi_io_vec; struct extent_io_tree *tree; u64 start; u64 end; @@ -1773,7 +1789,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) else whole_page = 0; - if (--bvec >= bio->bi_io_vec) + if (++bvec <= bvec_end) prefetchw(&bvec->bv_page->flags); if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { @@ -1818,7 +1834,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) } check_page_locked(tree, page); } - } while (bvec >= bio->bi_io_vec); + } while (bvec <= bvec_end); bio_put(bio); } @@ -2704,6 +2720,7 @@ int extent_readpages(struct extent_io_tree *tree, int extent_invalidatepage(struct extent_io_tree *tree, struct page *page, unsigned long offset) { + struct extent_state *cached_state = NULL; u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); u64 end = start + PAGE_CACHE_SIZE - 1; size_t blocksize = page->mapping->host->i_sb->s_blocksize; @@ -2712,12 +2729,12 @@ int extent_invalidatepage(struct extent_io_tree *tree, if (start > end) return 0; - lock_extent(tree, start, end, GFP_NOFS); + lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS); wait_on_page_writeback(page); clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, - 1, 1, NULL, GFP_NOFS); + 1, 1, &cached_state, GFP_NOFS); return 0; } @@ -2920,16 +2937,17 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock, get_extent_t *get_extent) { struct inode *inode = mapping->host; + struct extent_state *cached_state = NULL; u64 start = iblock << inode->i_blkbits; sector_t sector = 0; size_t blksize = (1 << inode->i_blkbits); struct extent_map *em; - lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, - GFP_NOFS); + lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, + 0, &cached_state, GFP_NOFS); em = get_extent(inode, NULL, 0, start, blksize, 0); - unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, - GFP_NOFS); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, + start + blksize - 1, &cached_state, GFP_NOFS); if (!em || IS_ERR(em)) return 0; @@ -2951,6 +2969,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u32 flags = 0; u64 disko = 0; struct extent_map *em = NULL; + struct extent_state *cached_state = NULL; int end = 0; u64 em_start = 0, em_len = 0; unsigned long emflags; @@ -2959,8 +2978,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (len == 0) return -EINVAL; - lock_extent(&BTRFS_I(inode)->io_tree, start, start + len, - GFP_NOFS); + lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, + &cached_state, GFP_NOFS); em = get_extent(inode, NULL, 0, off, max - off, 0); if (!em) goto out; @@ -3023,8 +3042,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, out_free: free_extent_map(em); out: - unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len, - GFP_NOFS); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len, + &cached_state, GFP_NOFS); return ret; } @@ -3264,7 +3283,8 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree, } int clear_extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb) + struct extent_buffer *eb, + struct extent_state **cached_state) { unsigned long i; struct page *page; @@ -3274,7 +3294,7 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree, clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - GFP_NOFS); + cached_state, GFP_NOFS); for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); if (page) @@ -3334,7 +3354,8 @@ int extent_range_uptodate(struct extent_io_tree *tree, } int extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb) + struct extent_buffer *eb, + struct extent_state *cached_state) { int ret = 0; unsigned long num_pages; @@ -3346,7 +3367,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree, return 1; ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1, NULL); + EXTENT_UPTODATE, 1, cached_state); if (ret) return ret; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 36de250a7b2..bbab4813646 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -163,6 +163,8 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, struct extent_state **cached, gfp_t mask); int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); +int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached, gfp_t mask); int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int extent_read_full_page(struct extent_io_tree *tree, struct page *page, @@ -196,7 +198,7 @@ int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask); + struct extent_state **cached_state, gfp_t mask); int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int find_first_extent_bit(struct extent_io_tree *tree, u64 start, @@ -281,9 +283,11 @@ int test_extent_buffer_dirty(struct extent_io_tree *tree, int set_extent_buffer_uptodate(struct extent_io_tree *tree, struct extent_buffer *eb); int clear_extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb); + struct extent_buffer *eb, + struct extent_state **cached_state); int extent_buffer_uptodate(struct extent_io_tree *tree, - struct extent_buffer *eb); + struct extent_buffer *eb, + struct extent_state *cached_state); int map_extent_buffer(struct extent_buffer *eb, unsigned long offset, unsigned long min_len, char **token, char **map, unsigned long *map_start, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 428fcac45f9..28d87ba60ce 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -35,7 +35,7 @@ void extent_map_exit(void) */ void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) { - tree->map.rb_node = NULL; + tree->map = RB_ROOT; rwlock_init(&tree->lock); } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 6ed434ac037..ee3323c7fc1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -123,7 +123,8 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, root->sectorsize - 1) & ~((u64)root->sectorsize - 1); end_of_last_block = start_pos + num_bytes - 1; - err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); + err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, + NULL); if (err) return err; @@ -753,6 +754,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file, loff_t pos, unsigned long first_index, unsigned long last_index, size_t write_bytes) { + struct extent_state *cached_state = NULL; int i; unsigned long index = pos >> PAGE_CACHE_SHIFT; struct inode *inode = fdentry(file)->d_inode; @@ -781,16 +783,18 @@ again: } if (start_pos < inode->i_size) { struct btrfs_ordered_extent *ordered; - lock_extent(&BTRFS_I(inode)->io_tree, - start_pos, last_pos - 1, GFP_NOFS); + lock_extent_bits(&BTRFS_I(inode)->io_tree, + start_pos, last_pos - 1, 0, &cached_state, + GFP_NOFS); ordered = btrfs_lookup_first_ordered_extent(inode, last_pos - 1); if (ordered && ordered->file_offset + ordered->len > start_pos && ordered->file_offset < last_pos) { btrfs_put_ordered_extent(ordered); - unlock_extent(&BTRFS_I(inode)->io_tree, - start_pos, last_pos - 1, GFP_NOFS); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + start_pos, last_pos - 1, + &cached_state, GFP_NOFS); for (i = 0; i < num_pages; i++) { unlock_page(pages[i]); page_cache_release(pages[i]); @@ -802,12 +806,13 @@ again: if (ordered) btrfs_put_ordered_extent(ordered); - clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos, + clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, + EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, GFP_NOFS); - unlock_extent(&BTRFS_I(inode)->io_tree, - start_pos, last_pos - 1, GFP_NOFS); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + start_pos, last_pos - 1, &cached_state, + GFP_NOFS); } for (i = 0; i < num_pages; i++) { clear_page_dirty_for_io(pages[i]); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index cb2849f0325..dd831ed31ee 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -870,7 +870,7 @@ __btrfs_return_cluster_to_free_space( tree_insert_offset(&block_group->free_space_offset, entry->offset, &entry->offset_index, 0); } - cluster->root.rb_node = NULL; + cluster->root = RB_ROOT; out: spin_unlock(&cluster->lock); @@ -1355,7 +1355,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) { spin_lock_init(&cluster->lock); spin_lock_init(&cluster->refill_lock); - cluster->root.rb_node = NULL; + cluster->root = RB_ROOT; cluster->max_size = 0; cluster->points_to_bitmap = false; INIT_LIST_HEAD(&cluster->block_group_list); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4deb280f896..02bb099845f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -379,7 +379,8 @@ again: * change at any time if we discover bad compression ratios. */ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && - btrfs_test_opt(root, COMPRESS)) { + (btrfs_test_opt(root, COMPRESS) || + (BTRFS_I(inode)->force_compress))) { WARN_ON(pages); pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); @@ -483,8 +484,10 @@ again: nr_pages_ret = 0; /* flag the file so we don't compress in the future */ - if (!btrfs_test_opt(root, FORCE_COMPRESS)) + if (!btrfs_test_opt(root, FORCE_COMPRESS) && + !(BTRFS_I(inode)->force_compress)) { BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + } } if (will_compress) { *num_added += 1; @@ -570,8 +573,8 @@ retry: unsigned long nr_written = 0; lock_extent(io_tree, async_extent->start, - async_extent->start + - async_extent->ram_size - 1, GFP_NOFS); + async_extent->start + + async_extent->ram_size - 1, GFP_NOFS); /* allocate blocks */ ret = cow_file_range(inode, async_cow->locked_page, @@ -1211,7 +1214,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); - else if (!btrfs_test_opt(root, COMPRESS)) + else if (!btrfs_test_opt(root, COMPRESS) && + !(BTRFS_I(inode)->force_compress)) ret = cow_file_range(inode, locked_page, start, end, page_started, nr_written, 1); else @@ -1508,12 +1512,13 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, return 0; } -int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end) +int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, + struct extent_state **cached_state) { if ((end & (PAGE_CACHE_SIZE - 1)) == 0) WARN_ON(1); return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, - GFP_NOFS); + cached_state, GFP_NOFS); } /* see btrfs_writepage_start_hook for details on why this is required */ @@ -1526,6 +1531,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) { struct btrfs_writepage_fixup *fixup; struct btrfs_ordered_extent *ordered; + struct extent_state *cached_state = NULL; struct page *page; struct inode *inode; u64 page_start; @@ -1544,7 +1550,8 @@ again: page_start = page_offset(page); page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; - lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); + lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0, + &cached_state, GFP_NOFS); /* already ordered? We're done */ if (PagePrivate2(page)) @@ -1552,17 +1559,18 @@ again: ordered = btrfs_lookup_ordered_extent(inode, page_start); if (ordered) { - unlock_extent(&BTRFS_I(inode)->io_tree, page_start, - page_end, GFP_NOFS); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, + page_end, &cached_state, GFP_NOFS); unlock_page(page); btrfs_start_ordered_extent(inode, ordered, 1); goto again; } - btrfs_set_extent_delalloc(inode, page_start, page_end); + btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); ClearPageChecked(page); out: - unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, + &cached_state, GFP_NOFS); out_page: unlock_page(page); page_cache_release(page); @@ -1691,14 +1699,14 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) struct btrfs_trans_handle *trans; struct btrfs_ordered_extent *ordered_extent = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_state *cached_state = NULL; int compressed = 0; int ret; - ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1); + ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, + end - start + 1); if (!ret) return 0; - - ordered_extent = btrfs_lookup_ordered_extent(inode, start); BUG_ON(!ordered_extent); if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { @@ -1713,9 +1721,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) goto out; } - lock_extent(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + ordered_extent->len - 1, - GFP_NOFS); + lock_extent_bits(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + ordered_extent->len - 1, + 0, &cached_state, GFP_NOFS); trans = btrfs_join_transaction(root, 1); @@ -1742,9 +1750,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ordered_extent->len); BUG_ON(ret); } - unlock_extent(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + ordered_extent->len - 1, - GFP_NOFS); + unlock_extent_cached(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + + ordered_extent->len - 1, &cached_state, GFP_NOFS); + add_pending_csums(trans, inode, ordered_extent->file_offset, &ordered_extent->list); @@ -2153,7 +2162,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) found_key.objectid = found_key.offset; found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; - inode = btrfs_iget(root->fs_info->sb, &found_key, root); + inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); if (IS_ERR(inode)) break; @@ -3081,6 +3090,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; + struct extent_state *cached_state = NULL; char *kaddr; u32 blocksize = root->sectorsize; pgoff_t index = from >> PAGE_CACHE_SHIFT; @@ -3127,12 +3137,14 @@ again: } wait_on_page_writeback(page); - lock_extent(io_tree, page_start, page_end, GFP_NOFS); + lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state, + GFP_NOFS); set_page_extent_mapped(page); ordered = btrfs_lookup_ordered_extent(inode, page_start); if (ordered) { - unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_extent_cached(io_tree, page_start, page_end, + &cached_state, GFP_NOFS); unlock_page(page); page_cache_release(page); btrfs_start_ordered_extent(inode, ordered, 1); @@ -3140,13 +3152,15 @@ again: goto again; } - clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, + clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, - GFP_NOFS); + 0, 0, &cached_state, GFP_NOFS); - ret = btrfs_set_extent_delalloc(inode, page_start, page_end); + ret = btrfs_set_extent_delalloc(inode, page_start, page_end, + &cached_state); if (ret) { - unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_extent_cached(io_tree, page_start, page_end, + &cached_state, GFP_NOFS); goto out_unlock; } @@ -3159,7 +3173,8 @@ again: } ClearPageChecked(page); set_page_dirty(page); - unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_extent_cached(io_tree, page_start, page_end, &cached_state, + GFP_NOFS); out_unlock: if (ret) @@ -3177,6 +3192,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_map *em; + struct extent_state *cached_state = NULL; u64 mask = root->sectorsize - 1; u64 hole_start = (inode->i_size + mask) & ~mask; u64 block_end = (size + mask) & ~mask; @@ -3192,11 +3208,13 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) struct btrfs_ordered_extent *ordered; btrfs_wait_ordered_range(inode, hole_start, block_end - hole_start); - lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); + lock_extent_bits(io_tree, hole_start, block_end - 1, 0, + &cached_state, GFP_NOFS); ordered = btrfs_lookup_ordered_extent(inode, hole_start); if (!ordered) break; - unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); + unlock_extent_cached(io_tree, hole_start, block_end - 1, + &cached_state, GFP_NOFS); btrfs_put_ordered_extent(ordered); } @@ -3241,7 +3259,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) break; } - unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); + unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, + GFP_NOFS); return err; } @@ -3639,6 +3658,7 @@ static noinline void init_btrfs_i(struct inode *inode) bi->index_cnt = (u64)-1; bi->last_unlink_trans = 0; bi->ordered_data_close = 0; + bi->force_compress = 0; extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode->i_mapping, GFP_NOFS); @@ -3687,7 +3707,7 @@ static struct inode *btrfs_iget_locked(struct super_block *s, * Returns in *is_new if the inode was read from disk */ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, - struct btrfs_root *root) + struct btrfs_root *root, int *new) { struct inode *inode; @@ -3702,6 +3722,8 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, inode_tree_add(inode); unlock_new_inode(inode); + if (new) + *new = 1; } return inode; @@ -3754,7 +3776,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return NULL; if (location.type == BTRFS_INODE_ITEM_KEY) { - inode = btrfs_iget(dir->i_sb, &location, root); + inode = btrfs_iget(dir->i_sb, &location, root, NULL); return inode; } @@ -3769,7 +3791,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) else inode = new_simple_dir(dir->i_sb, &location, sub_root); } else { - inode = btrfs_iget(dir->i_sb, &location, sub_root); + inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL); } srcu_read_unlock(&root->fs_info->subvol_srcu, index); @@ -3968,7 +3990,7 @@ err: return ret; } -int btrfs_write_inode(struct inode *inode, int wait) +int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; @@ -3977,7 +3999,7 @@ int btrfs_write_inode(struct inode *inode, int wait) if (root->fs_info->btree_inode == inode) return 0; - if (wait) { + if (wbc->sync_mode == WB_SYNC_ALL) { trans = btrfs_join_transaction(root, 1); btrfs_set_trans_block_group(trans, inode); ret = btrfs_commit_transaction(trans, root); @@ -4501,7 +4523,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); if (err) { err = -ENOSPC; - goto out_unlock; + goto out_fail; } inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, @@ -4979,6 +5001,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) { struct extent_io_tree *tree; struct btrfs_ordered_extent *ordered; + struct extent_state *cached_state = NULL; u64 page_start = page_offset(page); u64 page_end = page_start + PAGE_CACHE_SIZE - 1; @@ -4997,7 +5020,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) btrfs_releasepage(page, GFP_NOFS); return; } - lock_extent(tree, page_start, page_end, GFP_NOFS); + lock_extent_bits(tree, page_start, page_end, 0, &cached_state, + GFP_NOFS); ordered = btrfs_lookup_ordered_extent(page->mapping->host, page_offset(page)); if (ordered) { @@ -5008,7 +5032,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) clear_extent_bit(tree, page_start, page_end, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, - NULL, GFP_NOFS); + &cached_state, GFP_NOFS); /* * whoever cleared the private bit is responsible * for the finish_ordered_io @@ -5018,11 +5042,13 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) page_start, page_end); } btrfs_put_ordered_extent(ordered); - lock_extent(tree, page_start, page_end, GFP_NOFS); + cached_state = NULL; + lock_extent_bits(tree, page_start, page_end, 0, &cached_state, + GFP_NOFS); } clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS); + EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS); __btrfs_releasepage(page, GFP_NOFS); ClearPageChecked(page); @@ -5055,6 +5081,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; + struct extent_state *cached_state = NULL; char *kaddr; unsigned long zero_start; loff_t size; @@ -5093,7 +5120,8 @@ again: } wait_on_page_writeback(page); - lock_extent(io_tree, page_start, page_end, GFP_NOFS); + lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state, + GFP_NOFS); set_page_extent_mapped(page); /* @@ -5102,7 +5130,8 @@ again: */ ordered = btrfs_lookup_ordered_extent(inode, page_start); if (ordered) { - unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_extent_cached(io_tree, page_start, page_end, + &cached_state, GFP_NOFS); unlock_page(page); btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); @@ -5116,13 +5145,15 @@ again: * is probably a better way to do this, but for now keep consistent with * prepare_pages in the normal write path. */ - clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, + clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, - GFP_NOFS); + 0, 0, &cached_state, GFP_NOFS); - ret = btrfs_set_extent_delalloc(inode, page_start, page_end); + ret = btrfs_set_extent_delalloc(inode, page_start, page_end, + &cached_state); if (ret) { - unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_extent_cached(io_tree, page_start, page_end, + &cached_state, GFP_NOFS); ret = VM_FAULT_SIGBUS; btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); goto out_unlock; @@ -5148,7 +5179,7 @@ again: BTRFS_I(inode)->last_trans = root->fs_info->generation; BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; - unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); out_unlock: btrfs_unreserve_metadata_for_delalloc(root, inode, 1); @@ -5827,6 +5858,7 @@ stop_trans: static long btrfs_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) { + struct extent_state *cached_state = NULL; u64 cur_offset; u64 last_byte; u64 alloc_start; @@ -5865,16 +5897,17 @@ static long btrfs_fallocate(struct inode *inode, int mode, /* the extent lock is ordered inside the running * transaction */ - lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, - GFP_NOFS); + lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, + locked_end, 0, &cached_state, GFP_NOFS); ordered = btrfs_lookup_first_ordered_extent(inode, alloc_end - 1); if (ordered && ordered->file_offset + ordered->len > alloc_start && ordered->file_offset < alloc_end) { btrfs_put_ordered_extent(ordered); - unlock_extent(&BTRFS_I(inode)->io_tree, - alloc_start, locked_end, GFP_NOFS); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + alloc_start, locked_end, + &cached_state, GFP_NOFS); /* * we can't wait on the range with the transaction * running or with the extent lock held @@ -5916,8 +5949,8 @@ static long btrfs_fallocate(struct inode *inode, int mode, break; } } - unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, - GFP_NOFS); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + &cached_state, GFP_NOFS); btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode, alloc_end - alloc_start); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 645a17927a8..2845c6ceecd 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -48,6 +48,7 @@ #include "print-tree.h" #include "volumes.h" #include "locking.h" +#include "ctree.h" /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) @@ -474,7 +475,79 @@ out_unlock: return error; } -static int btrfs_defrag_file(struct file *file) +static int should_defrag_range(struct inode *inode, u64 start, u64 len, + int thresh, u64 *last_len, u64 *skip, + u64 *defrag_end) +{ + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_map *em = NULL; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + int ret = 1; + + + if (thresh == 0) + thresh = 256 * 1024; + + /* + * make sure that once we start defragging and extent, we keep on + * defragging it + */ + if (start < *defrag_end) + return 1; + + *skip = 0; + + /* + * hopefully we have this extent in the tree already, try without + * the full extent lock + */ + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + read_unlock(&em_tree->lock); + + if (!em) { + /* get the big lock and read metadata off disk */ + lock_extent(io_tree, start, start + len - 1, GFP_NOFS); + em = btrfs_get_extent(inode, NULL, 0, start, len, 0); + unlock_extent(io_tree, start, start + len - 1, GFP_NOFS); + + if (!em) + return 0; + } + + /* this will cover holes, and inline extents */ + if (em->block_start >= EXTENT_MAP_LAST_BYTE) + ret = 0; + + /* + * we hit a real extent, if it is big don't bother defragging it again + */ + if ((*last_len == 0 || *last_len >= thresh) && em->len >= thresh) + ret = 0; + + /* + * last_len ends up being a counter of how many bytes we've defragged. + * every time we choose not to defrag an extent, we reset *last_len + * so that the next tiny extent will force a defrag. + * + * The end result of this is that tiny extents before a single big + * extent will force at least part of that big extent to be defragged. + */ + if (ret) { + *last_len += len; + *defrag_end = extent_map_end(em); + } else { + *last_len = 0; + *skip = extent_map_end(em); + *defrag_end = 0; + } + + free_extent_map(em); + return ret; +} + +static int btrfs_defrag_file(struct file *file, + struct btrfs_ioctl_defrag_range_args *range) { struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -486,37 +559,96 @@ static int btrfs_defrag_file(struct file *file) unsigned long total_read = 0; u64 page_start; u64 page_end; + u64 last_len = 0; + u64 skip = 0; + u64 defrag_end = 0; unsigned long i; int ret; - ret = btrfs_check_data_free_space(root, inode, inode->i_size); - if (ret) - return -ENOSPC; + if (inode->i_size == 0) + return 0; + + if (range->start + range->len > range->start) { + last_index = min_t(u64, inode->i_size - 1, + range->start + range->len - 1) >> PAGE_CACHE_SHIFT; + } else { + last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; + } + + i = range->start >> PAGE_CACHE_SHIFT; + while (i <= last_index) { + if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE, + range->extent_thresh, + &last_len, &skip, + &defrag_end)) { + unsigned long next; + /* + * the should_defrag function tells us how much to skip + * bump our counter by the suggested amount + */ + next = (skip + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + i = max(i + 1, next); + continue; + } - mutex_lock(&inode->i_mutex); - last_index = inode->i_size >> PAGE_CACHE_SHIFT; - for (i = 0; i <= last_index; i++) { if (total_read % ra_pages == 0) { btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i, min(last_index, i + ra_pages - 1)); } total_read++; + mutex_lock(&inode->i_mutex); + if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) + BTRFS_I(inode)->force_compress = 1; + + ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); + if (ret) { + ret = -ENOSPC; + break; + } + + ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); + if (ret) { + btrfs_free_reserved_data_space(root, inode, + PAGE_CACHE_SIZE); + ret = -ENOSPC; + break; + } again: + if (inode->i_size == 0 || + i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) { + ret = 0; + goto err_reservations; + } + page = grab_cache_page(inode->i_mapping, i); if (!page) - goto out_unlock; + goto err_reservations; + if (!PageUptodate(page)) { btrfs_readpage(NULL, page); lock_page(page); if (!PageUptodate(page)) { unlock_page(page); page_cache_release(page); - goto out_unlock; + goto err_reservations; } } + if (page->mapping != inode->i_mapping) { + unlock_page(page); + page_cache_release(page); + goto again; + } + wait_on_page_writeback(page); + if (PageDirty(page)) { + btrfs_free_reserved_data_space(root, inode, + PAGE_CACHE_SIZE); + goto loop_unlock; + } + page_start = (u64)page->index << PAGE_CACHE_SHIFT; page_end = page_start + PAGE_CACHE_SIZE - 1; lock_extent(io_tree, page_start, page_end, GFP_NOFS); @@ -537,18 +669,54 @@ again: * page if it is dirtied again later */ clear_page_dirty_for_io(page); + clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, + page_end, EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, GFP_NOFS); - btrfs_set_extent_delalloc(inode, page_start, page_end); + btrfs_set_extent_delalloc(inode, page_start, page_end, NULL); + ClearPageChecked(page); set_page_dirty(page); unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + +loop_unlock: unlock_page(page); page_cache_release(page); + mutex_unlock(&inode->i_mutex); + + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); + i++; + } + + if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) + filemap_flush(inode->i_mapping); + + if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { + /* the filemap_flush will queue IO into the worker threads, but + * we have to make sure the IO is actually started and that + * ordered extents get created before we return + */ + atomic_inc(&root->fs_info->async_submit_draining); + while (atomic_read(&root->fs_info->nr_async_submits) || + atomic_read(&root->fs_info->async_delalloc_pages)) { + wait_event(root->fs_info->async_submit_wait, + (atomic_read(&root->fs_info->nr_async_submits) == 0 && + atomic_read(&root->fs_info->async_delalloc_pages) == 0)); + } + atomic_dec(&root->fs_info->async_submit_draining); + + mutex_lock(&inode->i_mutex); + BTRFS_I(inode)->force_compress = 0; + mutex_unlock(&inode->i_mutex); } -out_unlock: - mutex_unlock(&inode->i_mutex); return 0; + +err_reservations: + mutex_unlock(&inode->i_mutex); + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + return ret; } static noinline int btrfs_ioctl_resize(struct btrfs_root *root, @@ -608,7 +776,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, mod = 1; sizestr++; } - new_size = btrfs_parse_size(sizestr); + new_size = memparse(sizestr, NULL); if (new_size == 0) { ret = -EINVAL; goto out_unlock; @@ -743,6 +911,327 @@ out: return ret; } +static noinline int key_in_sk(struct btrfs_key *key, + struct btrfs_ioctl_search_key *sk) +{ + struct btrfs_key test; + int ret; + + test.objectid = sk->min_objectid; + test.type = sk->min_type; + test.offset = sk->min_offset; + + ret = btrfs_comp_cpu_keys(key, &test); + if (ret < 0) + return 0; + + test.objectid = sk->max_objectid; + test.type = sk->max_type; + test.offset = sk->max_offset; + + ret = btrfs_comp_cpu_keys(key, &test); + if (ret > 0) + return 0; + return 1; +} + +static noinline int copy_to_sk(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *key, + struct btrfs_ioctl_search_key *sk, + char *buf, + unsigned long *sk_offset, + int *num_found) +{ + u64 found_transid; + struct extent_buffer *leaf; + struct btrfs_ioctl_search_header sh; + unsigned long item_off; + unsigned long item_len; + int nritems; + int i; + int slot; + int found = 0; + int ret = 0; + + leaf = path->nodes[0]; + slot = path->slots[0]; + nritems = btrfs_header_nritems(leaf); + + if (btrfs_header_generation(leaf) > sk->max_transid) { + i = nritems; + goto advance_key; + } + found_transid = btrfs_header_generation(leaf); + + for (i = slot; i < nritems; i++) { + item_off = btrfs_item_ptr_offset(leaf, i); + item_len = btrfs_item_size_nr(leaf, i); + + if (item_len > BTRFS_SEARCH_ARGS_BUFSIZE) + item_len = 0; + + if (sizeof(sh) + item_len + *sk_offset > + BTRFS_SEARCH_ARGS_BUFSIZE) { + ret = 1; + goto overflow; + } + + btrfs_item_key_to_cpu(leaf, key, i); + if (!key_in_sk(key, sk)) + continue; + + sh.objectid = key->objectid; + sh.offset = key->offset; + sh.type = key->type; + sh.len = item_len; + sh.transid = found_transid; + + /* copy search result header */ + memcpy(buf + *sk_offset, &sh, sizeof(sh)); + *sk_offset += sizeof(sh); + + if (item_len) { + char *p = buf + *sk_offset; + /* copy the item */ + read_extent_buffer(leaf, p, + item_off, item_len); + *sk_offset += item_len; + } + found++; + + if (*num_found >= sk->nr_items) + break; + } +advance_key: + ret = 0; + if (key->offset < (u64)-1 && key->offset < sk->max_offset) + key->offset++; + else if (key->type < (u8)-1 && key->type < sk->max_type) { + key->offset = 0; + key->type++; + } else if (key->objectid < (u64)-1 && key->objectid < sk->max_objectid) { + key->offset = 0; + key->type = 0; + key->objectid++; + } else + ret = 1; +overflow: + *num_found += found; + return ret; +} + +static noinline int search_ioctl(struct inode *inode, + struct btrfs_ioctl_search_args *args) +{ + struct btrfs_root *root; + struct btrfs_key key; + struct btrfs_key max_key; + struct btrfs_path *path; + struct btrfs_ioctl_search_key *sk = &args->key; + struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info; + int ret; + int num_found = 0; + unsigned long sk_offset = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + if (sk->tree_id == 0) { + /* search the root of the inode that was passed */ + root = BTRFS_I(inode)->root; + } else { + key.objectid = sk->tree_id; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + root = btrfs_read_fs_root_no_name(info, &key); + if (IS_ERR(root)) { + printk(KERN_ERR "could not find root %llu\n", + sk->tree_id); + btrfs_free_path(path); + return -ENOENT; + } + } + + key.objectid = sk->min_objectid; + key.type = sk->min_type; + key.offset = sk->min_offset; + + max_key.objectid = sk->max_objectid; + max_key.type = sk->max_type; + max_key.offset = sk->max_offset; + + path->keep_locks = 1; + + while(1) { + ret = btrfs_search_forward(root, &key, &max_key, path, 0, + sk->min_transid); + if (ret != 0) { + if (ret > 0) + ret = 0; + goto err; + } + ret = copy_to_sk(root, path, &key, sk, args->buf, + &sk_offset, &num_found); + btrfs_release_path(root, path); + if (ret || num_found >= sk->nr_items) + break; + + } + ret = 0; +err: + sk->nr_items = num_found; + btrfs_free_path(path); + return ret; +} + +static noinline int btrfs_ioctl_tree_search(struct file *file, + void __user *argp) +{ + struct btrfs_ioctl_search_args *args; + struct inode *inode; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + args = kmalloc(sizeof(*args), GFP_KERNEL); + if (!args) + return -ENOMEM; + + if (copy_from_user(args, argp, sizeof(*args))) { + kfree(args); + return -EFAULT; + } + inode = fdentry(file)->d_inode; + ret = search_ioctl(inode, args); + if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) + ret = -EFAULT; + kfree(args); + return ret; +} + +/* + * Search INODE_REFs to identify path name of 'dirid' directory + * in a 'tree_id' tree. and sets path name to 'name'. + */ +static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, + u64 tree_id, u64 dirid, char *name) +{ + struct btrfs_root *root; + struct btrfs_key key; + char *ptr; + int ret = -1; + int slot; + int len; + int total_len = 0; + struct btrfs_inode_ref *iref; + struct extent_buffer *l; + struct btrfs_path *path; + + if (dirid == BTRFS_FIRST_FREE_OBJECTID) { + name[0]='\0'; + return 0; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX]; + + key.objectid = tree_id; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + root = btrfs_read_fs_root_no_name(info, &key); + if (IS_ERR(root)) { + printk(KERN_ERR "could not find root %llu\n", tree_id); + ret = -ENOENT; + goto out; + } + + key.objectid = dirid; + key.type = BTRFS_INODE_REF_KEY; + key.offset = (u64)-1; + + while(1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + l = path->nodes[0]; + slot = path->slots[0]; + if (ret > 0 && slot > 0) + slot--; + btrfs_item_key_to_cpu(l, &key, slot); + + if (ret > 0 && (key.objectid != dirid || + key.type != BTRFS_INODE_REF_KEY)) { + ret = -ENOENT; + goto out; + } + + iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref); + len = btrfs_inode_ref_name_len(l, iref); + ptr -= len + 1; + total_len += len + 1; + if (ptr < name) + goto out; + + *(ptr + len) = '/'; + read_extent_buffer(l, ptr,(unsigned long)(iref + 1), len); + + if (key.offset == BTRFS_FIRST_FREE_OBJECTID) + break; + + btrfs_release_path(root, path); + key.objectid = key.offset; + key.offset = (u64)-1; + dirid = key.objectid; + + } + if (ptr < name) + goto out; + memcpy(name, ptr, total_len); + name[total_len]='\0'; + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +static noinline int btrfs_ioctl_ino_lookup(struct file *file, + void __user *argp) +{ + struct btrfs_ioctl_ino_lookup_args *args; + struct inode *inode; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + args = kmalloc(sizeof(*args), GFP_KERNEL); + if (copy_from_user(args, argp, sizeof(*args))) { + kfree(args); + return -EFAULT; + } + inode = fdentry(file)->d_inode; + + if (args->treeid == 0) + args->treeid = BTRFS_I(inode)->root->root_key.objectid; + + ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info, + args->treeid, args->objectid, + args->name); + + if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) + ret = -EFAULT; + + kfree(args); + return ret; +} + static noinline int btrfs_ioctl_snap_destroy(struct file *file, void __user *arg) { @@ -849,10 +1338,11 @@ out: return err; } -static int btrfs_ioctl_defrag(struct file *file) +static int btrfs_ioctl_defrag(struct file *file, void __user *argp) { struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_ioctl_defrag_range_args *range; int ret; ret = mnt_want_write(file->f_path.mnt); @@ -873,7 +1363,30 @@ static int btrfs_ioctl_defrag(struct file *file) ret = -EINVAL; goto out; } - btrfs_defrag_file(file); + + range = kzalloc(sizeof(*range), GFP_KERNEL); + if (!range) { + ret = -ENOMEM; + goto out; + } + + if (argp) { + if (copy_from_user(range, argp, + sizeof(*range))) { + ret = -EFAULT; + kfree(range); + } + /* compression requires us to start the IO */ + if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { + range->flags |= BTRFS_DEFRAG_RANGE_START_IO; + range->extent_thresh = (u32)-1; + } + } else { + /* the rest are all set to zero by kzalloc */ + range->len = (u64)-1; + } + btrfs_defrag_file(file, range); + kfree(range); break; } out: @@ -1274,6 +1787,157 @@ out: return ret; } +static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *new_root; + struct btrfs_dir_item *di; + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + struct btrfs_key location; + struct btrfs_disk_key disk_key; + struct btrfs_super_block *disk_super; + u64 features; + u64 objectid = 0; + u64 dir_id; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&objectid, argp, sizeof(objectid))) + return -EFAULT; + + if (!objectid) + objectid = root->root_key.objectid; + + location.objectid = objectid; + location.type = BTRFS_ROOT_ITEM_KEY; + location.offset = (u64)-1; + + new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); + if (IS_ERR(new_root)) + return PTR_ERR(new_root); + + if (btrfs_root_refs(&new_root->root_item) == 0) + return -ENOENT; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->leave_spinning = 1; + + trans = btrfs_start_transaction(root, 1); + if (!trans) { + btrfs_free_path(path); + return -ENOMEM; + } + + dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); + di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, + dir_id, "default", 7, 1); + if (!di) { + btrfs_free_path(path); + btrfs_end_transaction(trans, root); + printk(KERN_ERR "Umm, you don't have the default dir item, " + "this isn't going to work\n"); + return -ENOENT; + } + + btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); + btrfs_set_dir_item_key(path->nodes[0], di, &disk_key); + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_free_path(path); + + disk_super = &root->fs_info->super_copy; + features = btrfs_super_incompat_flags(disk_super); + if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) { + features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL; + btrfs_set_super_incompat_flags(disk_super, features); + } + btrfs_end_transaction(trans, root); + + return 0; +} + +long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_space_args space_args; + struct btrfs_ioctl_space_info space; + struct btrfs_ioctl_space_info *dest; + struct btrfs_ioctl_space_info *dest_orig; + struct btrfs_ioctl_space_info *user_dest; + struct btrfs_space_info *info; + int alloc_size; + int ret = 0; + int slot_count = 0; + + if (copy_from_user(&space_args, + (struct btrfs_ioctl_space_args __user *)arg, + sizeof(space_args))) + return -EFAULT; + + /* first we count slots */ + rcu_read_lock(); + list_for_each_entry_rcu(info, &root->fs_info->space_info, list) + slot_count++; + rcu_read_unlock(); + + /* space_slots == 0 means they are asking for a count */ + if (space_args.space_slots == 0) { + space_args.total_spaces = slot_count; + goto out; + } + alloc_size = sizeof(*dest) * slot_count; + /* we generally have at most 6 or so space infos, one for each raid + * level. So, a whole page should be more than enough for everyone + */ + if (alloc_size > PAGE_CACHE_SIZE) + return -ENOMEM; + + space_args.total_spaces = 0; + dest = kmalloc(alloc_size, GFP_NOFS); + if (!dest) + return -ENOMEM; + dest_orig = dest; + + /* now we have a buffer to copy into */ + rcu_read_lock(); + list_for_each_entry_rcu(info, &root->fs_info->space_info, list) { + /* make sure we don't copy more than we allocated + * in our buffer + */ + if (slot_count == 0) + break; + slot_count--; + + /* make sure userland has enough room in their buffer */ + if (space_args.total_spaces >= space_args.space_slots) + break; + + space.flags = info->flags; + space.total_bytes = info->total_bytes; + space.used_bytes = info->bytes_used; + memcpy(dest, &space, sizeof(space)); + dest++; + space_args.total_spaces++; + } + rcu_read_unlock(); + + user_dest = (struct btrfs_ioctl_space_info *) + (arg + sizeof(struct btrfs_ioctl_space_args)); + + if (copy_to_user(user_dest, dest_orig, alloc_size)) + ret = -EFAULT; + + kfree(dest_orig); +out: + if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args))) + ret = -EFAULT; + + return ret; +} + /* * there are many ways the trans_start and trans_end ioctls can lead * to deadlocks. They should only be used by applications that @@ -1320,8 +1984,12 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_snap_create(file, argp, 1); case BTRFS_IOC_SNAP_DESTROY: return btrfs_ioctl_snap_destroy(file, argp); + case BTRFS_IOC_DEFAULT_SUBVOL: + return btrfs_ioctl_default_subvol(file, argp); case BTRFS_IOC_DEFRAG: - return btrfs_ioctl_defrag(file); + return btrfs_ioctl_defrag(file, NULL); + case BTRFS_IOC_DEFRAG_RANGE: + return btrfs_ioctl_defrag(file, argp); case BTRFS_IOC_RESIZE: return btrfs_ioctl_resize(root, argp); case BTRFS_IOC_ADD_DEV: @@ -1338,6 +2006,12 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_trans_start(file); case BTRFS_IOC_TRANS_END: return btrfs_ioctl_trans_end(file); + case BTRFS_IOC_TREE_SEARCH: + return btrfs_ioctl_tree_search(file, argp); + case BTRFS_IOC_INO_LOOKUP: + return btrfs_ioctl_ino_lookup(file, argp); + case BTRFS_IOC_SPACE_INFO: + return btrfs_ioctl_space_info(root, argp); case BTRFS_IOC_SYNC: btrfs_sync_fs(file->f_dentry->d_sb, 1); return 0; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index bc49914475e..424694aa517 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -30,12 +30,114 @@ struct btrfs_ioctl_vol_args { char name[BTRFS_PATH_NAME_MAX + 1]; }; +#define BTRFS_INO_LOOKUP_PATH_MAX 4080 +struct btrfs_ioctl_ino_lookup_args { + __u64 treeid; + __u64 objectid; + char name[BTRFS_INO_LOOKUP_PATH_MAX]; +}; + +struct btrfs_ioctl_search_key { + /* which root are we searching. 0 is the tree of tree roots */ + __u64 tree_id; + + /* keys returned will be >= min and <= max */ + __u64 min_objectid; + __u64 max_objectid; + + /* keys returned will be >= min and <= max */ + __u64 min_offset; + __u64 max_offset; + + /* max and min transids to search for */ + __u64 min_transid; + __u64 max_transid; + + /* keys returned will be >= min and <= max */ + __u32 min_type; + __u32 max_type; + + /* + * how many items did userland ask for, and how many are we + * returning + */ + __u32 nr_items; + + /* align to 64 bits */ + __u32 unused; + + /* some extra for later */ + __u64 unused1; + __u64 unused2; + __u64 unused3; + __u64 unused4; +}; + +struct btrfs_ioctl_search_header { + __u64 transid; + __u64 objectid; + __u64 offset; + __u32 type; + __u32 len; +}; + +#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key)) +/* + * the buf is an array of search headers where + * each header is followed by the actual item + * the type field is expanded to 32 bits for alignment + */ +struct btrfs_ioctl_search_args { + struct btrfs_ioctl_search_key key; + char buf[BTRFS_SEARCH_ARGS_BUFSIZE]; +}; + struct btrfs_ioctl_clone_range_args { __s64 src_fd; __u64 src_offset, src_length; __u64 dest_offset; }; +/* flags for the defrag range ioctl */ +#define BTRFS_DEFRAG_RANGE_COMPRESS 1 +#define BTRFS_DEFRAG_RANGE_START_IO 2 + +struct btrfs_ioctl_defrag_range_args { + /* start of the defrag operation */ + __u64 start; + + /* number of bytes to defrag, use (u64)-1 to say all */ + __u64 len; + + /* + * flags for the operation, which can include turning + * on compression for this one defrag + */ + __u64 flags; + + /* + * any extent bigger than this will be considered + * already defragged. Use 0 to take the kernel default + * Use 1 to say every single extent must be rewritten + */ + __u32 extent_thresh; + + /* spare for later */ + __u32 unused[5]; +}; + +struct btrfs_ioctl_space_info { + __u64 flags; + __u64 total_bytes; + __u64 used_bytes; +}; + +struct btrfs_ioctl_space_args { + __u64 space_slots; + __u64 total_spaces; + struct btrfs_ioctl_space_info spaces[0]; +}; + #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ struct btrfs_ioctl_vol_args) #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ @@ -67,4 +169,13 @@ struct btrfs_ioctl_clone_range_args { struct btrfs_ioctl_vol_args) #define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \ struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \ + struct btrfs_ioctl_defrag_range_args) +#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \ + struct btrfs_ioctl_search_args) +#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \ + struct btrfs_ioctl_ino_lookup_args) +#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64) +#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \ + struct btrfs_ioctl_space_args) #endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 5c2a9e78a94..a8ffecd0b49 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -174,7 +174,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, if (!entry) return -ENOMEM; - mutex_lock(&tree->mutex); entry->file_offset = file_offset; entry->start = start; entry->len = len; @@ -190,16 +189,17 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, INIT_LIST_HEAD(&entry->list); INIT_LIST_HEAD(&entry->root_extent_list); + spin_lock(&tree->lock); node = tree_insert(&tree->tree, file_offset, &entry->rb_node); BUG_ON(node); + spin_unlock(&tree->lock); spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); list_add_tail(&entry->root_extent_list, &BTRFS_I(inode)->root->fs_info->ordered_extents); spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); - mutex_unlock(&tree->mutex); BUG_ON(node); return 0; } @@ -216,9 +216,9 @@ int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_inode_tree *tree; tree = &BTRFS_I(inode)->ordered_tree; - mutex_lock(&tree->mutex); + spin_lock(&tree->lock); list_add_tail(&sum->list, &entry->list); - mutex_unlock(&tree->mutex); + spin_unlock(&tree->lock); return 0; } @@ -232,15 +232,16 @@ int btrfs_add_ordered_sum(struct inode *inode, * to make sure this function only returns 1 once for a given ordered extent. */ int btrfs_dec_test_ordered_pending(struct inode *inode, + struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; - struct btrfs_ordered_extent *entry; + struct btrfs_ordered_extent *entry = NULL; int ret; tree = &BTRFS_I(inode)->ordered_tree; - mutex_lock(&tree->mutex); + spin_lock(&tree->lock); node = tree_search(tree, file_offset); if (!node) { ret = 1; @@ -264,7 +265,11 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, else ret = 1; out: - mutex_unlock(&tree->mutex); + if (!ret && cached && entry) { + *cached = entry; + atomic_inc(&entry->refs); + } + spin_unlock(&tree->lock); return ret == 0; } @@ -291,7 +296,7 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) /* * remove an ordered extent from the tree. No references are dropped - * and you must wake_up entry->wait. You must hold the tree mutex + * and you must wake_up entry->wait. You must hold the tree lock * while you call this function. */ static int __btrfs_remove_ordered_extent(struct inode *inode, @@ -340,9 +345,9 @@ int btrfs_remove_ordered_extent(struct inode *inode, int ret; tree = &BTRFS_I(inode)->ordered_tree; - mutex_lock(&tree->mutex); + spin_lock(&tree->lock); ret = __btrfs_remove_ordered_extent(inode, entry); - mutex_unlock(&tree->mutex); + spin_unlock(&tree->lock); wake_up(&entry->wait); return ret; @@ -567,7 +572,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry = NULL; tree = &BTRFS_I(inode)->ordered_tree; - mutex_lock(&tree->mutex); + spin_lock(&tree->lock); node = tree_search(tree, file_offset); if (!node) goto out; @@ -578,7 +583,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, if (entry) atomic_inc(&entry->refs); out: - mutex_unlock(&tree->mutex); + spin_unlock(&tree->lock); return entry; } @@ -594,7 +599,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) struct btrfs_ordered_extent *entry = NULL; tree = &BTRFS_I(inode)->ordered_tree; - mutex_lock(&tree->mutex); + spin_lock(&tree->lock); node = tree_search(tree, file_offset); if (!node) goto out; @@ -602,7 +607,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); atomic_inc(&entry->refs); out: - mutex_unlock(&tree->mutex); + spin_unlock(&tree->lock); return entry; } @@ -629,7 +634,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, else offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize); - mutex_lock(&tree->mutex); + spin_lock(&tree->lock); disk_i_size = BTRFS_I(inode)->disk_i_size; /* truncate file */ @@ -735,7 +740,7 @@ out: */ if (ordered) __btrfs_remove_ordered_extent(inode, ordered); - mutex_unlock(&tree->mutex); + spin_unlock(&tree->lock); if (ordered) wake_up(&ordered->wait); return ret; @@ -762,7 +767,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, if (!ordered) return 1; - mutex_lock(&tree->mutex); + spin_lock(&tree->lock); list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { if (disk_bytenr >= ordered_sum->bytenr) { num_sectors = ordered_sum->len / sectorsize; @@ -777,7 +782,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, } } out: - mutex_unlock(&tree->mutex); + spin_unlock(&tree->lock); btrfs_put_ordered_extent(ordered); return ret; } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 1fe1282ef47..c82f76a9f04 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -21,7 +21,7 @@ /* one of these per inode */ struct btrfs_ordered_inode_tree { - struct mutex mutex; + spinlock_t lock; struct rb_root tree; struct rb_node *last; }; @@ -128,8 +128,8 @@ static inline int btrfs_ordered_sum_size(struct btrfs_root *root, static inline void btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) { - mutex_init(&t->mutex); - t->tree.rb_node = NULL; + spin_lock_init(&t->lock); + t->tree = RB_ROOT; t->last = NULL; } @@ -137,7 +137,8 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); int btrfs_remove_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry); int btrfs_dec_test_ordered_pending(struct inode *inode, - u64 file_offset, u64 io_size); + struct btrfs_ordered_extent **cached, + u64 file_offset, u64 io_size); int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int tyep); int btrfs_add_ordered_sum(struct inode *inode, diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h index bc283ad2db7..e2a55cb2072 100644 --- a/fs/btrfs/ref-cache.h +++ b/fs/btrfs/ref-cache.h @@ -52,7 +52,7 @@ static inline size_t btrfs_leaf_ref_size(int nr_extents) static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree) { - tree->root.rb_node = NULL; + tree->root = RB_ROOT; INIT_LIST_HEAD(&tree->list); spin_lock_init(&tree->lock); } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index ab7ab531874..0b23942cbc0 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -170,14 +170,14 @@ struct async_merge { static void mapping_tree_init(struct mapping_tree *tree) { - tree->rb_root.rb_node = NULL; + tree->rb_root = RB_ROOT; spin_lock_init(&tree->lock); } static void backref_cache_init(struct backref_cache *cache) { int i; - cache->rb_root.rb_node = NULL; + cache->rb_root = RB_ROOT; for (i = 0; i < BTRFS_MAX_LEVEL; i++) INIT_LIST_HEAD(&cache->pending[i]); spin_lock_init(&cache->lock); @@ -2659,7 +2659,7 @@ static int relocate_file_extent_cluster(struct inode *inode, EXTENT_BOUNDARY, GFP_NOFS); nr++; } - btrfs_set_extent_delalloc(inode, page_start, page_end); + btrfs_set_extent_delalloc(inode, page_start, page_end, NULL); set_page_dirty(page); dirty_page++; @@ -3487,7 +3487,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, key.objectid = objectid; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - inode = btrfs_iget(root->fs_info->sb, &key, root); + inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); BUG_ON(IS_ERR(inode) || is_bad_inode(inode)); BTRFS_I(inode)->index_cnt = group->key.objectid; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 8a1ea6e6457..9ac612e6ca6 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -63,10 +63,10 @@ static void btrfs_put_super(struct super_block *sb) } enum { - Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, - Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, - Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, - Opt_compress, Opt_compress_force, Opt_notreelog, Opt_ratio, + Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum, + Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start, + Opt_nobarrier, Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, + Opt_noacl, Opt_compress, Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, Opt_err, }; @@ -74,6 +74,7 @@ enum { static match_table_t tokens = { {Opt_degraded, "degraded"}, {Opt_subvol, "subvol=%s"}, + {Opt_subvolid, "subvolid=%d"}, {Opt_device, "device=%s"}, {Opt_nodatasum, "nodatasum"}, {Opt_nodatacow, "nodatacow"}, @@ -95,31 +96,6 @@ static match_table_t tokens = { {Opt_err, NULL}, }; -u64 btrfs_parse_size(char *str) -{ - u64 res; - int mult = 1; - char *end; - char last; - - res = simple_strtoul(str, &end, 10); - - last = end[0]; - if (isalpha(last)) { - last = tolower(last); - switch (last) { - case 'g': - mult *= 1024; - case 'm': - mult *= 1024; - case 'k': - mult *= 1024; - } - res = res * mult; - } - return res; -} - /* * Regular mount options parser. Everything that is needed only when * reading in a new superblock is parsed here. @@ -128,7 +104,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) { struct btrfs_fs_info *info = root->fs_info; substring_t args[MAX_OPT_ARGS]; - char *p, *num; + char *p, *num, *orig; int intarg; int ret = 0; @@ -143,6 +119,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) if (!options) return -ENOMEM; + orig = options; while ((p = strsep(&options, ",")) != NULL) { int token; @@ -156,6 +133,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_opt(info->mount_opt, DEGRADED); break; case Opt_subvol: + case Opt_subvolid: case Opt_device: /* * These are parsed by btrfs_parse_early_options @@ -213,7 +191,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_max_extent: num = match_strdup(&args[0]); if (num) { - info->max_extent = btrfs_parse_size(num); + info->max_extent = memparse(num, NULL); kfree(num); info->max_extent = max_t(u64, @@ -225,7 +203,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_max_inline: num = match_strdup(&args[0]); if (num) { - info->max_inline = btrfs_parse_size(num); + info->max_inline = memparse(num, NULL); kfree(num); if (info->max_inline) { @@ -240,7 +218,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_alloc_start: num = match_strdup(&args[0]); if (num) { - info->alloc_start = btrfs_parse_size(num); + info->alloc_start = memparse(num, NULL); kfree(num); printk(KERN_INFO "btrfs: allocations start at %llu\n", @@ -280,7 +258,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) } } out: - kfree(options); + kfree(orig); return ret; } @@ -291,12 +269,13 @@ out: * only when we need to allocate a new super block. */ static int btrfs_parse_early_options(const char *options, fmode_t flags, - void *holder, char **subvol_name, + void *holder, char **subvol_name, u64 *subvol_objectid, struct btrfs_fs_devices **fs_devices) { substring_t args[MAX_OPT_ARGS]; char *opts, *p; int error = 0; + int intarg; if (!options) goto out; @@ -319,6 +298,18 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, case Opt_subvol: *subvol_name = match_strdup(&args[0]); break; + case Opt_subvolid: + intarg = 0; + error = match_int(&args[0], &intarg); + if (!error) { + /* we want the original fs_tree */ + if (!intarg) + *subvol_objectid = + BTRFS_FS_TREE_OBJECTID; + else + *subvol_objectid = intarg; + } + break; case Opt_device: error = btrfs_scan_one_device(match_strdup(&args[0]), flags, holder, fs_devices); @@ -346,6 +337,110 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, return error; } +static struct dentry *get_default_root(struct super_block *sb, + u64 subvol_objectid) +{ + struct btrfs_root *root = sb->s_fs_info; + struct btrfs_root *new_root; + struct btrfs_dir_item *di; + struct btrfs_path *path; + struct btrfs_key location; + struct inode *inode; + struct dentry *dentry; + u64 dir_id; + int new = 0; + + /* + * We have a specific subvol we want to mount, just setup location and + * go look up the root. + */ + if (subvol_objectid) { + location.objectid = subvol_objectid; + location.type = BTRFS_ROOT_ITEM_KEY; + location.offset = (u64)-1; + goto find_root; + } + + path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); + path->leave_spinning = 1; + + /* + * Find the "default" dir item which points to the root item that we + * will mount by default if we haven't been given a specific subvolume + * to mount. + */ + dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); + di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); + if (!di) { + /* + * Ok the default dir item isn't there. This is weird since + * it's always been there, but don't freak out, just try and + * mount to root most subvolume. + */ + btrfs_free_path(path); + dir_id = BTRFS_FIRST_FREE_OBJECTID; + new_root = root->fs_info->fs_root; + goto setup_root; + } + + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); + btrfs_free_path(path); + +find_root: + new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); + if (IS_ERR(new_root)) + return ERR_PTR(PTR_ERR(new_root)); + + if (btrfs_root_refs(&new_root->root_item) == 0) + return ERR_PTR(-ENOENT); + + dir_id = btrfs_root_dirid(&new_root->root_item); +setup_root: + location.objectid = dir_id; + location.type = BTRFS_INODE_ITEM_KEY; + location.offset = 0; + + inode = btrfs_iget(sb, &location, new_root, &new); + if (!inode) + return ERR_PTR(-ENOMEM); + + /* + * If we're just mounting the root most subvol put the inode and return + * a reference to the dentry. We will have already gotten a reference + * to the inode in btrfs_fill_super so we're good to go. + */ + if (!new && sb->s_root->d_inode == inode) { + iput(inode); + return dget(sb->s_root); + } + + if (new) { + const struct qstr name = { .name = "/", .len = 1 }; + + /* + * New inode, we need to make the dentry a sibling of s_root so + * everything gets cleaned up properly on unmount. + */ + dentry = d_alloc(sb->s_root, &name); + if (!dentry) { + iput(inode); + return ERR_PTR(-ENOMEM); + } + d_splice_alias(inode, dentry); + } else { + /* + * We found the inode in cache, just find a dentry for it and + * put the reference to the inode we just got. + */ + dentry = d_find_alias(inode); + iput(inode); + } + + return dentry; +} + static int btrfs_fill_super(struct super_block *sb, struct btrfs_fs_devices *fs_devices, void *data, int silent) @@ -379,7 +474,7 @@ static int btrfs_fill_super(struct super_block *sb, key.objectid = BTRFS_FIRST_FREE_OBJECTID; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root); + inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto fail_close; @@ -391,12 +486,6 @@ static int btrfs_fill_super(struct super_block *sb, err = -ENOMEM; goto fail_close; } -#if 0 - /* this does the super kobj at the same time */ - err = btrfs_sysfs_add_super(tree_root->fs_info); - if (err) - goto fail_close; -#endif sb->s_root = root_dentry; @@ -488,19 +577,22 @@ static int btrfs_test_super(struct super_block *s, void *data) static int btrfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - char *subvol_name = NULL; struct block_device *bdev = NULL; struct super_block *s; struct dentry *root; struct btrfs_fs_devices *fs_devices = NULL; fmode_t mode = FMODE_READ; + char *subvol_name = NULL; + u64 subvol_objectid = 0; int error = 0; + int found = 0; if (!(flags & MS_RDONLY)) mode |= FMODE_WRITE; error = btrfs_parse_early_options(data, mode, fs_type, - &subvol_name, &fs_devices); + &subvol_name, &subvol_objectid, + &fs_devices); if (error) return error; @@ -529,6 +621,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags, goto error_close_devices; } + found = 1; btrfs_close_devices(fs_devices); } else { char b[BDEVNAME_SIZE]; @@ -546,25 +639,35 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags, s->s_flags |= MS_ACTIVE; } - if (!strcmp(subvol_name, ".")) - root = dget(s->s_root); - else { - mutex_lock(&s->s_root->d_inode->i_mutex); - root = lookup_one_len(subvol_name, s->s_root, + root = get_default_root(s, subvol_objectid); + if (IS_ERR(root)) { + error = PTR_ERR(root); + deactivate_locked_super(s); + goto error; + } + /* if they gave us a subvolume name bind mount into that */ + if (strcmp(subvol_name, ".")) { + struct dentry *new_root; + mutex_lock(&root->d_inode->i_mutex); + new_root = lookup_one_len(subvol_name, root, strlen(subvol_name)); - mutex_unlock(&s->s_root->d_inode->i_mutex); + mutex_unlock(&root->d_inode->i_mutex); - if (IS_ERR(root)) { + if (IS_ERR(new_root)) { deactivate_locked_super(s); - error = PTR_ERR(root); - goto error_free_subvol_name; + error = PTR_ERR(new_root); + dput(root); + goto error_close_devices; } - if (!root->d_inode) { + if (!new_root->d_inode) { dput(root); + dput(new_root); deactivate_locked_super(s); error = -ENXIO; - goto error_free_subvol_name; + goto error_close_devices; } + dput(root); + root = new_root; } mnt->mnt_sb = s; @@ -579,6 +682,7 @@ error_close_devices: btrfs_close_devices(fs_devices); error_free_subvol_name: kfree(subvol_name); +error: return error; } @@ -623,14 +727,37 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct btrfs_root *root = btrfs_sb(dentry->d_sb); struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + struct list_head *head = &root->fs_info->space_info; + struct btrfs_space_info *found; + u64 total_used = 0; + u64 data_used = 0; int bits = dentry->d_sb->s_blocksize_bits; __be32 *fsid = (__be32 *)root->fs_info->fsid; + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) { + if (found->flags & (BTRFS_BLOCK_GROUP_DUP| + BTRFS_BLOCK_GROUP_RAID10| + BTRFS_BLOCK_GROUP_RAID1)) { + total_used += found->bytes_used; + if (found->flags & BTRFS_BLOCK_GROUP_DATA) + data_used += found->bytes_used; + else + data_used += found->total_bytes; + } + + total_used += found->bytes_used; + if (found->flags & BTRFS_BLOCK_GROUP_DATA) + data_used += found->bytes_used; + else + data_used += found->total_bytes; + } + rcu_read_unlock(); + buf->f_namelen = BTRFS_NAME_LEN; buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; - buf->f_bfree = buf->f_blocks - - (btrfs_super_bytes_used(disk_super) >> bits); - buf->f_bavail = buf->f_bfree; + buf->f_bfree = buf->f_blocks - (total_used >> bits); + buf->f_bavail = buf->f_blocks - (data_used >> bits); buf->f_bsize = dentry->d_sb->s_blocksize; buf->f_type = BTRFS_SUPER_MAGIC; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index a240b6fa81d..4ce16ef702a 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -164,12 +164,12 @@ static void btrfs_root_release(struct kobject *kobj) complete(&root->kobj_unregister); } -static struct sysfs_ops btrfs_super_attr_ops = { +static const struct sysfs_ops btrfs_super_attr_ops = { .show = btrfs_super_attr_show, .store = btrfs_super_attr_store, }; -static struct sysfs_ops btrfs_root_attr_ops = { +static const struct sysfs_ops btrfs_root_attr_ops = { .show = btrfs_root_attr_show, .store = btrfs_root_attr_store, }; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index b2acc79f1b3..2d654c1c794 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -69,7 +69,7 @@ static noinline int join_transaction(struct btrfs_root *root) cur_trans->commit_done = 0; cur_trans->start_time = get_seconds(); - cur_trans->delayed_refs.root.rb_node = NULL; + cur_trans->delayed_refs.root = RB_ROOT; cur_trans->delayed_refs.num_entries = 0; cur_trans->delayed_refs.num_heads_ready = 0; cur_trans->delayed_refs.num_heads = 0; @@ -997,13 +997,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, mutex_unlock(&root->fs_info->trans_mutex); - if (flush_on_commit) { + if (flush_on_commit || snap_pending) { btrfs_start_delalloc_inodes(root, 1); ret = btrfs_wait_ordered_extents(root, 0, 1); BUG_ON(ret); - } else if (snap_pending) { - ret = btrfs_wait_ordered_extents(root, 0, 1); - BUG_ON(ret); } /* diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4a9434b622e..1255fcc8ade 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -445,7 +445,7 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root, key.objectid = objectid; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - inode = btrfs_iget(root->fs_info->sb, &key, root); + inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); if (IS_ERR(inode)) { inode = NULL; } else if (is_bad_inode(inode)) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 41ecbb2347f..9df8e3f1cca 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -256,13 +256,13 @@ loop_lock: wake_up(&fs_info->async_submit_wait); BUG_ON(atomic_read(&cur->bi_cnt) == 0); - submit_bio(cur->bi_rw, cur); - num_run++; - batch_run++; if (bio_rw_flagged(cur, BIO_RW_SYNCIO)) num_sync_run++; + submit_bio(cur->bi_rw, cur); + num_run++; + batch_run++; if (need_resched()) { if (num_sync_run) { blk_run_backing_dev(bdi, NULL); @@ -325,16 +325,6 @@ loop_lock: num_sync_run = 0; blk_run_backing_dev(bdi, NULL); } - - cond_resched(); - if (again) - goto loop; - - spin_lock(&device->io_lock); - if (device->pending_bios.head || device->pending_sync_bios.head) - goto loop_lock; - spin_unlock(&device->io_lock); - /* * IO has already been through a long path to get here. Checksumming, * async helper threads, perhaps compression. We've done a pretty @@ -346,6 +336,16 @@ loop_lock: * cared about found its way down here. */ blk_run_backing_dev(bdi, NULL); + + cond_resched(); + if (again) + goto loop; + + spin_lock(&device->io_lock); + if (device->pending_bios.head || device->pending_sync_bios.head) + goto loop_lock; + spin_unlock(&device->io_lock); + done: return 0; } @@ -365,6 +365,7 @@ static noinline int device_list_add(const char *path, struct btrfs_device *device; struct btrfs_fs_devices *fs_devices; u64 found_transid = btrfs_super_generation(disk_super); + char *name; fs_devices = find_fsid(disk_super->fsid); if (!fs_devices) { @@ -411,6 +412,12 @@ static noinline int device_list_add(const char *path, device->fs_devices = fs_devices; fs_devices->num_devices++; + } else if (strcmp(device->name, path)) { + name = kstrdup(path, GFP_NOFS); + if (!name) + return -ENOMEM; + kfree(device->name); + device->name = name; } if (found_transid > fs_devices->latest_trans) { @@ -592,7 +599,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, goto error_close; disk_super = (struct btrfs_super_block *)bh->b_data; - devid = le64_to_cpu(disk_super->dev_item.devid); + devid = btrfs_stack_device_id(&disk_super->dev_item); if (devid != device->devid) goto error_brelse; @@ -694,7 +701,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, goto error_close; } disk_super = (struct btrfs_super_block *)bh->b_data; - devid = le64_to_cpu(disk_super->dev_item.devid); + devid = btrfs_stack_device_id(&disk_super->dev_item); transid = btrfs_super_generation(disk_super); if (disk_super->label[0]) printk(KERN_INFO "device label %s ", disk_super->label); @@ -1187,7 +1194,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) goto error_close; } disk_super = (struct btrfs_super_block *)bh->b_data; - devid = le64_to_cpu(disk_super->dev_item.devid); + devid = btrfs_stack_device_id(&disk_super->dev_item); dev_uuid = disk_super->dev_item.uuid; device = btrfs_find_device(root, devid, dev_uuid, disk_super->fsid); diff --git a/fs/buffer.c b/fs/buffer.c index 6fa530256bf..c9c266db062 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2893,7 +2893,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block, /* * The page straddles i_size. It must be zeroed out on each and every - * writepage invokation because it may be mmapped. "A file is mapped + * writepage invocation because it may be mmapped. "A file is mapped * in multiples of the page size. For a file that is not a multiple of * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." @@ -3265,7 +3265,7 @@ static void recalc_bh_state(void) struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) { - struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags); + struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags); if (ret) { INIT_LIST_HEAD(&ret->b_assoc_buffers); get_cpu_var(bh_accounting).nr++; @@ -3352,15 +3352,6 @@ int bh_submit_read(struct buffer_head *bh) } EXPORT_SYMBOL(bh_submit_read); -static void -init_buffer_head(void *data) -{ - struct buffer_head *bh = data; - - memset(bh, 0, sizeof(*bh)); - INIT_LIST_HEAD(&bh->b_assoc_buffers); -} - void __init buffer_init(void) { int nrpages; @@ -3369,7 +3360,7 @@ void __init buffer_init(void) sizeof(struct buffer_head), 0, (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| SLAB_MEM_SPREAD), - init_buffer_head); + NULL); /* * Limit the bh occupancy to 10% of ZONE_NORMAL diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c index 20692fbfdb2..a20bea59893 100644 --- a/fs/cifs/asn1.c +++ b/fs/cifs/asn1.c @@ -136,7 +136,7 @@ asn1_enum_decode(struct asn1_ctx *ctx, __le32 *val) return 0; } - ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to lenght octet */ + ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to length octet */ if ((ch) == ASN1_ENUM) /* if ch value is ENUM, 0xa */ *val = *(++(ctx->pointer)); /* value has enum value */ else diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index b44ce0a0711..b1d61d0bdfc 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -54,7 +54,7 @@ void cifs_dfs_release_automount_timer(void) * Extracts sharename form full UNC. * i.e. strips from UNC trailing path that is not part of share * name and fixup missing '\' in the begining of DFS node refferal - * if neccessary. + * if necessary. * Returns pointer to share name on success or ERR_PTR on error. * Caller is responsible for freeing returned string. */ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 20959befc70..7cc7f83e931 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -4019,7 +4019,7 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr, goto parse_DFS_referrals_exit; } - /* collect neccessary data from referrals */ + /* collect necessary data from referrals */ for (i = 0; i < *num_of_nodes; i++) { char *temp; int max_len; diff --git a/fs/compat.c b/fs/compat.c index 00d90c2e66f..030602d453b 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1795,6 +1795,24 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, return ret; } +struct compat_sel_arg_struct { + compat_ulong_t n; + compat_uptr_t inp; + compat_uptr_t outp; + compat_uptr_t exp; + compat_uptr_t tvp; +}; + +asmlinkage long compat_sys_old_select(struct compat_sel_arg_struct __user *arg) +{ + struct compat_sel_arg_struct a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), + compat_ptr(a.exp), compat_ptr(a.tvp)); +} + #ifdef HAVE_SET_RESTORE_SIGMASK static long do_compat_pselect(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index 0adced2f296..112e45a17e9 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -28,10 +28,12 @@ #undef elfhdr #undef elf_phdr +#undef elf_shdr #undef elf_note #undef elf_addr_t #define elfhdr elf32_hdr #define elf_phdr elf32_phdr +#define elf_shdr elf32_shdr #define elf_note elf32_note #define elf_addr_t Elf32_Addr diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 0ca9ec4a79c..6d55b61bfa7 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -545,7 +545,7 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp) kcmd = MTIOCPOS; karg = &pos; break; - case MTIOCGET32: + default: /* MTIOCGET32 */ kcmd = MTIOCGET; karg = &get; break; @@ -663,7 +663,7 @@ static int raw_ioctl(unsigned fd, unsigned cmd, switch (cmd) { case RAW_SETBIND: - case RAW_GETBIND: { + default: { /* RAW_GETBIND */ struct raw_config_request req; mm_segment_t oldfs = get_fs(); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 26a8bd40400..f994a7dfda8 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -148,7 +148,7 @@ static void lockspace_kobj_release(struct kobject *k) kfree(ls); } -static struct sysfs_ops dlm_attr_ops = { +static const struct sysfs_ops dlm_attr_ops = { .show = dlm_attr_show, .store = dlm_attr_store, }; diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 84f70bfb0ba..b12532e553f 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -312,7 +312,7 @@ int dlm_ls_stop(struct dlm_ls *ls) /* * This in_recovery lock does two things: * 1) Keeps this function from returning until all threads are out - * of locking routines and locking is truely stopped. + * of locking routines and locking is truly stopped. * 2) Keeps any new requests from being processed until it's unlocked * when recovery is complete. */ diff --git a/fs/exec.c b/fs/exec.c index cce6bbdbdbb..49cdaa19e5b 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -195,7 +195,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, * to work from. */ rlim = current->signal->rlim; - if (size > rlim[RLIMIT_STACK].rlim_cur / 4) { + if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) { put_page(page); return NULL; } @@ -246,6 +246,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) vma->vm_start = vma->vm_end - PAGE_SIZE; vma->vm_flags = VM_STACK_FLAGS; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); + INIT_LIST_HEAD(&vma->anon_vma_chain); err = insert_vm_struct(mm, vma); if (err) goto err; @@ -516,7 +517,8 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) /* * cover the whole range: [new_start, old_end) */ - vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL); + if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL)) + return -ENOMEM; /* * move the page tables downwards, on failure we rely on @@ -547,15 +549,13 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) tlb_finish_mmu(tlb, new_end, old_end); /* - * shrink the vma to just the new range. + * Shrink the vma to just the new range. Always succeeds. */ vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL); return 0; } -#define EXTRA_STACK_VM_PAGES 20 /* random */ - /* * Finalizes the stack vm_area_struct. The flags and permissions are updated, * the stack is optionally relocated, and some extra space is added. @@ -577,7 +577,7 @@ int setup_arg_pages(struct linux_binprm *bprm, #ifdef CONFIG_STACK_GROWSUP /* Limit stack size to 1GB */ - stack_base = current->signal->rlim[RLIMIT_STACK].rlim_max; + stack_base = rlimit_max(RLIMIT_STACK); if (stack_base > (1 << 30)) stack_base = 1 << 30; @@ -630,7 +630,7 @@ int setup_arg_pages(struct linux_binprm *bprm, goto out_unlock; } - stack_expand = EXTRA_STACK_VM_PAGES * PAGE_SIZE; + stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ stack_size = vma->vm_end - vma->vm_start; /* * Align this down to a page boundary as expand_stack @@ -718,6 +718,7 @@ static int exec_mmap(struct mm_struct *mm) /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; + sync_mm_rss(tsk, old_mm); mm_release(tsk, old_mm); if (old_mm) { @@ -1532,7 +1533,7 @@ static int format_corename(char *corename, long signr) /* core limit size */ case 'c': rc = snprintf(out_ptr, out_end - out_ptr, - "%lu", current->signal->rlim[RLIMIT_CORE].rlim_cur); + "%lu", rlimit(RLIMIT_CORE)); if (rc > out_end - out_ptr) goto out; out_ptr += rc; @@ -1560,12 +1561,13 @@ out: return ispipe; } -static int zap_process(struct task_struct *start) +static int zap_process(struct task_struct *start, int exit_code) { struct task_struct *t; int nr = 0; start->signal->flags = SIGNAL_GROUP_EXIT; + start->signal->group_exit_code = exit_code; start->signal->group_stop_count = 0; t = start; @@ -1590,8 +1592,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, spin_lock_irq(&tsk->sighand->siglock); if (!signal_group_exit(tsk->signal)) { mm->core_state = core_state; - tsk->signal->group_exit_code = exit_code; - nr = zap_process(tsk); + nr = zap_process(tsk, exit_code); } spin_unlock_irq(&tsk->sighand->siglock); if (unlikely(nr < 0)) @@ -1640,7 +1641,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, if (p->mm) { if (unlikely(p->mm == mm)) { lock_task_sighand(p, &flags); - nr += zap_process(p); + nr += zap_process(p, exit_code); unlock_task_sighand(p, &flags); } break; @@ -1747,14 +1748,19 @@ void set_dumpable(struct mm_struct *mm, int value) } } -int get_dumpable(struct mm_struct *mm) +static int __get_dumpable(unsigned long mm_flags) { int ret; - ret = mm->flags & 0x3; + ret = mm_flags & MMF_DUMPABLE_MASK; return (ret >= 2) ? 2 : ret; } +int get_dumpable(struct mm_struct *mm) +{ + return __get_dumpable(mm->flags); +} + static void wait_for_dump_helpers(struct file *file) { struct pipe_inode_info *pipe; @@ -1797,7 +1803,13 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) struct coredump_params cprm = { .signr = signr, .regs = regs, - .limit = current->signal->rlim[RLIMIT_CORE].rlim_cur, + .limit = rlimit(RLIMIT_CORE), + /* + * We must use the same mm->flags while dumping core to avoid + * inconsistency of bit flags, since this flag is not protected + * by any locks. + */ + .mm_flags = mm->flags, }; audit_core_dumps(signr); @@ -1816,7 +1828,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) /* * If another thread got here first, or we are not dumpable, bail out. */ - if (mm->core_state || !get_dumpable(mm)) { + if (mm->core_state || !__get_dumpable(cprm.mm_flags)) { up_write(&mm->mmap_sem); put_cred(cred); goto fail; @@ -1827,7 +1839,8 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) * process nor do we know its entire history. We only know it * was tainted so we dump it as root in mode 2. */ - if (get_dumpable(mm) == 2) { /* Setuid core dump mode */ + if (__get_dumpable(cprm.mm_flags) == 2) { + /* Setuid core dump mode */ flag = O_EXCL; /* Stop rewrite attacks */ cred->fsuid = 0; /* Dump root private */ } @@ -1923,8 +1936,9 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) /* * Dont allow local users get cute and trick others to coredump * into their pre-created files: + * Note, this is not relevant for pipes */ - if (inode->i_uid != current_fsuid()) + if (!ispipe && (inode->i_uid != current_fsuid())) goto close_fail; if (!cprm.file->f_op) goto close_fail; diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 59b8bf2825c..8442e353309 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -261,7 +261,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata); extern struct inode *exofs_iget(struct super_block *, unsigned long); struct inode *exofs_new_inode(struct inode *, int); -extern int exofs_write_inode(struct inode *, int); +extern int exofs_write_inode(struct inode *, struct writeback_control *wbc); extern void exofs_delete_inode(struct inode *); /* dir.c: */ diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 5514f3c2c2f..a17e4b733e3 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -1280,9 +1280,9 @@ out: return ret; } -int exofs_write_inode(struct inode *inode, int wait) +int exofs_write_inode(struct inode *inode, struct writeback_control *wbc) { - return exofs_update_inode(inode, wait); + return exofs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); } /* diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index 7f8d2e5a7ea..1d081f0cfec 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -570,7 +570,7 @@ do_more: error_return: brelse(bitmap_bh); release_blocks(sb, freed); - vfs_dq_free_block(inode, freed); + dquot_free_block(inode, freed); } /** @@ -1236,6 +1236,7 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal, unsigned short windowsz = 0; unsigned long ngroups; unsigned long num = *count; + int ret; *errp = -ENOSPC; sb = inode->i_sb; @@ -1247,8 +1248,9 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal, /* * Check quota for allocation of this block. */ - if (vfs_dq_alloc_block(inode, num)) { - *errp = -EDQUOT; + ret = dquot_alloc_block(inode, num); + if (ret) { + *errp = ret; return 0; } @@ -1409,7 +1411,7 @@ allocated: *errp = 0; brelse(bitmap_bh); - vfs_dq_free_block(inode, *count-num); + dquot_free_block(inode, *count-num); *count = num; return ret_block; @@ -1420,7 +1422,7 @@ out: * Undo the block allocation */ if (!performed_allocation) - vfs_dq_free_block(inode, *count); + dquot_free_block(inode, *count); brelse(bitmap_bh); return 0; } diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 061914add3c..0b038e47ad2 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -118,7 +118,7 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned); /* inode.c */ extern struct inode *ext2_iget (struct super_block *, unsigned long); -extern int ext2_write_inode (struct inode *, int); +extern int ext2_write_inode (struct inode *, struct writeback_control *); extern void ext2_delete_inode (struct inode *); extern int ext2_sync_inode (struct inode *); extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 586e3589d4c..5d198d0697f 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -20,6 +20,7 @@ #include <linux/time.h> #include <linux/pagemap.h> +#include <linux/quotaops.h> #include "ext2.h" #include "xattr.h" #include "acl.h" @@ -70,7 +71,7 @@ const struct file_operations ext2_file_operations = { .compat_ioctl = ext2_compat_ioctl, #endif .mmap = generic_file_mmap, - .open = generic_file_open, + .open = dquot_file_open, .release = ext2_release_file, .fsync = ext2_fsync, .splice_read = generic_file_splice_read, @@ -87,7 +88,7 @@ const struct file_operations ext2_xip_file_operations = { .compat_ioctl = ext2_compat_ioctl, #endif .mmap = xip_file_mmap, - .open = generic_file_open, + .open = dquot_file_open, .release = ext2_release_file, .fsync = ext2_fsync, }; diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 15387c9c17d..ad7d572ee8d 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -121,8 +121,8 @@ void ext2_free_inode (struct inode * inode) if (!is_bad_inode(inode)) { /* Quota is already initialized in iput() */ ext2_xattr_delete_inode(inode); - vfs_dq_free_inode(inode); - vfs_dq_drop(inode); + dquot_free_inode(inode); + dquot_drop(inode); } es = EXT2_SB(sb)->s_es; @@ -586,10 +586,10 @@ got: goto fail_drop; } - if (vfs_dq_alloc_inode(inode)) { - err = -EDQUOT; + dquot_initialize(inode); + err = dquot_alloc_inode(inode); + if (err) goto fail_drop; - } err = ext2_init_acl(inode, dir); if (err) @@ -605,10 +605,10 @@ got: return inode; fail_free_drop: - vfs_dq_free_inode(inode); + dquot_free_inode(inode); fail_drop: - vfs_dq_drop(inode); + dquot_drop(inode); inode->i_flags |= S_NOQUOTA; inode->i_nlink = 0; unlock_new_inode(inode); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 71b032c65a0..fc13cc119aa 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -41,6 +41,8 @@ MODULE_AUTHOR("Remy Card and others"); MODULE_DESCRIPTION("Second Extended Filesystem"); MODULE_LICENSE("GPL"); +static int __ext2_write_inode(struct inode *inode, int do_sync); + /* * Test whether an inode is a fast symlink. */ @@ -58,13 +60,15 @@ static inline int ext2_inode_is_fast_symlink(struct inode *inode) */ void ext2_delete_inode (struct inode * inode) { + if (!is_bad_inode(inode)) + dquot_initialize(inode); truncate_inode_pages(&inode->i_data, 0); if (is_bad_inode(inode)) goto no_delete; EXT2_I(inode)->i_dtime = get_seconds(); mark_inode_dirty(inode); - ext2_write_inode(inode, inode_needs_sync(inode)); + __ext2_write_inode(inode, inode_needs_sync(inode)); inode->i_size = 0; if (inode->i_blocks) @@ -1335,7 +1339,7 @@ bad_inode: return ERR_PTR(ret); } -int ext2_write_inode(struct inode *inode, int do_sync) +static int __ext2_write_inode(struct inode *inode, int do_sync) { struct ext2_inode_info *ei = EXT2_I(inode); struct super_block *sb = inode->i_sb; @@ -1440,6 +1444,11 @@ int ext2_write_inode(struct inode *inode, int do_sync) return err; } +int ext2_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); +} + int ext2_sync_inode(struct inode *inode) { struct writeback_control wbc = { @@ -1457,9 +1466,12 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr) error = inode_change_ok(inode, iattr); if (error) return error; + + if (iattr->ia_valid & ATTR_SIZE) + dquot_initialize(inode); if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { - error = vfs_dq_transfer(inode, iattr) ? -EDQUOT : 0; + error = dquot_transfer(inode, iattr); if (error) return error; } diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index dd7175ce560..71efb0e9a3f 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -31,6 +31,7 @@ */ #include <linux/pagemap.h> +#include <linux/quotaops.h> #include "ext2.h" #include "xattr.h" #include "acl.h" @@ -99,24 +100,27 @@ struct dentry *ext2_get_parent(struct dentry *child) */ static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd) { - struct inode * inode = ext2_new_inode (dir, mode); - int err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - inode->i_op = &ext2_file_inode_operations; - if (ext2_use_xip(inode->i_sb)) { - inode->i_mapping->a_ops = &ext2_aops_xip; - inode->i_fop = &ext2_xip_file_operations; - } else if (test_opt(inode->i_sb, NOBH)) { - inode->i_mapping->a_ops = &ext2_nobh_aops; - inode->i_fop = &ext2_file_operations; - } else { - inode->i_mapping->a_ops = &ext2_aops; - inode->i_fop = &ext2_file_operations; - } - mark_inode_dirty(inode); - err = ext2_add_nondir(dentry, inode); + struct inode *inode; + + dquot_initialize(dir); + + inode = ext2_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &ext2_file_inode_operations; + if (ext2_use_xip(inode->i_sb)) { + inode->i_mapping->a_ops = &ext2_aops_xip; + inode->i_fop = &ext2_xip_file_operations; + } else if (test_opt(inode->i_sb, NOBH)) { + inode->i_mapping->a_ops = &ext2_nobh_aops; + inode->i_fop = &ext2_file_operations; + } else { + inode->i_mapping->a_ops = &ext2_aops; + inode->i_fop = &ext2_file_operations; } - return err; + mark_inode_dirty(inode); + return ext2_add_nondir(dentry, inode); } static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t rdev) @@ -127,6 +131,8 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_ if (!new_valid_dev(rdev)) return -EINVAL; + dquot_initialize(dir); + inode = ext2_new_inode (dir, mode); err = PTR_ERR(inode); if (!IS_ERR(inode)) { @@ -151,6 +157,8 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry, if (l > sb->s_blocksize) goto out; + dquot_initialize(dir); + inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO); err = PTR_ERR(inode); if (IS_ERR(inode)) @@ -194,6 +202,8 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir, if (inode->i_nlink >= EXT2_LINK_MAX) return -EMLINK; + dquot_initialize(dir); + inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); atomic_inc(&inode->i_count); @@ -216,6 +226,8 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode) if (dir->i_nlink >= EXT2_LINK_MAX) goto out; + dquot_initialize(dir); + inode_inc_link_count(dir); inode = ext2_new_inode (dir, S_IFDIR | mode); @@ -262,6 +274,8 @@ static int ext2_unlink(struct inode * dir, struct dentry *dentry) struct page * page; int err = -ENOENT; + dquot_initialize(dir); + de = ext2_find_entry (dir, &dentry->d_name, &page); if (!de) goto out; @@ -304,6 +318,9 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, struct ext2_dir_entry_2 * old_de; int err = -ENOENT; + dquot_initialize(old_dir); + dquot_initialize(new_dir); + old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page); if (!old_de) goto out; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index f9cb54a585c..42e4a303b67 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -194,6 +194,8 @@ static void destroy_inodecache(void) static void ext2_clear_inode(struct inode *inode) { struct ext2_block_alloc_info *rsv = EXT2_I(inode)->i_block_alloc_info; + + dquot_drop(inode); ext2_discard_reservation(inode); EXT2_I(inode)->i_block_alloc_info = NULL; if (unlikely(rsv)) diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 904f00642f8..e44dc92609b 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -644,8 +644,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, the inode. */ ea_bdebug(new_bh, "reusing block"); - error = -EDQUOT; - if (vfs_dq_alloc_block(inode, 1)) { + error = dquot_alloc_block(inode, 1); + if (error) { unlock_buffer(new_bh); goto cleanup; } @@ -702,7 +702,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, * as if nothing happened and cleanup the unused block */ if (error && error != -ENOSPC) { if (new_bh && new_bh != old_bh) - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); goto cleanup; } } else @@ -734,7 +734,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, le32_add_cpu(&HDR(old_bh)->h_refcount, -1); if (ce) mb_cache_entry_release(ce); - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); mark_buffer_dirty(old_bh); ea_bdebug(old_bh, "refcount now=%d", le32_to_cpu(HDR(old_bh)->h_refcount)); @@ -797,7 +797,7 @@ ext2_xattr_delete_inode(struct inode *inode) mark_buffer_dirty(bh); if (IS_SYNC(inode)) sync_dirty_buffer(bh); - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); } EXT2_I(inode)->i_file_acl = 0; diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index 27967f92e82..161da2d3f89 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -676,7 +676,7 @@ void ext3_free_blocks(handle_t *handle, struct inode *inode, } ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); if (dquot_freed_blocks) - vfs_dq_free_block(inode, dquot_freed_blocks); + dquot_free_block(inode, dquot_freed_blocks); return; } @@ -1502,8 +1502,9 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, /* * Check quota for allocation of this block. */ - if (vfs_dq_alloc_block(inode, num)) { - *errp = -EDQUOT; + err = dquot_alloc_block(inode, num); + if (err) { + *errp = err; return 0; } @@ -1713,7 +1714,7 @@ allocated: *errp = 0; brelse(bitmap_bh); - vfs_dq_free_block(inode, *count-num); + dquot_free_block(inode, *count-num); *count = num; return ret_block; @@ -1728,7 +1729,7 @@ out: * Undo the block allocation */ if (!performed_allocation) - vfs_dq_free_block(inode, *count); + dquot_free_block(inode, *count); brelse(bitmap_bh); return 0; } diff --git a/fs/ext3/file.c b/fs/ext3/file.c index 388bbdfa0b4..f55df0e61cb 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -21,6 +21,7 @@ #include <linux/time.h> #include <linux/fs.h> #include <linux/jbd.h> +#include <linux/quotaops.h> #include <linux/ext3_fs.h> #include <linux/ext3_jbd.h> #include "xattr.h" @@ -33,9 +34,9 @@ */ static int ext3_release_file (struct inode * inode, struct file * filp) { - if (EXT3_I(inode)->i_state & EXT3_STATE_FLUSH_ON_CLOSE) { + if (ext3_test_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE)) { filemap_flush(inode->i_mapping); - EXT3_I(inode)->i_state &= ~EXT3_STATE_FLUSH_ON_CLOSE; + ext3_clear_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE); } /* if we are the last writer on the inode, drop the block reservation */ if ((filp->f_mode & FMODE_WRITE) && @@ -62,7 +63,7 @@ const struct file_operations ext3_file_operations = { .compat_ioctl = ext3_compat_ioctl, #endif .mmap = generic_file_mmap, - .open = generic_file_open, + .open = dquot_file_open, .release = ext3_release_file, .fsync = ext3_sync_file, .splice_read = generic_file_splice_read, diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index b3999128513..ef9008b885b 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -123,10 +123,10 @@ void ext3_free_inode (handle_t *handle, struct inode * inode) * Note: we must free any quota before locking the superblock, * as writing the quota to disk may need the lock as well. */ - vfs_dq_init(inode); + dquot_initialize(inode); ext3_xattr_delete_inode(handle, inode); - vfs_dq_free_inode(inode); - vfs_dq_drop(inode); + dquot_free_inode(inode); + dquot_drop(inode); is_directory = S_ISDIR(inode->i_mode); @@ -588,10 +588,10 @@ got: sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0; ret = inode; - if (vfs_dq_alloc_inode(inode)) { - err = -EDQUOT; + dquot_initialize(inode); + err = dquot_alloc_inode(inode); + if (err) goto fail_drop; - } err = ext3_init_acl(handle, inode, dir); if (err) @@ -619,10 +619,10 @@ really_out: return ret; fail_free_drop: - vfs_dq_free_inode(inode); + dquot_free_inode(inode); fail_drop: - vfs_dq_drop(inode); + dquot_drop(inode); inode->i_flags |= S_NOQUOTA; inode->i_nlink = 0; unlock_new_inode(inode); diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 455e6e6e5cb..7f920b7263a 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -196,6 +196,9 @@ void ext3_delete_inode (struct inode * inode) { handle_t *handle; + if (!is_bad_inode(inode)) + dquot_initialize(inode); + truncate_inode_pages(&inode->i_data, 0); if (is_bad_inode(inode)) @@ -1378,7 +1381,7 @@ static int ext3_journalled_write_end(struct file *file, */ if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); - EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; + ext3_set_inode_state(inode, EXT3_STATE_JDATA); if (inode->i_size > EXT3_I(inode)->i_disksize) { EXT3_I(inode)->i_disksize = inode->i_size; ret2 = ext3_mark_inode_dirty(handle, inode); @@ -1417,7 +1420,7 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block) journal_t *journal; int err; - if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) { + if (ext3_test_inode_state(inode, EXT3_STATE_JDATA)) { /* * This is a REALLY heavyweight approach, but the use of * bmap on dirty files is expected to be extremely rare: @@ -1436,7 +1439,7 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block) * everything they get. */ - EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA; + ext3_clear_inode_state(inode, EXT3_STATE_JDATA); journal = EXT3_JOURNAL(inode); journal_lock_updates(journal); err = journal_flush(journal); @@ -1528,6 +1531,7 @@ static int ext3_ordered_writepage(struct page *page, int err; J_ASSERT(PageLocked(page)); + WARN_ON_ONCE(IS_RDONLY(inode)); /* * We give up here if we're reentered, because it might be for a @@ -1600,6 +1604,9 @@ static int ext3_writeback_writepage(struct page *page, int ret = 0; int err; + J_ASSERT(PageLocked(page)); + WARN_ON_ONCE(IS_RDONLY(inode)); + if (ext3_journal_current_handle()) goto out_fail; @@ -1642,6 +1649,9 @@ static int ext3_journalled_writepage(struct page *page, int ret = 0; int err; + J_ASSERT(PageLocked(page)); + WARN_ON_ONCE(IS_RDONLY(inode)); + if (ext3_journal_current_handle()) goto no_write; @@ -1670,7 +1680,7 @@ static int ext3_journalled_writepage(struct page *page, PAGE_CACHE_SIZE, NULL, write_end_fn); if (ret == 0) ret = err; - EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; + ext3_set_inode_state(inode, EXT3_STATE_JDATA); unlock_page(page); } else { /* @@ -1785,8 +1795,9 @@ retry: handle = ext3_journal_start(inode, 2); if (IS_ERR(handle)) { /* This is really bad luck. We've written the data - * but cannot extend i_size. Bail out and pretend - * the write failed... */ + * but cannot extend i_size. Truncate allocated blocks + * and pretend the write failed... */ + ext3_truncate(inode); ret = PTR_ERR(handle); goto out; } @@ -2402,7 +2413,7 @@ void ext3_truncate(struct inode *inode) goto out_notrans; if (inode->i_size == 0 && ext3_should_writeback_data(inode)) - ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE; + ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE); /* * We have to lock the EOF page here, because lock_page() nests @@ -2721,7 +2732,7 @@ int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc) { /* We have all inode data except xattrs in memory here. */ return __ext3_get_inode_loc(inode, iloc, - !(EXT3_I(inode)->i_state & EXT3_STATE_XATTR)); + !ext3_test_inode_state(inode, EXT3_STATE_XATTR)); } void ext3_set_inode_flags(struct inode *inode) @@ -2893,7 +2904,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC)) - ei->i_state |= EXT3_STATE_XATTR; + ext3_set_inode_state(inode, EXT3_STATE_XATTR); } } else ei->i_extra_isize = 0; @@ -2955,7 +2966,7 @@ again: /* For fields not not tracking in the in-memory inode, * initialise them to zero for new inodes. */ - if (ei->i_state & EXT3_STATE_NEW) + if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); ext3_get_inode_flags(ei); @@ -3052,7 +3063,7 @@ again: rc = ext3_journal_dirty_metadata(handle, bh); if (!err) err = rc; - ei->i_state &= ~EXT3_STATE_NEW; + ext3_clear_inode_state(inode, EXT3_STATE_NEW); atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid); out_brelse: @@ -3096,7 +3107,7 @@ out_brelse: * `stuff()' is running, and the new i_size will be lost. Plus the inode * will no longer be on the superblock's dirty inode list. */ -int ext3_write_inode(struct inode *inode, int wait) +int ext3_write_inode(struct inode *inode, struct writeback_control *wbc) { if (current->flags & PF_MEMALLOC) return 0; @@ -3107,7 +3118,7 @@ int ext3_write_inode(struct inode *inode, int wait) return -EIO; } - if (!wait) + if (wbc->sync_mode != WB_SYNC_ALL) return 0; return ext3_force_commit(inode->i_sb); @@ -3140,6 +3151,8 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; + if (ia_valid & ATTR_SIZE) + dquot_initialize(inode); if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { handle_t *handle; @@ -3152,7 +3165,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) error = PTR_ERR(handle); goto err_out; } - error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; + error = dquot_transfer(inode, attr); if (error) { ext3_journal_stop(handle); return error; @@ -3237,7 +3250,7 @@ static int ext3_writepage_trans_blocks(struct inode *inode) ret = 2 * (bpp + indirects) + 2; #ifdef CONFIG_QUOTA - /* We know that structure was already allocated during vfs_dq_init so + /* We know that structure was already allocated during dquot_initialize so * we will be updating only the data blocks + inodes */ ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); #endif @@ -3328,7 +3341,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) * i_size has been changed by generic_commit_write() and we thus need * to include the updated inode in the current transaction. * - * Also, vfs_dq_alloc_space() will always dirty the inode when blocks + * Also, dquot_alloc_space() will always dirty the inode when blocks * are allocated to the file. * * If the inode is marked synchronous, we don't honour that here - doing diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 7b0e44f7d66..ee184084ca4 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1696,6 +1696,8 @@ static int ext3_create (struct inode * dir, struct dentry * dentry, int mode, struct inode * inode; int err, retries = 0; + dquot_initialize(dir); + retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + @@ -1730,6 +1732,8 @@ static int ext3_mknod (struct inode * dir, struct dentry *dentry, if (!new_valid_dev(rdev)) return -EINVAL; + dquot_initialize(dir); + retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + @@ -1766,6 +1770,8 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode) if (dir->i_nlink >= EXT3_LINK_MAX) return -EMLINK; + dquot_initialize(dir); + retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + @@ -2060,7 +2066,9 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go in * separate transaction */ - vfs_dq_init(dentry->d_inode); + dquot_initialize(dir); + dquot_initialize(dentry->d_inode); + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2119,7 +2127,9 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go * in separate transaction */ - vfs_dq_init(dentry->d_inode); + dquot_initialize(dir); + dquot_initialize(dentry->d_inode); + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2174,6 +2184,8 @@ static int ext3_symlink (struct inode * dir, if (l > dir->i_sb->s_blocksize) return -ENAMETOOLONG; + dquot_initialize(dir); + retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 + @@ -2228,6 +2240,9 @@ static int ext3_link (struct dentry * old_dentry, if (inode->i_nlink >= EXT3_LINK_MAX) return -EMLINK; + + dquot_initialize(dir); + /* * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing * otherwise has the potential to corrupt the orphan inode list. @@ -2278,12 +2293,15 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, struct ext3_dir_entry_2 * old_de, * new_de; int retval, flush_file = 0; + dquot_initialize(old_dir); + dquot_initialize(new_dir); + old_bh = new_bh = dir_bh = NULL; /* Initialize quotas before so that eventual writes go * in separate transaction */ if (new_dentry->d_inode) - vfs_dq_init(new_dentry->d_inode); + dquot_initialize(new_dentry->d_inode); handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index afa2b569da1..1bee604cc6c 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -164,7 +164,7 @@ void ext3_msg(struct super_block *sb, const char *prefix, * write out the superblock safely. * * We'll just use the journal_abort() error code to record an error in - * the journal instead. On recovery, the journal will compain about + * the journal instead. On recovery, the journal will complain about * that error until we've noted it down and cleared it. */ @@ -181,7 +181,7 @@ static void ext3_handle_error(struct super_block *sb) if (!test_opt (sb, ERRORS_CONT)) { journal_t *journal = EXT3_SB(sb)->s_journal; - EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT; + set_opt(EXT3_SB(sb)->s_mount_opt, ABORT); if (journal) journal_abort(journal, -EIO); } @@ -296,7 +296,7 @@ void ext3_abort (struct super_block * sb, const char * function, "error: remounting filesystem read-only"); EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; sb->s_flags |= MS_RDONLY; - EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT; + set_opt(EXT3_SB(sb)->s_mount_opt, ABORT); if (EXT3_SB(sb)->s_journal) journal_abort(EXT3_SB(sb)->s_journal, -EIO); } @@ -528,6 +528,8 @@ static void destroy_inodecache(void) static void ext3_clear_inode(struct inode *inode) { struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info; + + dquot_drop(inode); ext3_discard_reservation(inode); EXT3_I(inode)->i_block_alloc_info = NULL; if (unlikely(rsv)) @@ -562,10 +564,10 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl if (sbi->s_qf_names[GRPQUOTA]) seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); - if (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA) + if (test_opt(sb, USRQUOTA)) seq_puts(seq, ",usrquota"); - if (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA) + if (test_opt(sb, GRPQUOTA)) seq_puts(seq, ",grpquota"); #endif } @@ -656,8 +658,7 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, NOBH)) seq_puts(seq, ",nobh"); - seq_printf(seq, ",data=%s", data_mode_string(sbi->s_mount_opt & - EXT3_MOUNT_DATA_FLAGS)); + seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS))); if (test_opt(sb, DATA_ERR_ABORT)) seq_puts(seq, ",data_err=abort"); @@ -751,13 +752,6 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); static const struct dquot_operations ext3_quota_operations = { - .initialize = dquot_initialize, - .drop = dquot_drop, - .alloc_space = dquot_alloc_space, - .alloc_inode = dquot_alloc_inode, - .free_space = dquot_free_space, - .free_inode = dquot_free_inode, - .transfer = dquot_transfer, .write_dquot = ext3_write_dquot, .acquire_dquot = ext3_acquire_dquot, .release_dquot = ext3_release_dquot, @@ -896,6 +890,63 @@ static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb) return sb_block; } +#ifdef CONFIG_QUOTA +static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + char *qname; + + if (sb_any_quota_loaded(sb) && + !sbi->s_qf_names[qtype]) { + ext3_msg(sb, KERN_ERR, + "Cannot change journaled " + "quota options when quota turned on"); + return 0; + } + qname = match_strdup(args); + if (!qname) { + ext3_msg(sb, KERN_ERR, + "Not enough memory for storing quotafile name"); + return 0; + } + if (sbi->s_qf_names[qtype] && + strcmp(sbi->s_qf_names[qtype], qname)) { + ext3_msg(sb, KERN_ERR, + "%s quota file already specified", QTYPE2NAME(qtype)); + kfree(qname); + return 0; + } + sbi->s_qf_names[qtype] = qname; + if (strchr(sbi->s_qf_names[qtype], '/')) { + ext3_msg(sb, KERN_ERR, + "quotafile must be on filesystem root"); + kfree(sbi->s_qf_names[qtype]); + sbi->s_qf_names[qtype] = NULL; + return 0; + } + set_opt(sbi->s_mount_opt, QUOTA); + return 1; +} + +static int clear_qf_name(struct super_block *sb, int qtype) { + + struct ext3_sb_info *sbi = EXT3_SB(sb); + + if (sb_any_quota_loaded(sb) && + sbi->s_qf_names[qtype]) { + ext3_msg(sb, KERN_ERR, "Cannot change journaled quota options" + " when quota turned on"); + return 0; + } + /* + * The space will be released later when all options are confirmed + * to be correct + */ + sbi->s_qf_names[qtype] = NULL; + return 1; +} +#endif + static int parse_options (char *options, struct super_block *sb, unsigned int *inum, unsigned long *journal_devnum, ext3_fsblk_t *n_blocks_count, int is_remount) @@ -906,8 +957,7 @@ static int parse_options (char *options, struct super_block *sb, int data_opt = 0; int option; #ifdef CONFIG_QUOTA - int qtype, qfmt; - char *qname; + int qfmt; #endif if (!options) @@ -1065,20 +1115,19 @@ static int parse_options (char *options, struct super_block *sb, data_opt = EXT3_MOUNT_WRITEBACK_DATA; datacheck: if (is_remount) { - if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS) - == data_opt) + if (test_opt(sb, DATA_FLAGS) == data_opt) break; ext3_msg(sb, KERN_ERR, "error: cannot change " "data mode on remount. The filesystem " "is mounted in data=%s mode and you " "try to remount it in data=%s mode.", - data_mode_string(sbi->s_mount_opt & - EXT3_MOUNT_DATA_FLAGS), + data_mode_string(test_opt(sb, + DATA_FLAGS)), data_mode_string(data_opt)); return 0; } else { - sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS; + clear_opt(sbi->s_mount_opt, DATA_FLAGS); sbi->s_mount_opt |= data_opt; } break; @@ -1090,62 +1139,20 @@ static int parse_options (char *options, struct super_block *sb, break; #ifdef CONFIG_QUOTA case Opt_usrjquota: - qtype = USRQUOTA; - goto set_qf_name; - case Opt_grpjquota: - qtype = GRPQUOTA; -set_qf_name: - if (sb_any_quota_loaded(sb) && - !sbi->s_qf_names[qtype]) { - ext3_msg(sb, KERN_ERR, - "error: cannot change journaled " - "quota options when quota turned on."); - return 0; - } - qname = match_strdup(&args[0]); - if (!qname) { - ext3_msg(sb, KERN_ERR, - "error: not enough memory for " - "storing quotafile name."); + if (!set_qf_name(sb, USRQUOTA, &args[0])) return 0; - } - if (sbi->s_qf_names[qtype] && - strcmp(sbi->s_qf_names[qtype], qname)) { - ext3_msg(sb, KERN_ERR, - "error: %s quota file already " - "specified.", QTYPE2NAME(qtype)); - kfree(qname); - return 0; - } - sbi->s_qf_names[qtype] = qname; - if (strchr(sbi->s_qf_names[qtype], '/')) { - ext3_msg(sb, KERN_ERR, - "error: quotafile must be on " - "filesystem root."); - kfree(sbi->s_qf_names[qtype]); - sbi->s_qf_names[qtype] = NULL; + break; + case Opt_grpjquota: + if (!set_qf_name(sb, GRPQUOTA, &args[0])) return 0; - } - set_opt(sbi->s_mount_opt, QUOTA); break; case Opt_offusrjquota: - qtype = USRQUOTA; - goto clear_qf_name; + if (!clear_qf_name(sb, USRQUOTA)) + return 0; + break; case Opt_offgrpjquota: - qtype = GRPQUOTA; -clear_qf_name: - if (sb_any_quota_loaded(sb) && - sbi->s_qf_names[qtype]) { - ext3_msg(sb, KERN_ERR, "error: cannot change " - "journaled quota options when " - "quota turned on."); + if (!clear_qf_name(sb, GRPQUOTA)) return 0; - } - /* - * The space will be released later when all options - * are confirmed to be correct - */ - sbi->s_qf_names[qtype] = NULL; break; case Opt_jqfmt_vfsold: qfmt = QFMT_VFS_OLD; @@ -1244,18 +1251,12 @@ set_qf_format: } #ifdef CONFIG_QUOTA if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { - if ((sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA) && - sbi->s_qf_names[USRQUOTA]) + if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) clear_opt(sbi->s_mount_opt, USRQUOTA); - - if ((sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA) && - sbi->s_qf_names[GRPQUOTA]) + if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) clear_opt(sbi->s_mount_opt, GRPQUOTA); - if ((sbi->s_qf_names[USRQUOTA] && - (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)) || - (sbi->s_qf_names[GRPQUOTA] && - (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA))) { + if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { ext3_msg(sb, KERN_ERR, "error: old and new quota " "format mixing."); return 0; @@ -1478,7 +1479,7 @@ static void ext3_orphan_cleanup (struct super_block * sb, } list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); - vfs_dq_init(inode); + dquot_initialize(inode); if (inode->i_nlink) { printk(KERN_DEBUG "%s: truncating inode %lu to %Ld bytes\n", @@ -1671,11 +1672,11 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) set_opt(sbi->s_mount_opt, POSIX_ACL); #endif if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA) - sbi->s_mount_opt |= EXT3_MOUNT_JOURNAL_DATA; + set_opt(sbi->s_mount_opt, JOURNAL_DATA); else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED) - sbi->s_mount_opt |= EXT3_MOUNT_ORDERED_DATA; + set_opt(sbi->s_mount_opt, ORDERED_DATA); else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK) - sbi->s_mount_opt |= EXT3_MOUNT_WRITEBACK_DATA; + set_opt(sbi->s_mount_opt, WRITEBACK_DATA); if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC) set_opt(sbi->s_mount_opt, ERRORS_PANIC); @@ -1694,7 +1695,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount; sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV && (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) || @@ -2561,11 +2562,11 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) goto restore_opts; } - if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) + if (test_opt(sb, ABORT)) ext3_abort(sb, __func__, "Abort forced by user"); sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); es = sbi->s_es; @@ -2573,7 +2574,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || n_blocks_count > le32_to_cpu(es->s_blocks_count)) { - if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) { + if (test_opt(sb, ABORT)) { err = -EROFS; goto restore_opts; } @@ -2734,7 +2735,7 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf) * Process 1 Process 2 * ext3_create() quota_sync() * journal_start() write_dquot() - * vfs_dq_init() down(dqio_mutex) + * dquot_initialize() down(dqio_mutex) * down(dqio_mutex) journal_start() * */ @@ -2942,9 +2943,7 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type, sector_t blk = off >> EXT3_BLOCK_SIZE_BITS(sb); int err = 0; int offset = off & (sb->s_blocksize - 1); - int tocopy; int journal_quota = EXT3_SB(sb)->s_qf_names[type] != NULL; - size_t towrite = len; struct buffer_head *bh; handle_t *handle = journal_current_handle(); @@ -2955,53 +2954,54 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type, (unsigned long long)off, (unsigned long long)len); return -EIO; } + + /* + * Since we account only one data block in transaction credits, + * then it is impossible to cross a block boundary. + */ + if (sb->s_blocksize - offset < len) { + ext3_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" + " cancelled because not block aligned", + (unsigned long long)off, (unsigned long long)len); + return -EIO; + } mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); - while (towrite > 0) { - tocopy = sb->s_blocksize - offset < towrite ? - sb->s_blocksize - offset : towrite; - bh = ext3_bread(handle, inode, blk, 1, &err); - if (!bh) + bh = ext3_bread(handle, inode, blk, 1, &err); + if (!bh) + goto out; + if (journal_quota) { + err = ext3_journal_get_write_access(handle, bh); + if (err) { + brelse(bh); goto out; - if (journal_quota) { - err = ext3_journal_get_write_access(handle, bh); - if (err) { - brelse(bh); - goto out; - } - } - lock_buffer(bh); - memcpy(bh->b_data+offset, data, tocopy); - flush_dcache_page(bh->b_page); - unlock_buffer(bh); - if (journal_quota) - err = ext3_journal_dirty_metadata(handle, bh); - else { - /* Always do at least ordered writes for quotas */ - err = ext3_journal_dirty_data(handle, bh); - mark_buffer_dirty(bh); } - brelse(bh); - if (err) - goto out; - offset = 0; - towrite -= tocopy; - data += tocopy; - blk++; } + lock_buffer(bh); + memcpy(bh->b_data+offset, data, len); + flush_dcache_page(bh->b_page); + unlock_buffer(bh); + if (journal_quota) + err = ext3_journal_dirty_metadata(handle, bh); + else { + /* Always do at least ordered writes for quotas */ + err = ext3_journal_dirty_data(handle, bh); + mark_buffer_dirty(bh); + } + brelse(bh); out: - if (len == towrite) { + if (err) { mutex_unlock(&inode->i_mutex); return err; } - if (inode->i_size < off+len-towrite) { - i_size_write(inode, off+len-towrite); + if (inode->i_size < off + len) { + i_size_write(inode, off + len); EXT3_I(inode)->i_disksize = inode->i_size; } inode->i_version++; inode->i_mtime = inode->i_ctime = CURRENT_TIME; ext3_mark_inode_dirty(handle, inode); mutex_unlock(&inode->i_mutex); - return len - towrite; + return len; } #endif diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index 66895ccf76c..534a94c3a93 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -274,7 +274,7 @@ ext3_xattr_ibody_get(struct inode *inode, int name_index, const char *name, void *end; int error; - if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR)) + if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR)) return -ENODATA; error = ext3_get_inode_loc(inode, &iloc); if (error) @@ -403,7 +403,7 @@ ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) void *end; int error; - if (!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR)) + if (!ext3_test_inode_state(inode, EXT3_STATE_XATTR)) return 0; error = ext3_get_inode_loc(inode, &iloc); if (error) @@ -500,7 +500,7 @@ ext3_xattr_release_block(handle_t *handle, struct inode *inode, error = ext3_journal_dirty_metadata(handle, bh); if (IS_SYNC(inode)) handle->h_sync = 1; - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); ea_bdebug(bh, "refcount now=%d; releasing", le32_to_cpu(BHDR(bh)->h_refcount)); if (ce) @@ -775,8 +775,8 @@ inserted: else { /* The old block is released after updating the inode. */ - error = -EDQUOT; - if (vfs_dq_alloc_block(inode, 1)) + error = dquot_alloc_block(inode, 1); + if (error) goto cleanup; error = ext3_journal_get_write_access(handle, new_bh); @@ -850,7 +850,7 @@ cleanup: return error; cleanup_dquot: - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); goto cleanup; bad_block: @@ -882,7 +882,7 @@ ext3_xattr_ibody_find(struct inode *inode, struct ext3_xattr_info *i, is->s.base = is->s.first = IFIRST(header); is->s.here = is->s.first; is->s.end = (void *)raw_inode + EXT3_SB(inode->i_sb)->s_inode_size; - if (EXT3_I(inode)->i_state & EXT3_STATE_XATTR) { + if (ext3_test_inode_state(inode, EXT3_STATE_XATTR)) { error = ext3_xattr_check_names(IFIRST(header), is->s.end); if (error) return error; @@ -914,10 +914,10 @@ ext3_xattr_ibody_set(handle_t *handle, struct inode *inode, header = IHDR(inode, ext3_raw_inode(&is->iloc)); if (!IS_LAST_ENTRY(s->first)) { header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC); - EXT3_I(inode)->i_state |= EXT3_STATE_XATTR; + ext3_set_inode_state(inode, EXT3_STATE_XATTR); } else { header->h_magic = cpu_to_le32(0); - EXT3_I(inode)->i_state &= ~EXT3_STATE_XATTR; + ext3_clear_inode_state(inode, EXT3_STATE_XATTR); } return 0; } @@ -967,10 +967,10 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (error) goto cleanup; - if (EXT3_I(inode)->i_state & EXT3_STATE_NEW) { + if (ext3_test_inode_state(inode, EXT3_STATE_NEW)) { struct ext3_inode *raw_inode = ext3_raw_inode(&is.iloc); memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size); - EXT3_I(inode)->i_state &= ~EXT3_STATE_NEW; + ext3_clear_inode_state(inode, EXT3_STATE_NEW); } error = ext3_xattr_ibody_find(inode, &i, &is); diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 22bc7435d91..d2f37a5516c 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -97,8 +97,8 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, /* If checksum is bad mark all blocks used to prevent allocation * essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { - ext4_error(sb, __func__, - "Checksum bad for group %u", block_group); + ext4_error(sb, "Checksum bad for group %u", + block_group); ext4_free_blks_set(sb, gdp, 0); ext4_free_inodes_set(sb, gdp, 0); ext4_itable_unused_set(sb, gdp, 0); @@ -130,8 +130,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, * to make sure we calculate the right free blocks */ group_blocks = ext4_blocks_count(sbi->s_es) - - le32_to_cpu(sbi->s_es->s_first_data_block) - - (EXT4_BLOCKS_PER_GROUP(sb) * (ngroups - 1)); + ext4_group_first_block_no(sb, ngroups - 1); } else { group_blocks = EXT4_BLOCKS_PER_GROUP(sb); } @@ -189,9 +188,6 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, * when a file system is mounted (see ext4_fill_super). */ - -#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) - /** * ext4_get_group_desc() -- load group descriptor from disk * @sb: super block @@ -210,10 +206,8 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); if (block_group >= ngroups) { - ext4_error(sb, "ext4_get_group_desc", - "block_group >= groups_count - " - "block_group = %u, groups_count = %u", - block_group, ngroups); + ext4_error(sb, "block_group >= groups_count - block_group = %u," + " groups_count = %u", block_group, ngroups); return NULL; } @@ -221,8 +215,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); if (!sbi->s_group_desc[group_desc]) { - ext4_error(sb, "ext4_get_group_desc", - "Group descriptor not loaded - " + ext4_error(sb, "Group descriptor not loaded - " "block_group = %u, group_desc = %u, desc = %u", block_group, group_desc, offset); return NULL; @@ -282,9 +275,7 @@ static int ext4_valid_block_bitmap(struct super_block *sb, return 1; err_out: - ext4_error(sb, __func__, - "Invalid block bitmap - " - "block_group = %d, block = %llu", + ext4_error(sb, "Invalid block bitmap - block_group = %d, block = %llu", block_group, bitmap_blk); return 0; } @@ -311,8 +302,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) bitmap_blk = ext4_block_bitmap(sb, desc); bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { - ext4_error(sb, __func__, - "Cannot read block bitmap - " + ext4_error(sb, "Cannot read block bitmap - " "block_group = %u, block_bitmap = %llu", block_group, bitmap_blk); return NULL; @@ -354,8 +344,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) set_bitmap_uptodate(bh); if (bh_submit_read(bh) < 0) { put_bh(bh); - ext4_error(sb, __func__, - "Cannot read block bitmap - " + ext4_error(sb, "Cannot read block bitmap - " "block_group = %u, block_bitmap = %llu", block_group, bitmap_blk); return NULL; @@ -419,8 +408,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || in_range(block + count - 1, ext4_inode_table(sb, desc), sbi->s_itb_per_group)) { - ext4_error(sb, __func__, - "Adding blocks in system zones - " + ext4_error(sb, "Adding blocks in system zones - " "Block = %llu, count = %lu", block, count); goto error_return; @@ -453,8 +441,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, BUFFER_TRACE(bitmap_bh, "clear bit"); if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), bit + i, bitmap_bh->b_data)) { - ext4_error(sb, __func__, - "bit already cleared for block %llu", + ext4_error(sb, "bit already cleared for block %llu", (ext4_fsblk_t)(block + i)); BUFFER_TRACE(bitmap_bh, "bit already cleared"); } else { diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index a60ab9aad57..983f0e12749 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -205,14 +205,14 @@ void ext4_release_system_zone(struct super_block *sb) entry = rb_entry(n, struct ext4_system_zone, node); kmem_cache_free(ext4_system_zone_cachep, entry); if (!parent) - EXT4_SB(sb)->system_blks.rb_node = NULL; + EXT4_SB(sb)->system_blks = RB_ROOT; else if (parent->rb_left == n) parent->rb_left = NULL; else if (parent->rb_right == n) parent->rb_right = NULL; n = parent; } - EXT4_SB(sb)->system_blks.rb_node = NULL; + EXT4_SB(sb)->system_blks = RB_ROOT; } /* diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 9dc93168e26..86cb6d86a04 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -83,10 +83,12 @@ int ext4_check_dir_entry(const char *function, struct inode *dir, error_msg = "inode out of bounds"; if (error_msg != NULL) - ext4_error(dir->i_sb, function, - "bad entry in directory #%lu: %s - " - "offset=%u, inode=%u, rec_len=%d, name_len=%d", - dir->i_ino, error_msg, offset, + __ext4_error(dir->i_sb, function, + "bad entry in directory #%lu: %s - block=%llu" + "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d", + dir->i_ino, error_msg, + (unsigned long long) bh->b_blocknr, + (unsigned) (offset%bh->b_size), offset, le32_to_cpu(de->inode), rlen, de->name_len); return error_msg == NULL ? 1 : 0; @@ -150,7 +152,7 @@ static int ext4_readdir(struct file *filp, */ if (!bh) { if (!dir_has_error) { - ext4_error(sb, __func__, "directory #%lu " + ext4_error(sb, "directory #%lu " "contains a hole at offset %Lu", inode->i_ino, (unsigned long long) filp->f_pos); @@ -303,7 +305,7 @@ static void free_rb_tree_fname(struct rb_root *root) kfree(old); } if (!parent) - root->rb_node = NULL; + *root = RB_ROOT; else if (parent->rb_left == n) parent->rb_left = NULL; else if (parent->rb_right == n) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4cedc91ec59..bf938cf7c5f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -53,6 +53,12 @@ #define ext4_debug(f, a...) do {} while (0) #endif +#define EXT4_ERROR_INODE(inode, fmt, a...) \ + ext4_error_inode(__func__, (inode), (fmt), ## a); + +#define EXT4_ERROR_FILE(file, fmt, a...) \ + ext4_error_file(__func__, (file), (fmt), ## a); + /* data type for block offset of block group */ typedef int ext4_grpblk_t; @@ -133,14 +139,14 @@ struct mpage_da_data { int pages_written; int retval; }; -#define DIO_AIO_UNWRITTEN 0x1 +#define EXT4_IO_UNWRITTEN 0x1 typedef struct ext4_io_end { struct list_head list; /* per-file finished AIO list */ struct inode *inode; /* file being written to */ unsigned int flag; /* unwritten or not */ - int error; /* I/O error code */ - ext4_lblk_t offset; /* offset in the file */ - size_t size; /* size of the extent */ + struct page *page; /* page struct for buffer write */ + loff_t offset; /* offset in the file */ + ssize_t size; /* size of the extent */ struct work_struct work; /* data work queue */ } ext4_io_end_t; @@ -284,10 +290,12 @@ struct flex_groups { #define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ #define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ +#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ +#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ -#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ -#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */ +#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ +#define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */ /* Flags that should be inherited by new inodes from their parent. */ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ @@ -313,17 +321,6 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) return flags & EXT4_OTHER_FLMASK; } -/* - * Inode dynamic state flags - */ -#define EXT4_STATE_JDATA 0x00000001 /* journaled data exists */ -#define EXT4_STATE_NEW 0x00000002 /* inode is newly created */ -#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */ -#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ -#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */ -#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */ -#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/ - /* Used to pass group descriptor data when online resize is done */ struct ext4_new_group_input { __u32 group; /* Group number for this data */ @@ -364,19 +361,20 @@ struct ext4_new_group_data { /* caller is from the direct IO path, request to creation of an unitialized extents if not allocated, split the uninitialized extent if blocks has been preallocated already*/ -#define EXT4_GET_BLOCKS_DIO 0x0008 +#define EXT4_GET_BLOCKS_PRE_IO 0x0008 #define EXT4_GET_BLOCKS_CONVERT 0x0010 -#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\ +#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ + EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) + /* Convert extent to initialized after IO complete */ +#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) - /* Convert extent to initialized after direct IO complete */ -#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ - EXT4_GET_BLOCKS_DIO_CREATE_EXT) /* * Flags used by ext4_free_blocks */ #define EXT4_FREE_BLOCKS_METADATA 0x0001 #define EXT4_FREE_BLOCKS_FORGET 0x0002 +#define EXT4_FREE_BLOCKS_VALIDATED 0x0004 /* * ioctl commands @@ -630,7 +628,7 @@ struct ext4_inode_info { * near to their parent directory's inode. */ ext4_group_t i_block_group; - __u32 i_state; /* Dynamic state flags for ext4 */ + unsigned long i_state_flags; /* Dynamic state flags */ ext4_lblk_t i_dir_start_lookup; #ifdef CONFIG_EXT4_FS_XATTR @@ -708,8 +706,9 @@ struct ext4_inode_info { qsize_t i_reserved_quota; #endif - /* completed async DIOs that might need unwritten extents handling */ - struct list_head i_aio_dio_complete_list; + /* completed IOs that might need unwritten extents handling */ + struct list_head i_completed_io_list; + spinlock_t i_completed_io_lock; /* current io_end structure for async DIO write*/ ext4_io_end_t *cur_aio_dio; @@ -760,6 +759,7 @@ struct ext4_inode_info { #define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ #define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ #define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ +#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ @@ -1050,6 +1050,34 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) (ino >= EXT4_FIRST_INO(sb) && ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); } + +/* + * Inode dynamic state flags + */ +enum { + EXT4_STATE_JDATA, /* journaled data exists */ + EXT4_STATE_NEW, /* inode is newly created */ + EXT4_STATE_XATTR, /* has in-inode xattrs */ + EXT4_STATE_NO_EXPAND, /* No space for expansion */ + EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ + EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ + EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ +}; + +static inline int ext4_test_inode_state(struct inode *inode, int bit) +{ + return test_bit(bit, &EXT4_I(inode)->i_state_flags); +} + +static inline void ext4_set_inode_state(struct inode *inode, int bit) +{ + set_bit(bit, &EXT4_I(inode)->i_state_flags); +} + +static inline void ext4_clear_inode_state(struct inode *inode, int bit) +{ + clear_bit(bit, &EXT4_I(inode)->i_state_flags); +} #else /* Assume that user mode programs are passing in an ext4fs superblock, not * a kernel struct super_block. This will allow us to call the feature-test @@ -1126,6 +1154,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) #define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 #define EXT4_FEATURE_INCOMPAT_MMP 0x0100 #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 +#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ +#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ #define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR #define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ @@ -1416,7 +1446,7 @@ int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); extern struct inode *ext4_iget(struct super_block *, unsigned long); -extern int ext4_write_inode(struct inode *, int); +extern int ext4_write_inode(struct inode *, struct writeback_control *); extern int ext4_setattr(struct dentry *, struct iattr *); extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); @@ -1439,7 +1469,7 @@ extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); -extern int flush_aio_dio_completed_IO(struct inode *inode); +extern int flush_completed_IO(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); /* ioctl.c */ @@ -1465,13 +1495,20 @@ extern int ext4_group_extend(struct super_block *sb, ext4_fsblk_t n_blocks_count); /* super.c */ -extern void ext4_error(struct super_block *, const char *, const char *, ...) +extern void __ext4_error(struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); +#define ext4_error(sb, message...) __ext4_error(sb, __func__, ## message) +extern void ext4_error_inode(const char *, struct inode *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); +extern void ext4_error_file(const char *, struct file *, const char *, ...) __attribute__ ((format (printf, 3, 4))); extern void __ext4_std_error(struct super_block *, const char *, int); extern void ext4_abort(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); -extern void ext4_warning(struct super_block *, const char *, const char *, ...) +extern void __ext4_warning(struct super_block *, const char *, + const char *, ...) __attribute__ ((format (printf, 3, 4))); +#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, ## message) extern void ext4_msg(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); extern void ext4_grp_locked_error(struct super_block *, ext4_group_t, @@ -1744,7 +1781,7 @@ extern void ext4_ext_release(struct super_block *); extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len); extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, - loff_t len); + ssize_t len); extern int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, unsigned int max_blocks, struct buffer_head *bh, int flags); @@ -1756,6 +1793,15 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 len, __u64 *moved_len); +/* BH_Uninit flag: blocks are allocated but uninitialized on disk */ +enum ext4_state_bits { + BH_Uninit /* blocks are allocated but uninitialized on disk */ + = BH_JBDPrivateStart, +}; + +BUFFER_FNS(Uninit, uninit) +TAS_BUFFER_FNS(Uninit, uninit) + /* * Add new method to test wether block and inode bitmaps are properly * initialized. With uninit_bg reading the block from disk is not enough @@ -1773,6 +1819,8 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh) set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); } +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + #endif /* __KERNEL__ */ #endif /* _EXT4_H */ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index b57e5c711b6..53d2764d71c 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -125,14 +125,14 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, ext4_journal_abort_handle(where, __func__, bh, handle, err); } else { - if (inode && bh) + if (inode) mark_buffer_dirty_inode(bh, inode); else mark_buffer_dirty(bh); if (inode && inode_needs_sync(inode)) { sync_dirty_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) { - ext4_error(inode->i_sb, __func__, + ext4_error(inode->i_sb, "IO error syncing inode, " "inode=%lu, block=%llu", inode->i_ino, diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 05eca817d70..b79ad512646 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -304,4 +304,28 @@ static inline int ext4_should_writeback_data(struct inode *inode) return 0; } +/* + * This function controls whether or not we should try to go down the + * dioread_nolock code paths, which makes it safe to avoid taking + * i_mutex for direct I/O reads. This only works for extent-based + * files, and it doesn't work for nobh or if data journaling is + * enabled, since the dioread_nolock code uses b_private to pass + * information back to the I/O completion handler, and this conflicts + * with the jbd's use of b_private. + */ +static inline int ext4_should_dioread_nolock(struct inode *inode) +{ + if (!test_opt(inode->i_sb, DIOREAD_NOLOCK)) + return 0; + if (test_opt(inode->i_sb, NOBH)) + return 0; + if (!S_ISREG(inode->i_mode)) + return 0; + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + return 0; + if (ext4_should_journal_data(inode)) + return 0; + return 1; +} + #endif /* _EXT4_JBD2_H */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 765a4826b11..94c8ee81f5e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -195,8 +195,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, if (S_ISREG(inode->i_mode)) block_group++; } - bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block); + bg_start = ext4_group_first_block_no(inode->i_sb, block_group); last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; /* @@ -440,7 +439,7 @@ static int __ext4_ext_check(const char *function, struct inode *inode, return 0; corrupted: - ext4_error(inode->i_sb, function, + __ext4_error(inode->i_sb, function, "bad header/extent in inode #%lu: %s - magic %x, " "entries %u, max %u(%u), depth %u(%u)", inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), @@ -703,7 +702,12 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, } eh = ext_block_hdr(bh); ppos++; - BUG_ON(ppos > depth); + if (unlikely(ppos > depth)) { + put_bh(bh); + EXT4_ERROR_INODE(inode, + "ppos %d > depth %d", ppos, depth); + goto err; + } path[ppos].p_bh = bh; path[ppos].p_hdr = eh; i--; @@ -749,7 +753,12 @@ int ext4_ext_insert_index(handle_t *handle, struct inode *inode, if (err) return err; - BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block)); + if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) { + EXT4_ERROR_INODE(inode, + "logical %d == ei_block %d!", + logical, le32_to_cpu(curp->p_idx->ei_block)); + return -EIO; + } len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; if (logical > le32_to_cpu(curp->p_idx->ei_block)) { /* insert after */ @@ -779,9 +788,17 @@ int ext4_ext_insert_index(handle_t *handle, struct inode *inode, ext4_idx_store_pblock(ix, ptr); le16_add_cpu(&curp->p_hdr->eh_entries, 1); - BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries) - > le16_to_cpu(curp->p_hdr->eh_max)); - BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr)); + if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) + > le16_to_cpu(curp->p_hdr->eh_max))) { + EXT4_ERROR_INODE(inode, + "logical %d == ei_block %d!", + logical, le32_to_cpu(curp->p_idx->ei_block)); + return -EIO; + } + if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { + EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); + return -EIO; + } err = ext4_ext_dirty(handle, inode, curp); ext4_std_error(inode->i_sb, err); @@ -819,7 +836,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, /* if current leaf will be split, then we should use * border from split point */ - BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr)); + if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) { + EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!"); + return -EIO; + } if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { border = path[depth].p_ext[1].ee_block; ext_debug("leaf will be split." @@ -860,7 +880,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, /* initialize new leaf */ newblock = ablocks[--a]; - BUG_ON(newblock == 0); + if (unlikely(newblock == 0)) { + EXT4_ERROR_INODE(inode, "newblock == 0!"); + err = -EIO; + goto cleanup; + } bh = sb_getblk(inode->i_sb, newblock); if (!bh) { err = -EIO; @@ -880,7 +904,14 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, ex = EXT_FIRST_EXTENT(neh); /* move remainder of path[depth] to the new leaf */ - BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max); + if (unlikely(path[depth].p_hdr->eh_entries != + path[depth].p_hdr->eh_max)) { + EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!", + path[depth].p_hdr->eh_entries, + path[depth].p_hdr->eh_max); + err = -EIO; + goto cleanup; + } /* start copy from next extent */ /* TODO: we could do it by single memmove */ m = 0; @@ -927,7 +958,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, /* create intermediate indexes */ k = depth - at - 1; - BUG_ON(k < 0); + if (unlikely(k < 0)) { + EXT4_ERROR_INODE(inode, "k %d < 0!", k); + err = -EIO; + goto cleanup; + } if (k) ext_debug("create %d intermediate indices\n", k); /* insert new index into current index block */ @@ -964,8 +999,14 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, EXT_MAX_INDEX(path[i].p_hdr)); - BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) != - EXT_LAST_INDEX(path[i].p_hdr)); + if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != + EXT_LAST_INDEX(path[i].p_hdr))) { + EXT4_ERROR_INODE(inode, + "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!", + le32_to_cpu(path[i].p_ext->ee_block)); + err = -EIO; + goto cleanup; + } while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { ext_debug("%d: move %d:%llu in new index %llu\n", i, le32_to_cpu(path[i].p_idx->ei_block), @@ -1203,7 +1244,10 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *ex; int depth, ee_len; - BUG_ON(path == NULL); + if (unlikely(path == NULL)) { + EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); + return -EIO; + } depth = path->p_depth; *phys = 0; @@ -1217,15 +1261,33 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path, ex = path[depth].p_ext; ee_len = ext4_ext_get_actual_len(ex); if (*logical < le32_to_cpu(ex->ee_block)) { - BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex); + if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { + EXT4_ERROR_INODE(inode, + "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!", + *logical, le32_to_cpu(ex->ee_block)); + return -EIO; + } while (--depth >= 0) { ix = path[depth].p_idx; - BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr)); + if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { + EXT4_ERROR_INODE(inode, + "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", + ix != NULL ? ix->ei_block : 0, + EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ? + EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0, + depth); + return -EIO; + } } return 0; } - BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len)); + if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { + EXT4_ERROR_INODE(inode, + "logical %d < ee_block %d + ee_len %d!", + *logical, le32_to_cpu(ex->ee_block), ee_len); + return -EIO; + } *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; *phys = ext_pblock(ex) + ee_len - 1; @@ -1251,7 +1313,10 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, int depth; /* Note, NOT eh_depth; depth from top of tree */ int ee_len; - BUG_ON(path == NULL); + if (unlikely(path == NULL)) { + EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); + return -EIO; + } depth = path->p_depth; *phys = 0; @@ -1265,17 +1330,32 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, ex = path[depth].p_ext; ee_len = ext4_ext_get_actual_len(ex); if (*logical < le32_to_cpu(ex->ee_block)) { - BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex); + if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { + EXT4_ERROR_INODE(inode, + "first_extent(path[%d].p_hdr) != ex", + depth); + return -EIO; + } while (--depth >= 0) { ix = path[depth].p_idx; - BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr)); + if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { + EXT4_ERROR_INODE(inode, + "ix != EXT_FIRST_INDEX *logical %d!", + *logical); + return -EIO; + } } *logical = le32_to_cpu(ex->ee_block); *phys = ext_pblock(ex); return 0; } - BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len)); + if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { + EXT4_ERROR_INODE(inode, + "logical %d < ee_block %d + ee_len %d!", + *logical, le32_to_cpu(ex->ee_block), ee_len); + return -EIO; + } if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { /* next allocated block in this leaf */ @@ -1414,8 +1494,12 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, eh = path[depth].p_hdr; ex = path[depth].p_ext; - BUG_ON(ex == NULL); - BUG_ON(eh == NULL); + + if (unlikely(ex == NULL || eh == NULL)) { + EXT4_ERROR_INODE(inode, + "ex %p == NULL or eh %p == NULL", ex, eh); + return -EIO; + } if (depth == 0) { /* there is no tree at all */ @@ -1538,8 +1622,9 @@ int ext4_ext_try_to_merge(struct inode *inode, merge_done = 1; WARN_ON(eh->eh_entries == 0); if (!eh->eh_entries) - ext4_error(inode->i_sb, "ext4_ext_try_to_merge", - "inode#%lu, eh->eh_entries = 0!", inode->i_ino); + ext4_error(inode->i_sb, + "inode#%lu, eh->eh_entries = 0!", + inode->i_ino); } return merge_done; @@ -1612,13 +1697,19 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, ext4_lblk_t next; unsigned uninitialized = 0; - BUG_ON(ext4_ext_get_actual_len(newext) == 0); + if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { + EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); + return -EIO; + } depth = ext_depth(inode); ex = path[depth].p_ext; - BUG_ON(path[depth].p_hdr == NULL); + if (unlikely(path[depth].p_hdr == NULL)) { + EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); + return -EIO; + } /* try to insert block into found extent and return */ - if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT) + if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) && ext4_can_extents_be_merged(inode, ex, newext)) { ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", ext4_ext_is_uninitialized(newext), @@ -1739,7 +1830,7 @@ has_space: merge: /* try to merge extents to the right */ - if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT) + if (!(flag & EXT4_GET_BLOCKS_PRE_IO)) ext4_ext_try_to_merge(inode, path, nearex); /* try to merge extents to the left */ @@ -1787,7 +1878,11 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, } depth = ext_depth(inode); - BUG_ON(path[depth].p_hdr == NULL); + if (unlikely(path[depth].p_hdr == NULL)) { + EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); + err = -EIO; + break; + } ex = path[depth].p_ext; next = ext4_ext_next_allocated_block(path); @@ -1838,7 +1933,11 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, cbex.ec_type = EXT4_EXT_CACHE_EXTENT; } - BUG_ON(cbex.ec_len == 0); + if (unlikely(cbex.ec_len == 0)) { + EXT4_ERROR_INODE(inode, "cbex.ec_len == 0"); + err = -EIO; + break; + } err = func(inode, path, &cbex, ex, cbdata); ext4_ext_drop_refs(path); @@ -1952,7 +2051,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP && cex->ec_type != EXT4_EXT_CACHE_EXTENT); - if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { + if (in_range(block, cex->ec_block, cex->ec_len)) { ex->ee_block = cpu_to_le32(cex->ec_block); ext4_ext_store_pblock(ex, cex->ec_start); ex->ee_len = cpu_to_le16(cex->ec_len); @@ -1981,7 +2080,10 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, /* free index block */ path--; leaf = idx_pblock(path->p_idx); - BUG_ON(path->p_hdr->eh_entries == 0); + if (unlikely(path->p_hdr->eh_entries == 0)) { + EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); + return -EIO; + } err = ext4_ext_get_access(handle, inode, path); if (err) return err; @@ -2119,8 +2221,10 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, if (!path[depth].p_hdr) path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); eh = path[depth].p_hdr; - BUG_ON(eh == NULL); - + if (unlikely(path[depth].p_hdr == NULL)) { + EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); + return -EIO; + } /* find where to start removing */ ex = EXT_LAST_EXTENT(eh); @@ -2983,7 +3087,7 @@ fix_extent_len: ext4_ext_dirty(handle, inode, path + depth); return err; } -static int ext4_convert_unwritten_extents_dio(handle_t *handle, +static int ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { @@ -3063,8 +3167,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, flags, allocated); ext4_ext_show_leaf(inode, path); - /* DIO get_block() before submit the IO, split the extent */ - if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { + /* get_block() before submit the IO, split the extent */ + if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { ret = ext4_split_unwritten_extents(handle, inode, path, iblock, max_blocks, flags); @@ -3074,14 +3178,16 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, * completed */ if (io) - io->flag = DIO_AIO_UNWRITTEN; + io->flag = EXT4_IO_UNWRITTEN; else - EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN; + ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); + if (ext4_should_dioread_nolock(inode)) + set_buffer_uninit(bh_result); goto out; } - /* async DIO end_io complete, convert the filled extent to written */ - if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { - ret = ext4_convert_unwritten_extents_dio(handle, inode, + /* IO end_io complete, convert the filled extent to written */ + if ((flags & EXT4_GET_BLOCKS_CONVERT)) { + ret = ext4_convert_unwritten_extents_endio(handle, inode, path); if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); @@ -3185,7 +3291,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, { struct ext4_ext_path *path = NULL; struct ext4_extent_header *eh; - struct ext4_extent newex, *ex; + struct ext4_extent newex, *ex, *last_ex; ext4_fsblk_t newblock; int err = 0, depth, ret, cache_type; unsigned int allocated = 0; @@ -3237,10 +3343,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * this situation is possible, though, _during_ tree modification; * this is why assert can't be put in ext4_ext_find_extent() */ - if (path[depth].p_ext == NULL && depth != 0) { - ext4_error(inode->i_sb, __func__, "bad extent address " - "inode: %lu, iblock: %d, depth: %d", - inode->i_ino, iblock, depth); + if (unlikely(path[depth].p_ext == NULL && depth != 0)) { + EXT4_ERROR_INODE(inode, "bad extent address " + "iblock: %d, depth: %d pblock %lld", + iblock, depth, path[depth].p_block); err = -EIO; goto out2; } @@ -3258,7 +3364,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, */ ee_len = ext4_ext_get_actual_len(ex); /* if found extent covers block, simply return it */ - if (iblock >= ee_block && iblock < ee_block + ee_len) { + if (in_range(iblock, ee_block, ee_len)) { newblock = iblock - ee_block + ee_start; /* number of remaining blocks in the extent */ allocated = ee_len - (iblock - ee_block); @@ -3350,21 +3456,35 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ ext4_ext_mark_uninitialized(&newex); /* - * io_end structure was created for every async - * direct IO write to the middle of the file. - * To avoid unecessary convertion for every aio dio rewrite - * to the mid of file, here we flag the IO that is really - * need the convertion. + * io_end structure was created for every IO write to an + * uninitialized extent. To avoid unecessary conversion, + * here we flag the IO that really needs the conversion. * For non asycn direct IO case, flag the inode state * that we need to perform convertion when IO is done. */ - if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { + if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { if (io) - io->flag = DIO_AIO_UNWRITTEN; + io->flag = EXT4_IO_UNWRITTEN; else - EXT4_I(inode)->i_state |= - EXT4_STATE_DIO_UNWRITTEN;; + ext4_set_inode_state(inode, + EXT4_STATE_DIO_UNWRITTEN); + } + if (ext4_should_dioread_nolock(inode)) + set_buffer_uninit(bh_result); + } + + if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) { + if (unlikely(!eh->eh_entries)) { + EXT4_ERROR_INODE(inode, + "eh->eh_entries == 0 ee_block %d", + ex->ee_block); + err = -EIO; + goto out2; } + last_ex = EXT_LAST_EXTENT(eh); + if (iblock + ar.len > le32_to_cpu(last_ex->ee_block) + + ext4_ext_get_actual_len(last_ex)) + EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL; } err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err) { @@ -3499,6 +3619,13 @@ static void ext4_falloc_update_inode(struct inode *inode, i_size_write(inode, new_size); if (new_size > EXT4_I(inode)->i_disksize) ext4_update_i_disksize(inode, new_size); + } else { + /* + * Mark that we allocate beyond EOF so the subsequent truncate + * can proceed even if the new size is the same as i_size. + */ + if (new_size > i_size_read(inode)) + EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL; } } @@ -3603,7 +3730,7 @@ retry: * Returns 0 on success. */ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, - loff_t len) + ssize_t len) { handle_t *handle; ext4_lblk_t block; @@ -3635,7 +3762,7 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, map_bh.b_state = 0; ret = ext4_get_blocks(handle, inode, block, max_blocks, &map_bh, - EXT4_GET_BLOCKS_DIO_CONVERT_EXT); + EXT4_GET_BLOCKS_IO_CONVERT_EXT); if (ret <= 0) { WARN_ON(ret <= 0); printk(KERN_ERR "%s: ext4_ext_get_blocks " @@ -3739,7 +3866,7 @@ static int ext4_xattr_fiemap(struct inode *inode, int error = 0; /* in-inode? */ - if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) { + if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { struct ext4_iloc iloc; int offset; /* offset of xattr in inode */ @@ -3767,7 +3894,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { ext4_lblk_t start_blk; - ext4_lblk_t len_blks; int error = 0; /* fallback to generic here if not in extents fmt */ @@ -3781,8 +3907,14 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { error = ext4_xattr_fiemap(inode, fieinfo); } else { + ext4_lblk_t len_blks; + __u64 last_blk; + start_blk = start >> inode->i_sb->s_blocksize_bits; - len_blks = len >> inode->i_sb->s_blocksize_bits; + last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; + if (last_blk >= EXT_MAX_BLOCK) + last_blk = EXT_MAX_BLOCK-1; + len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; /* * Walk the extent tree gathering extent information. diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 56eee3d796c..d0776e410f3 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -23,6 +23,7 @@ #include <linux/jbd2.h> #include <linux/mount.h> #include <linux/path.h> +#include <linux/quotaops.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" @@ -35,9 +36,9 @@ */ static int ext4_release_file(struct inode *inode, struct file *filp) { - if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) { + if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) { ext4_alloc_da_blocks(inode); - EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE; + ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); } /* if we are the last writer on the inode, drop the block reservation */ if ((filp->f_mode & FMODE_WRITE) && @@ -125,7 +126,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) sb->s_dirt = 1; } } - return generic_file_open(inode, filp); + return dquot_file_open(inode, filp); } const struct file_operations ext4_file_operations = { diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 98bd140aad0..0d0c3239c1c 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -63,7 +63,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) if (inode->i_sb->s_flags & MS_RDONLY) return 0; - ret = flush_aio_dio_completed_IO(inode); + ret = flush_completed_IO(inode); if (ret < 0) return ret; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index f3624ead4f6..361c0b9962a 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -76,8 +76,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, /* If checksum is bad mark all blocks and inodes use to prevent * allocation, essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { - ext4_error(sb, __func__, "Checksum bad for group %u", - block_group); + ext4_error(sb, "Checksum bad for group %u", block_group); ext4_free_blks_set(sb, gdp, 0); ext4_free_inodes_set(sb, gdp, 0); ext4_itable_unused_set(sb, gdp, 0); @@ -111,8 +110,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) bitmap_blk = ext4_inode_bitmap(sb, desc); bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { - ext4_error(sb, __func__, - "Cannot read inode bitmap - " + ext4_error(sb, "Cannot read inode bitmap - " "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); return NULL; @@ -153,8 +151,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) set_bitmap_uptodate(bh); if (bh_submit_read(bh) < 0) { put_bh(bh); - ext4_error(sb, __func__, - "Cannot read inode bitmap - " + ext4_error(sb, "Cannot read inode bitmap - " "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); return NULL; @@ -217,10 +214,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) * Note: we must free any quota before locking the superblock, * as writing the quota to disk may need the lock as well. */ - vfs_dq_init(inode); + dquot_initialize(inode); ext4_xattr_delete_inode(handle, inode); - vfs_dq_free_inode(inode); - vfs_dq_drop(inode); + dquot_free_inode(inode); + dquot_drop(inode); is_directory = S_ISDIR(inode->i_mode); @@ -229,8 +226,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) es = EXT4_SB(sb)->s_es; if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { - ext4_error(sb, "ext4_free_inode", - "reserved or nonexistent inode %lu", ino); + ext4_error(sb, "reserved or nonexistent inode %lu", ino); goto error_return; } block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); @@ -248,8 +244,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), bit, bitmap_bh->b_data); if (!cleared) - ext4_error(sb, "ext4_free_inode", - "bit already cleared for inode %lu", ino); + ext4_error(sb, "bit already cleared for inode %lu", ino); else { gdp = ext4_get_group_desc(sb, block_group, &bh2); @@ -736,8 +731,7 @@ static int ext4_claim_inode(struct super_block *sb, if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || ino > EXT4_INODES_PER_GROUP(sb)) { ext4_unlock_group(sb, group); - ext4_error(sb, __func__, - "reserved inode or inode > inodes count - " + ext4_error(sb, "reserved inode or inode > inodes count - " "block_group = %u, inode=%lu", group, ino + group * EXT4_INODES_PER_GROUP(sb)); return 1; @@ -904,7 +898,7 @@ repeat_in_this_group: BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, - inode, + NULL, inode_bitmap_bh); if (err) goto fail; @@ -1029,15 +1023,16 @@ got: inode->i_generation = sbi->s_next_generation++; spin_unlock(&sbi->s_next_gen_lock); - ei->i_state = EXT4_STATE_NEW; + ei->i_state_flags = 0; + ext4_set_inode_state(inode, EXT4_STATE_NEW); ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; ret = inode; - if (vfs_dq_alloc_inode(inode)) { - err = -EDQUOT; + dquot_initialize(inode); + err = dquot_alloc_inode(inode); + if (err) goto fail_drop; - } err = ext4_init_acl(handle, inode, dir); if (err) @@ -1074,10 +1069,10 @@ really_out: return ret; fail_free_drop: - vfs_dq_free_inode(inode); + dquot_free_inode(inode); fail_drop: - vfs_dq_drop(inode); + dquot_drop(inode); inode->i_flags |= S_NOQUOTA; inode->i_nlink = 0; unlock_new_inode(inode); @@ -1098,8 +1093,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) /* Error cases - e2fsck has already cleaned up for us */ if (ino > max_ino) { - ext4_warning(sb, __func__, - "bad orphan ino %lu! e2fsck was run?", ino); + ext4_warning(sb, "bad orphan ino %lu! e2fsck was run?", ino); goto error; } @@ -1107,8 +1101,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); if (!bitmap_bh) { - ext4_warning(sb, __func__, - "inode bitmap error for orphan %lu", ino); + ext4_warning(sb, "inode bitmap error for orphan %lu", ino); goto error; } @@ -1140,8 +1133,7 @@ iget_failed: err = PTR_ERR(inode); inode = NULL; bad_orphan: - ext4_warning(sb, __func__, - "bad orphan inode %lu! e2fsck was run?", ino); + ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino); printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n", bit, (unsigned long long)bitmap_bh->b_blocknr, ext4_test_bit(bit, bitmap_bh->b_data)); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e11952404e0..986120f3006 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -38,6 +38,7 @@ #include <linux/uio.h> #include <linux/bio.h> #include <linux/workqueue.h> +#include <linux/kernel.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -170,6 +171,9 @@ void ext4_delete_inode(struct inode *inode) handle_t *handle; int err; + if (!is_bad_inode(inode)) + dquot_initialize(inode); + if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages(&inode->i_data, 0); @@ -194,7 +198,7 @@ void ext4_delete_inode(struct inode *inode) inode->i_size = 0; err = ext4_mark_inode_dirty(handle, inode); if (err) { - ext4_warning(inode->i_sb, __func__, + ext4_warning(inode->i_sb, "couldn't mark inode dirty (err %d)", err); goto stop_handle; } @@ -212,7 +216,7 @@ void ext4_delete_inode(struct inode *inode) if (err > 0) err = ext4_journal_restart(handle, 3); if (err != 0) { - ext4_warning(inode->i_sb, __func__, + ext4_warning(inode->i_sb, "couldn't extend journal (err %d)", err); stop_handle: ext4_journal_stop(handle); @@ -323,8 +327,7 @@ static int ext4_block_to_path(struct inode *inode, offsets[n++] = i_block & (ptrs - 1); final = ptrs; } else { - ext4_warning(inode->i_sb, "ext4_block_to_path", - "block %lu > max in inode %lu", + ext4_warning(inode->i_sb, "block %lu > max in inode %lu", i_block + direct_blocks + indirect_blocks + double_blocks, inode->i_ino); } @@ -344,7 +347,7 @@ static int __ext4_check_blockref(const char *function, struct inode *inode, if (blk && unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), blk, 1))) { - ext4_error(inode->i_sb, function, + __ext4_error(inode->i_sb, function, "invalid block reference %u " "in inode #%lu", blk, inode->i_ino); return -EIO; @@ -607,7 +610,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, if (*err) goto failed_out; - BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS); + if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { + EXT4_ERROR_INODE(inode, + "current_block %llu + count %lu > %d!", + current_block, count, + EXT4_MAX_BLOCK_FILE_PHYS); + *err = -EIO; + goto failed_out; + } target -= count; /* allocate blocks for indirect blocks */ @@ -643,7 +653,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, ar.flags = EXT4_MB_HINT_DATA; current_block = ext4_mb_new_blocks(handle, &ar, err); - BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS); + if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { + EXT4_ERROR_INODE(inode, + "current_block %llu + ar.len %d > %d!", + current_block, ar.len, + EXT4_MAX_BLOCK_FILE_PHYS); + *err = -EIO; + goto failed_out; + } if (*err && (target == blks)) { /* @@ -1061,6 +1078,7 @@ void ext4_da_update_reserve_space(struct inode *inode, int mdb_free = 0, allocated_meta_blocks = 0; spin_lock(&ei->i_block_reservation_lock); + trace_ext4_da_update_reserve_space(inode, used); if (unlikely(used > ei->i_reserved_data_blocks)) { ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " "with only %d reserved data blocks\n", @@ -1093,9 +1111,9 @@ void ext4_da_update_reserve_space(struct inode *inode, /* Update quota subsystem */ if (quota_claim) { - vfs_dq_claim_block(inode, used); + dquot_claim_block(inode, used); if (mdb_free) - vfs_dq_release_reservation_block(inode, mdb_free); + dquot_release_reservation_block(inode, mdb_free); } else { /* * We did fallocate with an offset that is already delayed @@ -1106,8 +1124,8 @@ void ext4_da_update_reserve_space(struct inode *inode, * that */ if (allocated_meta_blocks) - vfs_dq_claim_block(inode, allocated_meta_blocks); - vfs_dq_release_reservation_block(inode, mdb_free + used); + dquot_claim_block(inode, allocated_meta_blocks); + dquot_release_reservation_block(inode, mdb_free + used); } /* @@ -1124,7 +1142,7 @@ static int check_block_validity(struct inode *inode, const char *msg, sector_t logical, sector_t phys, int len) { if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { - ext4_error(inode->i_sb, msg, + __ext4_error(inode->i_sb, msg, "inode #%lu logical block %llu mapped to %llu " "(size %d)", inode->i_ino, (unsigned long long) logical, @@ -1306,7 +1324,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, * i_data's format changing. Force the migrate * to fail by clearing migrate flags */ - EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; + ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); } /* @@ -1534,6 +1552,8 @@ static void ext4_truncate_failed_write(struct inode *inode) ext4_truncate(inode); } +static int ext4_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1575,8 +1595,12 @@ retry: } *pagep = page; - ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, - ext4_get_block); + if (ext4_should_dioread_nolock(inode)) + ret = block_write_begin(file, mapping, pos, len, flags, pagep, + fsdata, ext4_get_block_write); + else + ret = block_write_begin(file, mapping, pos, len, flags, pagep, + fsdata, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { ret = walk_page_buffers(handle, page_buffers(page), @@ -1793,7 +1817,7 @@ static int ext4_journalled_write_end(struct file *file, new_i_size = pos + copied; if (new_i_size > inode->i_size) i_size_write(inode, pos+copied); - EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; + ext4_set_inode_state(inode, EXT4_STATE_JDATA); if (new_i_size > EXT4_I(inode)->i_disksize) { ext4_update_i_disksize(inode, new_i_size); ret2 = ext4_mark_inode_dirty(handle, inode); @@ -1836,6 +1860,7 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock) struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); unsigned long md_needed, md_reserved; + int ret; /* * recalculate the amount of metadata blocks to reserve @@ -1846,6 +1871,7 @@ repeat: spin_lock(&ei->i_block_reservation_lock); md_reserved = ei->i_reserved_meta_blocks; md_needed = ext4_calc_metadata_amount(inode, lblock); + trace_ext4_da_reserve_space(inode, md_needed); spin_unlock(&ei->i_block_reservation_lock); /* @@ -1853,11 +1879,12 @@ repeat: * later. Real quota accounting is done at pages writeout * time. */ - if (vfs_dq_reserve_block(inode, md_needed + 1)) - return -EDQUOT; + ret = dquot_reserve_block(inode, md_needed + 1); + if (ret) + return ret; if (ext4_claim_free_blocks(sbi, md_needed + 1)) { - vfs_dq_release_reservation_block(inode, md_needed + 1); + dquot_release_reservation_block(inode, md_needed + 1); if (ext4_should_retry_alloc(inode->i_sb, &retries)) { yield(); goto repeat; @@ -1914,7 +1941,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free) spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - vfs_dq_release_reservation_block(inode, to_free); + dquot_release_reservation_block(inode, to_free); } static void ext4_da_page_release_reservation(struct page *page, @@ -2091,6 +2118,8 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, } else if (buffer_mapped(bh)) BUG_ON(bh->b_blocknr != pblock); + if (buffer_uninit(exbh)) + set_buffer_uninit(bh); cur_logical++; pblock++; } while ((bh = bh->b_this_page) != head); @@ -2133,17 +2162,16 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - index = page->index; - if (index > end) + if (page->index > end) break; - index++; - BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); block_invalidatepage(page, 0); ClearPageUptodate(page); unlock_page(page); } + index = pvec.pages[nr_pages - 1]->index + 1; + pagevec_release(&pvec); } return; } @@ -2220,6 +2248,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) */ new.b_state = 0; get_blocks_flags = EXT4_GET_BLOCKS_CREATE; + if (ext4_should_dioread_nolock(mpd->inode)) + get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; if (mpd->b_state & (1 << BH_Delay)) get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; @@ -2630,11 +2660,14 @@ static int __ext4_journalled_writepage(struct page *page, ret = err; walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); - EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; + ext4_set_inode_state(inode, EXT4_STATE_JDATA); out: return ret; } +static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); +static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); + /* * Note that we don't need to start a transaction unless we're journaling data * because we should have holes filled from ext4_page_mkwrite(). We even don't @@ -2682,7 +2715,7 @@ static int ext4_writepage(struct page *page, int ret = 0; loff_t size; unsigned int len; - struct buffer_head *page_bufs; + struct buffer_head *page_bufs = NULL; struct inode *inode = page->mapping->host; trace_ext4_writepage(inode, page); @@ -2758,7 +2791,11 @@ static int ext4_writepage(struct page *page, if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) ret = nobh_writepage(page, noalloc_get_block_write, wbc); - else + else if (page_bufs && buffer_uninit(page_bufs)) { + ext4_set_bh_endio(page_bufs, inode); + ret = block_write_full_page_endio(page, noalloc_get_block_write, + wbc, ext4_end_io_buffer_write); + } else ret = block_write_full_page(page, noalloc_get_block_write, wbc); @@ -3301,7 +3338,8 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) filemap_write_and_wait(mapping); } - if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { + if (EXT4_JOURNAL(inode) && + ext4_test_inode_state(inode, EXT4_STATE_JDATA)) { /* * This is a REALLY heavyweight approach, but the use of * bmap on dirty files is expected to be extremely rare: @@ -3320,7 +3358,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) * everything they get. */ - EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA; + ext4_clear_inode_state(inode, EXT4_STATE_JDATA); journal = EXT4_JOURNAL(inode); jbd2_journal_lock_updates(journal); err = jbd2_journal_flush(journal); @@ -3345,11 +3383,45 @@ ext4_readpages(struct file *file, struct address_space *mapping, return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); } +static void ext4_free_io_end(ext4_io_end_t *io) +{ + BUG_ON(!io); + if (io->page) + put_page(io->page); + iput(io->inode); + kfree(io); +} + +static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset) +{ + struct buffer_head *head, *bh; + unsigned int curr_off = 0; + + if (!page_has_buffers(page)) + return; + head = bh = page_buffers(page); + do { + if (offset <= curr_off && test_clear_buffer_uninit(bh) + && bh->b_private) { + ext4_free_io_end(bh->b_private); + bh->b_private = NULL; + bh->b_end_io = NULL; + } + curr_off = curr_off + bh->b_size; + bh = bh->b_this_page; + } while (bh != head); +} + static void ext4_invalidatepage(struct page *page, unsigned long offset) { journal_t *journal = EXT4_JOURNAL(page->mapping->host); /* + * free any io_end structure allocated for buffers to be discarded + */ + if (ext4_should_dioread_nolock(page->mapping->host)) + ext4_invalidatepage_free_endio(page, offset); + /* * If it's a full truncate we just forget about the pending dirtying */ if (offset == 0) @@ -3420,7 +3492,14 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, } retry: - ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + if (rw == READ && ext4_should_dioread_nolock(inode)) + ret = blockdev_direct_IO_no_locking(rw, iocb, inode, + inode->i_sb->s_bdev, iov, + offset, nr_segs, + ext4_get_block, NULL); + else + ret = blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iov, offset, nr_segs, ext4_get_block, NULL); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) @@ -3436,6 +3515,9 @@ retry: * but cannot extend i_size. Bail out and pretend * the write failed... */ ret = PTR_ERR(handle); + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + goto out; } if (inode->i_nlink) @@ -3463,75 +3545,63 @@ out: return ret; } -static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, +static int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - handle_t *handle = NULL; + handle_t *handle = ext4_journal_current_handle(); int ret = 0; unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; int dio_credits; + int started = 0; - ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n", + ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", inode->i_ino, create); /* - * DIO VFS code passes create = 0 flag for write to - * the middle of file. It does this to avoid block - * allocation for holes, to prevent expose stale data - * out when there is parallel buffered read (which does - * not hold the i_mutex lock) while direct IO write has - * not completed. DIO request on holes finally falls back - * to buffered IO for this reason. - * - * For ext4 extent based file, since we support fallocate, - * new allocated extent as uninitialized, for holes, we - * could fallocate blocks for holes, thus parallel - * buffered IO read will zero out the page when read on - * a hole while parallel DIO write to the hole has not completed. - * - * when we come here, we know it's a direct IO write to - * to the middle of file (<i_size) - * so it's safe to override the create flag from VFS. + * ext4_get_block in prepare for a DIO write or buffer write. + * We allocate an uinitialized extent if blocks haven't been allocated. + * The extent will be converted to initialized after IO complete. */ - create = EXT4_GET_BLOCKS_DIO_CREATE_EXT; + create = EXT4_GET_BLOCKS_IO_CREATE_EXT; - if (max_blocks > DIO_MAX_BLOCKS) - max_blocks = DIO_MAX_BLOCKS; - dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); - handle = ext4_journal_start(inode, dio_credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; + if (!handle) { + if (max_blocks > DIO_MAX_BLOCKS) + max_blocks = DIO_MAX_BLOCKS; + dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); + handle = ext4_journal_start(inode, dio_credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + started = 1; } + ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, create); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; } - ext4_journal_stop(handle); + if (started) + ext4_journal_stop(handle); out: return ret; } -static void ext4_free_io_end(ext4_io_end_t *io) -{ - BUG_ON(!io); - iput(io->inode); - kfree(io); -} -static void dump_aio_dio_list(struct inode * inode) +static void dump_completed_IO(struct inode * inode) { #ifdef EXT4_DEBUG struct list_head *cur, *before, *after; ext4_io_end_t *io, *io0, *io1; + unsigned long flags; - if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ - ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino); + if (list_empty(&EXT4_I(inode)->i_completed_io_list)){ + ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino); return; } - ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino); - list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){ + ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino); + spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); + list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){ cur = &io->list; before = cur->prev; io0 = container_of(before, ext4_io_end_t, list); @@ -3541,32 +3611,31 @@ static void dump_aio_dio_list(struct inode * inode) ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", io, inode->i_ino, io0, io1); } + spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); #endif } /* * check a range of space and convert unwritten extents to written. */ -static int ext4_end_aio_dio_nolock(ext4_io_end_t *io) +static int ext4_end_io_nolock(ext4_io_end_t *io) { struct inode *inode = io->inode; loff_t offset = io->offset; - size_t size = io->size; + ssize_t size = io->size; int ret = 0; - ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p," + ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," "list->prev 0x%p\n", io, inode->i_ino, io->list.next, io->list.prev); if (list_empty(&io->list)) return ret; - if (io->flag != DIO_AIO_UNWRITTEN) + if (io->flag != EXT4_IO_UNWRITTEN) return ret; - if (offset + size <= i_size_read(inode)) - ret = ext4_convert_unwritten_extents(inode, offset, size); - + ret = ext4_convert_unwritten_extents(inode, offset, size); if (ret < 0) { printk(KERN_EMERG "%s: failed to convert unwritten" "extents to written extents, error is %d" @@ -3579,50 +3648,64 @@ static int ext4_end_aio_dio_nolock(ext4_io_end_t *io) io->flag = 0; return ret; } + /* * work on completed aio dio IO, to convert unwritten extents to extents */ -static void ext4_end_aio_dio_work(struct work_struct *work) +static void ext4_end_io_work(struct work_struct *work) { - ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); - struct inode *inode = io->inode; - int ret = 0; + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); + struct inode *inode = io->inode; + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned long flags; + int ret; mutex_lock(&inode->i_mutex); - ret = ext4_end_aio_dio_nolock(io); - if (ret >= 0) { - if (!list_empty(&io->list)) - list_del_init(&io->list); - ext4_free_io_end(io); + ret = ext4_end_io_nolock(io); + if (ret < 0) { + mutex_unlock(&inode->i_mutex); + return; } + + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + if (!list_empty(&io->list)) + list_del_init(&io->list); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); mutex_unlock(&inode->i_mutex); + ext4_free_io_end(io); } + /* * This function is called from ext4_sync_file(). * - * When AIO DIO IO is completed, the work to convert unwritten - * extents to written is queued on workqueue but may not get immediately + * When IO is completed, the work to convert unwritten extents to + * written is queued on workqueue but may not get immediately * scheduled. When fsync is called, we need to ensure the * conversion is complete before fsync returns. - * The inode keeps track of a list of completed AIO from DIO path - * that might needs to do the conversion. This function walks through - * the list and convert the related unwritten extents to written. + * The inode keeps track of a list of pending/completed IO that + * might needs to do the conversion. This function walks through + * the list and convert the related unwritten extents for completed IO + * to written. + * The function return the number of pending IOs on success. */ -int flush_aio_dio_completed_IO(struct inode *inode) +int flush_completed_IO(struct inode *inode) { ext4_io_end_t *io; + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned long flags; int ret = 0; int ret2 = 0; - if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)) + if (list_empty(&ei->i_completed_io_list)) return ret; - dump_aio_dio_list(inode); - while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ - io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next, + dump_completed_IO(inode); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + while (!list_empty(&ei->i_completed_io_list)){ + io = list_entry(ei->i_completed_io_list.next, ext4_io_end_t, list); /* - * Calling ext4_end_aio_dio_nolock() to convert completed + * Calling ext4_end_io_nolock() to convert completed * IO to written. * * When ext4_sync_file() is called, run_queue() may already @@ -3635,20 +3718,23 @@ int flush_aio_dio_completed_IO(struct inode *inode) * avoid double converting from both fsync and background work * queue work. */ - ret = ext4_end_aio_dio_nolock(io); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + ret = ext4_end_io_nolock(io); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); if (ret < 0) ret2 = ret; else list_del_init(&io->list); } + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); return (ret2 < 0) ? ret2 : 0; } -static ext4_io_end_t *ext4_init_io_end (struct inode *inode) +static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags) { ext4_io_end_t *io = NULL; - io = kmalloc(sizeof(*io), GFP_NOFS); + io = kmalloc(sizeof(*io), flags); if (io) { igrab(inode); @@ -3656,8 +3742,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode) io->flag = 0; io->offset = 0; io->size = 0; - io->error = 0; - INIT_WORK(&io->work, ext4_end_aio_dio_work); + io->page = NULL; + INIT_WORK(&io->work, ext4_end_io_work); INIT_LIST_HEAD(&io->list); } @@ -3669,6 +3755,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, { ext4_io_end_t *io_end = iocb->private; struct workqueue_struct *wq; + unsigned long flags; + struct ext4_inode_info *ei; /* if not async direct IO or dio with 0 bytes write, just return */ if (!io_end || !size) @@ -3680,7 +3768,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, size); /* if not aio dio with unwritten extents, just free io and return */ - if (io_end->flag != DIO_AIO_UNWRITTEN){ + if (io_end->flag != EXT4_IO_UNWRITTEN){ ext4_free_io_end(io_end); iocb->private = NULL; return; @@ -3688,16 +3776,85 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, io_end->offset = offset; io_end->size = size; + io_end->flag = EXT4_IO_UNWRITTEN; wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; /* queue the work to convert unwritten extents to written */ queue_work(wq, &io_end->work); /* Add the io_end to per-inode completed aio dio list*/ - list_add_tail(&io_end->list, - &EXT4_I(io_end->inode)->i_aio_dio_complete_list); + ei = EXT4_I(io_end->inode); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + list_add_tail(&io_end->list, &ei->i_completed_io_list); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); iocb->private = NULL; } + +static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) +{ + ext4_io_end_t *io_end = bh->b_private; + struct workqueue_struct *wq; + struct inode *inode; + unsigned long flags; + + if (!test_clear_buffer_uninit(bh) || !io_end) + goto out; + + if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) { + printk("sb umounted, discard end_io request for inode %lu\n", + io_end->inode->i_ino); + ext4_free_io_end(io_end); + goto out; + } + + io_end->flag = EXT4_IO_UNWRITTEN; + inode = io_end->inode; + + /* Add the io_end to per-inode completed io list*/ + spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); + list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); + spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); + + wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; + /* queue the work to convert unwritten extents to written */ + queue_work(wq, &io_end->work); +out: + bh->b_private = NULL; + bh->b_end_io = NULL; + clear_buffer_uninit(bh); + end_buffer_async_write(bh, uptodate); +} + +static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode) +{ + ext4_io_end_t *io_end; + struct page *page = bh->b_page; + loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT; + size_t size = bh->b_size; + +retry: + io_end = ext4_init_io_end(inode, GFP_ATOMIC); + if (!io_end) { + if (printk_ratelimit()) + printk(KERN_WARNING "%s: allocation fail\n", __func__); + schedule(); + goto retry; + } + io_end->offset = offset; + io_end->size = size; + /* + * We need to hold a reference to the page to make sure it + * doesn't get evicted before ext4_end_io_work() has a chance + * to convert the extent from written to unwritten. + */ + io_end->page = page; + get_page(io_end->page); + + bh->b_private = io_end; + bh->b_end_io = ext4_end_io_buffer_write; + return 0; +} + /* * For ext4 extent files, ext4 will do direct-io write to holes, * preallocated extents, and those write extend the file, no need to @@ -3751,7 +3908,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, iocb->private = NULL; EXT4_I(inode)->cur_aio_dio = NULL; if (!is_sync_kiocb(iocb)) { - iocb->private = ext4_init_io_end(inode); + iocb->private = ext4_init_io_end(inode, GFP_NOFS); if (!iocb->private) return -ENOMEM; /* @@ -3767,7 +3924,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, - ext4_get_block_dio_write, + ext4_get_block_write, ext4_end_io_dio); if (iocb->private) EXT4_I(inode)->cur_aio_dio = NULL; @@ -3788,8 +3945,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { ext4_free_io_end(iocb->private); iocb->private = NULL; - } else if (ret > 0 && (EXT4_I(inode)->i_state & - EXT4_STATE_DIO_UNWRITTEN)) { + } else if (ret > 0 && ext4_test_inode_state(inode, + EXT4_STATE_DIO_UNWRITTEN)) { int err; /* * for non AIO case, since the IO is already @@ -3799,7 +3956,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, offset, ret); if (err < 0) ret = err; - EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN; + ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); } return ret; } @@ -4130,18 +4287,27 @@ no_top: * We release `count' blocks on disk, but (last - first) may be greater * than `count' because there can be holes in there. */ -static void ext4_clear_blocks(handle_t *handle, struct inode *inode, - struct buffer_head *bh, - ext4_fsblk_t block_to_free, - unsigned long count, __le32 *first, - __le32 *last) +static int ext4_clear_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, + ext4_fsblk_t block_to_free, + unsigned long count, __le32 *first, + __le32 *last) { __le32 *p; - int flags = EXT4_FREE_BLOCKS_FORGET; + int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) flags |= EXT4_FREE_BLOCKS_METADATA; + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, + count)) { + ext4_error(inode->i_sb, "inode #%lu: " + "attempt to clear blocks %llu len %lu, invalid", + inode->i_ino, (unsigned long long) block_to_free, + count); + return 1; + } + if (try_to_extend_transaction(handle, inode)) { if (bh) { BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); @@ -4160,6 +4326,7 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, *p = 0; ext4_free_blocks(handle, inode, 0, block_to_free, count, flags); + return 0; } /** @@ -4215,9 +4382,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, } else if (nr == block_to_free + count) { count++; } else { - ext4_clear_blocks(handle, inode, this_bh, - block_to_free, - count, block_to_free_p, p); + if (ext4_clear_blocks(handle, inode, this_bh, + block_to_free, count, + block_to_free_p, p)) + break; block_to_free = nr; block_to_free_p = p; count = 1; @@ -4241,7 +4409,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) ext4_handle_dirty_metadata(handle, inode, this_bh); else - ext4_error(inode->i_sb, __func__, + ext4_error(inode->i_sb, "circular indirect block detected, " "inode=%lu, block=%llu", inode->i_ino, @@ -4281,6 +4449,16 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, if (!nr) continue; /* A hole */ + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), + nr, 1)) { + ext4_error(inode->i_sb, + "indirect mapped block in inode " + "#%lu invalid (level %d, blk #%lu)", + inode->i_ino, depth, + (unsigned long) nr); + break; + } + /* Go read the buffer for the next level down */ bh = sb_bread(inode->i_sb, nr); @@ -4289,7 +4467,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, * (should be rare). */ if (!bh) { - ext4_error(inode->i_sb, "ext4_free_branches", + ext4_error(inode->i_sb, "Read failure, inode=%lu, block=%llu", inode->i_ino, nr); continue; @@ -4433,8 +4611,10 @@ void ext4_truncate(struct inode *inode) if (!ext4_can_truncate(inode)) return; + EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL; + if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) - ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; + ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { ext4_ext_truncate(inode); @@ -4604,9 +4784,8 @@ static int __ext4_get_inode_loc(struct inode *inode, bh = sb_getblk(sb, block); if (!bh) { - ext4_error(sb, "ext4_get_inode_loc", "unable to read " - "inode block - inode=%lu, block=%llu", - inode->i_ino, block); + ext4_error(sb, "unable to read inode block - " + "inode=%lu, block=%llu", inode->i_ino, block); return -EIO; } if (!buffer_uptodate(bh)) { @@ -4704,9 +4883,8 @@ make_io: submit_bh(READ_META, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { - ext4_error(sb, __func__, - "unable to read inode block - inode=%lu, " - "block=%llu", inode->i_ino, block); + ext4_error(sb, "unable to read inode block - inode=%lu," + " block=%llu", inode->i_ino, block); brelse(bh); return -EIO; } @@ -4720,7 +4898,7 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) { /* We have all inode data except xattrs in memory here. */ return __ext4_get_inode_loc(inode, iloc, - !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)); + !ext4_test_inode_state(inode, EXT4_STATE_XATTR)); } void ext4_set_inode_flags(struct inode *inode) @@ -4814,7 +4992,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) } inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); - ei->i_state = 0; + ei->i_state_flags = 0; ei->i_dir_start_lookup = 0; ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); /* We now have enough fields to check if the inode was active or not. @@ -4897,7 +5075,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) - ei->i_state |= EXT4_STATE_XATTR; + ext4_set_inode_state(inode, EXT4_STATE_XATTR); } } else ei->i_extra_isize = 0; @@ -4917,8 +5095,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ret = 0; if (ei->i_file_acl && !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { - ext4_error(sb, __func__, - "bad extended attribute block %llu in inode #%lu", + ext4_error(sb, "bad extended attribute block %llu inode #%lu", ei->i_file_acl, inode->i_ino); ret = -EIO; goto bad_inode; @@ -4964,8 +5141,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); } else { ret = -EIO; - ext4_error(inode->i_sb, __func__, - "bogus i_mode (%o) for inode=%lu", + ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu", inode->i_mode, inode->i_ino); goto bad_inode; } @@ -5037,7 +5213,7 @@ static int ext4_do_update_inode(handle_t *handle, /* For fields not not tracking in the in-memory inode, * initialise them to zero for new inodes. */ - if (ei->i_state & EXT4_STATE_NEW) + if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); ext4_get_inode_flags(ei); @@ -5101,7 +5277,7 @@ static int ext4_do_update_inode(handle_t *handle, EXT4_FEATURE_RO_COMPAT_LARGE_FILE); sb->s_dirt = 1; ext4_handle_sync(handle); - err = ext4_handle_dirty_metadata(handle, inode, + err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); } } @@ -5130,10 +5306,10 @@ static int ext4_do_update_inode(handle_t *handle, } BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - rc = ext4_handle_dirty_metadata(handle, inode, bh); + rc = ext4_handle_dirty_metadata(handle, NULL, bh); if (!err) err = rc; - ei->i_state &= ~EXT4_STATE_NEW; + ext4_clear_inode_state(inode, EXT4_STATE_NEW); ext4_update_inode_fsync_trans(handle, inode, 0); out_brelse: @@ -5177,7 +5353,7 @@ out_brelse: * `stuff()' is running, and the new i_size will be lost. Plus the inode * will no longer be on the superblock's dirty inode list. */ -int ext4_write_inode(struct inode *inode, int wait) +int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; @@ -5191,7 +5367,7 @@ int ext4_write_inode(struct inode *inode, int wait) return -EIO; } - if (!wait) + if (wbc->sync_mode != WB_SYNC_ALL) return 0; err = ext4_force_commit(inode->i_sb); @@ -5201,13 +5377,11 @@ int ext4_write_inode(struct inode *inode, int wait) err = ext4_get_inode_loc(inode, &iloc); if (err) return err; - if (wait) + if (wbc->sync_mode == WB_SYNC_ALL) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { - ext4_error(inode->i_sb, __func__, - "IO error syncing inode, " - "inode=%lu, block=%llu", - inode->i_ino, + ext4_error(inode->i_sb, "IO error syncing inode, " + "inode=%lu, block=%llu", inode->i_ino, (unsigned long long)iloc.bh->b_blocknr); err = -EIO; } @@ -5249,6 +5423,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; + if (ia_valid & ATTR_SIZE) + dquot_initialize(inode); if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { handle_t *handle; @@ -5261,7 +5437,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) error = PTR_ERR(handle); goto err_out; } - error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; + error = dquot_transfer(inode, attr); if (error) { ext4_journal_stop(handle); return error; @@ -5288,7 +5464,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) } if (S_ISREG(inode->i_mode) && - attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { + attr->ia_valid & ATTR_SIZE && + (attr->ia_size < inode->i_size || + (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) { handle_t *handle; handle = ext4_journal_start(inode, 3); @@ -5319,6 +5497,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) goto err_out; } } + /* ext4_truncate will clear the flag */ + if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) + ext4_truncate(inode); } rc = inode_setattr(inode, attr); @@ -5557,8 +5738,8 @@ static int ext4_expand_extra_isize(struct inode *inode, entry = IFIRST(header); /* No extended attributes present */ - if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) || - header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || + header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, new_extra_isize); EXT4_I(inode)->i_extra_isize = new_extra_isize; @@ -5602,7 +5783,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) err = ext4_reserve_inode_write(handle, inode, &iloc); if (ext4_handle_valid(handle) && EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && - !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { + !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { /* * We need extra buffer credits since we may write into EA block * with this same handle. If journal_extend fails, then it will @@ -5616,10 +5797,11 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) sbi->s_want_extra_isize, iloc, handle); if (ret) { - EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; + ext4_set_inode_state(inode, + EXT4_STATE_NO_EXPAND); if (mnt_count != le16_to_cpu(sbi->s_es->s_mnt_count)) { - ext4_warning(inode->i_sb, __func__, + ext4_warning(inode->i_sb, "Unable to expand inode %lu. Delete" " some EAs or run e2fsck.", inode->i_ino); @@ -5641,7 +5823,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) * i_size has been changed by generic_commit_write() and we thus need * to include the updated inode in the current transaction. * - * Also, vfs_dq_alloc_block() will always dirty the inode when blocks + * Also, dquot_alloc_block() will always dirty the inode when blocks * are allocated to the file. * * If the inode is marked synchronous, we don't honour that here - doing @@ -5683,7 +5865,7 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode) err = jbd2_journal_get_write_access(handle, iloc.bh); if (!err) err = ext4_handle_dirty_metadata(handle, - inode, + NULL, iloc.bh); brelse(iloc.bh); } diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index b63d193126d..016d0249294 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -92,6 +92,15 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) flags &= ~EXT4_EXTENTS_FL; } + if (flags & EXT4_EOFBLOCKS_FL) { + /* we don't support adding EOFBLOCKS flag */ + if (!(oldflags & EXT4_EOFBLOCKS_FL)) { + err = -EOPNOTSUPP; + goto flags_out; + } + } else if (oldflags & EXT4_EOFBLOCKS_FL) + ext4_truncate(inode); + handle = ext4_journal_start(inode, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); @@ -249,7 +258,8 @@ setversion_out: if (me.moved_len > 0) file_remove_suid(donor_filp); - if (copy_to_user((struct move_extent *)arg, &me, sizeof(me))) + if (copy_to_user((struct move_extent __user *)arg, + &me, sizeof(me))) err = -EFAULT; mext_out: fput(donor_filp); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d34afad3e13..54df209d2ee 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -69,7 +69,7 @@ * * pa_lstart -> the logical start block for this prealloc space * pa_pstart -> the physical start block for this prealloc space - * pa_len -> lenght for this prealloc space + * pa_len -> length for this prealloc space * pa_free -> free space available in this prealloc space * * The inode preallocation space is used looking at the _logical_ start @@ -441,10 +441,9 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, for (i = 0; i < count; i++) { if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { ext4_fsblk_t blocknr; - blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); + + blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += first + i; - blocknr += - le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); ext4_grp_locked_error(sb, e4b->bd_group, __func__, "double-free of inode" " %lu's block %llu(bit %u in group %u)", @@ -1255,10 +1254,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) { ext4_fsblk_t blocknr; - blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); + + blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += block; - blocknr += - le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); ext4_grp_locked_error(sb, e4b->bd_group, __func__, "double-free of inode" " %lu's block %llu(bit %u in group %u)", @@ -1631,7 +1629,6 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, int max; int err; struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_super_block *es = sbi->s_es; struct ext4_free_extent ex; if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) @@ -1648,8 +1645,8 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { ext4_fsblk_t start; - start = (e4b->bd_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) + - ex.fe_start + le32_to_cpu(es->s_first_data_block); + start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) + + ex.fe_start; /* use do_div to get remainder (would be 64-bit modulo) */ if (do_div(start, sbi->s_stripe) == 0) { ac->ac_found++; @@ -1803,8 +1800,8 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, BUG_ON(sbi->s_stripe == 0); /* find first stripe-aligned block in group */ - first_group_block = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb) - + le32_to_cpu(sbi->s_es->s_first_data_block); + first_group_block = ext4_group_first_block_no(sb, e4b->bd_group); + a = first_group_block + sbi->s_stripe - 1; do_div(a, sbi->s_stripe); i = (a * sbi->s_stripe) - first_group_block; @@ -2256,7 +2253,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); init_rwsem(&meta_group_info[i]->alloc_sem); - meta_group_info[i]->bb_free_root.rb_node = NULL; + meta_group_info[i]->bb_free_root = RB_ROOT; #ifdef DOUBLE_CHECK { @@ -2560,12 +2557,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) ext4_unlock_group(sb, entry->group); if (test_opt(sb, DISCARD)) { ext4_fsblk_t discard_block; - struct ext4_super_block *es = EXT4_SB(sb)->s_es; - discard_block = (ext4_fsblk_t)entry->group * - EXT4_BLOCKS_PER_GROUP(sb) - + entry->start_blk - + le32_to_cpu(es->s_first_data_block); + discard_block = entry->start_blk + + ext4_group_first_block_no(sb, entry->group); trace_ext4_discard_blocks(sb, (unsigned long long)discard_block, entry->count); @@ -2703,14 +2697,11 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, if (err) goto out_err; - block = ac->ac_b_ex.fe_group * EXT4_BLOCKS_PER_GROUP(sb) - + ac->ac_b_ex.fe_start - + le32_to_cpu(es->s_first_data_block); + block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); len = ac->ac_b_ex.fe_len; if (!ext4_data_block_valid(sbi, block, len)) { - ext4_error(sb, __func__, - "Allocating blocks %llu-%llu which overlap " + ext4_error(sb, "Allocating blocks %llu-%llu which overlap " "fs metadata\n", block, block+len); /* File system mounted not to panic on error * Fix the bitmap and repeat the block allocation @@ -3161,9 +3152,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) /* The max size of hash table is PREALLOC_TB_SIZE */ order = PREALLOC_TB_SIZE - 1; - goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) + - ac->ac_g_ex.fe_start + - le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block); + goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex); /* * search for the prealloc space that is having * minimal distance from the goal block. @@ -3526,8 +3515,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, if (bit >= end) break; next = mb_find_next_bit(bitmap_bh->b_data, end, bit); - start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + - le32_to_cpu(sbi->s_es->s_first_data_block); + start = ext4_group_first_block_no(sb, group) + bit; mb_debug(1, " free preallocated %u/%u in group %u\n", (unsigned) start, (unsigned) next - bit, (unsigned) group); @@ -3623,15 +3611,13 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, bitmap_bh = ext4_read_block_bitmap(sb, group); if (bitmap_bh == NULL) { - ext4_error(sb, __func__, "Error in reading block " - "bitmap for %u", group); + ext4_error(sb, "Error reading block bitmap for %u", group); return 0; } err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { - ext4_error(sb, __func__, "Error in loading buddy " - "information for %u", group); + ext4_error(sb, "Error loading buddy information for %u", group); put_bh(bitmap_bh); return 0; } @@ -3804,15 +3790,15 @@ repeat: err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { - ext4_error(sb, __func__, "Error in loading buddy " - "information for %u", group); + ext4_error(sb, "Error loading buddy information for %u", + group); continue; } bitmap_bh = ext4_read_block_bitmap(sb, group); if (bitmap_bh == NULL) { - ext4_error(sb, __func__, "Error in reading block " - "bitmap for %u", group); + ext4_error(sb, "Error reading block bitmap for %u", + group); ext4_mb_release_desc(&e4b); continue; } @@ -3938,7 +3924,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) /* don't use group allocation for large files */ size = max(size, isize); - if (size >= sbi->s_mb_stream_request) { + if (size > sbi->s_mb_stream_request) { ac->ac_flags |= EXT4_MB_STREAM_ALLOC; return; } @@ -4077,8 +4063,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); if (ext4_mb_load_buddy(sb, group, &e4b)) { - ext4_error(sb, __func__, "Error in loading buddy " - "information for %u", group); + ext4_error(sb, "Error loading buddy information for %u", + group); continue; } ext4_lock_group(sb, group); @@ -4254,7 +4240,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, return 0; } reserv_blks = ar->len; - while (ar->len && vfs_dq_alloc_block(ar->inode, ar->len)) { + while (ar->len && dquot_alloc_block(ar->inode, ar->len)) { ar->flags |= EXT4_MB_HINT_NOPREALLOC; ar->len--; } @@ -4331,7 +4317,7 @@ out2: kmem_cache_free(ext4_ac_cachep, ac); out1: if (inquota && ar->len < inquota) - vfs_dq_free_block(ar->inode, inquota - ar->len); + dquot_free_block(ar->inode, inquota - ar->len); out3: if (!ar->len) { if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) @@ -4476,10 +4462,10 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, sbi = EXT4_SB(sb); es = EXT4_SB(sb)->s_es; - if (!ext4_data_block_valid(sbi, block, count)) { - ext4_error(sb, __func__, - "Freeing blocks not in datazone - " - "block = %llu, count = %lu", block, count); + if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && + !ext4_data_block_valid(sbi, block, count)) { + ext4_error(sb, "Freeing blocks not in datazone - " + "block = %llu, count = %lu", block, count); goto error_return; } @@ -4547,8 +4533,7 @@ do_more: in_range(block + count - 1, ext4_inode_table(sb, gdp), EXT4_SB(sb)->s_itb_per_group)) { - ext4_error(sb, __func__, - "Freeing blocks in system zone - " + ext4_error(sb, "Freeing blocks in system zone - " "Block = %llu, count = %lu", block, count); /* err = 0. ext4_std_error should be a no op */ goto error_return; @@ -4646,7 +4631,7 @@ do_more: sb->s_dirt = 1; error_return: if (freed) - vfs_dq_free_block(inode, freed); + dquot_free_block(inode, freed); brelse(bitmap_bh); ext4_std_error(sb, err); if (ac) diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 436521cae45..b619322c76f 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -220,16 +220,9 @@ struct ext4_buddy { #define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) #define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) -#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) - static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, struct ext4_free_extent *fex) { - ext4_fsblk_t block; - - block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb) - + fex->fe_start - + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); - return block; + return ext4_group_first_block_no(sb, fex->fe_group) + fex->fe_start; } #endif diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 81415814b00..8b87bd0eac9 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -365,12 +365,12 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, * happened after we started the migrate. We need to * fail the migrate */ - if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) { + if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) { retval = -EAGAIN; up_write(&EXT4_I(inode)->i_data_sem); goto err_out; } else - EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; + ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); /* * We have the extent map build with the tmp inode. * Now copy the i_data across @@ -503,14 +503,10 @@ int ext4_ext_migrate(struct inode *inode) } i_size_write(tmp_inode, i_size_read(inode)); /* - * We don't want the inode to be reclaimed - * if we got interrupted in between. We have - * this tmp inode carrying reference to the - * data blocks of the original file. We set - * the i_nlink to zero at the last stage after - * switching the original file to extent format + * Set the i_nlink to zero so it will be deleted later + * when we drop inode reference. */ - tmp_inode->i_nlink = 1; + tmp_inode->i_nlink = 0; ext4_ext_tree_init(handle, tmp_inode); ext4_orphan_add(handle, tmp_inode); @@ -533,10 +529,20 @@ int ext4_ext_migrate(struct inode *inode) * allocation. */ down_read((&EXT4_I(inode)->i_data_sem)); - EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE; + ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE); up_read((&EXT4_I(inode)->i_data_sem)); handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) { + /* + * It is impossible to update on-disk structures without + * a handle, so just rollback in-core changes and live other + * work to orphan_list_cleanup() + */ + ext4_orphan_del(NULL, tmp_inode); + retval = PTR_ERR(handle); + goto out; + } ei = EXT4_I(inode); i_data = ei->i_data; @@ -618,15 +624,8 @@ err_out: /* Reset the extent details */ ext4_ext_tree_init(handle, tmp_inode); - - /* - * Set the i_nlink to zero so that - * generic_drop_inode really deletes the - * inode - */ - tmp_inode->i_nlink = 0; - ext4_journal_stop(handle); +out: unlock_new_inode(tmp_inode); iput(tmp_inode); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 82c415be87a..aa5fe28d180 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -152,12 +152,12 @@ mext_check_null_inode(struct inode *inode1, struct inode *inode2, int ret = 0; if (inode1 == NULL) { - ext4_error(inode2->i_sb, function, + __ext4_error(inode2->i_sb, function, "Both inodes should not be NULL: " "inode1 NULL inode2 %lu", inode2->i_ino); ret = -EIO; } else if (inode2 == NULL) { - ext4_error(inode1->i_sb, function, + __ext4_error(inode1->i_sb, function, "Both inodes should not be NULL: " "inode1 %lu inode2 NULL", inode1->i_ino); ret = -EIO; @@ -252,6 +252,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode, } o_start->ee_len = start_ext->ee_len; + eblock = le32_to_cpu(start_ext->ee_block); new_flag = 1; } else if (start_ext->ee_len && new_ext->ee_len && @@ -262,6 +263,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode, * orig |------------------------------| */ o_start->ee_len = start_ext->ee_len; + eblock = le32_to_cpu(start_ext->ee_block); new_flag = 1; } else if (!start_ext->ee_len && new_ext->ee_len && @@ -475,7 +477,6 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, struct ext4_extent *oext, *o_start, *o_end, *prev_ext; struct ext4_extent new_ext, start_ext, end_ext; ext4_lblk_t new_ext_end; - ext4_fsblk_t new_phys_end; int oext_alen, new_ext_alen, end_ext_alen; int depth = ext_depth(orig_inode); int ret; @@ -489,7 +490,6 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, new_ext.ee_len = dext->ee_len; new_ext_alen = ext4_ext_get_actual_len(&new_ext); new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1; - new_phys_end = ext_pblock(&new_ext) + new_ext_alen - 1; /* * Case: original extent is first @@ -502,6 +502,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, le32_to_cpu(oext->ee_block) + oext_alen) { start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) - le32_to_cpu(oext->ee_block)); + start_ext.ee_block = oext->ee_block; copy_extent_status(oext, &start_ext); } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) { prev_ext = oext - 1; @@ -515,6 +516,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, start_ext.ee_len = cpu_to_le16( ext4_ext_get_actual_len(prev_ext) + new_ext_alen); + start_ext.ee_block = oext->ee_block; copy_extent_status(prev_ext, &start_ext); new_ext.ee_len = 0; } @@ -526,7 +528,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, * new_ext |-------| */ if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) { - ext4_error(orig_inode->i_sb, __func__, + ext4_error(orig_inode->i_sb, "new_ext_end(%u) should be less than or equal to " "oext->ee_block(%u) + oext_alen(%d) - 1", new_ext_end, le32_to_cpu(oext->ee_block), @@ -689,12 +691,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, while (1) { /* The extent for donor must be found. */ if (!dext) { - ext4_error(donor_inode->i_sb, __func__, + ext4_error(donor_inode->i_sb, "The extent for donor must be found"); *err = -EIO; goto out; } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { - ext4_error(donor_inode->i_sb, __func__, + ext4_error(donor_inode->i_sb, "Donor offset(%u) and the first block of donor " "extent(%u) should be equal", donor_off, @@ -928,7 +930,7 @@ out2: } /** - * mext_check_argumants - Check whether move extent can be done + * mext_check_arguments - Check whether move extent can be done * * @orig_inode: original inode * @donor_inode: donor inode @@ -949,14 +951,6 @@ mext_check_arguments(struct inode *orig_inode, unsigned int blkbits = orig_inode->i_blkbits; unsigned int blocksize = 1 << blkbits; - /* Regular file check */ - if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { - ext4_debug("ext4 move extent: The argument files should be " - "regular file [ino:orig %lu, donor %lu]\n", - orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; - } - if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { ext4_debug("ext4 move extent: suid or sgid is set" " to donor file [ino:orig %lu, donor %lu]\n", @@ -1204,6 +1198,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, return -EINVAL; } + /* Regular file check */ + if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { + ext4_debug("ext4 move extent: The argument files should be " + "regular file [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + /* Protect orig and donor inodes against a truncate */ ret1 = mext_inode_double_lock(orig_inode, donor_inode); if (ret1 < 0) @@ -1351,7 +1353,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, if (ret1 < 0) break; if (*moved_len > len) { - ext4_error(orig_inode->i_sb, __func__, + ext4_error(orig_inode->i_sb, "We replaced blocks too much! " "sum of replaced: %llu requested: %llu", *moved_len, len); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 17a17e10dd6..0c070fabd10 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -383,8 +383,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir, if (root->info.hash_version != DX_HASH_TEA && root->info.hash_version != DX_HASH_HALF_MD4 && root->info.hash_version != DX_HASH_LEGACY) { - ext4_warning(dir->i_sb, __func__, - "Unrecognised inode hash code %d", + ext4_warning(dir->i_sb, "Unrecognised inode hash code %d", root->info.hash_version); brelse(bh); *err = ERR_BAD_DX_DIR; @@ -399,8 +398,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir, hash = hinfo->hash; if (root->info.unused_flags & 1) { - ext4_warning(dir->i_sb, __func__, - "Unimplemented inode hash flags: %#06x", + ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x", root->info.unused_flags); brelse(bh); *err = ERR_BAD_DX_DIR; @@ -408,8 +406,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir, } if ((indirect = root->info.indirect_levels) > 1) { - ext4_warning(dir->i_sb, __func__, - "Unimplemented inode hash depth: %#06x", + ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x", root->info.indirect_levels); brelse(bh); *err = ERR_BAD_DX_DIR; @@ -421,8 +418,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir, if (dx_get_limit(entries) != dx_root_limit(dir, root->info.info_length)) { - ext4_warning(dir->i_sb, __func__, - "dx entry: limit != root limit"); + ext4_warning(dir->i_sb, "dx entry: limit != root limit"); brelse(bh); *err = ERR_BAD_DX_DIR; goto fail; @@ -433,7 +429,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir, { count = dx_get_count(entries); if (!count || count > dx_get_limit(entries)) { - ext4_warning(dir->i_sb, __func__, + ext4_warning(dir->i_sb, "dx entry: no count or count > limit"); brelse(bh); *err = ERR_BAD_DX_DIR; @@ -478,7 +474,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir, goto fail2; at = entries = ((struct dx_node *) bh->b_data)->entries; if (dx_get_limit(entries) != dx_node_limit (dir)) { - ext4_warning(dir->i_sb, __func__, + ext4_warning(dir->i_sb, "dx entry: limit != node limit"); brelse(bh); *err = ERR_BAD_DX_DIR; @@ -494,7 +490,7 @@ fail2: } fail: if (*err == ERR_BAD_DX_DIR) - ext4_warning(dir->i_sb, __func__, + ext4_warning(dir->i_sb, "Corrupt dir inode %ld, running e2fsck is " "recommended.", dir->i_ino); return NULL; @@ -947,9 +943,8 @@ restart: wait_on_buffer(bh); if (!buffer_uptodate(bh)) { /* read error, skip block & hope for the best */ - ext4_error(sb, __func__, "reading directory #%lu " - "offset %lu", dir->i_ino, - (unsigned long)block); + ext4_error(sb, "reading directory #%lu offset %lu", + dir->i_ino, (unsigned long)block); brelse(bh); goto next; } @@ -1041,7 +1036,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q retval = ext4_htree_next_block(dir, hash, frame, frames, NULL); if (retval < 0) { - ext4_warning(sb, __func__, + ext4_warning(sb, "error reading index page in directory #%lu", dir->i_ino); *err = retval; @@ -1071,14 +1066,13 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru __u32 ino = le32_to_cpu(de->inode); brelse(bh); if (!ext4_valid_inum(dir->i_sb, ino)) { - ext4_error(dir->i_sb, "ext4_lookup", - "bad inode number: %u", ino); + ext4_error(dir->i_sb, "bad inode number: %u", ino); return ERR_PTR(-EIO); } inode = ext4_iget(dir->i_sb, ino); if (unlikely(IS_ERR(inode))) { if (PTR_ERR(inode) == -ESTALE) { - ext4_error(dir->i_sb, __func__, + ext4_error(dir->i_sb, "deleted inode referenced: %u", ino); return ERR_PTR(-EIO); @@ -1110,7 +1104,7 @@ struct dentry *ext4_get_parent(struct dentry *child) brelse(bh); if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { - ext4_error(child->d_inode->i_sb, "ext4_get_parent", + ext4_error(child->d_inode->i_sb, "bad inode number: %u", ino); return ERR_PTR(-EIO); } @@ -1410,7 +1404,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, de = (struct ext4_dir_entry_2 *)((char *)fde + ext4_rec_len_from_disk(fde->rec_len, blocksize)); if ((char *) de >= (((char *) root) + blocksize)) { - ext4_error(dir->i_sb, __func__, + ext4_error(dir->i_sb, "invalid rec_len for '..' in inode %lu", dir->i_ino); brelse(bh); @@ -1575,8 +1569,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (levels && (dx_get_count(frames->entries) == dx_get_limit(frames->entries))) { - ext4_warning(sb, __func__, - "Directory index full!"); + ext4_warning(sb, "Directory index full!"); err = -ENOSPC; goto cleanup; } @@ -1766,6 +1759,8 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode, struct inode *inode; int err, retries = 0; + dquot_initialize(dir); + retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + @@ -1800,6 +1795,8 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, if (!new_valid_dev(rdev)) return -EINVAL; + dquot_initialize(dir); + retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + @@ -1837,6 +1834,8 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (EXT4_DIR_LINK_MAX(dir)) return -EMLINK; + dquot_initialize(dir); + retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + @@ -1916,11 +1915,11 @@ static int empty_dir(struct inode *inode) if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { if (err) - ext4_error(inode->i_sb, __func__, + ext4_error(inode->i_sb, "error %d reading directory #%lu offset 0", err, inode->i_ino); else - ext4_warning(inode->i_sb, __func__, + ext4_warning(inode->i_sb, "bad directory (dir #%lu) - no data block", inode->i_ino); return 1; @@ -1931,7 +1930,7 @@ static int empty_dir(struct inode *inode) !le32_to_cpu(de1->inode) || strcmp(".", de->name) || strcmp("..", de1->name)) { - ext4_warning(inode->i_sb, "empty_dir", + ext4_warning(inode->i_sb, "bad directory (dir #%lu) - no `.' or `..'", inode->i_ino); brelse(bh); @@ -1949,7 +1948,7 @@ static int empty_dir(struct inode *inode) offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err); if (!bh) { if (err) - ext4_error(sb, __func__, + ext4_error(sb, "error %d reading directory" " #%lu offset %u", err, inode->i_ino, offset); @@ -2020,11 +2019,18 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) goto out_unlock; + /* + * Due to previous errors inode may be already a part of on-disk + * orphan list. If so skip on-disk list modification. + */ + if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <= + (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) + goto mem_insert; /* Insert this inode at the head of the on-disk orphan list... */ NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); - err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh); + err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); rc = ext4_mark_iloc_dirty(handle, inode, &iloc); if (!err) err = rc; @@ -2037,6 +2043,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) * * This is safe: on error we're going to ignore the orphan list * anyway on the next recovery. */ +mem_insert: if (!err) list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); @@ -2096,7 +2103,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) if (err) goto out_brelse; sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); - err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh); + err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); } else { struct ext4_iloc iloc2; struct inode *i_prev = @@ -2136,7 +2143,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go in * separate transaction */ - vfs_dq_init(dentry->d_inode); + dquot_initialize(dir); + dquot_initialize(dentry->d_inode); + handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2163,7 +2172,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) if (retval) goto end_rmdir; if (!EXT4_DIR_LINK_EMPTY(inode)) - ext4_warning(inode->i_sb, "ext4_rmdir", + ext4_warning(inode->i_sb, "empty directory has too many links (%d)", inode->i_nlink); inode->i_version++; @@ -2195,7 +2204,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go * in separate transaction */ - vfs_dq_init(dentry->d_inode); + dquot_initialize(dir); + dquot_initialize(dentry->d_inode); + handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2215,7 +2226,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) goto end_unlink; if (!inode->i_nlink) { - ext4_warning(inode->i_sb, "ext4_unlink", + ext4_warning(inode->i_sb, "Deleting nonexistent file (%lu), %d", inode->i_ino, inode->i_nlink); inode->i_nlink = 1; @@ -2250,6 +2261,8 @@ static int ext4_symlink(struct inode *dir, if (l > dir->i_sb->s_blocksize) return -ENAMETOOLONG; + dquot_initialize(dir); + retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 + @@ -2308,6 +2321,8 @@ static int ext4_link(struct dentry *old_dentry, if (inode->i_nlink >= EXT4_LINK_MAX) return -EMLINK; + dquot_initialize(dir); + /* * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing * otherwise has the potential to corrupt the orphan inode list. @@ -2358,12 +2373,15 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, struct ext4_dir_entry_2 *old_de, *new_de; int retval, force_da_alloc = 0; + dquot_initialize(old_dir); + dquot_initialize(new_dir); + old_bh = new_bh = dir_bh = NULL; /* Initialize quotas before so that eventual writes go * in separate transaction */ if (new_dentry->d_inode) - vfs_dq_init(new_dentry->d_inode); + dquot_initialize(new_dentry->d_inode); handle = ext4_journal_start(old_dir, 2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2); @@ -2462,7 +2480,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, } } if (retval) { - ext4_warning(old_dir->i_sb, "ext4_rename", + ext4_warning(old_dir->i_sb, "Deleting old file (%lu), %d, error=%d", old_dir->i_ino, old_dir->i_nlink, retval); } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 3b2c5541d8a..5692c48754a 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -48,65 +48,54 @@ static int verify_group_input(struct super_block *sb, ext4_get_group_no_and_offset(sb, start, NULL, &offset); if (group != sbi->s_groups_count) - ext4_warning(sb, __func__, - "Cannot add at group %u (only %u groups)", + ext4_warning(sb, "Cannot add at group %u (only %u groups)", input->group, sbi->s_groups_count); else if (offset != 0) - ext4_warning(sb, __func__, "Last group not full"); + ext4_warning(sb, "Last group not full"); else if (input->reserved_blocks > input->blocks_count / 5) - ext4_warning(sb, __func__, "Reserved blocks too high (%u)", + ext4_warning(sb, "Reserved blocks too high (%u)", input->reserved_blocks); else if (free_blocks_count < 0) - ext4_warning(sb, __func__, "Bad blocks count %u", + ext4_warning(sb, "Bad blocks count %u", input->blocks_count); else if (!(bh = sb_bread(sb, end - 1))) - ext4_warning(sb, __func__, - "Cannot read last block (%llu)", + ext4_warning(sb, "Cannot read last block (%llu)", end - 1); else if (outside(input->block_bitmap, start, end)) - ext4_warning(sb, __func__, - "Block bitmap not in group (block %llu)", + ext4_warning(sb, "Block bitmap not in group (block %llu)", (unsigned long long)input->block_bitmap); else if (outside(input->inode_bitmap, start, end)) - ext4_warning(sb, __func__, - "Inode bitmap not in group (block %llu)", + ext4_warning(sb, "Inode bitmap not in group (block %llu)", (unsigned long long)input->inode_bitmap); else if (outside(input->inode_table, start, end) || outside(itend - 1, start, end)) - ext4_warning(sb, __func__, - "Inode table not in group (blocks %llu-%llu)", + ext4_warning(sb, "Inode table not in group (blocks %llu-%llu)", (unsigned long long)input->inode_table, itend - 1); else if (input->inode_bitmap == input->block_bitmap) - ext4_warning(sb, __func__, - "Block bitmap same as inode bitmap (%llu)", + ext4_warning(sb, "Block bitmap same as inode bitmap (%llu)", (unsigned long long)input->block_bitmap); else if (inside(input->block_bitmap, input->inode_table, itend)) - ext4_warning(sb, __func__, - "Block bitmap (%llu) in inode table (%llu-%llu)", + ext4_warning(sb, "Block bitmap (%llu) in inode table " + "(%llu-%llu)", (unsigned long long)input->block_bitmap, (unsigned long long)input->inode_table, itend - 1); else if (inside(input->inode_bitmap, input->inode_table, itend)) - ext4_warning(sb, __func__, - "Inode bitmap (%llu) in inode table (%llu-%llu)", + ext4_warning(sb, "Inode bitmap (%llu) in inode table " + "(%llu-%llu)", (unsigned long long)input->inode_bitmap, (unsigned long long)input->inode_table, itend - 1); else if (inside(input->block_bitmap, start, metaend)) - ext4_warning(sb, __func__, - "Block bitmap (%llu) in GDT table" - " (%llu-%llu)", + ext4_warning(sb, "Block bitmap (%llu) in GDT table (%llu-%llu)", (unsigned long long)input->block_bitmap, start, metaend - 1); else if (inside(input->inode_bitmap, start, metaend)) - ext4_warning(sb, __func__, - "Inode bitmap (%llu) in GDT table" - " (%llu-%llu)", + ext4_warning(sb, "Inode bitmap (%llu) in GDT table (%llu-%llu)", (unsigned long long)input->inode_bitmap, start, metaend - 1); else if (inside(input->inode_table, start, metaend) || inside(itend - 1, start, metaend)) - ext4_warning(sb, __func__, - "Inode table (%llu-%llu) overlaps" - "GDT table (%llu-%llu)", + ext4_warning(sb, "Inode table (%llu-%llu) overlaps GDT table " + "(%llu-%llu)", (unsigned long long)input->inode_table, itend - 1, start, metaend - 1); else @@ -364,8 +353,7 @@ static int verify_reserved_gdb(struct super_block *sb, while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) { if (le32_to_cpu(*p++) != grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){ - ext4_warning(sb, __func__, - "reserved GDT %llu" + ext4_warning(sb, "reserved GDT %llu" " missing grp %d (%llu)", blk, grp, grp * @@ -420,8 +408,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, */ if (EXT4_SB(sb)->s_sbh->b_blocknr != le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { - ext4_warning(sb, __func__, - "won't resize using backup superblock at %llu", + ext4_warning(sb, "won't resize using backup superblock at %llu", (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr); return -EPERM; } @@ -444,8 +431,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, data = (__le32 *)dind->b_data; if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) { - ext4_warning(sb, __func__, - "new group %u GDT block %llu not reserved", + ext4_warning(sb, "new group %u GDT block %llu not reserved", input->group, gdblock); err = -EINVAL; goto exit_dind; @@ -468,7 +454,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, GFP_NOFS); if (!n_group_desc) { err = -ENOMEM; - ext4_warning(sb, __func__, + ext4_warning(sb, "not enough memory for %lu groups", gdb_num + 1); goto exit_inode; } @@ -567,8 +553,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, /* Get each reserved primary GDT block and verify it holds backups */ for (res = 0; res < reserved_gdb; res++, blk++) { if (le32_to_cpu(*data) != blk) { - ext4_warning(sb, __func__, - "reserved block %llu" + ext4_warning(sb, "reserved block %llu" " not at offset %ld", blk, (long)(data - (__le32 *)dind->b_data)); @@ -713,8 +698,7 @@ static void update_backups(struct super_block *sb, */ exit_err: if (err) { - ext4_warning(sb, __func__, - "can't update backup for group %u (err %d), " + ext4_warning(sb, "can't update backup for group %u (err %d), " "forcing fsck on next reboot", group, err); sbi->s_mount_state &= ~EXT4_VALID_FS; sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS); @@ -753,20 +737,19 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) { - ext4_warning(sb, __func__, - "Can't resize non-sparse filesystem further"); + ext4_warning(sb, "Can't resize non-sparse filesystem further"); return -EPERM; } if (ext4_blocks_count(es) + input->blocks_count < ext4_blocks_count(es)) { - ext4_warning(sb, __func__, "blocks_count overflow"); + ext4_warning(sb, "blocks_count overflow"); return -EINVAL; } if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) < le32_to_cpu(es->s_inodes_count)) { - ext4_warning(sb, __func__, "inodes_count overflow"); + ext4_warning(sb, "inodes_count overflow"); return -EINVAL; } @@ -774,14 +757,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE) || !le16_to_cpu(es->s_reserved_gdt_blocks)) { - ext4_warning(sb, __func__, + ext4_warning(sb, "No reserved GDT blocks, can't resize"); return -EPERM; } inode = ext4_iget(sb, EXT4_RESIZE_INO); if (IS_ERR(inode)) { - ext4_warning(sb, __func__, - "Error opening resize inode"); + ext4_warning(sb, "Error opening resize inode"); return PTR_ERR(inode); } } @@ -810,8 +792,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) mutex_lock(&sbi->s_resize_lock); if (input->group != sbi->s_groups_count) { - ext4_warning(sb, __func__, - "multiple resizers run on filesystem!"); + ext4_warning(sb, "multiple resizers run on filesystem!"); err = -EBUSY; goto exit_journal; } @@ -997,13 +978,12 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, " too large to resize to %llu blocks safely\n", sb->s_id, n_blocks_count); if (sizeof(sector_t) < 8) - ext4_warning(sb, __func__, "CONFIG_LBDAF not enabled"); + ext4_warning(sb, "CONFIG_LBDAF not enabled"); return -EINVAL; } if (n_blocks_count < o_blocks_count) { - ext4_warning(sb, __func__, - "can't shrink FS - resize aborted"); + ext4_warning(sb, "can't shrink FS - resize aborted"); return -EBUSY; } @@ -1011,15 +991,14 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); if (last == 0) { - ext4_warning(sb, __func__, - "need to use ext2online to resize further"); + ext4_warning(sb, "need to use ext2online to resize further"); return -EPERM; } add = EXT4_BLOCKS_PER_GROUP(sb) - last; if (o_blocks_count + add < o_blocks_count) { - ext4_warning(sb, __func__, "blocks_count overflow"); + ext4_warning(sb, "blocks_count overflow"); return -EINVAL; } @@ -1027,16 +1006,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, add = n_blocks_count - o_blocks_count; if (o_blocks_count + add < n_blocks_count) - ext4_warning(sb, __func__, - "will only finish group (%llu" - " blocks, %u new)", + ext4_warning(sb, "will only finish group (%llu blocks, %u new)", o_blocks_count + add, add); /* See if the device is actually as big as what was requested */ bh = sb_bread(sb, o_blocks_count + add - 1); if (!bh) { - ext4_warning(sb, __func__, - "can't read last block, resize aborted"); + ext4_warning(sb, "can't read last block, resize aborted"); return -ENOSPC; } brelse(bh); @@ -1047,14 +1023,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, handle = ext4_journal_start_sb(sb, 3); if (IS_ERR(handle)) { err = PTR_ERR(handle); - ext4_warning(sb, __func__, "error %d on journal start", err); + ext4_warning(sb, "error %d on journal start", err); goto exit_put; } mutex_lock(&EXT4_SB(sb)->s_resize_lock); if (o_blocks_count != ext4_blocks_count(es)) { - ext4_warning(sb, __func__, - "multiple resizers run on filesystem!"); + ext4_warning(sb, "multiple resizers run on filesystem!"); mutex_unlock(&EXT4_SB(sb)->s_resize_lock); ext4_journal_stop(handle); err = -EBUSY; @@ -1063,8 +1038,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh))) { - ext4_warning(sb, __func__, - "error %d on journal write access", err); + ext4_warning(sb, "error %d on journal write access", err); mutex_unlock(&EXT4_SB(sb)->s_resize_lock); ext4_journal_stop(handle); goto exit_put; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 735c20d5fd5..ba191dae873 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -302,7 +302,7 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn, * write out the superblock safely. * * We'll just use the jbd2_journal_abort() error code to record an error in - * the journal instead. On recovery, the journal will compain about + * the journal instead. On recovery, the journal will complain about * that error until we've noted it down and cleared it. */ @@ -333,7 +333,7 @@ static void ext4_handle_error(struct super_block *sb) sb->s_id); } -void ext4_error(struct super_block *sb, const char *function, +void __ext4_error(struct super_block *sb, const char *function, const char *fmt, ...) { va_list args; @@ -347,6 +347,42 @@ void ext4_error(struct super_block *sb, const char *function, ext4_handle_error(sb); } +void ext4_error_inode(const char *function, struct inode *inode, + const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + printk(KERN_CRIT "EXT4-fs error (device %s): %s: inode #%lu: (comm %s) ", + inode->i_sb->s_id, function, inode->i_ino, current->comm); + vprintk(fmt, args); + printk("\n"); + va_end(args); + + ext4_handle_error(inode->i_sb); +} + +void ext4_error_file(const char *function, struct file *file, + const char *fmt, ...) +{ + va_list args; + struct inode *inode = file->f_dentry->d_inode; + char pathname[80], *path; + + va_start(args, fmt); + path = d_path(&(file->f_path), pathname, sizeof(pathname)); + if (!path) + path = "(unknown)"; + printk(KERN_CRIT + "EXT4-fs error (device %s): %s: inode #%lu (comm %s path %s): ", + inode->i_sb->s_id, function, inode->i_ino, current->comm, path); + vprintk(fmt, args); + printk("\n"); + va_end(args); + + ext4_handle_error(inode->i_sb); +} + static const char *ext4_decode_error(struct super_block *sb, int errno, char nbuf[16]) { @@ -450,7 +486,7 @@ void ext4_msg (struct super_block * sb, const char *prefix, va_end(args); } -void ext4_warning(struct super_block *sb, const char *function, +void __ext4_warning(struct super_block *sb, const char *function, const char *fmt, ...) { va_list args; @@ -507,7 +543,7 @@ void ext4_update_dynamic_rev(struct super_block *sb) if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV) return; - ext4_warning(sb, __func__, + ext4_warning(sb, "updating to rev %d because of new feature flag, " "running e2fsck is recommended", EXT4_DYNAMIC_REV); @@ -708,7 +744,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; #endif - INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); + INIT_LIST_HEAD(&ei->i_completed_io_list); + spin_lock_init(&ei->i_completed_io_lock); ei->cur_aio_dio = NULL; ei->i_sync_tid = 0; ei->i_datasync_tid = 0; @@ -761,6 +798,7 @@ static void destroy_inodecache(void) static void ext4_clear_inode(struct inode *inode) { + dquot_drop(inode); ext4_discard_preallocations(inode); if (EXT4_JOURNAL(inode)) jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, @@ -796,10 +834,10 @@ static inline void ext4_show_quota_options(struct seq_file *seq, if (sbi->s_qf_names[GRPQUOTA]) seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); - if (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) + if (test_opt(sb, USRQUOTA)) seq_puts(seq, ",usrquota"); - if (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) + if (test_opt(sb, GRPQUOTA)) seq_puts(seq, ",grpquota"); #endif } @@ -926,6 +964,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, NOLOAD)) seq_puts(seq, ",norecovery"); + if (test_opt(sb, DIOREAD_NOLOCK)) + seq_puts(seq, ",dioread_nolock"); + ext4_show_quota_options(seq, sb); return 0; @@ -1012,19 +1053,9 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); static const struct dquot_operations ext4_quota_operations = { - .initialize = dquot_initialize, - .drop = dquot_drop, - .alloc_space = dquot_alloc_space, - .reserve_space = dquot_reserve_space, - .claim_space = dquot_claim_space, - .release_rsv = dquot_release_reserved_space, #ifdef CONFIG_QUOTA .get_reserved_space = ext4_get_reserved_space, #endif - .alloc_inode = dquot_alloc_inode, - .free_space = dquot_free_space, - .free_inode = dquot_free_inode, - .transfer = dquot_transfer, .write_dquot = ext4_write_dquot, .acquire_dquot = ext4_acquire_dquot, .release_dquot = ext4_release_dquot, @@ -1109,6 +1140,7 @@ enum { Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, + Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, }; @@ -1176,6 +1208,8 @@ static const match_table_t tokens = { {Opt_auto_da_alloc, "auto_da_alloc=%u"}, {Opt_auto_da_alloc, "auto_da_alloc"}, {Opt_noauto_da_alloc, "noauto_da_alloc"}, + {Opt_dioread_nolock, "dioread_nolock"}, + {Opt_dioread_lock, "dioread_lock"}, {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, {Opt_err, NULL}, @@ -1205,6 +1239,66 @@ static ext4_fsblk_t get_sb_block(void **data) } #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) +static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n" + "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n"; + +#ifdef CONFIG_QUOTA +static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + char *qname; + + if (sb_any_quota_loaded(sb) && + !sbi->s_qf_names[qtype]) { + ext4_msg(sb, KERN_ERR, + "Cannot change journaled " + "quota options when quota turned on"); + return 0; + } + qname = match_strdup(args); + if (!qname) { + ext4_msg(sb, KERN_ERR, + "Not enough memory for storing quotafile name"); + return 0; + } + if (sbi->s_qf_names[qtype] && + strcmp(sbi->s_qf_names[qtype], qname)) { + ext4_msg(sb, KERN_ERR, + "%s quota file already specified", QTYPE2NAME(qtype)); + kfree(qname); + return 0; + } + sbi->s_qf_names[qtype] = qname; + if (strchr(sbi->s_qf_names[qtype], '/')) { + ext4_msg(sb, KERN_ERR, + "quotafile must be on filesystem root"); + kfree(sbi->s_qf_names[qtype]); + sbi->s_qf_names[qtype] = NULL; + return 0; + } + set_opt(sbi->s_mount_opt, QUOTA); + return 1; +} + +static int clear_qf_name(struct super_block *sb, int qtype) +{ + + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (sb_any_quota_loaded(sb) && + sbi->s_qf_names[qtype]) { + ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options" + " when quota turned on"); + return 0; + } + /* + * The space will be released later when all options are confirmed + * to be correct + */ + sbi->s_qf_names[qtype] = NULL; + return 1; +} +#endif static int parse_options(char *options, struct super_block *sb, unsigned long *journal_devnum, @@ -1217,8 +1311,7 @@ static int parse_options(char *options, struct super_block *sb, int data_opt = 0; int option; #ifdef CONFIG_QUOTA - int qtype, qfmt; - char *qname; + int qfmt; #endif if (!options) @@ -1229,19 +1322,31 @@ static int parse_options(char *options, struct super_block *sb, if (!*p) continue; + /* + * Initialize args struct so we know whether arg was + * found; some options take optional arguments. + */ + args[0].to = args[0].from = 0; token = match_token(p, tokens, args); switch (token) { case Opt_bsd_df: + ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); clear_opt(sbi->s_mount_opt, MINIX_DF); break; case Opt_minix_df: + ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); set_opt(sbi->s_mount_opt, MINIX_DF); + break; case Opt_grpid: + ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); set_opt(sbi->s_mount_opt, GRPID); + break; case Opt_nogrpid: + ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38"); clear_opt(sbi->s_mount_opt, GRPID); + break; case Opt_resuid: if (match_int(&args[0], &option)) @@ -1378,14 +1483,13 @@ static int parse_options(char *options, struct super_block *sb, data_opt = EXT4_MOUNT_WRITEBACK_DATA; datacheck: if (is_remount) { - if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS) - != data_opt) { + if (test_opt(sb, DATA_FLAGS) != data_opt) { ext4_msg(sb, KERN_ERR, "Cannot change data mode on remount"); return 0; } } else { - sbi->s_mount_opt &= ~EXT4_MOUNT_DATA_FLAGS; + clear_opt(sbi->s_mount_opt, DATA_FLAGS); sbi->s_mount_opt |= data_opt; } break; @@ -1397,63 +1501,22 @@ static int parse_options(char *options, struct super_block *sb, break; #ifdef CONFIG_QUOTA case Opt_usrjquota: - qtype = USRQUOTA; - goto set_qf_name; - case Opt_grpjquota: - qtype = GRPQUOTA; -set_qf_name: - if (sb_any_quota_loaded(sb) && - !sbi->s_qf_names[qtype]) { - ext4_msg(sb, KERN_ERR, - "Cannot change journaled " - "quota options when quota turned on"); + if (!set_qf_name(sb, USRQUOTA, &args[0])) return 0; - } - qname = match_strdup(&args[0]); - if (!qname) { - ext4_msg(sb, KERN_ERR, - "Not enough memory for " - "storing quotafile name"); - return 0; - } - if (sbi->s_qf_names[qtype] && - strcmp(sbi->s_qf_names[qtype], qname)) { - ext4_msg(sb, KERN_ERR, - "%s quota file already " - "specified", QTYPE2NAME(qtype)); - kfree(qname); - return 0; - } - sbi->s_qf_names[qtype] = qname; - if (strchr(sbi->s_qf_names[qtype], '/')) { - ext4_msg(sb, KERN_ERR, - "quotafile must be on " - "filesystem root"); - kfree(sbi->s_qf_names[qtype]); - sbi->s_qf_names[qtype] = NULL; + break; + case Opt_grpjquota: + if (!set_qf_name(sb, GRPQUOTA, &args[0])) return 0; - } - set_opt(sbi->s_mount_opt, QUOTA); break; case Opt_offusrjquota: - qtype = USRQUOTA; - goto clear_qf_name; + if (!clear_qf_name(sb, USRQUOTA)) + return 0; + break; case Opt_offgrpjquota: - qtype = GRPQUOTA; -clear_qf_name: - if (sb_any_quota_loaded(sb) && - sbi->s_qf_names[qtype]) { - ext4_msg(sb, KERN_ERR, "Cannot change " - "journaled quota options when " - "quota turned on"); + if (!clear_qf_name(sb, GRPQUOTA)) return 0; - } - /* - * The space will be released later when all options - * are confirmed to be correct - */ - sbi->s_qf_names[qtype] = NULL; break; + case Opt_jqfmt_vfsold: qfmt = QFMT_VFS_OLD; goto set_qf_format; @@ -1518,10 +1581,11 @@ set_qf_format: clear_opt(sbi->s_mount_opt, BARRIER); break; case Opt_barrier: - if (match_int(&args[0], &option)) { - set_opt(sbi->s_mount_opt, BARRIER); - break; - } + if (args[0].from) { + if (match_int(&args[0], &option)) + return 0; + } else + option = 1; /* No argument, default to 1 */ if (option) set_opt(sbi->s_mount_opt, BARRIER); else @@ -1594,10 +1658,11 @@ set_qf_format: set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); break; case Opt_auto_da_alloc: - if (match_int(&args[0], &option)) { - clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); - break; - } + if (args[0].from) { + if (match_int(&args[0], &option)) + return 0; + } else + option = 1; /* No argument, default to 1 */ if (option) clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); else @@ -1609,6 +1674,12 @@ set_qf_format: case Opt_nodiscard: clear_opt(sbi->s_mount_opt, DISCARD); break; + case Opt_dioread_nolock: + set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); + break; + case Opt_dioread_lock: + clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); + break; default: ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" " @@ -1618,18 +1689,13 @@ set_qf_format: } #ifdef CONFIG_QUOTA if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { - if ((sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) && - sbi->s_qf_names[USRQUOTA]) + if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) clear_opt(sbi->s_mount_opt, USRQUOTA); - if ((sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) && - sbi->s_qf_names[GRPQUOTA]) + if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) clear_opt(sbi->s_mount_opt, GRPQUOTA); - if ((sbi->s_qf_names[USRQUOTA] && - (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) || - (sbi->s_qf_names[GRPQUOTA] && - (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) { + if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { ext4_msg(sb, KERN_ERR, "old and new quota " "format mixing"); return 0; @@ -1939,7 +2005,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, } list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); - vfs_dq_init(inode); + dquot_initialize(inode); if (inode->i_nlink) { ext4_msg(sb, KERN_DEBUG, "%s: truncating inode %lu to %lld bytes", @@ -2292,7 +2358,7 @@ static void ext4_sb_release(struct kobject *kobj) } -static struct sysfs_ops ext4_attr_ops = { +static const struct sysfs_ops ext4_attr_ops = { .show = ext4_attr_show, .store = ext4_attr_store, }; @@ -2432,8 +2498,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) def_mount_opts = le32_to_cpu(es->s_default_mount_opts); if (def_mount_opts & EXT4_DEFM_DEBUG) set_opt(sbi->s_mount_opt, DEBUG); - if (def_mount_opts & EXT4_DEFM_BSDGROUPS) + if (def_mount_opts & EXT4_DEFM_BSDGROUPS) { + ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups", + "2.6.38"); set_opt(sbi->s_mount_opt, GRPID); + } if (def_mount_opts & EXT4_DEFM_UID16) set_opt(sbi->s_mount_opt, NO_UID32); #ifdef CONFIG_EXT4_FS_XATTR @@ -2445,11 +2514,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_opt(sbi->s_mount_opt, POSIX_ACL); #endif if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) - sbi->s_mount_opt |= EXT4_MOUNT_JOURNAL_DATA; + set_opt(sbi->s_mount_opt, JOURNAL_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) - sbi->s_mount_opt |= EXT4_MOUNT_ORDERED_DATA; + set_opt(sbi->s_mount_opt, ORDERED_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) - sbi->s_mount_opt |= EXT4_MOUNT_WRITEBACK_DATA; + set_opt(sbi->s_mount_opt, WRITEBACK_DATA); if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) set_opt(sbi->s_mount_opt, ERRORS_PANIC); @@ -2477,7 +2546,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) || @@ -2766,7 +2835,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { ext4_msg(sb, KERN_ERR, "required journal recovery " "suppressed and not mounted read-only"); - goto failed_mount4; + goto failed_mount_wq; } else { clear_opt(sbi->s_mount_opt, DATA_FLAGS); set_opt(sbi->s_mount_opt, WRITEBACK_DATA); @@ -2779,7 +2848,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_64BIT)) { ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); - goto failed_mount4; + goto failed_mount_wq; } if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { @@ -2818,7 +2887,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { ext4_msg(sb, KERN_ERR, "Journal does not support " "requested data journaling mode"); - goto failed_mount4; + goto failed_mount_wq; } default: break; @@ -2826,13 +2895,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); no_journal: - if (test_opt(sb, NOBH)) { if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - " "its supported only with writeback mode"); clear_opt(sbi->s_mount_opt, NOBH); } + if (test_opt(sb, DIOREAD_NOLOCK)) { + ext4_msg(sb, KERN_WARNING, "dioread_nolock option is " + "not supported with nobh mode"); + goto failed_mount_wq; + } } EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); if (!EXT4_SB(sb)->dio_unwritten_wq) { @@ -2897,6 +2970,18 @@ no_journal: "requested data journaling mode"); clear_opt(sbi->s_mount_opt, DELALLOC); } + if (test_opt(sb, DIOREAD_NOLOCK)) { + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock " + "option - requested data journaling mode"); + clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); + } + if (sb->s_blocksize < PAGE_SIZE) { + ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock " + "option - block size is too small"); + clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK); + } + } err = ext4_setup_system_zone(sb); if (err) { @@ -3360,10 +3445,9 @@ static void ext4_clear_journal_err(struct super_block *sb, char nbuf[16]; errstr = ext4_decode_error(sb, j_errno, nbuf); - ext4_warning(sb, __func__, "Filesystem error recorded " + ext4_warning(sb, "Filesystem error recorded " "from previous mount: %s", errstr); - ext4_warning(sb, __func__, "Marking fs in need of " - "filesystem check."); + ext4_warning(sb, "Marking fs in need of filesystem check."); EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); @@ -3514,7 +3598,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) ext4_abort(sb, __func__, "Abort forced by user"); sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); es = sbi->s_es; @@ -3708,7 +3792,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) * Process 1 Process 2 * ext4_create() quota_sync() * jbd2_journal_start() write_dquot() - * vfs_dq_init() down(dqio_mutex) + * dquot_initialize() down(dqio_mutex) * down(dqio_mutex) jbd2_journal_start() * */ @@ -3917,9 +4001,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); int err = 0; int offset = off & (sb->s_blocksize - 1); - int tocopy; int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL; - size_t towrite = len; struct buffer_head *bh; handle_t *handle = journal_current_handle(); @@ -3929,52 +4011,53 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, (unsigned long long)off, (unsigned long long)len); return -EIO; } + /* + * Since we account only one data block in transaction credits, + * then it is impossible to cross a block boundary. + */ + if (sb->s_blocksize - offset < len) { + ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" + " cancelled because not block aligned", + (unsigned long long)off, (unsigned long long)len); + return -EIO; + } + mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); - while (towrite > 0) { - tocopy = sb->s_blocksize - offset < towrite ? - sb->s_blocksize - offset : towrite; - bh = ext4_bread(handle, inode, blk, 1, &err); - if (!bh) + bh = ext4_bread(handle, inode, blk, 1, &err); + if (!bh) + goto out; + if (journal_quota) { + err = ext4_journal_get_write_access(handle, bh); + if (err) { + brelse(bh); goto out; - if (journal_quota) { - err = ext4_journal_get_write_access(handle, bh); - if (err) { - brelse(bh); - goto out; - } } - lock_buffer(bh); - memcpy(bh->b_data+offset, data, tocopy); - flush_dcache_page(bh->b_page); - unlock_buffer(bh); - if (journal_quota) - err = ext4_handle_dirty_metadata(handle, NULL, bh); - else { - /* Always do at least ordered writes for quotas */ - err = ext4_jbd2_file_inode(handle, inode); - mark_buffer_dirty(bh); - } - brelse(bh); - if (err) - goto out; - offset = 0; - towrite -= tocopy; - data += tocopy; - blk++; } + lock_buffer(bh); + memcpy(bh->b_data+offset, data, len); + flush_dcache_page(bh->b_page); + unlock_buffer(bh); + if (journal_quota) + err = ext4_handle_dirty_metadata(handle, NULL, bh); + else { + /* Always do at least ordered writes for quotas */ + err = ext4_jbd2_file_inode(handle, inode); + mark_buffer_dirty(bh); + } + brelse(bh); out: - if (len == towrite) { + if (err) { mutex_unlock(&inode->i_mutex); return err; } - if (inode->i_size < off+len-towrite) { - i_size_write(inode, off+len-towrite); + if (inode->i_size < off + len) { + i_size_write(inode, off + len); EXT4_I(inode)->i_disksize = inode->i_size; } inode->i_mtime = inode->i_ctime = CURRENT_TIME; ext4_mark_inode_dirty(handle, inode); mutex_unlock(&inode->i_mutex); - return len - towrite; + return len; } #endif diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index f3a2f7ed45a..b4c5aa8489d 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -227,7 +227,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); if (ext4_xattr_check_block(bh)) { -bad_block: ext4_error(inode->i_sb, __func__, +bad_block: + ext4_error(inode->i_sb, "inode %lu: bad block %llu", inode->i_ino, EXT4_I(inode)->i_file_acl); error = -EIO; @@ -267,7 +268,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, void *end; int error; - if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)) + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) return -ENODATA; error = ext4_get_inode_loc(inode, &iloc); if (error) @@ -371,7 +372,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); if (ext4_xattr_check_block(bh)) { - ext4_error(inode->i_sb, __func__, + ext4_error(inode->i_sb, "inode %lu: bad block %llu", inode->i_ino, EXT4_I(inode)->i_file_acl); error = -EIO; @@ -396,7 +397,7 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) void *end; int error; - if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)) + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) return 0; error = ext4_get_inode_loc(inode, &iloc); if (error) @@ -494,7 +495,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, error = ext4_handle_dirty_metadata(handle, inode, bh); if (IS_SYNC(inode)) ext4_handle_sync(handle); - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); ea_bdebug(bh, "refcount now=%d; releasing", le32_to_cpu(BHDR(bh)->h_refcount)); if (ce) @@ -665,9 +666,8 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, atomic_read(&(bs->bh->b_count)), le32_to_cpu(BHDR(bs->bh)->h_refcount)); if (ext4_xattr_check_block(bs->bh)) { - ext4_error(sb, __func__, - "inode %lu: bad block %llu", inode->i_ino, - EXT4_I(inode)->i_file_acl); + ext4_error(sb, "inode %lu: bad block %llu", + inode->i_ino, EXT4_I(inode)->i_file_acl); error = -EIO; goto cleanup; } @@ -787,8 +787,8 @@ inserted: else { /* The old block is released after updating the inode. */ - error = -EDQUOT; - if (vfs_dq_alloc_block(inode, 1)) + error = dquot_alloc_block(inode, 1); + if (error) goto cleanup; error = ext4_journal_get_write_access(handle, new_bh); @@ -876,13 +876,12 @@ cleanup: return error; cleanup_dquot: - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); goto cleanup; bad_block: - ext4_error(inode->i_sb, __func__, - "inode %lu: bad block %llu", inode->i_ino, - EXT4_I(inode)->i_file_acl); + ext4_error(inode->i_sb, "inode %lu: bad block %llu", + inode->i_ino, EXT4_I(inode)->i_file_acl); goto cleanup; #undef header @@ -908,7 +907,7 @@ ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, is->s.base = is->s.first = IFIRST(header); is->s.here = is->s.first; is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) { + if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { error = ext4_xattr_check_names(IFIRST(header), is->s.end); if (error) return error; @@ -940,10 +939,10 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, header = IHDR(inode, ext4_raw_inode(&is->iloc)); if (!IS_LAST_ENTRY(s->first)) { header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); - EXT4_I(inode)->i_state |= EXT4_STATE_XATTR; + ext4_set_inode_state(inode, EXT4_STATE_XATTR); } else { header->h_magic = cpu_to_le32(0); - EXT4_I(inode)->i_state &= ~EXT4_STATE_XATTR; + ext4_clear_inode_state(inode, EXT4_STATE_XATTR); } return 0; } @@ -986,8 +985,8 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (strlen(name) > 255) return -ERANGE; down_write(&EXT4_I(inode)->xattr_sem); - no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND; - EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; + no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND); + ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); error = ext4_get_inode_loc(inode, &is.iloc); if (error) @@ -997,10 +996,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (error) goto cleanup; - if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) { + if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) { struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); - EXT4_I(inode)->i_state &= ~EXT4_STATE_NEW; + ext4_clear_inode_state(inode, EXT4_STATE_NEW); } error = ext4_xattr_ibody_find(inode, &i, &is); @@ -1052,7 +1051,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, ext4_xattr_update_super_block(handle, inode->i_sb); inode->i_ctime = ext4_current_time(inode); if (!value) - EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND; + ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); /* * The bh is consumed by ext4_mark_iloc_dirty, even with @@ -1067,7 +1066,7 @@ cleanup: brelse(is.iloc.bh); brelse(bs.bh); if (no_expand == 0) - EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND; + ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); up_write(&EXT4_I(inode)->xattr_sem); return error; } @@ -1195,9 +1194,8 @@ retry: if (!bh) goto cleanup; if (ext4_xattr_check_block(bh)) { - ext4_error(inode->i_sb, __func__, - "inode %lu: bad block %llu", inode->i_ino, - EXT4_I(inode)->i_file_acl); + ext4_error(inode->i_sb, "inode %lu: bad block %llu", + inode->i_ino, EXT4_I(inode)->i_file_acl); error = -EIO; goto cleanup; } @@ -1302,6 +1300,8 @@ retry: /* Remove the chosen entry from the inode */ error = ext4_xattr_ibody_set(handle, inode, &i, is); + if (error) + goto cleanup; entry = IFIRST(header); if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize) @@ -1372,16 +1372,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) goto cleanup; bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); if (!bh) { - ext4_error(inode->i_sb, __func__, - "inode %lu: block %llu read error", inode->i_ino, - EXT4_I(inode)->i_file_acl); + ext4_error(inode->i_sb, "inode %lu: block %llu read error", + inode->i_ino, EXT4_I(inode)->i_file_acl); goto cleanup; } if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) { - ext4_error(inode->i_sb, __func__, - "inode %lu: bad block %llu", inode->i_ino, - EXT4_I(inode)->i_file_acl); + ext4_error(inode->i_sb, "inode %lu: bad block %llu", + inode->i_ino, EXT4_I(inode)->i_file_acl); goto cleanup; } ext4_xattr_release_block(handle, inode, bh); @@ -1506,7 +1504,7 @@ again: } bh = sb_bread(inode->i_sb, ce->e_block); if (!bh) { - ext4_error(inode->i_sb, __func__, + ext4_error(inode->i_sb, "inode %lu: block %lu read error", inode->i_ino, (unsigned long) ce->e_block); } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 14da530b05c..0ce143bd7d5 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -558,7 +558,7 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = sbi->free_clusters; buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); - buf->f_namelen = sbi->options.isvfat ? 260 : 12; + buf->f_namelen = sbi->options.isvfat ? FAT_LFN_LEN : 12; return 0; } @@ -577,7 +577,7 @@ static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi, return i_pos; } -static int fat_write_inode(struct inode *inode, int wait) +static int __fat_write_inode(struct inode *inode, int wait) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); @@ -634,9 +634,14 @@ retry: return err; } +static int fat_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + return __fat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); +} + int fat_sync_inode(struct inode *inode) { - return fat_write_inode(inode, 1); + return __fat_write_inode(inode, 1); } EXPORT_SYMBOL_GPL(fat_sync_inode); diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index f565f24019b..c1ef5015486 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -502,14 +502,14 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname, *outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname); if (*outlen < 0) return *outlen; - else if (*outlen > 255) + else if (*outlen > FAT_LFN_LEN) return -ENAMETOOLONG; op = &outname[*outlen * sizeof(wchar_t)]; } else { if (nls) { for (i = 0, ip = name, op = outname, *outlen = 0; - i < len && *outlen <= 255; + i < len && *outlen <= FAT_LFN_LEN; *outlen += 1) { if (escape && (*ip == ':')) { @@ -549,7 +549,7 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname, return -ENAMETOOLONG; } else { for (i = 0, ip = name, op = outname, *outlen = 0; - i < len && *outlen <= 255; + i < len && *outlen <= FAT_LFN_LEN; i++, *outlen += 1) { *op++ = *ip++; @@ -701,6 +701,15 @@ static int vfat_find(struct inode *dir, struct qstr *qname, return fat_search_long(dir, qname->name, len, sinfo); } +/* + * (nfsd's) anonymous disconnected dentry? + * NOTE: !IS_ROOT() is not anonymous (I.e. d_splice_alias() did the job). + */ +static int vfat_d_anon_disconn(struct dentry *dentry) +{ + return IS_ROOT(dentry) && (dentry->d_flags & DCACHE_DISCONNECTED); +} + static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { @@ -729,11 +738,11 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry, } alias = d_find_alias(inode); - if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) { + if (alias && !vfat_d_anon_disconn(alias)) { /* - * This inode has non DCACHE_DISCONNECTED dentry. This - * means, the user did ->lookup() by an another name - * (longname vs 8.3 alias of it) in past. + * This inode has non anonymous-DCACHE_DISCONNECTED + * dentry. This means, the user did ->lookup() by an + * another name (longname vs 8.3 alias of it) in past. * * Switch to new one for reason of locality if possible. */ @@ -743,7 +752,9 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry, iput(inode); unlock_super(sb); return alias; - } + } else + dput(alias); + out: unlock_super(sb); dentry->d_op = sb->s_root->d_op; diff --git a/fs/fcntl.c b/fs/fcntl.c index 97e01dc0d95..452d02f9075 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -344,7 +344,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, switch (cmd) { case F_DUPFD: case F_DUPFD_CLOEXEC: - if (arg >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) + if (arg >= rlimit(RLIMIT_NOFILE)) break; err = alloc_fd(arg, cmd == F_DUPFD_CLOEXEC ? O_CLOEXEC : 0); if (err >= 0) { diff --git a/fs/file.c b/fs/file.c index 38039af6766..34bb7f71d99 100644 --- a/fs/file.c +++ b/fs/file.c @@ -257,7 +257,7 @@ int expand_files(struct files_struct *files, int nr) * N.B. For clone tasks sharing a files structure, this test * will limit the total number of files that can be opened. */ - if (nr >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) + if (nr >= rlimit(RLIMIT_NOFILE)) return -EMFILE; /* Do we need to expand? */ diff --git a/fs/file_table.c b/fs/file_table.c index b98404b5438..32d12b78bac 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -393,7 +393,9 @@ retry: continue; if (!(f->f_mode & FMODE_WRITE)) continue; + spin_lock(&f->f_lock); f->f_mode &= ~FMODE_WRITE; + spin_unlock(&f->f_lock); if (file_check_writeable(f) != 0) continue; file_release_write(f); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 1a7c42c64ff..76fc4d594ac 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -381,10 +381,10 @@ static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); } -static int write_inode(struct inode *inode, int sync) +static int write_inode(struct inode *inode, struct writeback_control *wbc) { if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) - return inode->i_sb->s_op->write_inode(inode, sync); + return inode->i_sb->s_op->write_inode(inode, wbc); return 0; } @@ -421,7 +421,6 @@ static int writeback_single_inode(struct inode *inode, struct writeback_control *wbc) { struct address_space *mapping = inode->i_mapping; - int wait = wbc->sync_mode == WB_SYNC_ALL; unsigned dirty; int ret; @@ -439,7 +438,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * We'll have another go at writing back this inode when we * completed a full scan of b_io. */ - if (!wait) { + if (wbc->sync_mode != WB_SYNC_ALL) { requeue_io(inode); return 0; } @@ -461,15 +460,20 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) ret = do_writepages(mapping, wbc); - /* Don't write the inode if only I_DIRTY_PAGES was set */ - if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { - int err = write_inode(inode, wait); + /* + * Make sure to wait on the data before writing out the metadata. + * This is important for filesystems that modify metadata on data + * I/O completion. + */ + if (wbc->sync_mode == WB_SYNC_ALL) { + int err = filemap_fdatawait(mapping); if (ret == 0) ret = err; } - if (wait) { - int err = filemap_fdatawait(mapping); + /* Don't write the inode if only I_DIRTY_PAGES was set */ + if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { + int err = write_inode(inode, wbc); if (ret == 0) ret = err; } diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig index 864dac20a24..cc94bb9563f 100644 --- a/fs/fscache/Kconfig +++ b/fs/fscache/Kconfig @@ -1,7 +1,6 @@ config FSCACHE tristate "General filesystem local caching manager" - depends on EXPERIMENTAL select SLOW_WORK help This option enables a generic filesystem caching manager that can be diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 1a822ce2b24..ec14d19ce50 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -850,7 +850,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) req->in.args[0].size = sizeof(*arg); req->in.args[0].value = arg; req->out.numargs = 1; - /* Variable length arguement used for backward compatibility + /* Variable length argument used for backward compatibility with interface version < 7.5. Rest of init_out is zeroed by do_get_request(), so a short reply is not a problem */ req->out.argvar = 1; diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index 4dcddf83326..a47b4310711 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -8,7 +8,6 @@ config GFS2_FS select FS_POSIX_ACL select CRC32 select SLOW_WORK - select QUOTA select QUOTACTL help A cluster filesystem. diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index a6abbae8a27..e6dd2aec6f8 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -640,7 +640,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_POSIX)) return -ENOLCK; - if (__mandatory_lock(&ip->i_inode)) + if (__mandatory_lock(&ip->i_inode) && fl->fl_type != F_UNLCK) return -ENOLCK; if (cmd == F_CANCELLK) { diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index b8025e51cab..3aac46f6853 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -616,7 +616,7 @@ struct gfs2_sbd { unsigned int sd_log_blks_reserved; unsigned int sd_log_commited_buf; unsigned int sd_log_commited_databuf; - unsigned int sd_log_commited_revoke; + int sd_log_commited_revoke; unsigned int sd_log_num_buf; unsigned int sd_log_num_revoke; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 4511b08fc45..e5bf4b59d46 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -417,7 +417,7 @@ static unsigned int calc_reserved(struct gfs2_sbd *sdp) databufhdrs_needed = (sdp->sd_log_commited_databuf + (dbuf_limit - 1)) / dbuf_limit; - if (sdp->sd_log_commited_revoke) + if (sdp->sd_log_commited_revoke > 0) revokes = gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke, sizeof(u64)); @@ -790,7 +790,6 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) gfs2_assert_withdraw(sdp, (((int)sdp->sd_log_commited_buf) >= 0) || (((int)sdp->sd_log_commited_databuf) >= 0)); sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm; - gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0); reserved = calc_reserved(sdp); gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved); unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index a054b526dc0..c1309ed1c49 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1001,7 +1001,7 @@ static const struct lm_lockops nolock_ops = { /** * gfs2_lm_mount - mount a locking protocol * @sdp: the filesystem - * @args: mount arguements + * @args: mount arguments * @silent: if 1, don't complain if the FS isn't a GFS2 fs * * Returns: errno diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index e3bf6eab875..6dbcbad6ab1 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1083,7 +1083,7 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change, } } -int gfs2_quota_sync(struct super_block *sb, int type) +int gfs2_quota_sync(struct super_block *sb, int type, int wait) { struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_quota_data **qda; @@ -1127,6 +1127,11 @@ int gfs2_quota_sync(struct super_block *sb, int type) return error; } +static int gfs2_quota_sync_timeo(struct super_block *sb, int type) +{ + return gfs2_quota_sync(sb, type, 0); +} + int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id) { struct gfs2_quota_data *qd; @@ -1382,7 +1387,7 @@ int gfs2_quotad(void *data) &tune->gt_statfs_quantum); /* Update quota file */ - quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t, + quotad_check_timeo(sdp, "sync", gfs2_quota_sync_timeo, t, "ad_timeo, &tune->gt_quota_quantum); /* Check for & recover partially truncated inodes */ diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index e271fa07ad0..195f60c8bd1 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -25,7 +25,7 @@ extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid); extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change, u32 uid, u32 gid); -extern int gfs2_quota_sync(struct super_block *sb, int type); +extern int gfs2_quota_sync(struct super_block *sb, int type, int wait); extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id); extern int gfs2_quota_init(struct gfs2_sbd *sdp); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index e5e22629da6..50aac606b99 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -22,6 +22,7 @@ #include <linux/crc32.h> #include <linux/time.h> #include <linux/wait.h> +#include <linux/writeback.h> #include "gfs2.h" #include "incore.h" @@ -711,7 +712,7 @@ void gfs2_unfreeze_fs(struct gfs2_sbd *sdp) * Returns: errno */ -static int gfs2_write_inode(struct inode *inode, int sync) +static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -745,7 +746,7 @@ static int gfs2_write_inode(struct inode *inode, int sync) do_unlock: gfs2_glock_dq_uninit(&gh); do_flush: - if (sync != 0) + if (wbc->sync_mode == WB_SYNC_ALL) gfs2_log_flush(GFS2_SB(inode), ip->i_gl); return ret; } @@ -763,7 +764,7 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) int error; flush_workqueue(gfs2_delete_workqueue); - gfs2_quota_sync(sdp->sd_vfs, 0); + gfs2_quota_sync(sdp->sd_vfs, 0, 1); gfs2_statfs_sync(sdp->sd_vfs, 0); error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE, diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index a0db1c94317..419042f7f0b 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -49,7 +49,7 @@ static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr, return a->store ? a->store(sdp, buf, len) : len; } -static struct sysfs_ops gfs2_attr_ops = { +static const struct sysfs_ops gfs2_attr_ops = { .show = gfs2_attr_show, .store = gfs2_attr_store, }; @@ -167,7 +167,7 @@ static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf, if (simple_strtol(buf, NULL, 0) != 1) return -EINVAL; - gfs2_quota_sync(sdp->sd_vfs, 0); + gfs2_quota_sync(sdp->sd_vfs, 0, 1); return len; } @@ -574,7 +574,7 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj, return 0; } -static struct kset_uevent_ops gfs2_uevent_ops = { +static const struct kset_uevent_ops gfs2_uevent_ops = { .uevent = gfs2_uevent, }; diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 052387e1167..fe35e3b626c 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -188,7 +188,7 @@ extern const struct address_space_operations hfs_btree_aops; extern struct inode *hfs_new_inode(struct inode *, struct qstr *, int); extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *); -extern int hfs_write_inode(struct inode *, int); +extern int hfs_write_inode(struct inode *, struct writeback_control *); extern int hfs_inode_setattr(struct dentry *, struct iattr *); extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext, __be32 log_size, __be32 phys_size, u32 clump_size); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index a1cbff2b4d9..14f5cb1b9fd 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -381,7 +381,7 @@ void hfs_inode_write_fork(struct inode *inode, struct hfs_extent *ext, HFS_SB(inode->i_sb)->alloc_blksz); } -int hfs_write_inode(struct inode *inode, int unused) +int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct inode *main_inode = inode; struct hfs_find_data fd; diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 43022f3d514..74b473a8ef9 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -87,7 +87,8 @@ bad_inode: return ERR_PTR(err); } -static int hfsplus_write_inode(struct inode *inode, int unused) +static int hfsplus_write_inode(struct inode *inode, + struct writeback_control *wbc) { struct hfsplus_vh *vhdr; int ret = 0; diff --git a/fs/inode.c b/fs/inode.c index 03dfeb2e392..407bf392e20 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -8,7 +8,6 @@ #include <linux/mm.h> #include <linux/dcache.h> #include <linux/init.h> -#include <linux/quotaops.h> #include <linux/slab.h> #include <linux/writeback.h> #include <linux/module.h> @@ -314,7 +313,6 @@ void clear_inode(struct inode *inode) BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); inode_sync_wait(inode); - vfs_dq_drop(inode); if (inode->i_sb->s_op->clear_inode) inode->i_sb->s_op->clear_inode(inode); if (S_ISBLK(inode->i_mode) && inode->i_bdev) @@ -1211,8 +1209,6 @@ void generic_delete_inode(struct inode *inode) if (op->delete_inode) { void (*delete)(struct inode *) = op->delete_inode; - if (!is_bad_inode(inode)) - vfs_dq_init(inode); /* Filesystems implementing their own * s_op->delete_inode are required to call * truncate_inode_pages and clear_inode() diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 4bd882548c4..2c90e3ef625 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -862,12 +862,12 @@ restart_loop: /* A buffer which has been freed while still being * journaled by a previous transaction may end up still * being dirty here, but we want to avoid writing back - * that buffer in the future now that the last use has - * been committed. That's not only a performance gain, - * it also stops aliasing problems if the buffer is left - * behind for writeback and gets reallocated for another + * that buffer in the future after the "add to orphan" + * operation been committed, That's not only a performance + * gain, it also stops aliasing problems if the buffer is + * left behind for writeback and gets reallocated for another * use in a different page. */ - if (buffer_freed(bh)) { + if (buffer_freed(bh) && !jh->b_next_transaction) { clear_buffer_freed(bh); clear_buffer_jbddirty(bh); } diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 006f9ad838a..5ae71e75a49 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -1398,7 +1398,7 @@ int journal_stop(handle_t *handle) * the case where our storage is so fast that it is more optimal to go * ahead and force a flush and wait for the transaction to be committed * than it is to wait for an arbitrary amount of time for new writers to - * join the transaction. We acheive this by measuring how long it takes + * join the transaction. We achieve this by measuring how long it takes * to commit a transaction, and compare it with how long this * transaction has been running, and if run time < commit time then we * sleep for the delta and commit. This greatly helps super fast disks @@ -1864,6 +1864,21 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) if (!jh) goto zap_buffer_no_jh; + /* + * We cannot remove the buffer from checkpoint lists until the + * transaction adding inode to orphan list (let's call it T) + * is committed. Otherwise if the transaction changing the + * buffer would be cleaned from the journal before T is + * committed, a crash will cause that the correct contents of + * the buffer will be lost. On the other hand we have to + * clear the buffer dirty bit at latest at the moment when the + * transaction marking the buffer as freed in the filesystem + * structures is committed because from that moment on the + * buffer can be reallocated and used by a different page. + * Since the block hasn't been freed yet but the inode has + * already been added to orphan list, it is safe for us to add + * the buffer to BJ_Forget list of the newest transaction. + */ transaction = jh->b_transaction; if (transaction == NULL) { /* First case: not on any transaction. If it @@ -1929,16 +1944,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) goto zap_buffer; } /* - * If it is committing, we simply cannot touch it. We - * can remove it's next_transaction pointer from the - * running transaction if that is set, but nothing - * else. */ + * The buffer is committing, we simply cannot touch + * it. So we just set j_next_transaction to the + * running transaction (if there is one) and mark + * buffer as freed so that commit code knows it should + * clear dirty bits when it is done with the buffer. + */ set_buffer_freed(bh); - if (jh->b_next_transaction) { - J_ASSERT(jh->b_next_transaction == - journal->j_running_transaction); - jh->b_next_transaction = NULL; - } + if (journal->j_running_transaction && buffer_jbddirty(bh)) + jh->b_next_transaction = journal->j_running_transaction; journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); @@ -2120,7 +2134,7 @@ void journal_file_buffer(struct journal_head *jh, */ void __journal_refile_buffer(struct journal_head *jh) { - int was_dirty; + int was_dirty, jlist; struct buffer_head *bh = jh2bh(jh); J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); @@ -2142,8 +2156,13 @@ void __journal_refile_buffer(struct journal_head *jh) __journal_temp_unlink_buffer(jh); jh->b_transaction = jh->b_next_transaction; jh->b_next_transaction = NULL; - __journal_file_buffer(jh, jh->b_transaction, - jh->b_modified ? BJ_Metadata : BJ_Reserved); + if (buffer_freed(bh)) + jlist = BJ_Forget; + else if (jh->b_modified) + jlist = BJ_Metadata; + else + jlist = BJ_Reserved; + __journal_file_buffer(jh, jh->b_transaction, jlist); J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); if (was_dirty) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 88684937095..30beb11ef92 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -507,6 +507,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) if (blocknr < journal->j_tail) freed = freed + journal->j_last - journal->j_first; + trace_jbd2_cleanup_journal_tail(journal, first_tid, blocknr, freed); jbd_debug(1, "Cleaning journal tail from %d to %d (offset %lu), " "freeing %lu\n", diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 1bc74b6f26d..671da7fb7ff 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -883,8 +883,7 @@ restart_loop: spin_unlock(&journal->j_list_lock); bh = jh2bh(jh); jbd_lock_bh_state(bh); - J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || - jh->b_transaction == journal->j_running_transaction); + J_ASSERT_JH(jh, jh->b_transaction == commit_transaction); /* * If there is undo-protected committed data against @@ -930,12 +929,12 @@ restart_loop: /* A buffer which has been freed while still being * journaled by a previous transaction may end up still * being dirty here, but we want to avoid writing back - * that buffer in the future now that the last use has - * been committed. That's not only a performance gain, - * it also stops aliasing problems if the buffer is left - * behind for writeback and gets reallocated for another + * that buffer in the future after the "add to orphan" + * operation been committed, That's not only a performance + * gain, it also stops aliasing problems if the buffer is + * left behind for writeback and gets reallocated for another * use in a different page. */ - if (buffer_freed(bh)) { + if (buffer_freed(bh) && !jh->b_next_transaction) { clear_buffer_freed(bh); clear_buffer_jbddirty(bh); } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index ac0d027595d..c03d4dce4d7 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -39,6 +39,8 @@ #include <linux/seq_file.h> #include <linux/math64.h> #include <linux/hash.h> +#include <linux/log2.h> +#include <linux/vmalloc.h> #define CREATE_TRACE_POINTS #include <trace/events/jbd2.h> @@ -93,6 +95,7 @@ EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); static void __journal_abort_soft (journal_t *journal, int errno); +static int jbd2_journal_create_slab(size_t slab_size); /* * Helper function used to manage commit timeouts @@ -1248,6 +1251,13 @@ int jbd2_journal_load(journal_t *journal) } } + /* + * Create a slab for this blocksize + */ + err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize)); + if (err) + return err; + /* Let the recovery code check whether it needs to recover any * data from the journal. */ if (jbd2_journal_recover(journal)) @@ -1807,6 +1817,127 @@ size_t journal_tag_bytes(journal_t *journal) } /* + * JBD memory management + * + * These functions are used to allocate block-sized chunks of memory + * used for making copies of buffer_head data. Very often it will be + * page-sized chunks of data, but sometimes it will be in + * sub-page-size chunks. (For example, 16k pages on Power systems + * with a 4k block file system.) For blocks smaller than a page, we + * use a SLAB allocator. There are slab caches for each block size, + * which are allocated at mount time, if necessary, and we only free + * (all of) the slab caches when/if the jbd2 module is unloaded. For + * this reason we don't need to a mutex to protect access to + * jbd2_slab[] allocating or releasing memory; only in + * jbd2_journal_create_slab(). + */ +#define JBD2_MAX_SLABS 8 +static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS]; +static DECLARE_MUTEX(jbd2_slab_create_sem); + +static const char *jbd2_slab_names[JBD2_MAX_SLABS] = { + "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k", + "jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k" +}; + + +static void jbd2_journal_destroy_slabs(void) +{ + int i; + + for (i = 0; i < JBD2_MAX_SLABS; i++) { + if (jbd2_slab[i]) + kmem_cache_destroy(jbd2_slab[i]); + jbd2_slab[i] = NULL; + } +} + +static int jbd2_journal_create_slab(size_t size) +{ + int i = order_base_2(size) - 10; + size_t slab_size; + + if (size == PAGE_SIZE) + return 0; + + if (i >= JBD2_MAX_SLABS) + return -EINVAL; + + if (unlikely(i < 0)) + i = 0; + down(&jbd2_slab_create_sem); + if (jbd2_slab[i]) { + up(&jbd2_slab_create_sem); + return 0; /* Already created */ + } + + slab_size = 1 << (i+10); + jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size, + slab_size, 0, NULL); + up(&jbd2_slab_create_sem); + if (!jbd2_slab[i]) { + printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n"); + return -ENOMEM; + } + return 0; +} + +static struct kmem_cache *get_slab(size_t size) +{ + int i = order_base_2(size) - 10; + + BUG_ON(i >= JBD2_MAX_SLABS); + if (unlikely(i < 0)) + i = 0; + BUG_ON(jbd2_slab[i] == 0); + return jbd2_slab[i]; +} + +void *jbd2_alloc(size_t size, gfp_t flags) +{ + void *ptr; + + BUG_ON(size & (size-1)); /* Must be a power of 2 */ + + flags |= __GFP_REPEAT; + if (size == PAGE_SIZE) + ptr = (void *)__get_free_pages(flags, 0); + else if (size > PAGE_SIZE) { + int order = get_order(size); + + if (order < 3) + ptr = (void *)__get_free_pages(flags, order); + else + ptr = vmalloc(size); + } else + ptr = kmem_cache_alloc(get_slab(size), flags); + + /* Check alignment; SLUB has gotten this wrong in the past, + * and this can lead to user data corruption! */ + BUG_ON(((unsigned long) ptr) & (size-1)); + + return ptr; +} + +void jbd2_free(void *ptr, size_t size) +{ + if (size == PAGE_SIZE) { + free_pages((unsigned long)ptr, 0); + return; + } + if (size > PAGE_SIZE) { + int order = get_order(size); + + if (order < 3) + free_pages((unsigned long)ptr, order); + else + vfree(ptr); + return; + } + kmem_cache_free(get_slab(size), ptr); +}; + +/* * Journal_head storage management */ static struct kmem_cache *jbd2_journal_head_cache; @@ -2204,6 +2335,7 @@ static void jbd2_journal_destroy_caches(void) jbd2_journal_destroy_revoke_caches(); jbd2_journal_destroy_jbd2_journal_head_cache(); jbd2_journal_destroy_handle_cache(); + jbd2_journal_destroy_slabs(); } static int __init journal_init(void) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index a0512700542..bfc70f57900 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1727,6 +1727,21 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) if (!jh) goto zap_buffer_no_jh; + /* + * We cannot remove the buffer from checkpoint lists until the + * transaction adding inode to orphan list (let's call it T) + * is committed. Otherwise if the transaction changing the + * buffer would be cleaned from the journal before T is + * committed, a crash will cause that the correct contents of + * the buffer will be lost. On the other hand we have to + * clear the buffer dirty bit at latest at the moment when the + * transaction marking the buffer as freed in the filesystem + * structures is committed because from that moment on the + * buffer can be reallocated and used by a different page. + * Since the block hasn't been freed yet but the inode has + * already been added to orphan list, it is safe for us to add + * the buffer to BJ_Forget list of the newest transaction. + */ transaction = jh->b_transaction; if (transaction == NULL) { /* First case: not on any transaction. If it @@ -1783,16 +1798,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) } else if (transaction == journal->j_committing_transaction) { JBUFFER_TRACE(jh, "on committing transaction"); /* - * If it is committing, we simply cannot touch it. We - * can remove it's next_transaction pointer from the - * running transaction if that is set, but nothing - * else. */ + * The buffer is committing, we simply cannot touch + * it. So we just set j_next_transaction to the + * running transaction (if there is one) and mark + * buffer as freed so that commit code knows it should + * clear dirty bits when it is done with the buffer. + */ set_buffer_freed(bh); - if (jh->b_next_transaction) { - J_ASSERT(jh->b_next_transaction == - journal->j_running_transaction); - jh->b_next_transaction = NULL; - } + if (journal->j_running_transaction && buffer_jbddirty(bh)) + jh->b_next_transaction = journal->j_running_transaction; jbd2_journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); @@ -1969,7 +1983,7 @@ void jbd2_journal_file_buffer(struct journal_head *jh, */ void __jbd2_journal_refile_buffer(struct journal_head *jh) { - int was_dirty; + int was_dirty, jlist; struct buffer_head *bh = jh2bh(jh); J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); @@ -1991,8 +2005,13 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh) __jbd2_journal_temp_unlink_buffer(jh); jh->b_transaction = jh->b_next_transaction; jh->b_next_transaction = NULL; - __jbd2_journal_file_buffer(jh, jh->b_transaction, - jh->b_modified ? BJ_Metadata : BJ_Reserved); + if (buffer_freed(bh)) + jlist = BJ_Forget; + else if (jh->b_modified) + jlist = BJ_Metadata; + else + jlist = BJ_Reserved; + __jbd2_journal_file_buffer(jh, jh->b_transaction, jlist); J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); if (was_dirty) diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index e22de8397b7..d32ee9412cb 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -567,7 +567,7 @@ static void jffs2_free_tmp_dnode_info_list(struct rb_root *list) else BUG(); } } - list->rb_node = NULL; + *list = RB_ROOT; } static void jffs2_free_full_dirent_list(struct jffs2_full_dirent *fd) diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index d66477c3430..213169780b6 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -20,7 +20,6 @@ #include <linux/sched.h> #include <linux/fs.h> -#include <linux/quotaops.h> #include <linux/posix_acl_xattr.h> #include "jfs_incore.h" #include "jfs_txnmgr.h" @@ -174,7 +173,7 @@ cleanup: return rc; } -static int jfs_acl_chmod(struct inode *inode) +int jfs_acl_chmod(struct inode *inode) { struct posix_acl *acl, *clone; int rc; @@ -205,26 +204,3 @@ static int jfs_acl_chmod(struct inode *inode) posix_acl_release(clone); return rc; } - -int jfs_setattr(struct dentry *dentry, struct iattr *iattr) -{ - struct inode *inode = dentry->d_inode; - int rc; - - rc = inode_change_ok(inode, iattr); - if (rc) - return rc; - - if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || - (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { - if (vfs_dq_transfer(inode, iattr)) - return -EDQUOT; - } - - rc = inode_setattr(inode, iattr); - - if (!rc && (iattr->ia_valid & ATTR_MODE)) - rc = jfs_acl_chmod(inode); - - return rc; -} diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 2b70fa78e4a..14ba982b3f2 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -18,6 +18,7 @@ */ #include <linux/fs.h> +#include <linux/quotaops.h> #include "jfs_incore.h" #include "jfs_inode.h" #include "jfs_dmap.h" @@ -47,7 +48,7 @@ static int jfs_open(struct inode *inode, struct file *file) { int rc; - if ((rc = generic_file_open(inode, file))) + if ((rc = dquot_file_open(inode, file))) return rc; /* @@ -88,14 +89,40 @@ static int jfs_release(struct inode *inode, struct file *file) return 0; } +int jfs_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + int rc; + + rc = inode_change_ok(inode, iattr); + if (rc) + return rc; + + if (iattr->ia_valid & ATTR_SIZE) + dquot_initialize(inode); + if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || + (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { + rc = dquot_transfer(inode, iattr); + if (rc) + return rc; + } + + rc = inode_setattr(inode, iattr); + + if (!rc && (iattr->ia_valid & ATTR_MODE)) + rc = jfs_acl_chmod(inode); + + return rc; +} + const struct inode_operations jfs_file_inode_operations = { .truncate = jfs_truncate, .setxattr = jfs_setxattr, .getxattr = jfs_getxattr, .listxattr = jfs_listxattr, .removexattr = jfs_removexattr, -#ifdef CONFIG_JFS_POSIX_ACL .setattr = jfs_setattr, +#ifdef CONFIG_JFS_POSIX_ACL .check_acl = jfs_check_acl, #endif }; diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index b2ae190a77b..9dd126276c9 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -22,6 +22,7 @@ #include <linux/buffer_head.h> #include <linux/pagemap.h> #include <linux/quotaops.h> +#include <linux/writeback.h> #include "jfs_incore.h" #include "jfs_inode.h" #include "jfs_filsys.h" @@ -120,8 +121,10 @@ int jfs_commit_inode(struct inode *inode, int wait) return rc; } -int jfs_write_inode(struct inode *inode, int wait) +int jfs_write_inode(struct inode *inode, struct writeback_control *wbc) { + int wait = wbc->sync_mode == WB_SYNC_ALL; + if (test_cflag(COMMIT_Nolink, inode)) return 0; /* @@ -146,6 +149,9 @@ void jfs_delete_inode(struct inode *inode) { jfs_info("In jfs_delete_inode, inode = 0x%p", inode); + if (!is_bad_inode(inode)) + dquot_initialize(inode); + if (!is_bad_inode(inode) && (JFS_IP(inode)->fileset == FILESYSTEM_I)) { truncate_inode_pages(&inode->i_data, 0); @@ -158,9 +164,9 @@ void jfs_delete_inode(struct inode *inode) /* * Free the inode from the quota allocation. */ - vfs_dq_init(inode); - vfs_dq_free_inode(inode); - vfs_dq_drop(inode); + dquot_initialize(inode); + dquot_free_inode(inode); + dquot_drop(inode); } clear_inode(inode); diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index b07bd417ef8..54e07559878 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -22,7 +22,7 @@ int jfs_check_acl(struct inode *, int); int jfs_init_acl(tid_t, struct inode *, struct inode *); -int jfs_setattr(struct dentry *, struct iattr *); +int jfs_acl_chmod(struct inode *inode); #else @@ -32,5 +32,10 @@ static inline int jfs_init_acl(tid_t tid, struct inode *inode, return 0; } +static inline int jfs_acl_chmod(struct inode *inode) +{ + return 0; +} + #endif #endif /* _H_JFS_ACL */ diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index 925871e9887..0e4623be70c 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -381,10 +381,10 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot) * It's time to move the inline table to an external * page and begin to build the xtree */ - if (vfs_dq_alloc_block(ip, sbi->nbperpage)) + if (dquot_alloc_block(ip, sbi->nbperpage)) goto clean_up; if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) { - vfs_dq_free_block(ip, sbi->nbperpage); + dquot_free_block(ip, sbi->nbperpage); goto clean_up; } @@ -408,7 +408,7 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot) memcpy(&jfs_ip->i_dirtable, temp_table, sizeof (temp_table)); dbFree(ip, xaddr, sbi->nbperpage); - vfs_dq_free_block(ip, sbi->nbperpage); + dquot_free_block(ip, sbi->nbperpage); goto clean_up; } ip->i_size = PSIZE; @@ -1027,10 +1027,9 @@ static int dtSplitUp(tid_t tid, n = xlen; /* Allocate blocks to quota. */ - if (vfs_dq_alloc_block(ip, n)) { - rc = -EDQUOT; + rc = dquot_alloc_block(ip, n); + if (rc) goto extendOut; - } quota_allocation += n; if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen, @@ -1308,7 +1307,7 @@ static int dtSplitUp(tid_t tid, /* Rollback quota allocation */ if (rc && quota_allocation) - vfs_dq_free_block(ip, quota_allocation); + dquot_free_block(ip, quota_allocation); dtSplitUp_Exit: @@ -1369,9 +1368,10 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, return -EIO; /* Allocate blocks to quota. */ - if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) { + rc = dquot_alloc_block(ip, lengthPXD(pxd)); + if (rc) { release_metapage(rmp); - return -EDQUOT; + return rc; } jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp); @@ -1892,6 +1892,7 @@ static int dtSplitRoot(tid_t tid, struct dt_lock *dtlck; struct tlock *tlck; struct lv *lv; + int rc; /* get split root page */ smp = split->mp; @@ -1916,9 +1917,10 @@ static int dtSplitRoot(tid_t tid, rp = rmp->data; /* Allocate blocks to quota. */ - if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) { + rc = dquot_alloc_block(ip, lengthPXD(pxd)); + if (rc) { release_metapage(rmp); - return -EDQUOT; + return rc; } BT_MARK_DIRTY(rmp, ip); @@ -2287,7 +2289,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip, xlen = lengthPXD(&fp->header.self); /* Free quota allocation. */ - vfs_dq_free_block(ip, xlen); + dquot_free_block(ip, xlen); /* free/invalidate its buffer page */ discard_metapage(fmp); @@ -2363,7 +2365,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip, xlen = lengthPXD(&p->header.self); /* Free quota allocation */ - vfs_dq_free_block(ip, xlen); + dquot_free_block(ip, xlen); /* free/invalidate its buffer page */ discard_metapage(mp); diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c index 41d6045dbeb..5d3bbd10f8d 100644 --- a/fs/jfs/jfs_extent.c +++ b/fs/jfs/jfs_extent.c @@ -141,10 +141,11 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr) } /* Allocate blocks to quota. */ - if (vfs_dq_alloc_block(ip, nxlen)) { + rc = dquot_alloc_block(ip, nxlen); + if (rc) { dbFree(ip, nxaddr, (s64) nxlen); mutex_unlock(&JFS_IP(ip)->commit_mutex); - return -EDQUOT; + return rc; } /* determine the value of the extent flag */ @@ -164,7 +165,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr) */ if (rc) { dbFree(ip, nxaddr, nxlen); - vfs_dq_free_block(ip, nxlen); + dquot_free_block(ip, nxlen); mutex_unlock(&JFS_IP(ip)->commit_mutex); return (rc); } @@ -256,10 +257,11 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr) goto exit; /* Allocat blocks to quota. */ - if (vfs_dq_alloc_block(ip, nxlen)) { + rc = dquot_alloc_block(ip, nxlen); + if (rc) { dbFree(ip, nxaddr, (s64) nxlen); mutex_unlock(&JFS_IP(ip)->commit_mutex); - return -EDQUOT; + return rc; } delta = nxlen - xlen; @@ -297,7 +299,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr) /* extend the extent */ if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) { dbFree(ip, xaddr + xlen, delta); - vfs_dq_free_block(ip, nxlen); + dquot_free_block(ip, nxlen); goto exit; } } else { @@ -308,7 +310,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr) */ if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) { dbFree(ip, nxaddr, nxlen); - vfs_dq_free_block(ip, nxlen); + dquot_free_block(ip, nxlen); goto exit; } } diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index dc0e02159ac..829921b6776 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -116,10 +116,10 @@ struct inode *ialloc(struct inode *parent, umode_t mode) /* * Allocate inode to quota. */ - if (vfs_dq_alloc_inode(inode)) { - rc = -EDQUOT; + dquot_initialize(inode); + rc = dquot_alloc_inode(inode); + if (rc) goto fail_drop; - } inode->i_mode = mode; /* inherit flags from parent */ @@ -162,7 +162,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode) return inode; fail_drop: - vfs_dq_drop(inode); + dquot_drop(inode); inode->i_flags |= S_NOQUOTA; fail_unlock: inode->i_nlink = 0; diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h index 1eff7db34d6..79e2c79661d 100644 --- a/fs/jfs/jfs_inode.h +++ b/fs/jfs/jfs_inode.h @@ -26,7 +26,7 @@ extern long jfs_ioctl(struct file *, unsigned int, unsigned long); extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long); extern struct inode *jfs_iget(struct super_block *, unsigned long); extern int jfs_commit_inode(struct inode *, int); -extern int jfs_write_inode(struct inode*, int); +extern int jfs_write_inode(struct inode *, struct writeback_control *); extern void jfs_delete_inode(struct inode *); extern void jfs_dirty_inode(struct inode *); extern void jfs_truncate(struct inode *); @@ -40,6 +40,7 @@ extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); extern void jfs_set_inode_flags(struct inode *); extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int); +extern int jfs_setattr(struct dentry *, struct iattr *); extern const struct address_space_operations jfs_aops; extern const struct inode_operations jfs_dir_inode_operations; diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c index d654a645864..6c50871e622 100644 --- a/fs/jfs/jfs_xtree.c +++ b/fs/jfs/jfs_xtree.c @@ -585,10 +585,10 @@ int xtInsert(tid_t tid, /* transaction id */ hint = addressXAD(xad) + lengthXAD(xad) - 1; } else hint = 0; - if ((rc = vfs_dq_alloc_block(ip, xlen))) + if ((rc = dquot_alloc_block(ip, xlen))) goto out; if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) { - vfs_dq_free_block(ip, xlen); + dquot_free_block(ip, xlen); goto out; } } @@ -617,7 +617,7 @@ int xtInsert(tid_t tid, /* transaction id */ /* undo data extent allocation */ if (*xaddrp == 0) { dbFree(ip, xaddr, (s64) xlen); - vfs_dq_free_block(ip, xlen); + dquot_free_block(ip, xlen); } return rc; } @@ -985,10 +985,9 @@ xtSplitPage(tid_t tid, struct inode *ip, rbn = addressPXD(pxd); /* Allocate blocks to quota. */ - if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) { - rc = -EDQUOT; + rc = dquot_alloc_block(ip, lengthPXD(pxd)); + if (rc) goto clean_up; - } quota_allocation += lengthPXD(pxd); @@ -1195,7 +1194,7 @@ xtSplitPage(tid_t tid, struct inode *ip, /* Rollback quota allocation. */ if (quota_allocation) - vfs_dq_free_block(ip, quota_allocation); + dquot_free_block(ip, quota_allocation); return (rc); } @@ -1235,6 +1234,7 @@ xtSplitRoot(tid_t tid, struct pxdlist *pxdlist; struct tlock *tlck; struct xtlock *xtlck; + int rc; sp = &JFS_IP(ip)->i_xtroot; @@ -1252,9 +1252,10 @@ xtSplitRoot(tid_t tid, return -EIO; /* Allocate blocks to quota. */ - if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) { + rc = dquot_alloc_block(ip, lengthPXD(pxd)); + if (rc) { release_metapage(rmp); - return -EDQUOT; + return rc; } jfs_info("xtSplitRoot: ip:0x%p rmp:0x%p", ip, rmp); @@ -3680,7 +3681,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) ip->i_size = newsize; /* update quota allocation to reflect freed blocks */ - vfs_dq_free_block(ip, nfreed); + dquot_free_block(ip, nfreed); /* * free tlock of invalidated pages diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index c79a4270f08..4a3e9f39c21 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -85,6 +85,8 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode, jfs_info("jfs_create: dip:0x%p name:%s", dip, dentry->d_name.name); + dquot_initialize(dip); + /* * search parent directory for entry/freespace * (dtSearch() returns parent directory page pinned) @@ -215,6 +217,8 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) jfs_info("jfs_mkdir: dip:0x%p name:%s", dip, dentry->d_name.name); + dquot_initialize(dip); + /* link count overflow on parent directory ? */ if (dip->i_nlink == JFS_LINK_MAX) { rc = -EMLINK; @@ -356,7 +360,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name); /* Init inode for quota operations. */ - vfs_dq_init(ip); + dquot_initialize(dip); + dquot_initialize(ip); /* directory must be empty to be removed */ if (!dtEmpty(ip)) { @@ -483,7 +488,8 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) jfs_info("jfs_unlink: dip:0x%p name:%s", dip, dentry->d_name.name); /* Init inode for quota operations. */ - vfs_dq_init(ip); + dquot_initialize(dip); + dquot_initialize(ip); if ((rc = get_UCSname(&dname, dentry))) goto out; @@ -805,6 +811,8 @@ static int jfs_link(struct dentry *old_dentry, if (ip->i_nlink == 0) return -ENOENT; + dquot_initialize(dir); + tid = txBegin(ip->i_sb, 0); mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT); @@ -896,6 +904,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, jfs_info("jfs_symlink: dip:0x%p name:%s", dip, name); + dquot_initialize(dip); + ssize = strlen(name) + 1; /* @@ -1087,6 +1097,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, jfs_info("jfs_rename: %s %s", old_dentry->d_name.name, new_dentry->d_name.name); + dquot_initialize(old_dir); + dquot_initialize(new_dir); + old_ip = old_dentry->d_inode; new_ip = new_dentry->d_inode; @@ -1136,7 +1149,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, } else if (new_ip) { IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL); /* Init inode for quota operations. */ - vfs_dq_init(new_ip); + dquot_initialize(new_ip); } /* @@ -1360,6 +1373,8 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry, jfs_info("jfs_mknod: %s", dentry->d_name.name); + dquot_initialize(dir); + if ((rc = get_UCSname(&dname, dentry))) goto out; @@ -1541,8 +1556,8 @@ const struct inode_operations jfs_dir_inode_operations = { .getxattr = jfs_getxattr, .listxattr = jfs_listxattr, .removexattr = jfs_removexattr, -#ifdef CONFIG_JFS_POSIX_ACL .setattr = jfs_setattr, +#ifdef CONFIG_JFS_POSIX_ACL .check_acl = jfs_check_acl, #endif }; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index d929a822a74..266699deb1c 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -131,6 +131,11 @@ static void jfs_destroy_inode(struct inode *inode) kmem_cache_free(jfs_inode_cachep, ji); } +static void jfs_clear_inode(struct inode *inode) +{ + dquot_drop(inode); +} + static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb); @@ -745,6 +750,7 @@ static const struct super_operations jfs_super_operations = { .dirty_inode = jfs_dirty_inode, .write_inode = jfs_write_inode, .delete_inode = jfs_delete_inode, + .clear_inode = jfs_clear_inode, .put_super = jfs_put_super, .sync_fs = jfs_sync_fs, .freeze_fs = jfs_freeze, diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index fad364548bc..1f594ab2189 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -260,14 +260,14 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size, nblocks = (size + (sb->s_blocksize - 1)) >> sb->s_blocksize_bits; /* Allocate new blocks to quota. */ - if (vfs_dq_alloc_block(ip, nblocks)) { - return -EDQUOT; - } + rc = dquot_alloc_block(ip, nblocks); + if (rc) + return rc; rc = dbAlloc(ip, INOHINT(ip), nblocks, &blkno); if (rc) { /*Rollback quota allocation. */ - vfs_dq_free_block(ip, nblocks); + dquot_free_block(ip, nblocks); return rc; } @@ -332,7 +332,7 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size, failed: /* Rollback quota allocation. */ - vfs_dq_free_block(ip, nblocks); + dquot_free_block(ip, nblocks); dbFree(ip, blkno, nblocks); return rc; @@ -538,7 +538,8 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size) if (blocks_needed > current_blocks) { /* Allocate new blocks to quota. */ - if (vfs_dq_alloc_block(inode, blocks_needed)) + rc = dquot_alloc_block(inode, blocks_needed); + if (rc) return -EDQUOT; quota_allocation = blocks_needed; @@ -602,7 +603,7 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size) clean_up: /* Rollback quota allocation */ if (quota_allocation) - vfs_dq_free_block(inode, quota_allocation); + dquot_free_block(inode, quota_allocation); return (rc); } @@ -677,7 +678,7 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf, /* If old blocks exist, they must be removed from quota allocation. */ if (old_blocks) - vfs_dq_free_block(inode, old_blocks); + dquot_free_block(inode, old_blocks); inode->i_ctime = CURRENT_TIME; diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 4600c2037b8..bb464d12104 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -479,8 +479,8 @@ again: mutex_lock(&nlm_host_mutex); } } } - mutex_unlock(&nlm_host_mutex); + nsm_release(nsm); } /* diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index f956651d0f6..fefa4df3f00 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -349,9 +349,9 @@ retry: * nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle * @info: pointer to NLMPROC_SM_NOTIFY arguments * - * Returns a matching nsm_handle if found in the nsm cache; the returned - * nsm_handle's reference count is bumped and sm_monitored is cleared. - * Otherwise returns NULL if some error occurred. + * Returns a matching nsm_handle if found in the nsm cache. The returned + * nsm_handle's reference count is bumped. Otherwise returns NULL if some + * error occurred. */ struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info) { @@ -370,12 +370,6 @@ struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info) atomic_inc(&cached->sm_count); spin_unlock(&nsm_lock); - /* - * During subsequent lock activity, force a fresh - * notification to be set up for this host. - */ - cached->sm_monitored = 0; - dprintk("lockd: host %s (%s) rebooted, cnt %d\n", cached->sm_name, cached->sm_addrbuf, atomic_read(&cached->sm_count)); diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index e50cfa3d965..7d150517ddf 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -243,11 +243,9 @@ static int make_socks(struct svc_serv *serv) if (err < 0) goto out_err; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) err = create_lockd_family(serv, PF_INET6); if (err < 0 && err != -EAFNOSUPPORT) goto out_err; -#endif /* CONFIG_IPV6 || CONFIG_IPV6_MODULE */ warned = 0; return 0; diff --git a/fs/locks.c b/fs/locks.c index ae9ded026b7..ab24d49fc04 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1455,7 +1455,7 @@ EXPORT_SYMBOL(generic_setlease); * leases held by processes on this node. * * There is also no break_lease method; filesystems that - * handle their own leases shoud break leases themselves from the + * handle their own leases should break leases themselves from the * filesystem's open, create, and (on truncate) setattr methods. * * Warning: the only current setlease methods exist only to disable diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig new file mode 100644 index 00000000000..daf9a9b32dd --- /dev/null +++ b/fs/logfs/Kconfig @@ -0,0 +1,17 @@ +config LOGFS + tristate "LogFS file system (EXPERIMENTAL)" + depends on (MTD || BLOCK) && EXPERIMENTAL + select ZLIB_INFLATE + select ZLIB_DEFLATE + select CRC32 + select BTREE + help + Flash filesystem aimed to scale efficiently to large devices. + In comparison to JFFS2 it offers significantly faster mount + times and potentially less RAM usage, although the latter has + not been measured yet. + + In its current state it is still very experimental and should + not be used for other than testing purposes. + + If unsure, say N. diff --git a/fs/logfs/Makefile b/fs/logfs/Makefile new file mode 100644 index 00000000000..4820027787e --- /dev/null +++ b/fs/logfs/Makefile @@ -0,0 +1,13 @@ +obj-$(CONFIG_LOGFS) += logfs.o + +logfs-y += compr.o +logfs-y += dir.o +logfs-y += file.o +logfs-y += gc.o +logfs-y += inode.o +logfs-y += journal.o +logfs-y += readwrite.o +logfs-y += segment.o +logfs-y += super.o +logfs-$(CONFIG_BLOCK) += dev_bdev.o +logfs-$(CONFIG_MTD) += dev_mtd.o diff --git a/fs/logfs/compr.c b/fs/logfs/compr.c new file mode 100644 index 00000000000..44bbfd249ab --- /dev/null +++ b/fs/logfs/compr.c @@ -0,0 +1,95 @@ +/* + * fs/logfs/compr.c - compression routines + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> + */ +#include "logfs.h" +#include <linux/vmalloc.h> +#include <linux/zlib.h> + +#define COMPR_LEVEL 3 + +static DEFINE_MUTEX(compr_mutex); +static struct z_stream_s stream; + +int logfs_compress(void *in, void *out, size_t inlen, size_t outlen) +{ + int err, ret; + + ret = -EIO; + mutex_lock(&compr_mutex); + err = zlib_deflateInit(&stream, COMPR_LEVEL); + if (err != Z_OK) + goto error; + + stream.next_in = in; + stream.avail_in = inlen; + stream.total_in = 0; + stream.next_out = out; + stream.avail_out = outlen; + stream.total_out = 0; + + err = zlib_deflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) + goto error; + + err = zlib_deflateEnd(&stream); + if (err != Z_OK) + goto error; + + if (stream.total_out >= stream.total_in) + goto error; + + ret = stream.total_out; +error: + mutex_unlock(&compr_mutex); + return ret; +} + +int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen) +{ + int err, ret; + + ret = -EIO; + mutex_lock(&compr_mutex); + err = zlib_inflateInit(&stream); + if (err != Z_OK) + goto error; + + stream.next_in = in; + stream.avail_in = inlen; + stream.total_in = 0; + stream.next_out = out; + stream.avail_out = outlen; + stream.total_out = 0; + + err = zlib_inflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) + goto error; + + err = zlib_inflateEnd(&stream); + if (err != Z_OK) + goto error; + + ret = 0; +error: + mutex_unlock(&compr_mutex); + return ret; +} + +int __init logfs_compr_init(void) +{ + size_t size = max(zlib_deflate_workspacesize(), + zlib_inflate_workspacesize()); + stream.workspace = vmalloc(size); + if (!stream.workspace) + return -ENOMEM; + return 0; +} + +void logfs_compr_exit(void) +{ + vfree(stream.workspace); +} diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c new file mode 100644 index 00000000000..9718c22f186 --- /dev/null +++ b/fs/logfs/dev_bdev.c @@ -0,0 +1,327 @@ +/* + * fs/logfs/dev_bdev.c - Device access methods for block devices + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> + */ +#include "logfs.h" +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/buffer_head.h> + +#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) + +static void request_complete(struct bio *bio, int err) +{ + complete((struct completion *)bio->bi_private); +} + +static int sync_request(struct page *page, struct block_device *bdev, int rw) +{ + struct bio bio; + struct bio_vec bio_vec; + struct completion complete; + + bio_init(&bio); + bio.bi_io_vec = &bio_vec; + bio_vec.bv_page = page; + bio_vec.bv_len = PAGE_SIZE; + bio_vec.bv_offset = 0; + bio.bi_vcnt = 1; + bio.bi_idx = 0; + bio.bi_size = PAGE_SIZE; + bio.bi_bdev = bdev; + bio.bi_sector = page->index * (PAGE_SIZE >> 9); + init_completion(&complete); + bio.bi_private = &complete; + bio.bi_end_io = request_complete; + + submit_bio(rw, &bio); + generic_unplug_device(bdev_get_queue(bdev)); + wait_for_completion(&complete); + return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO; +} + +static int bdev_readpage(void *_sb, struct page *page) +{ + struct super_block *sb = _sb; + struct block_device *bdev = logfs_super(sb)->s_bdev; + int err; + + err = sync_request(page, bdev, READ); + if (err) { + ClearPageUptodate(page); + SetPageError(page); + } else { + SetPageUptodate(page); + ClearPageError(page); + } + unlock_page(page); + return err; +} + +static DECLARE_WAIT_QUEUE_HEAD(wq); + +static void writeseg_end_io(struct bio *bio, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct super_block *sb = bio->bi_private; + struct logfs_super *super = logfs_super(sb); + struct page *page; + + BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */ + BUG_ON(err); + BUG_ON(bio->bi_vcnt == 0); + do { + page = bvec->bv_page; + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + + end_page_writeback(page); + } while (bvec >= bio->bi_io_vec); + bio_put(bio); + if (atomic_dec_and_test(&super->s_pending_writes)) + wake_up(&wq); +} + +static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, + size_t nr_pages) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + struct bio *bio; + struct page *page; + struct request_queue *q = bdev_get_queue(sb->s_bdev); + unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9); + int i; + + bio = bio_alloc(GFP_NOFS, max_pages); + BUG_ON(!bio); /* FIXME: handle this */ + + for (i = 0; i < nr_pages; i++) { + if (i >= max_pages) { + /* Block layer cannot split bios :( */ + bio->bi_vcnt = i; + bio->bi_idx = 0; + bio->bi_size = i * PAGE_SIZE; + bio->bi_bdev = super->s_bdev; + bio->bi_sector = ofs >> 9; + bio->bi_private = sb; + bio->bi_end_io = writeseg_end_io; + atomic_inc(&super->s_pending_writes); + submit_bio(WRITE, bio); + + ofs += i * PAGE_SIZE; + index += i; + nr_pages -= i; + i = 0; + + bio = bio_alloc(GFP_NOFS, max_pages); + BUG_ON(!bio); + } + page = find_lock_page(mapping, index + i); + BUG_ON(!page); + bio->bi_io_vec[i].bv_page = page; + bio->bi_io_vec[i].bv_len = PAGE_SIZE; + bio->bi_io_vec[i].bv_offset = 0; + + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + } + bio->bi_vcnt = nr_pages; + bio->bi_idx = 0; + bio->bi_size = nr_pages * PAGE_SIZE; + bio->bi_bdev = super->s_bdev; + bio->bi_sector = ofs >> 9; + bio->bi_private = sb; + bio->bi_end_io = writeseg_end_io; + atomic_inc(&super->s_pending_writes); + submit_bio(WRITE, bio); + return 0; +} + +static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len) +{ + struct logfs_super *super = logfs_super(sb); + int head; + + BUG_ON(super->s_flags & LOGFS_SB_FLAG_RO); + + if (len == 0) { + /* This can happen when the object fit perfectly into a + * segment, the segment gets written per sync and subsequently + * closed. + */ + return; + } + head = ofs & (PAGE_SIZE - 1); + if (head) { + ofs -= head; + len += head; + } + len = PAGE_ALIGN(len); + __bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT); + generic_unplug_device(bdev_get_queue(logfs_super(sb)->s_bdev)); +} + + +static void erase_end_io(struct bio *bio, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct super_block *sb = bio->bi_private; + struct logfs_super *super = logfs_super(sb); + + BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */ + BUG_ON(err); + BUG_ON(bio->bi_vcnt == 0); + bio_put(bio); + if (atomic_dec_and_test(&super->s_pending_writes)) + wake_up(&wq); +} + +static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, + size_t nr_pages) +{ + struct logfs_super *super = logfs_super(sb); + struct bio *bio; + struct request_queue *q = bdev_get_queue(sb->s_bdev); + unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9); + int i; + + bio = bio_alloc(GFP_NOFS, max_pages); + BUG_ON(!bio); /* FIXME: handle this */ + + for (i = 0; i < nr_pages; i++) { + if (i >= max_pages) { + /* Block layer cannot split bios :( */ + bio->bi_vcnt = i; + bio->bi_idx = 0; + bio->bi_size = i * PAGE_SIZE; + bio->bi_bdev = super->s_bdev; + bio->bi_sector = ofs >> 9; + bio->bi_private = sb; + bio->bi_end_io = erase_end_io; + atomic_inc(&super->s_pending_writes); + submit_bio(WRITE, bio); + + ofs += i * PAGE_SIZE; + index += i; + nr_pages -= i; + i = 0; + + bio = bio_alloc(GFP_NOFS, max_pages); + BUG_ON(!bio); + } + bio->bi_io_vec[i].bv_page = super->s_erase_page; + bio->bi_io_vec[i].bv_len = PAGE_SIZE; + bio->bi_io_vec[i].bv_offset = 0; + } + bio->bi_vcnt = nr_pages; + bio->bi_idx = 0; + bio->bi_size = nr_pages * PAGE_SIZE; + bio->bi_bdev = super->s_bdev; + bio->bi_sector = ofs >> 9; + bio->bi_private = sb; + bio->bi_end_io = erase_end_io; + atomic_inc(&super->s_pending_writes); + submit_bio(WRITE, bio); + return 0; +} + +static int bdev_erase(struct super_block *sb, loff_t to, size_t len, + int ensure_write) +{ + struct logfs_super *super = logfs_super(sb); + + BUG_ON(to & (PAGE_SIZE - 1)); + BUG_ON(len & (PAGE_SIZE - 1)); + + if (super->s_flags & LOGFS_SB_FLAG_RO) + return -EROFS; + + if (ensure_write) { + /* + * Object store doesn't care whether erases happen or not. + * But for the journal they are required. Otherwise a scan + * can find an old commit entry and assume it is the current + * one, travelling back in time. + */ + do_erase(sb, to, to >> PAGE_SHIFT, len >> PAGE_SHIFT); + } + + return 0; +} + +static void bdev_sync(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + wait_event(wq, atomic_read(&super->s_pending_writes) == 0); +} + +static struct page *bdev_find_first_sb(struct super_block *sb, u64 *ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = bdev_readpage; + + *ofs = 0; + return read_cache_page(mapping, 0, filler, sb); +} + +static struct page *bdev_find_last_sb(struct super_block *sb, u64 *ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = bdev_readpage; + u64 pos = (super->s_bdev->bd_inode->i_size & ~0xfffULL) - 0x1000; + pgoff_t index = pos >> PAGE_SHIFT; + + *ofs = pos; + return read_cache_page(mapping, index, filler, sb); +} + +static int bdev_write_sb(struct super_block *sb, struct page *page) +{ + struct block_device *bdev = logfs_super(sb)->s_bdev; + + /* Nothing special to do for block devices. */ + return sync_request(page, bdev, WRITE); +} + +static void bdev_put_device(struct super_block *sb) +{ + close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE); +} + +static const struct logfs_device_ops bd_devops = { + .find_first_sb = bdev_find_first_sb, + .find_last_sb = bdev_find_last_sb, + .write_sb = bdev_write_sb, + .readpage = bdev_readpage, + .writeseg = bdev_writeseg, + .erase = bdev_erase, + .sync = bdev_sync, + .put_device = bdev_put_device, +}; + +int logfs_get_sb_bdev(struct file_system_type *type, int flags, + const char *devname, struct vfsmount *mnt) +{ + struct block_device *bdev; + + bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + + if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) { + int mtdnr = MINOR(bdev->bd_dev); + close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE); + return logfs_get_sb_mtd(type, flags, mtdnr, mnt); + } + + return logfs_get_sb_device(type, flags, NULL, bdev, &bd_devops, mnt); +} diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c new file mode 100644 index 00000000000..cafb6ef2e05 --- /dev/null +++ b/fs/logfs/dev_mtd.c @@ -0,0 +1,254 @@ +/* + * fs/logfs/dev_mtd.c - Device access methods for MTD + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> + */ +#include "logfs.h" +#include <linux/completion.h> +#include <linux/mount.h> +#include <linux/sched.h> + +#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) + +static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf) +{ + struct mtd_info *mtd = logfs_super(sb)->s_mtd; + size_t retlen; + int ret; + + ret = mtd->read(mtd, ofs, len, &retlen, buf); + BUG_ON(ret == -EINVAL); + if (ret) + return ret; + + /* Not sure if we should loop instead. */ + if (retlen != len) + return -EIO; + + return 0; +} + +static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf) +{ + struct logfs_super *super = logfs_super(sb); + struct mtd_info *mtd = super->s_mtd; + size_t retlen; + loff_t page_start, page_end; + int ret; + + if (super->s_flags & LOGFS_SB_FLAG_RO) + return -EROFS; + + BUG_ON((ofs >= mtd->size) || (len > mtd->size - ofs)); + BUG_ON(ofs != (ofs >> super->s_writeshift) << super->s_writeshift); + BUG_ON(len > PAGE_CACHE_SIZE); + page_start = ofs & PAGE_CACHE_MASK; + page_end = PAGE_CACHE_ALIGN(ofs + len) - 1; + ret = mtd->write(mtd, ofs, len, &retlen, buf); + if (ret || (retlen != len)) + return -EIO; + + return 0; +} + +/* + * For as long as I can remember (since about 2001) mtd->erase has been an + * asynchronous interface lacking the first driver to actually use the + * asynchronous properties. So just to prevent the first implementor of such + * a thing from breaking logfs in 2350, we do the usual pointless dance to + * declare a completion variable and wait for completion before returning + * from mtd_erase(). What an excercise in futility! + */ +static void logfs_erase_callback(struct erase_info *ei) +{ + complete((struct completion *)ei->priv); +} + +static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + struct page *page; + pgoff_t index = ofs >> PAGE_SHIFT; + + for (index = ofs >> PAGE_SHIFT; index < (ofs + len) >> PAGE_SHIFT; index++) { + page = find_get_page(mapping, index); + if (!page) + continue; + memset(page_address(page), 0xFF, PAGE_SIZE); + page_cache_release(page); + } + return 0; +} + +static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len, + int ensure_write) +{ + struct mtd_info *mtd = logfs_super(sb)->s_mtd; + struct erase_info ei; + DECLARE_COMPLETION_ONSTACK(complete); + int ret; + + BUG_ON(len % mtd->erasesize); + if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO) + return -EROFS; + + memset(&ei, 0, sizeof(ei)); + ei.mtd = mtd; + ei.addr = ofs; + ei.len = len; + ei.callback = logfs_erase_callback; + ei.priv = (long)&complete; + ret = mtd->erase(mtd, &ei); + if (ret) + return -EIO; + + wait_for_completion(&complete); + if (ei.state != MTD_ERASE_DONE) + return -EIO; + return mtd_erase_mapping(sb, ofs, len); +} + +static void mtd_sync(struct super_block *sb) +{ + struct mtd_info *mtd = logfs_super(sb)->s_mtd; + + if (mtd->sync) + mtd->sync(mtd); +} + +static int mtd_readpage(void *_sb, struct page *page) +{ + struct super_block *sb = _sb; + int err; + + err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE, + page_address(page)); + if (err == -EUCLEAN) { + err = 0; + /* FIXME: force GC this segment */ + } + if (err) { + ClearPageUptodate(page); + SetPageError(page); + } else { + SetPageUptodate(page); + ClearPageError(page); + } + unlock_page(page); + return err; +} + +static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = mtd_readpage; + struct mtd_info *mtd = super->s_mtd; + + if (!mtd->block_isbad) + return NULL; + + *ofs = 0; + while (mtd->block_isbad(mtd, *ofs)) { + *ofs += mtd->erasesize; + if (*ofs >= mtd->size) + return NULL; + } + BUG_ON(*ofs & ~PAGE_MASK); + return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb); +} + +static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = mtd_readpage; + struct mtd_info *mtd = super->s_mtd; + + if (!mtd->block_isbad) + return NULL; + + *ofs = mtd->size - mtd->erasesize; + while (mtd->block_isbad(mtd, *ofs)) { + *ofs -= mtd->erasesize; + if (*ofs <= 0) + return NULL; + } + *ofs = *ofs + mtd->erasesize - 0x1000; + BUG_ON(*ofs & ~PAGE_MASK); + return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb); +} + +static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, + size_t nr_pages) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + struct page *page; + int i, err; + + for (i = 0; i < nr_pages; i++) { + page = find_lock_page(mapping, index + i); + BUG_ON(!page); + + err = mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE, + page_address(page)); + unlock_page(page); + page_cache_release(page); + if (err) + return err; + } + return 0; +} + +static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len) +{ + struct logfs_super *super = logfs_super(sb); + int head; + + if (super->s_flags & LOGFS_SB_FLAG_RO) + return; + + if (len == 0) { + /* This can happen when the object fit perfectly into a + * segment, the segment gets written per sync and subsequently + * closed. + */ + return; + } + head = ofs & (PAGE_SIZE - 1); + if (head) { + ofs -= head; + len += head; + } + len = PAGE_ALIGN(len); + __mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT); +} + +static void mtd_put_device(struct super_block *sb) +{ + put_mtd_device(logfs_super(sb)->s_mtd); +} + +static const struct logfs_device_ops mtd_devops = { + .find_first_sb = mtd_find_first_sb, + .find_last_sb = mtd_find_last_sb, + .readpage = mtd_readpage, + .writeseg = mtd_writeseg, + .erase = mtd_erase, + .sync = mtd_sync, + .put_device = mtd_put_device, +}; + +int logfs_get_sb_mtd(struct file_system_type *type, int flags, + int mtdnr, struct vfsmount *mnt) +{ + struct mtd_info *mtd; + const struct logfs_device_ops *devops = &mtd_devops; + + mtd = get_mtd_device(NULL, mtdnr); + return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt); +} diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c new file mode 100644 index 00000000000..56a8bfbb012 --- /dev/null +++ b/fs/logfs/dir.c @@ -0,0 +1,827 @@ +/* + * fs/logfs/dir.c - directory-related code + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> + */ +#include "logfs.h" + + +/* + * Atomic dir operations + * + * Directory operations are by default not atomic. Dentries and Inodes are + * created/removed/altered in seperate operations. Therefore we need to do + * a small amount of journaling. + * + * Create, link, mkdir, mknod and symlink all share the same function to do + * the work: __logfs_create. This function works in two atomic steps: + * 1. allocate inode (remember in journal) + * 2. allocate dentry (clear journal) + * + * As we can only get interrupted between the two, when the inode we just + * created is simply stored in the anchor. On next mount, if we were + * interrupted, we delete the inode. From a users point of view the + * operation never happened. + * + * Unlink and rmdir also share the same function: unlink. Again, this + * function works in two atomic steps + * 1. remove dentry (remember inode in journal) + * 2. unlink inode (clear journal) + * + * And again, on the next mount, if we were interrupted, we delete the inode. + * From a users point of view the operation succeeded. + * + * Rename is the real pain to deal with, harder than all the other methods + * combined. Depending on the circumstances we can run into three cases. + * A "target rename" where the target dentry already existed, a "local + * rename" where both parent directories are identical or a "cross-directory + * rename" in the remaining case. + * + * Local rename is atomic, as the old dentry is simply rewritten with a new + * name. + * + * Cross-directory rename works in two steps, similar to __logfs_create and + * logfs_unlink: + * 1. Write new dentry (remember old dentry in journal) + * 2. Remove old dentry (clear journal) + * + * Here we remember a dentry instead of an inode. On next mount, if we were + * interrupted, we delete the dentry. From a users point of view, the + * operation succeeded. + * + * Target rename works in three atomic steps: + * 1. Attach old inode to new dentry (remember old dentry and new inode) + * 2. Remove old dentry (still remember the new inode) + * 3. Remove victim inode + * + * Here we remember both an inode an a dentry. If we get interrupted + * between steps 1 and 2, we delete both the dentry and the inode. If + * we get interrupted between steps 2 and 3, we delete just the inode. + * In either case, the remaining objects are deleted on next mount. From + * a users point of view, the operation succeeded. + */ + +static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd, + loff_t pos) +{ + return logfs_inode_write(dir, dd, sizeof(*dd), pos, WF_LOCK, NULL); +} + +static int write_inode(struct inode *inode) +{ + return __logfs_write_inode(inode, WF_LOCK); +} + +static s64 dir_seek_data(struct inode *inode, s64 pos) +{ + s64 new_pos = logfs_seek_data(inode, pos); + + return max(pos, new_pos - 1); +} + +static int beyond_eof(struct inode *inode, loff_t bix) +{ + loff_t pos = bix << inode->i_sb->s_blocksize_bits; + return pos >= i_size_read(inode); +} + +/* + * Prime value was chosen to be roughly 256 + 26. r5 hash uses 11, + * so short names (len <= 9) don't even occupy the complete 32bit name + * space. A prime >256 ensures short names quickly spread the 32bit + * name space. Add about 26 for the estimated amount of information + * of each character and pick a prime nearby, preferrably a bit-sparse + * one. + */ +static u32 hash_32(const char *s, int len, u32 seed) +{ + u32 hash = seed; + int i; + + for (i = 0; i < len; i++) + hash = hash * 293 + s[i]; + return hash; +} + +/* + * We have to satisfy several conflicting requirements here. Small + * directories should stay fairly compact and not require too many + * indirect blocks. The number of possible locations for a given hash + * should be small to make lookup() fast. And we should try hard not + * to overflow the 32bit name space or nfs and 32bit host systems will + * be unhappy. + * + * So we use the following scheme. First we reduce the hash to 0..15 + * and try a direct block. If that is occupied we reduce the hash to + * 16..255 and try an indirect block. Same for 2x and 3x indirect + * blocks. Lastly we reduce the hash to 0x800_0000 .. 0xffff_ffff, + * but use buckets containing eight entries instead of a single one. + * + * Using 16 entries should allow for a reasonable amount of hash + * collisions, so the 32bit name space can be packed fairly tight + * before overflowing. Oh and currently we don't overflow but return + * and error. + * + * How likely are collisions? Doing the appropriate math is beyond me + * and the Bronstein textbook. But running a test program to brute + * force collisions for a couple of days showed that on average the + * first collision occurs after 598M entries, with 290M being the + * smallest result. Obviously 21 entries could already cause a + * collision if all entries are carefully chosen. + */ +static pgoff_t hash_index(u32 hash, int round) +{ + u32 i0_blocks = I0_BLOCKS; + u32 i1_blocks = I1_BLOCKS; + u32 i2_blocks = I2_BLOCKS; + u32 i3_blocks = I3_BLOCKS; + + switch (round) { + case 0: + return hash % i0_blocks; + case 1: + return i0_blocks + hash % (i1_blocks - i0_blocks); + case 2: + return i1_blocks + hash % (i2_blocks - i1_blocks); + case 3: + return i2_blocks + hash % (i3_blocks - i2_blocks); + case 4 ... 19: + return i3_blocks + 16 * (hash % (((1<<31) - i3_blocks) / 16)) + + round - 4; + } + BUG(); +} + +static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry) +{ + struct qstr *name = &dentry->d_name; + struct page *page; + struct logfs_disk_dentry *dd; + u32 hash = hash_32(name->name, name->len, 0); + pgoff_t index; + int round; + + if (name->len > LOGFS_MAX_NAMELEN) + return ERR_PTR(-ENAMETOOLONG); + + for (round = 0; round < 20; round++) { + index = hash_index(hash, round); + + if (beyond_eof(dir, index)) + return NULL; + if (!logfs_exist_block(dir, index)) + continue; + page = read_cache_page(dir->i_mapping, index, + (filler_t *)logfs_readpage, NULL); + if (IS_ERR(page)) + return page; + dd = kmap_atomic(page, KM_USER0); + BUG_ON(dd->namelen == 0); + + if (name->len != be16_to_cpu(dd->namelen) || + memcmp(name->name, dd->name, name->len)) { + kunmap_atomic(dd, KM_USER0); + page_cache_release(page); + continue; + } + + kunmap_atomic(dd, KM_USER0); + return page; + } + return NULL; +} + +static int logfs_remove_inode(struct inode *inode) +{ + int ret; + + inode->i_nlink--; + ret = write_inode(inode); + LOGFS_BUG_ON(ret, inode->i_sb); + return ret; +} + +static void abort_transaction(struct inode *inode, struct logfs_transaction *ta) +{ + if (logfs_inode(inode)->li_block) + logfs_inode(inode)->li_block->ta = NULL; + kfree(ta); +} + +static int logfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct logfs_super *super = logfs_super(dir->i_sb); + struct inode *inode = dentry->d_inode; + struct logfs_transaction *ta; + struct page *page; + pgoff_t index; + int ret; + + ta = kzalloc(sizeof(*ta), GFP_KERNEL); + if (!ta) + return -ENOMEM; + + ta->state = UNLINK_1; + ta->ino = inode->i_ino; + + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + + page = logfs_get_dd_page(dir, dentry); + if (!page) { + kfree(ta); + return -ENOENT; + } + if (IS_ERR(page)) { + kfree(ta); + return PTR_ERR(page); + } + index = page->index; + page_cache_release(page); + + mutex_lock(&super->s_dirop_mutex); + logfs_add_transaction(dir, ta); + + ret = logfs_delete(dir, index, NULL); + if (!ret) + ret = write_inode(dir); + + if (ret) { + abort_transaction(dir, ta); + printk(KERN_ERR"LOGFS: unable to delete inode\n"); + goto out; + } + + ta->state = UNLINK_2; + logfs_add_transaction(inode, ta); + ret = logfs_remove_inode(inode); +out: + mutex_unlock(&super->s_dirop_mutex); + return ret; +} + +static inline int logfs_empty_dir(struct inode *dir) +{ + u64 data; + + data = logfs_seek_data(dir, 0) << dir->i_sb->s_blocksize_bits; + return data >= i_size_read(dir); +} + +static int logfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + + if (!logfs_empty_dir(inode)) + return -ENOTEMPTY; + + return logfs_unlink(dir, dentry); +} + +/* FIXME: readdir currently has it's own dir_walk code. I don't see a good + * way to combine the two copies */ +#define IMPLICIT_NODES 2 +static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir) +{ + struct inode *dir = file->f_dentry->d_inode; + loff_t pos = file->f_pos - IMPLICIT_NODES; + struct page *page; + struct logfs_disk_dentry *dd; + int full; + + BUG_ON(pos < 0); + for (;; pos++) { + if (beyond_eof(dir, pos)) + break; + if (!logfs_exist_block(dir, pos)) { + /* deleted dentry */ + pos = dir_seek_data(dir, pos); + continue; + } + page = read_cache_page(dir->i_mapping, pos, + (filler_t *)logfs_readpage, NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + dd = kmap_atomic(page, KM_USER0); + BUG_ON(dd->namelen == 0); + + full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen), + pos, be64_to_cpu(dd->ino), dd->type); + kunmap_atomic(dd, KM_USER0); + page_cache_release(page); + if (full) + break; + } + + file->f_pos = pos + IMPLICIT_NODES; + return 0; +} + +static int logfs_readdir(struct file *file, void *buf, filldir_t filldir) +{ + struct inode *inode = file->f_dentry->d_inode; + ino_t pino = parent_ino(file->f_dentry); + int err; + + if (file->f_pos < 0) + return -EINVAL; + + if (file->f_pos == 0) { + if (filldir(buf, ".", 1, 1, inode->i_ino, DT_DIR) < 0) + return 0; + file->f_pos++; + } + if (file->f_pos == 1) { + if (filldir(buf, "..", 2, 2, pino, DT_DIR) < 0) + return 0; + file->f_pos++; + } + + err = __logfs_readdir(file, buf, filldir); + return err; +} + +static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name) +{ + dd->namelen = cpu_to_be16(name->len); + memcpy(dd->name, name->name, name->len); +} + +static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct page *page; + struct logfs_disk_dentry *dd; + pgoff_t index; + u64 ino = 0; + struct inode *inode; + + page = logfs_get_dd_page(dir, dentry); + if (IS_ERR(page)) + return ERR_CAST(page); + if (!page) { + d_add(dentry, NULL); + return NULL; + } + index = page->index; + dd = kmap_atomic(page, KM_USER0); + ino = be64_to_cpu(dd->ino); + kunmap_atomic(dd, KM_USER0); + page_cache_release(page); + + inode = logfs_iget(dir->i_sb, ino); + if (IS_ERR(inode)) { + printk(KERN_ERR"LogFS: Cannot read inode #%llx for dentry (%lx, %lx)n", + ino, dir->i_ino, index); + return ERR_CAST(inode); + } + return d_splice_alias(inode, dentry); +} + +static void grow_dir(struct inode *dir, loff_t index) +{ + index = (index + 1) << dir->i_sb->s_blocksize_bits; + if (i_size_read(dir) < index) + i_size_write(dir, index); +} + +static int logfs_write_dir(struct inode *dir, struct dentry *dentry, + struct inode *inode) +{ + struct page *page; + struct logfs_disk_dentry *dd; + u32 hash = hash_32(dentry->d_name.name, dentry->d_name.len, 0); + pgoff_t index; + int round, err; + + for (round = 0; round < 20; round++) { + index = hash_index(hash, round); + + if (logfs_exist_block(dir, index)) + continue; + page = find_or_create_page(dir->i_mapping, index, GFP_KERNEL); + if (!page) + return -ENOMEM; + + dd = kmap_atomic(page, KM_USER0); + memset(dd, 0, sizeof(*dd)); + dd->ino = cpu_to_be64(inode->i_ino); + dd->type = logfs_type(inode); + logfs_set_name(dd, &dentry->d_name); + kunmap_atomic(dd, KM_USER0); + + err = logfs_write_buf(dir, page, WF_LOCK); + unlock_page(page); + page_cache_release(page); + if (!err) + grow_dir(dir, index); + return err; + } + /* FIXME: Is there a better return value? In most cases neither + * the filesystem nor the directory are full. But we have had + * too many collisions for this particular hash and no fallback. + */ + return -ENOSPC; +} + +static int __logfs_create(struct inode *dir, struct dentry *dentry, + struct inode *inode, const char *dest, long destlen) +{ + struct logfs_super *super = logfs_super(dir->i_sb); + struct logfs_inode *li = logfs_inode(inode); + struct logfs_transaction *ta; + int ret; + + ta = kzalloc(sizeof(*ta), GFP_KERNEL); + if (!ta) + return -ENOMEM; + + ta->state = CREATE_1; + ta->ino = inode->i_ino; + mutex_lock(&super->s_dirop_mutex); + logfs_add_transaction(inode, ta); + + if (dest) { + /* symlink */ + ret = logfs_inode_write(inode, dest, destlen, 0, WF_LOCK, NULL); + if (!ret) + ret = write_inode(inode); + } else { + /* creat/mkdir/mknod */ + ret = write_inode(inode); + } + if (ret) { + abort_transaction(inode, ta); + li->li_flags |= LOGFS_IF_STILLBORN; + /* FIXME: truncate symlink */ + inode->i_nlink--; + iput(inode); + goto out; + } + + ta->state = CREATE_2; + logfs_add_transaction(dir, ta); + ret = logfs_write_dir(dir, dentry, inode); + /* sync directory */ + if (!ret) + ret = write_inode(dir); + + if (ret) { + logfs_del_transaction(dir, ta); + ta->state = CREATE_2; + logfs_add_transaction(inode, ta); + logfs_remove_inode(inode); + iput(inode); + goto out; + } + d_instantiate(dentry, inode); +out: + mutex_unlock(&super->s_dirop_mutex); + return ret; +} + +static int logfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct inode *inode; + + /* + * FIXME: why do we have to fill in S_IFDIR, while the mode is + * correct for mknod, creat, etc.? Smells like the vfs *should* + * do it for us but for some reason fails to do so. + */ + inode = logfs_new_inode(dir, S_IFDIR | mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &logfs_dir_iops; + inode->i_fop = &logfs_dir_fops; + + return __logfs_create(dir, dentry, inode, NULL, 0); +} + +static int logfs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct inode *inode; + + inode = logfs_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &logfs_reg_iops; + inode->i_fop = &logfs_reg_fops; + inode->i_mapping->a_ops = &logfs_reg_aops; + + return __logfs_create(dir, dentry, inode, NULL, 0); +} + +static int logfs_mknod(struct inode *dir, struct dentry *dentry, int mode, + dev_t rdev) +{ + struct inode *inode; + + if (dentry->d_name.len > LOGFS_MAX_NAMELEN) + return -ENAMETOOLONG; + + inode = logfs_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + init_special_inode(inode, mode, rdev); + + return __logfs_create(dir, dentry, inode, NULL, 0); +} + +static int logfs_symlink(struct inode *dir, struct dentry *dentry, + const char *target) +{ + struct inode *inode; + size_t destlen = strlen(target) + 1; + + if (destlen > dir->i_sb->s_blocksize) + return -ENAMETOOLONG; + + inode = logfs_new_inode(dir, S_IFLNK | 0777); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &logfs_symlink_iops; + inode->i_mapping->a_ops = &logfs_reg_aops; + + return __logfs_create(dir, dentry, inode, target, destlen); +} + +static int logfs_permission(struct inode *inode, int mask) +{ + return generic_permission(inode, mask, NULL); +} + +static int logfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = old_dentry->d_inode; + + if (inode->i_nlink >= LOGFS_LINK_MAX) + return -EMLINK; + + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + atomic_inc(&inode->i_count); + inode->i_nlink++; + mark_inode_dirty_sync(inode); + + return __logfs_create(dir, dentry, inode, NULL, 0); +} + +static int logfs_get_dd(struct inode *dir, struct dentry *dentry, + struct logfs_disk_dentry *dd, loff_t *pos) +{ + struct page *page; + void *map; + + page = logfs_get_dd_page(dir, dentry); + if (IS_ERR(page)) + return PTR_ERR(page); + *pos = page->index; + map = kmap_atomic(page, KM_USER0); + memcpy(dd, map, sizeof(*dd)); + kunmap_atomic(map, KM_USER0); + page_cache_release(page); + return 0; +} + +static int logfs_delete_dd(struct inode *dir, loff_t pos) +{ + /* + * Getting called with pos somewhere beyond eof is either a goofup + * within this file or means someone maliciously edited the + * (crc-protected) journal. + */ + BUG_ON(beyond_eof(dir, pos)); + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + log_dir(" Delete dentry (%lx, %llx)\n", dir->i_ino, pos); + return logfs_delete(dir, pos, NULL); +} + +/* + * Cross-directory rename, target does not exist. Just a little nasty. + * Create a new dentry in the target dir, then remove the old dentry, + * all the while taking care to remember our operation in the journal. + */ +static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct logfs_super *super = logfs_super(old_dir->i_sb); + struct logfs_disk_dentry dd; + struct logfs_transaction *ta; + loff_t pos; + int err; + + /* 1. locate source dd */ + err = logfs_get_dd(old_dir, old_dentry, &dd, &pos); + if (err) + return err; + + ta = kzalloc(sizeof(*ta), GFP_KERNEL); + if (!ta) + return -ENOMEM; + + ta->state = CROSS_RENAME_1; + ta->dir = old_dir->i_ino; + ta->pos = pos; + + /* 2. write target dd */ + mutex_lock(&super->s_dirop_mutex); + logfs_add_transaction(new_dir, ta); + err = logfs_write_dir(new_dir, new_dentry, old_dentry->d_inode); + if (!err) + err = write_inode(new_dir); + + if (err) { + super->s_rename_dir = 0; + super->s_rename_pos = 0; + abort_transaction(new_dir, ta); + goto out; + } + + /* 3. remove source dd */ + ta->state = CROSS_RENAME_2; + logfs_add_transaction(old_dir, ta); + err = logfs_delete_dd(old_dir, pos); + if (!err) + err = write_inode(old_dir); + LOGFS_BUG_ON(err, old_dir->i_sb); +out: + mutex_unlock(&super->s_dirop_mutex); + return err; +} + +static int logfs_replace_inode(struct inode *dir, struct dentry *dentry, + struct logfs_disk_dentry *dd, struct inode *inode) +{ + loff_t pos; + int err; + + err = logfs_get_dd(dir, dentry, dd, &pos); + if (err) + return err; + dd->ino = cpu_to_be64(inode->i_ino); + dd->type = logfs_type(inode); + + err = write_dir(dir, dd, pos); + if (err) + return err; + log_dir("Replace dentry (%lx, %llx) %s -> %llx\n", dir->i_ino, pos, + dd->name, be64_to_cpu(dd->ino)); + return write_inode(dir); +} + +/* Target dentry exists - the worst case. We need to attach the source + * inode to the target dentry, then remove the orphaned target inode and + * source dentry. + */ +static int logfs_rename_target(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct logfs_super *super = logfs_super(old_dir->i_sb); + struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + int isdir = S_ISDIR(old_inode->i_mode); + struct logfs_disk_dentry dd; + struct logfs_transaction *ta; + loff_t pos; + int err; + + BUG_ON(isdir != S_ISDIR(new_inode->i_mode)); + if (isdir) { + if (!logfs_empty_dir(new_inode)) + return -ENOTEMPTY; + } + + /* 1. locate source dd */ + err = logfs_get_dd(old_dir, old_dentry, &dd, &pos); + if (err) + return err; + + ta = kzalloc(sizeof(*ta), GFP_KERNEL); + if (!ta) + return -ENOMEM; + + ta->state = TARGET_RENAME_1; + ta->dir = old_dir->i_ino; + ta->pos = pos; + ta->ino = new_inode->i_ino; + + /* 2. attach source inode to target dd */ + mutex_lock(&super->s_dirop_mutex); + logfs_add_transaction(new_dir, ta); + err = logfs_replace_inode(new_dir, new_dentry, &dd, old_inode); + if (err) { + super->s_rename_dir = 0; + super->s_rename_pos = 0; + super->s_victim_ino = 0; + abort_transaction(new_dir, ta); + goto out; + } + + /* 3. remove source dd */ + ta->state = TARGET_RENAME_2; + logfs_add_transaction(old_dir, ta); + err = logfs_delete_dd(old_dir, pos); + if (!err) + err = write_inode(old_dir); + LOGFS_BUG_ON(err, old_dir->i_sb); + + /* 4. remove target inode */ + ta->state = TARGET_RENAME_3; + logfs_add_transaction(new_inode, ta); + err = logfs_remove_inode(new_inode); + +out: + mutex_unlock(&super->s_dirop_mutex); + return err; +} + +static int logfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + if (new_dentry->d_inode) + return logfs_rename_target(old_dir, old_dentry, + new_dir, new_dentry); + return logfs_rename_cross(old_dir, old_dentry, new_dir, new_dentry); +} + +/* No locking done here, as this is called before .get_sb() returns. */ +int logfs_replay_journal(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode; + u64 ino, pos; + int err; + + if (super->s_victim_ino) { + /* delete victim inode */ + ino = super->s_victim_ino; + printk(KERN_INFO"LogFS: delete unmapped inode #%llx\n", ino); + inode = logfs_iget(sb, ino); + if (IS_ERR(inode)) + goto fail; + + LOGFS_BUG_ON(i_size_read(inode) > 0, sb); + super->s_victim_ino = 0; + err = logfs_remove_inode(inode); + iput(inode); + if (err) { + super->s_victim_ino = ino; + goto fail; + } + } + if (super->s_rename_dir) { + /* delete old dd from rename */ + ino = super->s_rename_dir; + pos = super->s_rename_pos; + printk(KERN_INFO"LogFS: delete unbacked dentry (%llx, %llx)\n", + ino, pos); + inode = logfs_iget(sb, ino); + if (IS_ERR(inode)) + goto fail; + + super->s_rename_dir = 0; + super->s_rename_pos = 0; + err = logfs_delete_dd(inode, pos); + iput(inode); + if (err) { + super->s_rename_dir = ino; + super->s_rename_pos = pos; + goto fail; + } + } + return 0; +fail: + LOGFS_BUG(sb); + return -EIO; +} + +const struct inode_operations logfs_symlink_iops = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, +}; + +const struct inode_operations logfs_dir_iops = { + .create = logfs_create, + .link = logfs_link, + .lookup = logfs_lookup, + .mkdir = logfs_mkdir, + .mknod = logfs_mknod, + .rename = logfs_rename, + .rmdir = logfs_rmdir, + .permission = logfs_permission, + .symlink = logfs_symlink, + .unlink = logfs_unlink, +}; +const struct file_operations logfs_dir_fops = { + .fsync = logfs_fsync, + .ioctl = logfs_ioctl, + .readdir = logfs_readdir, + .read = generic_read_dir, +}; diff --git a/fs/logfs/file.c b/fs/logfs/file.c new file mode 100644 index 00000000000..370f367a933 --- /dev/null +++ b/fs/logfs/file.c @@ -0,0 +1,263 @@ +/* + * fs/logfs/file.c - prepare_write, commit_write and friends + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> + */ +#include "logfs.h" +#include <linux/sched.h> +#include <linux/writeback.h> + +static int logfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = mapping->host; + struct page *page; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + *pagep = page; + + if ((len == PAGE_CACHE_SIZE) || PageUptodate(page)) + return 0; + if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) { + unsigned start = pos & (PAGE_CACHE_SIZE - 1); + unsigned end = start + len; + + /* Reading beyond i_size is simple: memset to zero */ + zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE); + return 0; + } + return logfs_readpage_nolock(page); +} + +static int logfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, struct page *page, + void *fsdata) +{ + struct inode *inode = mapping->host; + pgoff_t index = page->index; + unsigned start = pos & (PAGE_CACHE_SIZE - 1); + unsigned end = start + copied; + int ret = 0; + + BUG_ON(PAGE_CACHE_SIZE != inode->i_sb->s_blocksize); + BUG_ON(page->index > I3_BLOCKS); + + if (copied < len) { + /* + * Short write of a non-initialized paged. Just tell userspace + * to retry the entire page. + */ + if (!PageUptodate(page)) { + copied = 0; + goto out; + } + } + if (copied == 0) + goto out; /* FIXME: do we need to update inode? */ + + if (i_size_read(inode) < (index << PAGE_CACHE_SHIFT) + end) { + i_size_write(inode, (index << PAGE_CACHE_SHIFT) + end); + mark_inode_dirty_sync(inode); + } + + SetPageUptodate(page); + if (!PageDirty(page)) { + if (!get_page_reserve(inode, page)) + __set_page_dirty_nobuffers(page); + else + ret = logfs_write_buf(inode, page, WF_LOCK); + } +out: + unlock_page(page); + page_cache_release(page); + return ret ? ret : copied; +} + +int logfs_readpage(struct file *file, struct page *page) +{ + int ret; + + ret = logfs_readpage_nolock(page); + unlock_page(page); + return ret; +} + +/* Clear the page's dirty flag in the radix tree. */ +/* TODO: mucking with PageWriteback is silly. Add a generic function to clear + * the dirty bit from the radix tree for filesystems that don't have to wait + * for page writeback to finish (i.e. any compressing filesystem). + */ +static void clear_radix_tree_dirty(struct page *page) +{ + BUG_ON(PagePrivate(page) || page->private); + set_page_writeback(page); + end_page_writeback(page); +} + +static int __logfs_writepage(struct page *page) +{ + struct inode *inode = page->mapping->host; + int err; + + err = logfs_write_buf(inode, page, WF_LOCK); + if (err) + set_page_dirty(page); + else + clear_radix_tree_dirty(page); + unlock_page(page); + return err; +} + +static int logfs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned offset; + u64 bix; + level_t level; + + log_file("logfs_writepage(%lx, %lx, %p)\n", inode->i_ino, page->index, + page); + + logfs_unpack_index(page->index, &bix, &level); + + /* Indirect blocks are never truncated */ + if (level != 0) + return __logfs_writepage(page); + + /* + * TODO: everything below is a near-verbatim copy of nobh_writepage(). + * The relevant bits should be factored out after logfs is merged. + */ + + /* Is the page fully inside i_size? */ + if (bix < end_index) + return __logfs_writepage(page); + + /* Is the page fully outside i_size? (truncate in progress) */ + offset = i_size & (PAGE_CACHE_SIZE-1); + if (bix > end_index || offset == 0) { + unlock_page(page); + return 0; /* don't care */ + } + + /* + * The page straddles i_size. It must be zeroed out on each and every + * writepage invokation because it may be mmapped. "A file is mapped + * in multiples of the page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + zero_user_segment(page, offset, PAGE_CACHE_SIZE); + return __logfs_writepage(page); +} + +static void logfs_invalidatepage(struct page *page, unsigned long offset) +{ + move_page_to_btree(page); + BUG_ON(PagePrivate(page) || page->private); +} + +static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this) +{ + return 0; /* None of these are easy to release */ +} + + +int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct logfs_inode *li = logfs_inode(inode); + unsigned int oldflags, flags; + int err; + + switch (cmd) { + case FS_IOC_GETFLAGS: + flags = li->li_flags & LOGFS_FL_USER_VISIBLE; + return put_user(flags, (int __user *)arg); + case FS_IOC_SETFLAGS: + if (IS_RDONLY(inode)) + return -EROFS; + + if (!is_owner_or_cap(inode)) + return -EACCES; + + err = get_user(flags, (int __user *)arg); + if (err) + return err; + + mutex_lock(&inode->i_mutex); + oldflags = li->li_flags; + flags &= LOGFS_FL_USER_MODIFIABLE; + flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE; + li->li_flags = flags; + mutex_unlock(&inode->i_mutex); + + inode->i_ctime = CURRENT_TIME; + mark_inode_dirty_sync(inode); + return 0; + + default: + return -ENOTTY; + } +} + +int logfs_fsync(struct file *file, struct dentry *dentry, int datasync) +{ + struct super_block *sb = dentry->d_inode->i_sb; + struct logfs_super *super = logfs_super(sb); + + /* FIXME: write anchor */ + super->s_devops->sync(sb); + return 0; +} + +static int logfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int err = 0; + + if (attr->ia_valid & ATTR_SIZE) + err = logfs_truncate(inode, attr->ia_size); + attr->ia_valid &= ~ATTR_SIZE; + + if (!err) + err = inode_change_ok(inode, attr); + if (!err) + err = inode_setattr(inode, attr); + return err; +} + +const struct inode_operations logfs_reg_iops = { + .setattr = logfs_setattr, +}; + +const struct file_operations logfs_reg_fops = { + .aio_read = generic_file_aio_read, + .aio_write = generic_file_aio_write, + .fsync = logfs_fsync, + .ioctl = logfs_ioctl, + .llseek = generic_file_llseek, + .mmap = generic_file_readonly_mmap, + .open = generic_file_open, + .read = do_sync_read, + .write = do_sync_write, +}; + +const struct address_space_operations logfs_reg_aops = { + .invalidatepage = logfs_invalidatepage, + .readpage = logfs_readpage, + .releasepage = logfs_releasepage, + .set_page_dirty = __set_page_dirty_nobuffers, + .writepage = logfs_writepage, + .writepages = generic_writepages, + .write_begin = logfs_write_begin, + .write_end = logfs_write_end, +}; diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c new file mode 100644 index 00000000000..92949f95a90 --- /dev/null +++ b/fs/logfs/gc.c @@ -0,0 +1,730 @@ +/* + * fs/logfs/gc.c - garbage collection code + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> + */ +#include "logfs.h" +#include <linux/sched.h> + +/* + * Wear leveling needs to kick in when the difference between low erase + * counts and high erase counts gets too big. A good value for "too big" + * may be somewhat below 10% of maximum erase count for the device. + * Why not 397, to pick a nice round number with no specific meaning? :) + * + * WL_RATELIMIT is the minimum time between two wear level events. A huge + * number of segments may fulfil the requirements for wear leveling at the + * same time. If that happens we don't want to cause a latency from hell, + * but just gently pick one segment every so often and minimize overhead. + */ +#define WL_DELTA 397 +#define WL_RATELIMIT 100 +#define MAX_OBJ_ALIASES 2600 +#define SCAN_RATIO 512 /* number of scanned segments per gc'd segment */ +#define LIST_SIZE 64 /* base size of candidate lists */ +#define SCAN_ROUNDS 128 /* maximum number of complete medium scans */ +#define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */ + +static int no_free_segments(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + return super->s_free_list.count; +} + +/* journal has distance -1, top-most ifile layer distance 0 */ +static u8 root_distance(struct super_block *sb, gc_level_t __gc_level) +{ + struct logfs_super *super = logfs_super(sb); + u8 gc_level = (__force u8)__gc_level; + + switch (gc_level) { + case 0: /* fall through */ + case 1: /* fall through */ + case 2: /* fall through */ + case 3: + /* file data or indirect blocks */ + return super->s_ifile_levels + super->s_iblock_levels - gc_level; + case 6: /* fall through */ + case 7: /* fall through */ + case 8: /* fall through */ + case 9: + /* inode file data or indirect blocks */ + return super->s_ifile_levels - (gc_level - 6); + default: + printk(KERN_ERR"LOGFS: segment of unknown level %x found\n", + gc_level); + WARN_ON(1); + return super->s_ifile_levels + super->s_iblock_levels; + } +} + +static int segment_is_reserved(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area; + void *reserved; + int i; + + /* Some segments are reserved. Just pretend they were all valid */ + reserved = btree_lookup32(&super->s_reserved_segments, segno); + if (reserved) + return 1; + + /* Currently open segments */ + for_each_area(i) { + area = super->s_area[i]; + if (area->a_is_open && area->a_segno == segno) + return 1; + } + + return 0; +} + +static void logfs_mark_segment_bad(struct super_block *sb, u32 segno) +{ + BUG(); +} + +/* + * Returns the bytes consumed by valid objects in this segment. Object headers + * are counted, the segment header is not. + */ +static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec, + gc_level_t *gc_level) +{ + struct logfs_segment_entry se; + u32 ec_level; + + logfs_get_segment_entry(sb, segno, &se); + if (se.ec_level == cpu_to_be32(BADSEG) || + se.valid == cpu_to_be32(RESERVED)) + return RESERVED; + + ec_level = be32_to_cpu(se.ec_level); + *ec = ec_level >> 4; + *gc_level = GC_LEVEL(ec_level & 0xf); + return be32_to_cpu(se.valid); +} + +static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino, + u64 bix, gc_level_t gc_level) +{ + struct inode *inode; + int err, cookie; + + inode = logfs_safe_iget(sb, ino, &cookie); + err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0); + BUG_ON(err); + logfs_safe_iput(inode, cookie); +} + +static u32 logfs_gc_segment(struct super_block *sb, u32 segno, u8 dist) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_segment_header sh; + struct logfs_object_header oh; + u64 ofs, ino, bix; + u32 seg_ofs, logical_segno, cleaned = 0; + int err, len, valid; + gc_level_t gc_level; + + LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb); + + btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS); + err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh); + BUG_ON(err); + gc_level = GC_LEVEL(sh.level); + logical_segno = be32_to_cpu(sh.segno); + if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) { + logfs_mark_segment_bad(sb, segno); + cleaned = -1; + goto out; + } + + for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE; + seg_ofs + sizeof(oh) < super->s_segsize; ) { + ofs = dev_ofs(sb, logical_segno, seg_ofs); + err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh), + &oh); + BUG_ON(err); + + if (!memchr_inv(&oh, 0xff, sizeof(oh))) + break; + + if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) { + logfs_mark_segment_bad(sb, segno); + cleaned = super->s_segsize - 1; + goto out; + } + + ino = be64_to_cpu(oh.ino); + bix = be64_to_cpu(oh.bix); + len = sizeof(oh) + be16_to_cpu(oh.len); + valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level); + if (valid == 1) { + logfs_cleanse_block(sb, ofs, ino, bix, gc_level); + cleaned += len; + } else if (valid == 2) { + /* Will be invalid upon journal commit */ + cleaned += len; + } + seg_ofs += len; + } +out: + btree_remove32(&super->s_reserved_segments, segno); + return cleaned; +} + +static struct gc_candidate *add_list(struct gc_candidate *cand, + struct candidate_list *list) +{ + struct rb_node **p = &list->rb_tree.rb_node; + struct rb_node *parent = NULL; + struct gc_candidate *cur; + int comp; + + cand->list = list; + while (*p) { + parent = *p; + cur = rb_entry(parent, struct gc_candidate, rb_node); + + if (list->sort_by_ec) + comp = cand->erase_count < cur->erase_count; + else + comp = cand->valid < cur->valid; + + if (comp) + p = &parent->rb_left; + else + p = &parent->rb_right; + } + rb_link_node(&cand->rb_node, parent, p); + rb_insert_color(&cand->rb_node, &list->rb_tree); + + if (list->count <= list->maxcount) { + list->count++; + return NULL; + } + cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node); + rb_erase(&cand->rb_node, &list->rb_tree); + cand->list = NULL; + return cand; +} + +static void remove_from_list(struct gc_candidate *cand) +{ + struct candidate_list *list = cand->list; + + rb_erase(&cand->rb_node, &list->rb_tree); + list->count--; +} + +static void free_candidate(struct super_block *sb, struct gc_candidate *cand) +{ + struct logfs_super *super = logfs_super(sb); + + btree_remove32(&super->s_cand_tree, cand->segno); + kfree(cand); +} + +u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec) +{ + struct gc_candidate *cand; + u32 segno; + + BUG_ON(list->count == 0); + + cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node); + remove_from_list(cand); + segno = cand->segno; + if (ec) + *ec = cand->erase_count; + free_candidate(sb, cand); + return segno; +} + +/* + * We have several lists to manage segments with. The reserve_list is used to + * deal with bad blocks. We try to keep the best (lowest ec) segments on this + * list. + * The free_list contains free segments for normal usage. It usually gets the + * second pick after the reserve_list. But when the free_list is running short + * it is more important to keep the free_list full than to keep a reserve. + * + * Segments that are not free are put onto a per-level low_list. If we have + * to run garbage collection, we pick a candidate from there. All segments on + * those lists should have at least some free space so GC will make progress. + * + * And last we have the ec_list, which is used to pick segments for wear + * leveling. + * + * If all appropriate lists are full, we simply free the candidate and forget + * about that segment for a while. We have better candidates for each purpose. + */ +static void __add_candidate(struct super_block *sb, struct gc_candidate *cand) +{ + struct logfs_super *super = logfs_super(sb); + u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE; + + if (cand->valid == 0) { + /* 100% free segments */ + log_gc_noisy("add reserve segment %x (ec %x) at %llx\n", + cand->segno, cand->erase_count, + dev_ofs(sb, cand->segno, 0)); + cand = add_list(cand, &super->s_reserve_list); + if (cand) { + log_gc_noisy("add free segment %x (ec %x) at %llx\n", + cand->segno, cand->erase_count, + dev_ofs(sb, cand->segno, 0)); + cand = add_list(cand, &super->s_free_list); + } + } else { + /* good candidates for Garbage Collection */ + if (cand->valid < full) + cand = add_list(cand, &super->s_low_list[cand->dist]); + /* good candidates for wear leveling, + * segments that were recently written get ignored */ + if (cand) + cand = add_list(cand, &super->s_ec_list); + } + if (cand) + free_candidate(sb, cand); +} + +static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec, + u8 dist) +{ + struct logfs_super *super = logfs_super(sb); + struct gc_candidate *cand; + + cand = kmalloc(sizeof(*cand), GFP_NOFS); + if (!cand) + return -ENOMEM; + + cand->segno = segno; + cand->valid = valid; + cand->erase_count = ec; + cand->dist = dist; + + btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS); + __add_candidate(sb, cand); + return 0; +} + +static void remove_segment_from_lists(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct gc_candidate *cand; + + cand = btree_lookup32(&super->s_cand_tree, segno); + if (cand) { + remove_from_list(cand); + free_candidate(sb, cand); + } +} + +static void scan_segment(struct super_block *sb, u32 segno) +{ + u32 valid, ec = 0; + gc_level_t gc_level = 0; + u8 dist; + + if (segment_is_reserved(sb, segno)) + return; + + remove_segment_from_lists(sb, segno); + valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); + if (valid == RESERVED) + return; + + dist = root_distance(sb, gc_level); + add_candidate(sb, segno, valid, ec, dist); +} + +static struct gc_candidate *first_in_list(struct candidate_list *list) +{ + if (list->count == 0) + return NULL; + return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node); +} + +/* + * Find the best segment for garbage collection. Main criterion is + * the segment requiring the least effort to clean. Secondary + * criterion is to GC on the lowest level available. + * + * So we search the least effort segment on the lowest level first, + * then move up and pick another segment iff is requires significantly + * less effort. Hence the LOGFS_MAX_OBJECTSIZE in the comparison. + */ +static struct gc_candidate *get_candidate(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i, max_dist; + struct gc_candidate *cand = NULL, *this; + + max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS); + + for (i = max_dist; i >= 0; i--) { + this = first_in_list(&super->s_low_list[i]); + if (!this) + continue; + if (!cand) + cand = this; + if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid) + cand = this; + } + return cand; +} + +static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand) +{ + struct logfs_super *super = logfs_super(sb); + gc_level_t gc_level; + u32 cleaned, valid, segno, ec; + u8 dist; + + if (!cand) { + log_gc("GC attempted, but no candidate found\n"); + return 0; + } + + segno = cand->segno; + dist = cand->dist; + valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); + free_candidate(sb, cand); + log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n", + segno, (u64)segno << super->s_segshift, + dist, no_free_segments(sb), valid, + super->s_free_bytes); + cleaned = logfs_gc_segment(sb, segno, dist); + log_gc("GC segment #%02x complete - now %x valid\n", segno, + valid - cleaned); + BUG_ON(cleaned != valid); + return 1; +} + +static int logfs_gc_once(struct super_block *sb) +{ + struct gc_candidate *cand; + + cand = get_candidate(sb); + if (cand) + remove_from_list(cand); + return __logfs_gc_once(sb, cand); +} + +/* returns 1 if a wrap occurs, 0 otherwise */ +static int logfs_scan_some(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + u32 segno; + int i, ret = 0; + + segno = super->s_sweeper; + for (i = SCAN_RATIO; i > 0; i--) { + segno++; + if (segno >= super->s_no_segs) { + segno = 0; + ret = 1; + /* Break out of the loop. We want to read a single + * block from the segment size on next invocation if + * SCAN_RATIO is set to match block size + */ + break; + } + + scan_segment(sb, segno); + } + super->s_sweeper = segno; + return ret; +} + +/* + * In principle, this function should loop forever, looking for GC candidates + * and moving data. LogFS is designed in such a way that this loop is + * guaranteed to terminate. + * + * Limiting the loop to some iterations serves purely to catch cases when + * these guarantees have failed. An actual endless loop is an obvious bug + * and should be reported as such. + */ +static void __logfs_gc_pass(struct super_block *sb, int target) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_block *block; + int round, progress, last_progress = 0; + + if (no_free_segments(sb) >= target && + super->s_no_object_aliases < MAX_OBJ_ALIASES) + return; + + log_gc("__logfs_gc_pass(%x)\n", target); + for (round = 0; round < SCAN_ROUNDS; ) { + if (no_free_segments(sb) >= target) + goto write_alias; + + /* Sync in-memory state with on-medium state in case they + * diverged */ + logfs_write_anchor(sb); + round += logfs_scan_some(sb); + if (no_free_segments(sb) >= target) + goto write_alias; + progress = logfs_gc_once(sb); + if (progress) + last_progress = round; + else if (round - last_progress > 2) + break; + continue; + + /* + * The goto logic is nasty, I just don't know a better way to + * code it. GC is supposed to ensure two things: + * 1. Enough free segments are available. + * 2. The number of aliases is bounded. + * When 1. is achieved, we take a look at 2. and write back + * some alias-containing blocks, if necessary. However, after + * each such write we need to go back to 1., as writes can + * consume free segments. + */ +write_alias: + if (super->s_no_object_aliases < MAX_OBJ_ALIASES) + return; + if (list_empty(&super->s_object_alias)) { + /* All aliases are still in btree */ + return; + } + log_gc("Write back one alias\n"); + block = list_entry(super->s_object_alias.next, + struct logfs_block, alias_list); + block->ops->write_block(block); + /* + * To round off the nasty goto logic, we reset round here. It + * is a safety-net for GC not making any progress and limited + * to something reasonably small. If incremented it for every + * single alias, the loop could terminate rather quickly. + */ + round = 0; + } + LOGFS_BUG(sb); +} + +static int wl_ratelimit(struct super_block *sb, u64 *next_event) +{ + struct logfs_super *super = logfs_super(sb); + + if (*next_event < super->s_gec) { + *next_event = super->s_gec + WL_RATELIMIT; + return 0; + } + return 1; +} + +static void logfs_wl_pass(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct gc_candidate *wl_cand, *free_cand; + + if (wl_ratelimit(sb, &super->s_wl_gec_ostore)) + return; + + wl_cand = first_in_list(&super->s_ec_list); + if (!wl_cand) + return; + free_cand = first_in_list(&super->s_free_list); + if (!free_cand) + return; + + if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) { + remove_from_list(wl_cand); + __logfs_gc_once(sb, wl_cand); + } +} + +/* + * The journal needs wear leveling as well. But moving the journal is an + * expensive operation so we try to avoid it as much as possible. And if we + * have to do it, we move the whole journal, not individual segments. + * + * Ratelimiting is not strictly necessary here, it mainly serves to avoid the + * calculations. First we check whether moving the journal would be a + * significant improvement. That means that a) the current journal segments + * have more wear than the future journal segments and b) the current journal + * segments have more wear than normal ostore segments. + * Rationale for b) is that we don't have to move the journal if it is aging + * less than the ostore, even if the reserve segments age even less (they are + * excluded from wear leveling, after all). + * Next we check that the superblocks have less wear than the journal. Since + * moving the journal requires writing the superblocks, we have to protect the + * superblocks even more than the journal. + * + * Also we double the acceptable wear difference, compared to ostore wear + * leveling. Journal data is read and rewritten rapidly, comparatively. So + * soft errors have much less time to accumulate and we allow the journal to + * be a bit worse than the ostore. + */ +static void logfs_journal_wl_pass(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct gc_candidate *cand; + u32 min_journal_ec = -1, max_reserve_ec = 0; + int i; + + if (wl_ratelimit(sb, &super->s_wl_gec_journal)) + return; + + if (super->s_reserve_list.count < super->s_no_journal_segs) { + /* Reserve is not full enough to move complete journal */ + return; + } + + journal_for_each(i) + if (super->s_journal_seg[i]) + min_journal_ec = min(min_journal_ec, + super->s_journal_ec[i]); + cand = rb_entry(rb_first(&super->s_free_list.rb_tree), + struct gc_candidate, rb_node); + max_reserve_ec = cand->erase_count; + for (i = 0; i < 2; i++) { + struct logfs_segment_entry se; + u32 segno = seg_no(sb, super->s_sb_ofs[i]); + u32 ec; + + logfs_get_segment_entry(sb, segno, &se); + ec = be32_to_cpu(se.ec_level) >> 4; + max_reserve_ec = max(max_reserve_ec, ec); + } + + if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) { + do_logfs_journal_wl_pass(sb); + } +} + +void logfs_gc_pass(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + //BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex)); + /* Write journal before free space is getting saturated with dirty + * objects. + */ + if (super->s_dirty_used_bytes + super->s_dirty_free_bytes + + LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes) + logfs_write_anchor(sb); + __logfs_gc_pass(sb, super->s_total_levels); + logfs_wl_pass(sb); + logfs_journal_wl_pass(sb); +} + +static int check_area(struct super_block *sb, int i) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_area[i]; + struct logfs_object_header oh; + u32 segno = area->a_segno; + u32 ofs = area->a_used_bytes; + __be32 crc; + int err; + + if (!area->a_is_open) + return 0; + + for (ofs = area->a_used_bytes; + ofs <= super->s_segsize - sizeof(oh); + ofs += (u32)be16_to_cpu(oh.len) + sizeof(oh)) { + err = wbuf_read(sb, dev_ofs(sb, segno, ofs), sizeof(oh), &oh); + if (err) + return err; + + if (!memchr_inv(&oh, 0xff, sizeof(oh))) + break; + + crc = logfs_crc32(&oh, sizeof(oh) - 4, 4); + if (crc != oh.crc) { + printk(KERN_INFO "interrupted header at %llx\n", + dev_ofs(sb, segno, ofs)); + return 0; + } + } + if (ofs != area->a_used_bytes) { + printk(KERN_INFO "%x bytes unaccounted data found at %llx\n", + ofs - area->a_used_bytes, + dev_ofs(sb, segno, area->a_used_bytes)); + area->a_used_bytes = ofs; + } + return 0; +} + +int logfs_check_areas(struct super_block *sb) +{ + int i, err; + + for_each_area(i) { + err = check_area(sb, i); + if (err) + return err; + } + return 0; +} + +static void logfs_init_candlist(struct candidate_list *list, int maxcount, + int sort_by_ec) +{ + list->count = 0; + list->maxcount = maxcount; + list->sort_by_ec = sort_by_ec; + list->rb_tree = RB_ROOT; +} + +int logfs_init_gc(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool); + logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1); + logfs_init_candlist(&super->s_reserve_list, + super->s_bad_seg_reserve, 1); + for_each_area(i) + logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0); + logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1); + return 0; +} + +static void logfs_cleanup_list(struct super_block *sb, + struct candidate_list *list) +{ + struct gc_candidate *cand; + + while (list->count) { + cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate, + rb_node); + remove_from_list(cand); + free_candidate(sb, cand); + } + BUG_ON(list->rb_tree.rb_node); +} + +void logfs_cleanup_gc(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + if (!super->s_free_list.count) + return; + + /* + * FIXME: The btree may still contain a single empty node. So we + * call the grim visitor to clean up that mess. Btree code should + * do it for us, really. + */ + btree_grim_visitor32(&super->s_cand_tree, 0, NULL); + logfs_cleanup_list(sb, &super->s_free_list); + logfs_cleanup_list(sb, &super->s_reserve_list); + for_each_area(i) + logfs_cleanup_list(sb, &super->s_low_list[i]); + logfs_cleanup_list(sb, &super->s_ec_list); +} diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c new file mode 100644 index 00000000000..33ec1aeaeec --- /dev/null +++ b/fs/logfs/inode.c @@ -0,0 +1,417 @@ +/* + * fs/logfs/inode.c - inode handling code + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> + */ +#include "logfs.h" +#include <linux/writeback.h> +#include <linux/backing-dev.h> + +/* + * How soon to reuse old inode numbers? LogFS doesn't store deleted inodes + * on the medium. It therefore also lacks a method to store the previous + * generation number for deleted inodes. Instead a single generation number + * is stored which will be used for new inodes. Being just a 32bit counter, + * this can obvious wrap relatively quickly. So we only reuse inodes if we + * know that a fair number of inodes can be created before we have to increment + * the generation again - effectively adding some bits to the counter. + * But being too aggressive here means we keep a very large and very sparse + * inode file, wasting space on indirect blocks. + * So what is a good value? Beats me. 64k seems moderately bad on both + * fronts, so let's use that for now... + * + * NFS sucks, as everyone already knows. + */ +#define INOS_PER_WRAP (0x10000) + +/* + * Logfs' requirement to read inodes for garbage collection makes life a bit + * harder. GC may have to read inodes that are in I_FREEING state, when they + * are being written out - and waiting for GC to make progress, naturally. + * + * So we cannot just call iget() or some variant of it, but first have to check + * wether the inode in question might be in I_FREEING state. Therefore we + * maintain our own per-sb list of "almost deleted" inodes and check against + * that list first. Normally this should be at most 1-2 entries long. + * + * Also, inodes have logfs-specific reference counting on top of what the vfs + * does. When .destroy_inode is called, normally the reference count will drop + * to zero and the inode gets deleted. But if GC accessed the inode, its + * refcount will remain nonzero and final deletion will have to wait. + * + * As a result we have two sets of functions to get/put inodes: + * logfs_safe_iget/logfs_safe_iput - safe to call from GC context + * logfs_iget/iput - normal version + */ +static struct kmem_cache *logfs_inode_cache; + +static DEFINE_SPINLOCK(logfs_inode_lock); + +static void logfs_inode_setops(struct inode *inode) +{ + switch (inode->i_mode & S_IFMT) { + case S_IFDIR: + inode->i_op = &logfs_dir_iops; + inode->i_fop = &logfs_dir_fops; + inode->i_mapping->a_ops = &logfs_reg_aops; + break; + case S_IFREG: + inode->i_op = &logfs_reg_iops; + inode->i_fop = &logfs_reg_fops; + inode->i_mapping->a_ops = &logfs_reg_aops; + break; + case S_IFLNK: + inode->i_op = &logfs_symlink_iops; + inode->i_mapping->a_ops = &logfs_reg_aops; + break; + case S_IFSOCK: /* fall through */ + case S_IFBLK: /* fall through */ + case S_IFCHR: /* fall through */ + case S_IFIFO: + init_special_inode(inode, inode->i_mode, inode->i_rdev); + break; + default: + BUG(); + } +} + +static struct inode *__logfs_iget(struct super_block *sb, ino_t ino) +{ + struct inode *inode = iget_locked(sb, ino); + int err; + + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + err = logfs_read_inode(inode); + if (err || inode->i_nlink == 0) { + /* inode->i_nlink == 0 can be true when called from + * block validator */ + /* set i_nlink to 0 to prevent caching */ + inode->i_nlink = 0; + logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE; + iget_failed(inode); + if (!err) + err = -ENOENT; + return ERR_PTR(err); + } + + logfs_inode_setops(inode); + unlock_new_inode(inode); + return inode; +} + +struct inode *logfs_iget(struct super_block *sb, ino_t ino) +{ + BUG_ON(ino == LOGFS_INO_MASTER); + BUG_ON(ino == LOGFS_INO_SEGFILE); + return __logfs_iget(sb, ino); +} + +/* + * is_cached is set to 1 if we hand out a cached inode, 0 otherwise. + * this allows logfs_iput to do the right thing later + */ +struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_inode *li; + + if (ino == LOGFS_INO_MASTER) + return super->s_master_inode; + if (ino == LOGFS_INO_SEGFILE) + return super->s_segfile_inode; + + spin_lock(&logfs_inode_lock); + list_for_each_entry(li, &super->s_freeing_list, li_freeing_list) + if (li->vfs_inode.i_ino == ino) { + li->li_refcount++; + spin_unlock(&logfs_inode_lock); + *is_cached = 1; + return &li->vfs_inode; + } + spin_unlock(&logfs_inode_lock); + + *is_cached = 0; + return __logfs_iget(sb, ino); +} + +static void __logfs_destroy_inode(struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + + BUG_ON(li->li_block); + list_del(&li->li_freeing_list); + kmem_cache_free(logfs_inode_cache, li); +} + +static void logfs_destroy_inode(struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + + BUG_ON(list_empty(&li->li_freeing_list)); + spin_lock(&logfs_inode_lock); + li->li_refcount--; + if (li->li_refcount == 0) + __logfs_destroy_inode(inode); + spin_unlock(&logfs_inode_lock); +} + +void logfs_safe_iput(struct inode *inode, int is_cached) +{ + if (inode->i_ino == LOGFS_INO_MASTER) + return; + if (inode->i_ino == LOGFS_INO_SEGFILE) + return; + + if (is_cached) { + logfs_destroy_inode(inode); + return; + } + + iput(inode); +} + +static void logfs_init_inode(struct super_block *sb, struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + int i; + + li->li_flags = 0; + li->li_height = 0; + li->li_used_bytes = 0; + li->li_block = NULL; + inode->i_uid = 0; + inode->i_gid = 0; + inode->i_size = 0; + inode->i_blocks = 0; + inode->i_ctime = CURRENT_TIME; + inode->i_mtime = CURRENT_TIME; + inode->i_nlink = 1; + INIT_LIST_HEAD(&li->li_freeing_list); + + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + li->li_data[i] = 0; + + return; +} + +static struct inode *logfs_alloc_inode(struct super_block *sb) +{ + struct logfs_inode *li; + + li = kmem_cache_alloc(logfs_inode_cache, GFP_NOFS); + if (!li) + return NULL; + logfs_init_inode(sb, &li->vfs_inode); + return &li->vfs_inode; +} + +/* + * In logfs inodes are written to an inode file. The inode file, like any + * other file, is managed with a inode. The inode file's inode, aka master + * inode, requires special handling in several respects. First, it cannot be + * written to the inode file, so it is stored in the journal instead. + * + * Secondly, this inode cannot be written back and destroyed before all other + * inodes have been written. The ordering is important. Linux' VFS is happily + * unaware of the ordering constraint and would ordinarily destroy the master + * inode at umount time while other inodes are still in use and dirty. Not + * good. + * + * So logfs makes sure the master inode is not written until all other inodes + * have been destroyed. Sadly, this method has another side-effect. The VFS + * will notice one remaining inode and print a frightening warning message. + * Worse, it is impossible to judge whether such a warning was caused by the + * master inode or any other inodes have leaked as well. + * + * Our attempt of solving this is with logfs_new_meta_inode() below. Its + * purpose is to create a new inode that will not trigger the warning if such + * an inode is still in use. An ugly hack, no doubt. Suggections for + * improvement are welcome. + */ +struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino) +{ + struct inode *inode; + + inode = logfs_alloc_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + inode->i_mode = S_IFREG; + inode->i_ino = ino; + inode->i_sb = sb; + + /* This is a blatant copy of alloc_inode code. We'd need alloc_inode + * to be nonstatic, alas. */ + { + struct address_space * const mapping = &inode->i_data; + + mapping->a_ops = &logfs_reg_aops; + mapping->host = inode; + mapping->flags = 0; + mapping_set_gfp_mask(mapping, GFP_NOFS); + mapping->assoc_mapping = NULL; + mapping->backing_dev_info = &default_backing_dev_info; + inode->i_mapping = mapping; + inode->i_nlink = 1; + } + + return inode; +} + +struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino) +{ + struct inode *inode; + int err; + + inode = logfs_new_meta_inode(sb, ino); + if (IS_ERR(inode)) + return inode; + + err = logfs_read_inode(inode); + if (err) { + destroy_meta_inode(inode); + return ERR_PTR(err); + } + logfs_inode_setops(inode); + return inode; +} + +static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + int ret; + long flags = WF_LOCK; + + /* Can only happen if creat() failed. Safe to skip. */ + if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN) + return 0; + + ret = __logfs_write_inode(inode, flags); + LOGFS_BUG_ON(ret, inode->i_sb); + return ret; +} + +void destroy_meta_inode(struct inode *inode) +{ + if (inode) { + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + logfs_clear_inode(inode); + kmem_cache_free(logfs_inode_cache, logfs_inode(inode)); + } +} + +/* called with inode_lock held */ +static void logfs_drop_inode(struct inode *inode) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + struct logfs_inode *li = logfs_inode(inode); + + spin_lock(&logfs_inode_lock); + list_move(&li->li_freeing_list, &super->s_freeing_list); + spin_unlock(&logfs_inode_lock); + generic_drop_inode(inode); +} + +static void logfs_set_ino_generation(struct super_block *sb, + struct inode *inode) +{ + struct logfs_super *super = logfs_super(sb); + u64 ino; + + mutex_lock(&super->s_journal_mutex); + ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino); + super->s_last_ino = ino; + super->s_inos_till_wrap--; + if (super->s_inos_till_wrap < 0) { + super->s_last_ino = LOGFS_RESERVED_INOS; + super->s_generation++; + super->s_inos_till_wrap = INOS_PER_WRAP; + } + inode->i_ino = ino; + inode->i_generation = super->s_generation; + mutex_unlock(&super->s_journal_mutex); +} + +struct inode *logfs_new_inode(struct inode *dir, int mode) +{ + struct super_block *sb = dir->i_sb; + struct inode *inode; + + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + logfs_init_inode(sb, inode); + + /* inherit parent flags */ + logfs_inode(inode)->li_flags |= + logfs_inode(dir)->li_flags & LOGFS_FL_INHERITED; + + inode->i_mode = mode; + logfs_set_ino_generation(sb, inode); + + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + if (dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + inode->i_mode |= S_ISGID; + } + + logfs_inode_setops(inode); + insert_inode_hash(inode); + + return inode; +} + +static void logfs_init_once(void *_li) +{ + struct logfs_inode *li = _li; + int i; + + li->li_flags = 0; + li->li_used_bytes = 0; + li->li_refcount = 1; + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + li->li_data[i] = 0; + inode_init_once(&li->vfs_inode); +} + +static int logfs_sync_fs(struct super_block *sb, int wait) +{ + /* FIXME: write anchor */ + logfs_super(sb)->s_devops->sync(sb); + return 0; +} + +const struct super_operations logfs_super_operations = { + .alloc_inode = logfs_alloc_inode, + .clear_inode = logfs_clear_inode, + .delete_inode = logfs_delete_inode, + .destroy_inode = logfs_destroy_inode, + .drop_inode = logfs_drop_inode, + .write_inode = logfs_write_inode, + .statfs = logfs_statfs, + .sync_fs = logfs_sync_fs, +}; + +int logfs_init_inode_cache(void) +{ + logfs_inode_cache = kmem_cache_create("logfs_inode_cache", + sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT, + logfs_init_once); + if (!logfs_inode_cache) + return -ENOMEM; + return 0; +} + +void logfs_destroy_inode_cache(void) +{ + kmem_cache_destroy(logfs_inode_cache); +} diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c new file mode 100644 index 00000000000..6ad30a4c905 --- /dev/null +++ b/fs/logfs/journal.c @@ -0,0 +1,883 @@ +/* + * fs/logfs/journal.c - journal handling code + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> + */ +#include "logfs.h" + +static void logfs_calc_free(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + u64 reserve, no_segs = super->s_no_segs; + s64 free; + int i; + + /* superblock segments */ + no_segs -= 2; + super->s_no_journal_segs = 0; + /* journal */ + journal_for_each(i) + if (super->s_journal_seg[i]) { + no_segs--; + super->s_no_journal_segs++; + } + + /* open segments plus one extra per level for GC */ + no_segs -= 2 * super->s_total_levels; + + free = no_segs * (super->s_segsize - LOGFS_SEGMENT_RESERVE); + free -= super->s_used_bytes; + /* just a bit extra */ + free -= super->s_total_levels * 4096; + + /* Bad blocks are 'paid' for with speed reserve - the filesystem + * simply gets slower as bad blocks accumulate. Until the bad blocks + * exceed the speed reserve - then the filesystem gets smaller. + */ + reserve = super->s_bad_segments + super->s_bad_seg_reserve; + reserve *= super->s_segsize - LOGFS_SEGMENT_RESERVE; + reserve = max(reserve, super->s_speed_reserve); + free -= reserve; + if (free < 0) + free = 0; + + super->s_free_bytes = free; +} + +static void reserve_sb_and_journal(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct btree_head32 *head = &super->s_reserved_segments; + int i, err; + + err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[0]), (void *)1, + GFP_KERNEL); + BUG_ON(err); + + err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[1]), (void *)1, + GFP_KERNEL); + BUG_ON(err); + + journal_for_each(i) { + if (!super->s_journal_seg[i]) + continue; + err = btree_insert32(head, super->s_journal_seg[i], (void *)1, + GFP_KERNEL); + BUG_ON(err); + } +} + +static void read_dynsb(struct super_block *sb, + struct logfs_je_dynsb *dynsb) +{ + struct logfs_super *super = logfs_super(sb); + + super->s_gec = be64_to_cpu(dynsb->ds_gec); + super->s_sweeper = be64_to_cpu(dynsb->ds_sweeper); + super->s_victim_ino = be64_to_cpu(dynsb->ds_victim_ino); + super->s_rename_dir = be64_to_cpu(dynsb->ds_rename_dir); + super->s_rename_pos = be64_to_cpu(dynsb->ds_rename_pos); + super->s_used_bytes = be64_to_cpu(dynsb->ds_used_bytes); + super->s_generation = be32_to_cpu(dynsb->ds_generation); +} + +static void read_anchor(struct super_block *sb, + struct logfs_je_anchor *da) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode = super->s_master_inode; + struct logfs_inode *li = logfs_inode(inode); + int i; + + super->s_last_ino = be64_to_cpu(da->da_last_ino); + li->li_flags = 0; + li->li_height = da->da_height; + i_size_write(inode, be64_to_cpu(da->da_size)); + li->li_used_bytes = be64_to_cpu(da->da_used_bytes); + + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + li->li_data[i] = be64_to_cpu(da->da_data[i]); +} + +static void read_erasecount(struct super_block *sb, + struct logfs_je_journal_ec *ec) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + journal_for_each(i) + super->s_journal_ec[i] = be32_to_cpu(ec->ec[i]); +} + +static int read_area(struct super_block *sb, struct logfs_je_area *a) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_area[a->gc_level]; + u64 ofs; + u32 writemask = ~(super->s_writesize - 1); + + if (a->gc_level >= LOGFS_NO_AREAS) + return -EIO; + if (a->vim != VIM_DEFAULT) + return -EIO; /* TODO: close area and continue */ + + area->a_used_bytes = be32_to_cpu(a->used_bytes); + area->a_written_bytes = area->a_used_bytes & writemask; + area->a_segno = be32_to_cpu(a->segno); + if (area->a_segno) + area->a_is_open = 1; + + ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); + if (super->s_writesize > 1) + logfs_buf_recover(area, ofs, a + 1, super->s_writesize); + else + logfs_buf_recover(area, ofs, NULL, 0); + return 0; +} + +static void *unpack(void *from, void *to) +{ + struct logfs_journal_header *jh = from; + void *data = from + sizeof(struct logfs_journal_header); + int err; + size_t inlen, outlen; + + inlen = be16_to_cpu(jh->h_len); + outlen = be16_to_cpu(jh->h_datalen); + + if (jh->h_compr == COMPR_NONE) + memcpy(to, data, inlen); + else { + err = logfs_uncompress(data, to, inlen, outlen); + BUG_ON(err); + } + return to; +} + +static int __read_je_header(struct super_block *sb, u64 ofs, + struct logfs_journal_header *jh) +{ + struct logfs_super *super = logfs_super(sb); + size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize) + + MAX_JOURNAL_HEADER; + u16 type, len, datalen; + int err; + + /* read header only */ + err = wbuf_read(sb, ofs, sizeof(*jh), jh); + if (err) + return err; + type = be16_to_cpu(jh->h_type); + len = be16_to_cpu(jh->h_len); + datalen = be16_to_cpu(jh->h_datalen); + if (len > sb->s_blocksize) + return -EIO; + if ((type < JE_FIRST) || (type > JE_LAST)) + return -EIO; + if (datalen > bufsize) + return -EIO; + return 0; +} + +static int __read_je_payload(struct super_block *sb, u64 ofs, + struct logfs_journal_header *jh) +{ + u16 len; + int err; + + len = be16_to_cpu(jh->h_len); + err = wbuf_read(sb, ofs + sizeof(*jh), len, jh + 1); + if (err) + return err; + if (jh->h_crc != logfs_crc32(jh, len + sizeof(*jh), 4)) { + /* Old code was confused. It forgot about the header length + * and stopped calculating the crc 16 bytes before the end + * of data - ick! + * FIXME: Remove this hack once the old code is fixed. + */ + if (jh->h_crc == logfs_crc32(jh, len, 4)) + WARN_ON_ONCE(1); + else + return -EIO; + } + return 0; +} + +/* + * jh needs to be large enough to hold the complete entry, not just the header + */ +static int __read_je(struct super_block *sb, u64 ofs, + struct logfs_journal_header *jh) +{ + int err; + + err = __read_je_header(sb, ofs, jh); + if (err) + return err; + return __read_je_payload(sb, ofs, jh); +} + +static int read_je(struct super_block *sb, u64 ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_journal_header *jh = super->s_compressed_je; + void *scratch = super->s_je; + u16 type, datalen; + int err; + + err = __read_je(sb, ofs, jh); + if (err) + return err; + type = be16_to_cpu(jh->h_type); + datalen = be16_to_cpu(jh->h_datalen); + + switch (type) { + case JE_DYNSB: + read_dynsb(sb, unpack(jh, scratch)); + break; + case JE_ANCHOR: + read_anchor(sb, unpack(jh, scratch)); + break; + case JE_ERASECOUNT: + read_erasecount(sb, unpack(jh, scratch)); + break; + case JE_AREA: + read_area(sb, unpack(jh, scratch)); + break; + case JE_OBJ_ALIAS: + err = logfs_load_object_aliases(sb, unpack(jh, scratch), + datalen); + break; + default: + WARN_ON_ONCE(1); + return -EIO; + } + return err; +} + +static int logfs_read_segment(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_journal_header *jh = super->s_compressed_je; + u64 ofs, seg_ofs = dev_ofs(sb, segno, 0); + u32 h_ofs, last_ofs = 0; + u16 len, datalen, last_len = 0; + int i, err; + + /* search for most recent commit */ + for (h_ofs = 0; h_ofs < super->s_segsize; h_ofs += sizeof(*jh)) { + ofs = seg_ofs + h_ofs; + err = __read_je_header(sb, ofs, jh); + if (err) + continue; + if (jh->h_type != cpu_to_be16(JE_COMMIT)) + continue; + err = __read_je_payload(sb, ofs, jh); + if (err) + continue; + len = be16_to_cpu(jh->h_len); + datalen = be16_to_cpu(jh->h_datalen); + if ((datalen > sizeof(super->s_je_array)) || + (datalen % sizeof(__be64))) + continue; + last_ofs = h_ofs; + last_len = datalen; + h_ofs += ALIGN(len, sizeof(*jh)) - sizeof(*jh); + } + /* read commit */ + if (last_ofs == 0) + return -ENOENT; + ofs = seg_ofs + last_ofs; + log_journal("Read commit from %llx\n", ofs); + err = __read_je(sb, ofs, jh); + BUG_ON(err); /* We should have caught it in the scan loop already */ + if (err) + return err; + /* uncompress */ + unpack(jh, super->s_je_array); + super->s_no_je = last_len / sizeof(__be64); + /* iterate over array */ + for (i = 0; i < super->s_no_je; i++) { + err = read_je(sb, be64_to_cpu(super->s_je_array[i])); + if (err) + return err; + } + super->s_journal_area->a_segno = segno; + return 0; +} + +static u64 read_gec(struct super_block *sb, u32 segno) +{ + struct logfs_segment_header sh; + __be32 crc; + int err; + + if (!segno) + return 0; + err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh); + if (err) + return 0; + crc = logfs_crc32(&sh, sizeof(sh), 4); + if (crc != sh.crc) { + WARN_ON(sh.gec != cpu_to_be64(0xffffffffffffffffull)); + /* Most likely it was just erased */ + return 0; + } + return be64_to_cpu(sh.gec); +} + +static int logfs_read_journal(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + u64 gec[LOGFS_JOURNAL_SEGS], max; + u32 segno; + int i, max_i; + + max = 0; + max_i = -1; + journal_for_each(i) { + segno = super->s_journal_seg[i]; + gec[i] = read_gec(sb, super->s_journal_seg[i]); + if (gec[i] > max) { + max = gec[i]; + max_i = i; + } + } + if (max_i == -1) + return -EIO; + /* FIXME: Try older segments in case of error */ + return logfs_read_segment(sb, super->s_journal_seg[max_i]); +} + +/* + * First search the current segment (outer loop), then pick the next segment + * in the array, skipping any zero entries (inner loop). + */ +static void journal_get_free_segment(struct logfs_area *area) +{ + struct logfs_super *super = logfs_super(area->a_sb); + int i; + + journal_for_each(i) { + if (area->a_segno != super->s_journal_seg[i]) + continue; + + do { + i++; + if (i == LOGFS_JOURNAL_SEGS) + i = 0; + } while (!super->s_journal_seg[i]); + + area->a_segno = super->s_journal_seg[i]; + area->a_erase_count = ++(super->s_journal_ec[i]); + log_journal("Journal now at %x (ec %x)\n", area->a_segno, + area->a_erase_count); + return; + } + BUG(); +} + +static void journal_get_erase_count(struct logfs_area *area) +{ + /* erase count is stored globally and incremented in + * journal_get_free_segment() - nothing to do here */ +} + +static int journal_erase_segment(struct logfs_area *area) +{ + struct super_block *sb = area->a_sb; + struct logfs_segment_header sh; + u64 ofs; + int err; + + err = logfs_erase_segment(sb, area->a_segno, 1); + if (err) + return err; + + sh.pad = 0; + sh.type = SEG_JOURNAL; + sh.level = 0; + sh.segno = cpu_to_be32(area->a_segno); + sh.ec = cpu_to_be32(area->a_erase_count); + sh.gec = cpu_to_be64(logfs_super(sb)->s_gec); + sh.crc = logfs_crc32(&sh, sizeof(sh), 4); + + /* This causes a bug in segment.c. Not yet. */ + //logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0); + + ofs = dev_ofs(sb, area->a_segno, 0); + area->a_used_bytes = ALIGN(sizeof(sh), 16); + logfs_buf_write(area, ofs, &sh, sizeof(sh)); + return 0; +} + +static size_t __logfs_write_header(struct logfs_super *super, + struct logfs_journal_header *jh, size_t len, size_t datalen, + u16 type, u8 compr) +{ + jh->h_len = cpu_to_be16(len); + jh->h_type = cpu_to_be16(type); + jh->h_datalen = cpu_to_be16(datalen); + jh->h_compr = compr; + jh->h_pad[0] = 'H'; + jh->h_pad[1] = 'E'; + jh->h_pad[2] = 'A'; + jh->h_pad[3] = 'D'; + jh->h_pad[4] = 'R'; + jh->h_crc = logfs_crc32(jh, len + sizeof(*jh), 4); + return ALIGN(len, 16) + sizeof(*jh); +} + +static size_t logfs_write_header(struct logfs_super *super, + struct logfs_journal_header *jh, size_t datalen, u16 type) +{ + size_t len = datalen; + + return __logfs_write_header(super, jh, len, datalen, type, COMPR_NONE); +} + +static inline size_t logfs_journal_erasecount_size(struct logfs_super *super) +{ + return LOGFS_JOURNAL_SEGS * sizeof(__be32); +} + +static void *logfs_write_erasecount(struct super_block *sb, void *_ec, + u16 *type, size_t *len) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_je_journal_ec *ec = _ec; + int i; + + journal_for_each(i) + ec->ec[i] = cpu_to_be32(super->s_journal_ec[i]); + *type = JE_ERASECOUNT; + *len = logfs_journal_erasecount_size(super); + return ec; +} + +static void account_shadow(void *_shadow, unsigned long _sb, u64 ignore, + size_t ignore2) +{ + struct logfs_shadow *shadow = _shadow; + struct super_block *sb = (void *)_sb; + struct logfs_super *super = logfs_super(sb); + + /* consume new space */ + super->s_free_bytes -= shadow->new_len; + super->s_used_bytes += shadow->new_len; + super->s_dirty_used_bytes -= shadow->new_len; + + /* free up old space */ + super->s_free_bytes += shadow->old_len; + super->s_used_bytes -= shadow->old_len; + super->s_dirty_free_bytes -= shadow->old_len; + + logfs_set_segment_used(sb, shadow->old_ofs, -shadow->old_len); + logfs_set_segment_used(sb, shadow->new_ofs, shadow->new_len); + + log_journal("account_shadow(%llx, %llx, %x) %llx->%llx %x->%x\n", + shadow->ino, shadow->bix, shadow->gc_level, + shadow->old_ofs, shadow->new_ofs, + shadow->old_len, shadow->new_len); + mempool_free(shadow, super->s_shadow_pool); +} + +static void account_shadows(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode = super->s_master_inode; + struct logfs_inode *li = logfs_inode(inode); + struct shadow_tree *tree = &super->s_shadow_tree; + + btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow); + btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow); + + if (li->li_block) { + /* + * We never actually use the structure, when attached to the + * master inode. But it is easier to always free it here than + * to have checks in several places elsewhere when allocating + * it. + */ + li->li_block->ops->free_block(sb, li->li_block); + } + BUG_ON((s64)li->li_used_bytes < 0); +} + +static void *__logfs_write_anchor(struct super_block *sb, void *_da, + u16 *type, size_t *len) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_je_anchor *da = _da; + struct inode *inode = super->s_master_inode; + struct logfs_inode *li = logfs_inode(inode); + int i; + + da->da_height = li->li_height; + da->da_last_ino = cpu_to_be64(super->s_last_ino); + da->da_size = cpu_to_be64(i_size_read(inode)); + da->da_used_bytes = cpu_to_be64(li->li_used_bytes); + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + da->da_data[i] = cpu_to_be64(li->li_data[i]); + *type = JE_ANCHOR; + *len = sizeof(*da); + return da; +} + +static void *logfs_write_dynsb(struct super_block *sb, void *_dynsb, + u16 *type, size_t *len) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_je_dynsb *dynsb = _dynsb; + + dynsb->ds_gec = cpu_to_be64(super->s_gec); + dynsb->ds_sweeper = cpu_to_be64(super->s_sweeper); + dynsb->ds_victim_ino = cpu_to_be64(super->s_victim_ino); + dynsb->ds_rename_dir = cpu_to_be64(super->s_rename_dir); + dynsb->ds_rename_pos = cpu_to_be64(super->s_rename_pos); + dynsb->ds_used_bytes = cpu_to_be64(super->s_used_bytes); + dynsb->ds_generation = cpu_to_be32(super->s_generation); + *type = JE_DYNSB; + *len = sizeof(*dynsb); + return dynsb; +} + +static void write_wbuf(struct super_block *sb, struct logfs_area *area, + void *wbuf) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + u64 ofs; + pgoff_t index; + int page_ofs; + struct page *page; + + ofs = dev_ofs(sb, area->a_segno, + area->a_used_bytes & ~(super->s_writesize - 1)); + index = ofs >> PAGE_SHIFT; + page_ofs = ofs & (PAGE_SIZE - 1); + + page = find_lock_page(mapping, index); + BUG_ON(!page); + memcpy(wbuf, page_address(page) + page_ofs, super->s_writesize); + unlock_page(page); +} + +static void *logfs_write_area(struct super_block *sb, void *_a, + u16 *type, size_t *len) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_area[super->s_sum_index]; + struct logfs_je_area *a = _a; + + a->vim = VIM_DEFAULT; + a->gc_level = super->s_sum_index; + a->used_bytes = cpu_to_be32(area->a_used_bytes); + a->segno = cpu_to_be32(area->a_segno); + if (super->s_writesize > 1) + write_wbuf(sb, area, a + 1); + + *type = JE_AREA; + *len = sizeof(*a) + super->s_writesize; + return a; +} + +static void *logfs_write_commit(struct super_block *sb, void *h, + u16 *type, size_t *len) +{ + struct logfs_super *super = logfs_super(sb); + + *type = JE_COMMIT; + *len = super->s_no_je * sizeof(__be64); + return super->s_je_array; +} + +static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type, + size_t len) +{ + struct logfs_super *super = logfs_super(sb); + void *header = super->s_compressed_je; + void *data = header + sizeof(struct logfs_journal_header); + ssize_t compr_len, pad_len; + u8 compr = COMPR_ZLIB; + + if (len == 0) + return logfs_write_header(super, header, 0, type); + + compr_len = logfs_compress(buf, data, len, sb->s_blocksize); + if (compr_len < 0 || type == JE_ANCHOR) { + BUG_ON(len > sb->s_blocksize); + memcpy(data, buf, len); + compr_len = len; + compr = COMPR_NONE; + } + + pad_len = ALIGN(compr_len, 16); + memset(data + compr_len, 0, pad_len - compr_len); + + return __logfs_write_header(super, header, compr_len, len, type, compr); +} + +static s64 logfs_get_free_bytes(struct logfs_area *area, size_t *bytes, + int must_pad) +{ + u32 writesize = logfs_super(area->a_sb)->s_writesize; + s32 ofs; + int ret; + + ret = logfs_open_area(area, *bytes); + if (ret) + return -EAGAIN; + + ofs = area->a_used_bytes; + area->a_used_bytes += *bytes; + + if (must_pad) { + area->a_used_bytes = ALIGN(area->a_used_bytes, writesize); + *bytes = area->a_used_bytes - ofs; + } + + return dev_ofs(area->a_sb, area->a_segno, ofs); +} + +static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type, + size_t buf_len) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_journal_area; + struct logfs_journal_header *jh = super->s_compressed_je; + size_t len; + int must_pad = 0; + s64 ofs; + + len = __logfs_write_je(sb, buf, type, buf_len); + if (jh->h_type == cpu_to_be16(JE_COMMIT)) + must_pad = 1; + + ofs = logfs_get_free_bytes(area, &len, must_pad); + if (ofs < 0) + return ofs; + logfs_buf_write(area, ofs, super->s_compressed_je, len); + super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs); + return 0; +} + +static int logfs_write_je(struct super_block *sb, + void* (*write)(struct super_block *sb, void *scratch, + u16 *type, size_t *len)) +{ + void *buf; + size_t len; + u16 type; + + buf = write(sb, logfs_super(sb)->s_je, &type, &len); + return logfs_write_je_buf(sb, buf, type, len); +} + +int write_alias_journal(struct super_block *sb, u64 ino, u64 bix, + level_t level, int child_no, __be64 val) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_obj_alias *oa = super->s_je; + int err = 0, fill = super->s_je_fill; + + log_aliases("logfs_write_obj_aliases #%x(%llx, %llx, %x, %x) %llx\n", + fill, ino, bix, level, child_no, be64_to_cpu(val)); + oa[fill].ino = cpu_to_be64(ino); + oa[fill].bix = cpu_to_be64(bix); + oa[fill].val = val; + oa[fill].level = (__force u8)level; + oa[fill].child_no = cpu_to_be16(child_no); + fill++; + if (fill >= sb->s_blocksize / sizeof(*oa)) { + err = logfs_write_je_buf(sb, oa, JE_OBJ_ALIAS, sb->s_blocksize); + fill = 0; + } + + super->s_je_fill = fill; + return err; +} + +static int logfs_write_obj_aliases(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int err; + + log_journal("logfs_write_obj_aliases: %d aliases to write\n", + super->s_no_object_aliases); + super->s_je_fill = 0; + err = logfs_write_obj_aliases_pagecache(sb); + if (err) + return err; + + if (super->s_je_fill) + err = logfs_write_je_buf(sb, super->s_je, JE_OBJ_ALIAS, + super->s_je_fill + * sizeof(struct logfs_obj_alias)); + return err; +} + +/* + * Write all journal entries. The goto logic ensures that all journal entries + * are written whenever a new segment is used. It is ugly and potentially a + * bit wasteful, but robustness is more important. With this we can *always* + * erase all journal segments except the one containing the most recent commit. + */ +void logfs_write_anchor(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_journal_area; + int i, err; + + if (!(super->s_flags & LOGFS_SB_FLAG_DIRTY)) + return; + super->s_flags &= ~LOGFS_SB_FLAG_DIRTY; + + BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN); + mutex_lock(&super->s_journal_mutex); + + /* Do this first or suffer corruption */ + logfs_sync_segments(sb); + account_shadows(sb); + +again: + super->s_no_je = 0; + for_each_area(i) { + if (!super->s_area[i]->a_is_open) + continue; + super->s_sum_index = i; + err = logfs_write_je(sb, logfs_write_area); + if (err) + goto again; + } + err = logfs_write_obj_aliases(sb); + if (err) + goto again; + err = logfs_write_je(sb, logfs_write_erasecount); + if (err) + goto again; + err = logfs_write_je(sb, __logfs_write_anchor); + if (err) + goto again; + err = logfs_write_je(sb, logfs_write_dynsb); + if (err) + goto again; + /* + * Order is imperative. First we sync all writes, including the + * non-committed journal writes. Then we write the final commit and + * sync the current journal segment. + * There is a theoretical bug here. Syncing the journal segment will + * write a number of journal entries and the final commit. All these + * are written in a single operation. If the device layer writes the + * data back-to-front, the commit will precede the other journal + * entries, leaving a race window. + * Two fixes are possible. Preferred is to fix the device layer to + * ensure writes happen front-to-back. Alternatively we can insert + * another logfs_sync_area() super->s_devops->sync() combo before + * writing the commit. + */ + /* + * On another subject, super->s_devops->sync is usually not necessary. + * Unless called from sys_sync or friends, a barrier would suffice. + */ + super->s_devops->sync(sb); + err = logfs_write_je(sb, logfs_write_commit); + if (err) + goto again; + log_journal("Write commit to %llx\n", + be64_to_cpu(super->s_je_array[super->s_no_je - 1])); + logfs_sync_area(area); + BUG_ON(area->a_used_bytes != area->a_written_bytes); + super->s_devops->sync(sb); + + mutex_unlock(&super->s_journal_mutex); + return; +} + +void do_logfs_journal_wl_pass(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_area *area = super->s_journal_area; + u32 segno, ec; + int i, err; + + log_journal("Journal requires wear-leveling.\n"); + /* Drop old segments */ + journal_for_each(i) + if (super->s_journal_seg[i]) { + logfs_set_segment_unreserved(sb, + super->s_journal_seg[i], + super->s_journal_ec[i]); + super->s_journal_seg[i] = 0; + super->s_journal_ec[i] = 0; + } + /* Get new segments */ + for (i = 0; i < super->s_no_journal_segs; i++) { + segno = get_best_cand(sb, &super->s_reserve_list, &ec); + super->s_journal_seg[i] = segno; + super->s_journal_ec[i] = ec; + logfs_set_segment_reserved(sb, segno); + } + /* Manually move journal_area */ + area->a_segno = super->s_journal_seg[0]; + area->a_is_open = 0; + area->a_used_bytes = 0; + /* Write journal */ + logfs_write_anchor(sb); + /* Write superblocks */ + err = logfs_write_sb(sb); + BUG_ON(err); +} + +static const struct logfs_area_ops journal_area_ops = { + .get_free_segment = journal_get_free_segment, + .get_erase_count = journal_get_erase_count, + .erase_segment = journal_erase_segment, +}; + +int logfs_init_journal(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize) + + MAX_JOURNAL_HEADER; + int ret = -ENOMEM; + + mutex_init(&super->s_journal_mutex); + btree_init_mempool32(&super->s_reserved_segments, super->s_btree_pool); + + super->s_je = kzalloc(bufsize, GFP_KERNEL); + if (!super->s_je) + return ret; + + super->s_compressed_je = kzalloc(bufsize, GFP_KERNEL); + if (!super->s_compressed_je) + return ret; + + super->s_master_inode = logfs_new_meta_inode(sb, LOGFS_INO_MASTER); + if (IS_ERR(super->s_master_inode)) + return PTR_ERR(super->s_master_inode); + + ret = logfs_read_journal(sb); + if (ret) + return -EIO; + + reserve_sb_and_journal(sb); + logfs_calc_free(sb); + + super->s_journal_area->a_ops = &journal_area_ops; + return 0; +} + +void logfs_cleanup_journal(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + btree_grim_visitor32(&super->s_reserved_segments, 0, NULL); + destroy_meta_inode(super->s_master_inode); + super->s_master_inode = NULL; + + kfree(super->s_compressed_je); + kfree(super->s_je); +} diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h new file mode 100644 index 00000000000..12977943137 --- /dev/null +++ b/fs/logfs/logfs.h @@ -0,0 +1,724 @@ +/* + * fs/logfs/logfs.h + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> + * + * Private header for logfs. + */ +#ifndef FS_LOGFS_LOGFS_H +#define FS_LOGFS_LOGFS_H + +#undef __CHECK_ENDIAN__ +#define __CHECK_ENDIAN__ + +#include <linux/btree.h> +#include <linux/crc32.h> +#include <linux/fs.h> +#include <linux/kernel.h> +#include <linux/mempool.h> +#include <linux/pagemap.h> +#include <linux/mtd/mtd.h> +#include "logfs_abi.h" + +#define LOGFS_DEBUG_SUPER (0x0001) +#define LOGFS_DEBUG_SEGMENT (0x0002) +#define LOGFS_DEBUG_JOURNAL (0x0004) +#define LOGFS_DEBUG_DIR (0x0008) +#define LOGFS_DEBUG_FILE (0x0010) +#define LOGFS_DEBUG_INODE (0x0020) +#define LOGFS_DEBUG_READWRITE (0x0040) +#define LOGFS_DEBUG_GC (0x0080) +#define LOGFS_DEBUG_GC_NOISY (0x0100) +#define LOGFS_DEBUG_ALIASES (0x0200) +#define LOGFS_DEBUG_BLOCKMOVE (0x0400) +#define LOGFS_DEBUG_ALL (0xffffffff) + +#define LOGFS_DEBUG (0x01) +/* + * To enable specific log messages, simply define LOGFS_DEBUG to match any + * or all of the above. + */ +#ifndef LOGFS_DEBUG +#define LOGFS_DEBUG (0) +#endif + +#define log_cond(cond, fmt, arg...) do { \ + if (cond) \ + printk(KERN_DEBUG fmt, ##arg); \ +} while (0) + +#define log_super(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SUPER, fmt, ##arg) +#define log_segment(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SEGMENT, fmt, ##arg) +#define log_journal(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_JOURNAL, fmt, ##arg) +#define log_dir(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_DIR, fmt, ##arg) +#define log_file(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_FILE, fmt, ##arg) +#define log_inode(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_INODE, fmt, ##arg) +#define log_readwrite(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_READWRITE, fmt, ##arg) +#define log_gc(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC, fmt, ##arg) +#define log_gc_noisy(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC_NOISY, fmt, ##arg) +#define log_aliases(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_ALIASES, fmt, ##arg) +#define log_blockmove(fmt, arg...) \ + log_cond(LOGFS_DEBUG & LOGFS_DEBUG_BLOCKMOVE, fmt, ##arg) + +#define PG_pre_locked PG_owner_priv_1 +#define PagePreLocked(page) test_bit(PG_pre_locked, &(page)->flags) +#define SetPagePreLocked(page) set_bit(PG_pre_locked, &(page)->flags) +#define ClearPagePreLocked(page) clear_bit(PG_pre_locked, &(page)->flags) + +/* FIXME: This should really be somewhere in the 64bit area. */ +#define LOGFS_LINK_MAX (1<<30) + +/* Read-only filesystem */ +#define LOGFS_SB_FLAG_RO 0x0001 +#define LOGFS_SB_FLAG_DIRTY 0x0002 +#define LOGFS_SB_FLAG_OBJ_ALIAS 0x0004 +#define LOGFS_SB_FLAG_SHUTDOWN 0x0008 + +/* Write Control Flags */ +#define WF_LOCK 0x01 /* take write lock */ +#define WF_WRITE 0x02 /* write block */ +#define WF_DELETE 0x04 /* delete old block */ + +typedef u8 __bitwise level_t; +typedef u8 __bitwise gc_level_t; + +#define LEVEL(level) ((__force level_t)(level)) +#define GC_LEVEL(gc_level) ((__force gc_level_t)(gc_level)) + +#define SUBLEVEL(level) ( (void)((level) == LEVEL(1)), \ + (__force level_t)((__force u8)(level) - 1) ) + +/** + * struct logfs_area - area management information + * + * @a_sb: the superblock this area belongs to + * @a_is_open: 1 if the area is currently open, else 0 + * @a_segno: segment number of area + * @a_written_bytes: number of bytes already written back + * @a_used_bytes: number of used bytes + * @a_ops: area operations (either journal or ostore) + * @a_erase_count: erase count + * @a_level: GC level + */ +struct logfs_area { /* a segment open for writing */ + struct super_block *a_sb; + int a_is_open; + u32 a_segno; + u32 a_written_bytes; + u32 a_used_bytes; + const struct logfs_area_ops *a_ops; + u32 a_erase_count; + gc_level_t a_level; +}; + +/** + * struct logfs_area_ops - area operations + * + * @get_free_segment: fill area->ofs with the offset of a free segment + * @get_erase_count: fill area->erase_count (needs area->ofs) + * @erase_segment: erase and setup segment + */ +struct logfs_area_ops { + void (*get_free_segment)(struct logfs_area *area); + void (*get_erase_count)(struct logfs_area *area); + int (*erase_segment)(struct logfs_area *area); +}; + +/** + * struct logfs_device_ops - device access operations + * + * @readpage: read one page (mm page) + * @writeseg: write one segment. may be a partial segment + * @erase: erase one segment + * @read: read from the device + * @erase: erase part of the device + */ +struct logfs_device_ops { + struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs); + struct page *(*find_last_sb)(struct super_block *sb, u64 *ofs); + int (*write_sb)(struct super_block *sb, struct page *page); + int (*readpage)(void *_sb, struct page *page); + void (*writeseg)(struct super_block *sb, u64 ofs, size_t len); + int (*erase)(struct super_block *sb, loff_t ofs, size_t len, + int ensure_write); + void (*sync)(struct super_block *sb); + void (*put_device)(struct super_block *sb); +}; + +/** + * struct candidate_list - list of similar candidates + */ +struct candidate_list { + struct rb_root rb_tree; + int count; + int maxcount; + int sort_by_ec; +}; + +/** + * struct gc_candidate - "candidate" segment to be garbage collected next + * + * @list: list (either free of low) + * @segno: segment number + * @valid: number of valid bytes + * @erase_count: erase count of segment + * @dist: distance from tree root + * + * Candidates can be on two lists. The free list contains electees rather + * than candidates - segments that no longer contain any valid data. The + * low list contains candidates to be picked for GC. It should be kept + * short. It is not required to always pick a perfect candidate. In the + * worst case GC will have to move more data than absolutely necessary. + */ +struct gc_candidate { + struct rb_node rb_node; + struct candidate_list *list; + u32 segno; + u32 valid; + u32 erase_count; + u8 dist; +}; + +/** + * struct logfs_journal_entry - temporary structure used during journal scan + * + * @used: + * @version: normalized version + * @len: length + * @offset: offset + */ +struct logfs_journal_entry { + int used; + s16 version; + u16 len; + u16 datalen; + u64 offset; +}; + +enum transaction_state { + CREATE_1 = 1, + CREATE_2, + UNLINK_1, + UNLINK_2, + CROSS_RENAME_1, + CROSS_RENAME_2, + TARGET_RENAME_1, + TARGET_RENAME_2, + TARGET_RENAME_3 +}; + +/** + * struct logfs_transaction - essential fields to support atomic dirops + * + * @ino: target inode + * @dir: inode of directory containing dentry + * @pos: pos of dentry in directory + */ +struct logfs_transaction { + enum transaction_state state; + u64 ino; + u64 dir; + u64 pos; +}; + +/** + * struct logfs_shadow - old block in the shadow of a not-yet-committed new one + * @old_ofs: offset of old block on medium + * @new_ofs: offset of new block on medium + * @ino: inode number + * @bix: block index + * @old_len: size of old block, including header + * @new_len: size of new block, including header + * @level: block level + */ +struct logfs_shadow { + u64 old_ofs; + u64 new_ofs; + u64 ino; + u64 bix; + int old_len; + int new_len; + gc_level_t gc_level; +}; + +/** + * struct shadow_tree + * @new: shadows where old_ofs==0, indexed by new_ofs + * @old: shadows where old_ofs!=0, indexed by old_ofs + */ +struct shadow_tree { + struct btree_head64 new; + struct btree_head64 old; +}; + +struct object_alias_item { + struct list_head list; + __be64 val; + int child_no; +}; + +/** + * struct logfs_block - contains any block state + * @type: indirect block or inode + * @full: number of fully populated children + * @partial: number of partially populated children + * + * Most blocks are directly represented by page cache pages. But when a block + * becomes dirty, is part of a transaction, contains aliases or is otherwise + * special, a struct logfs_block is allocated to track the additional state. + * Inodes are very similar to indirect blocks, so they can also get one of + * these structures added when appropriate. + */ +#define BLOCK_INDIRECT 1 /* Indirect block */ +#define BLOCK_INODE 2 /* Inode */ +struct logfs_block_ops; +struct logfs_block { + struct list_head alias_list; + struct list_head item_list; + struct super_block *sb; + u64 ino; + u64 bix; + level_t level; + struct page *page; + struct inode *inode; + struct logfs_transaction *ta; + unsigned long alias_map[LOGFS_BLOCK_FACTOR / BITS_PER_LONG]; + struct logfs_block_ops *ops; + int full; + int partial; + int reserved_bytes; +}; + +typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix, + level_t level, int child_no, __be64 val); +struct logfs_block_ops { + void (*write_block)(struct logfs_block *block); + gc_level_t (*block_level)(struct logfs_block *block); + void (*free_block)(struct super_block *sb, struct logfs_block*block); + int (*write_alias)(struct super_block *sb, + struct logfs_block *block, + write_alias_t *write_one_alias); +}; + +struct logfs_super { + struct mtd_info *s_mtd; /* underlying device */ + struct block_device *s_bdev; /* underlying device */ + const struct logfs_device_ops *s_devops;/* device access */ + struct inode *s_master_inode; /* inode file */ + struct inode *s_segfile_inode; /* segment file */ + struct inode *s_mapping_inode; /* device mapping */ + atomic_t s_pending_writes; /* outstanting bios */ + long s_flags; + mempool_t *s_btree_pool; /* for btree nodes */ + mempool_t *s_alias_pool; /* aliases in segment.c */ + u64 s_feature_incompat; + u64 s_feature_ro_compat; + u64 s_feature_compat; + u64 s_feature_flags; + u64 s_sb_ofs[2]; + struct page *s_erase_page; /* for dev_bdev.c */ + /* alias.c fields */ + struct btree_head32 s_segment_alias; /* remapped segments */ + int s_no_object_aliases; + struct list_head s_object_alias; /* remapped objects */ + struct btree_head128 s_object_alias_tree; /* remapped objects */ + struct mutex s_object_alias_mutex; + /* dir.c fields */ + struct mutex s_dirop_mutex; /* for creat/unlink/rename */ + u64 s_victim_ino; /* used for atomic dir-ops */ + u64 s_rename_dir; /* source directory ino */ + u64 s_rename_pos; /* position of source dd */ + /* gc.c fields */ + long s_segsize; /* size of a segment */ + int s_segshift; /* log2 of segment size */ + long s_segmask; /* 1 << s_segshift - 1 */ + long s_no_segs; /* segments on device */ + long s_no_journal_segs; /* segments used for journal */ + long s_no_blocks; /* blocks per segment */ + long s_writesize; /* minimum write size */ + int s_writeshift; /* log2 of write size */ + u64 s_size; /* filesystem size */ + struct logfs_area *s_area[LOGFS_NO_AREAS]; /* open segment array */ + u64 s_gec; /* global erase count */ + u64 s_wl_gec_ostore; /* time of last wl event */ + u64 s_wl_gec_journal; /* time of last wl event */ + u64 s_sweeper; /* current sweeper pos */ + u8 s_ifile_levels; /* max level of ifile */ + u8 s_iblock_levels; /* max level of regular files */ + u8 s_data_levels; /* # of segments to leaf block*/ + u8 s_total_levels; /* sum of above three */ + struct btree_head32 s_cand_tree; /* all candidates */ + struct candidate_list s_free_list; /* 100% free segments */ + struct candidate_list s_reserve_list; /* Bad segment reserve */ + struct candidate_list s_low_list[LOGFS_NO_AREAS];/* good candidates */ + struct candidate_list s_ec_list; /* wear level candidates */ + struct btree_head32 s_reserved_segments;/* sb, journal, bad, etc. */ + /* inode.c fields */ + u64 s_last_ino; /* highest ino used */ + long s_inos_till_wrap; + u32 s_generation; /* i_generation for new files */ + struct list_head s_freeing_list; /* inodes being freed */ + /* journal.c fields */ + struct mutex s_journal_mutex; + void *s_je; /* journal entry to compress */ + void *s_compressed_je; /* block to write to journal */ + u32 s_journal_seg[LOGFS_JOURNAL_SEGS]; /* journal segments */ + u32 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */ + u64 s_last_version; + struct logfs_area *s_journal_area; /* open journal segment */ + __be64 s_je_array[64]; + int s_no_je; + + int s_sum_index; /* for the 12 summaries */ + struct shadow_tree s_shadow_tree; + int s_je_fill; /* index of current je */ + /* readwrite.c fields */ + struct mutex s_write_mutex; + int s_lock_count; + mempool_t *s_block_pool; /* struct logfs_block pool */ + mempool_t *s_shadow_pool; /* struct logfs_shadow pool */ + /* + * Space accounting: + * - s_used_bytes specifies space used to store valid data objects. + * - s_dirty_used_bytes is space used to store non-committed data + * objects. Those objects have already been written themselves, + * but they don't become valid until all indirect blocks up to the + * journal have been written as well. + * - s_dirty_free_bytes is space used to store the old copy of a + * replaced object, as long as the replacement is non-committed. + * In other words, it is the amount of space freed when all dirty + * blocks are written back. + * - s_free_bytes is the amount of free space available for any + * purpose. + * - s_root_reserve is the amount of free space available only to + * the root user. Non-privileged users can no longer write once + * this watermark has been reached. + * - s_speed_reserve is space which remains unused to speed up + * garbage collection performance. + * - s_dirty_pages is the space reserved for currently dirty pages. + * It is a pessimistic estimate, so some/most will get freed on + * page writeback. + * + * s_used_bytes + s_free_bytes + s_speed_reserve = total usable size + */ + u64 s_free_bytes; + u64 s_used_bytes; + u64 s_dirty_free_bytes; + u64 s_dirty_used_bytes; + u64 s_root_reserve; + u64 s_speed_reserve; + u64 s_dirty_pages; + /* Bad block handling: + * - s_bad_seg_reserve is a number of segments usually kept + * free. When encountering bad blocks, the affected segment's data + * is _temporarily_ moved to a reserved segment. + * - s_bad_segments is the number of known bad segments. + */ + u32 s_bad_seg_reserve; + u32 s_bad_segments; +}; + +/** + * struct logfs_inode - in-memory inode + * + * @vfs_inode: struct inode + * @li_data: data pointers + * @li_used_bytes: number of used bytes + * @li_freeing_list: used to track inodes currently being freed + * @li_flags: inode flags + * @li_refcount: number of internal (GC-induced) references + */ +struct logfs_inode { + struct inode vfs_inode; + u64 li_data[LOGFS_EMBEDDED_FIELDS]; + u64 li_used_bytes; + struct list_head li_freeing_list; + struct logfs_block *li_block; + u32 li_flags; + u8 li_height; + int li_refcount; +}; + +#define journal_for_each(__i) for (__i = 0; __i < LOGFS_JOURNAL_SEGS; __i++) +#define for_each_area(__i) for (__i = 0; __i < LOGFS_NO_AREAS; __i++) +#define for_each_area_down(__i) for (__i = LOGFS_NO_AREAS - 1; __i >= 0; __i--) + +/* compr.c */ +int logfs_compress(void *in, void *out, size_t inlen, size_t outlen); +int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen); +int __init logfs_compr_init(void); +void logfs_compr_exit(void); + +/* dev_bdev.c */ +#ifdef CONFIG_BLOCK +int logfs_get_sb_bdev(struct file_system_type *type, int flags, + const char *devname, struct vfsmount *mnt); +#else +static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags, + const char *devname, struct vfsmount *mnt) +{ + return -ENODEV; +} +#endif + +/* dev_mtd.c */ +#ifdef CONFIG_MTD +int logfs_get_sb_mtd(struct file_system_type *type, int flags, + int mtdnr, struct vfsmount *mnt); +#else +static inline int logfs_get_sb_mtd(struct file_system_type *type, int flags, + int mtdnr, struct vfsmount *mnt) +{ + return -ENODEV; +} +#endif + +/* dir.c */ +extern const struct inode_operations logfs_symlink_iops; +extern const struct inode_operations logfs_dir_iops; +extern const struct file_operations logfs_dir_fops; +int logfs_replay_journal(struct super_block *sb); + +/* file.c */ +extern const struct inode_operations logfs_reg_iops; +extern const struct file_operations logfs_reg_fops; +extern const struct address_space_operations logfs_reg_aops; +int logfs_readpage(struct file *file, struct page *page); +int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd, + unsigned long arg); +int logfs_fsync(struct file *file, struct dentry *dentry, int datasync); + +/* gc.c */ +u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec); +void logfs_gc_pass(struct super_block *sb); +int logfs_check_areas(struct super_block *sb); +int logfs_init_gc(struct super_block *sb); +void logfs_cleanup_gc(struct super_block *sb); + +/* inode.c */ +extern const struct super_operations logfs_super_operations; +struct inode *logfs_iget(struct super_block *sb, ino_t ino); +struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *cookie); +void logfs_safe_iput(struct inode *inode, int cookie); +struct inode *logfs_new_inode(struct inode *dir, int mode); +struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino); +struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino); +int logfs_init_inode_cache(void); +void logfs_destroy_inode_cache(void); +void destroy_meta_inode(struct inode *inode); +void logfs_set_blocks(struct inode *inode, u64 no); +/* these logically belong into inode.c but actually reside in readwrite.c */ +int logfs_read_inode(struct inode *inode); +int __logfs_write_inode(struct inode *inode, long flags); +void logfs_delete_inode(struct inode *inode); +void logfs_clear_inode(struct inode *inode); + +/* journal.c */ +void logfs_write_anchor(struct super_block *sb); +int logfs_init_journal(struct super_block *sb); +void logfs_cleanup_journal(struct super_block *sb); +int write_alias_journal(struct super_block *sb, u64 ino, u64 bix, + level_t level, int child_no, __be64 val); +void do_logfs_journal_wl_pass(struct super_block *sb); + +/* readwrite.c */ +pgoff_t logfs_pack_index(u64 bix, level_t level); +void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level); +int logfs_inode_write(struct inode *inode, const void *buf, size_t count, + loff_t bix, long flags, struct shadow_tree *shadow_tree); +int logfs_readpage_nolock(struct page *page); +int logfs_write_buf(struct inode *inode, struct page *page, long flags); +int logfs_delete(struct inode *inode, pgoff_t index, + struct shadow_tree *shadow_tree); +int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs, + gc_level_t gc_level, long flags); +int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix, + gc_level_t gc_level); +int logfs_truncate(struct inode *inode, u64 size); +u64 logfs_seek_hole(struct inode *inode, u64 bix); +u64 logfs_seek_data(struct inode *inode, u64 bix); +int logfs_open_segfile(struct super_block *sb); +int logfs_init_rw(struct super_block *sb); +void logfs_cleanup_rw(struct super_block *sb); +void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta); +void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta); +void logfs_write_block(struct logfs_block *block, long flags); +int logfs_write_obj_aliases_pagecache(struct super_block *sb); +void logfs_get_segment_entry(struct super_block *sb, u32 segno, + struct logfs_segment_entry *se); +void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment); +void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec, + gc_level_t gc_level); +void logfs_set_segment_reserved(struct super_block *sb, u32 segno); +void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec); +struct logfs_block *__alloc_block(struct super_block *sb, + u64 ino, u64 bix, level_t level); +void __free_block(struct super_block *sb, struct logfs_block *block); +void btree_write_block(struct logfs_block *block); +void initialize_block_counters(struct page *page, struct logfs_block *block, + __be64 *array, int page_is_empty); +int logfs_exist_block(struct inode *inode, u64 bix); +int get_page_reserve(struct inode *inode, struct page *page); +extern struct logfs_block_ops indirect_block_ops; + +/* segment.c */ +int logfs_erase_segment(struct super_block *sb, u32 ofs, int ensure_erase); +int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf); +int logfs_segment_read(struct inode *inode, struct page *page, u64 ofs, u64 bix, + level_t level); +int logfs_segment_write(struct inode *inode, struct page *page, + struct logfs_shadow *shadow); +int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow); +int logfs_load_object_aliases(struct super_block *sb, + struct logfs_obj_alias *oa, int count); +void move_page_to_btree(struct page *page); +int logfs_init_mapping(struct super_block *sb); +void logfs_sync_area(struct logfs_area *area); +void logfs_sync_segments(struct super_block *sb); + +/* area handling */ +int logfs_init_areas(struct super_block *sb); +void logfs_cleanup_areas(struct super_block *sb); +int logfs_open_area(struct logfs_area *area, size_t bytes); +void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len, + int use_filler); + +static inline void logfs_buf_write(struct logfs_area *area, u64 ofs, + void *buf, size_t len) +{ + __logfs_buf_write(area, ofs, buf, len, 0); +} + +static inline void logfs_buf_recover(struct logfs_area *area, u64 ofs, + void *buf, size_t len) +{ + __logfs_buf_write(area, ofs, buf, len, 1); +} + +/* super.c */ +struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index); +void emergency_read_end(struct page *page); +void logfs_crash_dump(struct super_block *sb); +void *memchr_inv(const void *s, int c, size_t n); +int logfs_statfs(struct dentry *dentry, struct kstatfs *stats); +int logfs_get_sb_device(struct file_system_type *type, int flags, + struct mtd_info *mtd, struct block_device *bdev, + const struct logfs_device_ops *devops, struct vfsmount *mnt); +int logfs_check_ds(struct logfs_disk_super *ds); +int logfs_write_sb(struct super_block *sb); + +static inline struct logfs_super *logfs_super(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct logfs_inode *logfs_inode(struct inode *inode) +{ + return container_of(inode, struct logfs_inode, vfs_inode); +} + +static inline void logfs_set_ro(struct super_block *sb) +{ + logfs_super(sb)->s_flags |= LOGFS_SB_FLAG_RO; +} + +#define LOGFS_BUG(sb) do { \ + struct super_block *__sb = sb; \ + logfs_crash_dump(__sb); \ + logfs_super(__sb)->s_flags |= LOGFS_SB_FLAG_RO; \ + BUG(); \ +} while (0) + +#define LOGFS_BUG_ON(condition, sb) \ + do { if (unlikely(condition)) LOGFS_BUG((sb)); } while (0) + +static inline __be32 logfs_crc32(void *data, size_t len, size_t skip) +{ + return cpu_to_be32(crc32(~0, data+skip, len-skip)); +} + +static inline u8 logfs_type(struct inode *inode) +{ + return (inode->i_mode >> 12) & 15; +} + +static inline pgoff_t logfs_index(struct super_block *sb, u64 pos) +{ + return pos >> sb->s_blocksize_bits; +} + +static inline u64 dev_ofs(struct super_block *sb, u32 segno, u32 ofs) +{ + return ((u64)segno << logfs_super(sb)->s_segshift) + ofs; +} + +static inline u32 seg_no(struct super_block *sb, u64 ofs) +{ + return ofs >> logfs_super(sb)->s_segshift; +} + +static inline u32 seg_ofs(struct super_block *sb, u64 ofs) +{ + return ofs & logfs_super(sb)->s_segmask; +} + +static inline u64 seg_align(struct super_block *sb, u64 ofs) +{ + return ofs & ~logfs_super(sb)->s_segmask; +} + +static inline struct logfs_block *logfs_block(struct page *page) +{ + return (void *)page->private; +} + +static inline level_t shrink_level(gc_level_t __level) +{ + u8 level = (__force u8)__level; + + if (level >= LOGFS_MAX_LEVELS) + level -= LOGFS_MAX_LEVELS; + return (__force level_t)level; +} + +static inline gc_level_t expand_level(u64 ino, level_t __level) +{ + u8 level = (__force u8)__level; + + if (ino == LOGFS_INO_MASTER) { + /* ifile has seperate areas */ + level += LOGFS_MAX_LEVELS; + } + return (__force gc_level_t)level; +} + +static inline int logfs_block_shift(struct super_block *sb, level_t level) +{ + level = shrink_level((__force gc_level_t)level); + return (__force int)level * (sb->s_blocksize_bits - 3); +} + +static inline u64 logfs_block_mask(struct super_block *sb, level_t level) +{ + return ~0ull << logfs_block_shift(sb, level); +} + +static inline struct logfs_area *get_area(struct super_block *sb, + gc_level_t gc_level) +{ + return logfs_super(sb)->s_area[(__force u8)gc_level]; +} + +#endif diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h new file mode 100644 index 00000000000..f674725663f --- /dev/null +++ b/fs/logfs/logfs_abi.h @@ -0,0 +1,629 @@ +/* + * fs/logfs/logfs_abi.h + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> + * + * Public header for logfs. + */ +#ifndef FS_LOGFS_LOGFS_ABI_H +#define FS_LOGFS_LOGFS_ABI_H + +/* For out-of-kernel compiles */ +#ifndef BUILD_BUG_ON +#define BUILD_BUG_ON(condition) /**/ +#endif + +#define SIZE_CHECK(type, size) \ +static inline void check_##type(void) \ +{ \ + BUILD_BUG_ON(sizeof(struct type) != (size)); \ +} + +/* + * Throughout the logfs code, we're constantly dealing with blocks at + * various positions or offsets. To remove confusion, we stricly + * distinguish between a "position" - the logical position within a + * file and an "offset" - the physical location within the device. + * + * Any usage of the term offset for a logical location or position for + * a physical one is a bug and should get fixed. + */ + +/* + * Block are allocated in one of several segments depending on their + * level. The following levels are used: + * 0 - regular data block + * 1 - i1 indirect blocks + * 2 - i2 indirect blocks + * 3 - i3 indirect blocks + * 4 - i4 indirect blocks + * 5 - i5 indirect blocks + * 6 - ifile data blocks + * 7 - ifile i1 indirect blocks + * 8 - ifile i2 indirect blocks + * 9 - ifile i3 indirect blocks + * 10 - ifile i4 indirect blocks + * 11 - ifile i5 indirect blocks + * Potential levels to be used in the future: + * 12 - gc recycled blocks, long-lived data + * 13 - replacement blocks, short-lived data + * + * Levels 1-11 are necessary for robust gc operations and help seperate + * short-lived metadata from longer-lived file data. In the future, + * file data should get seperated into several segments based on simple + * heuristics. Old data recycled during gc operation is expected to be + * long-lived. New data is of uncertain life expectancy. New data + * used to replace older blocks in existing files is expected to be + * short-lived. + */ + + +/* Magic numbers. 64bit for superblock, 32bit for statfs f_type */ +#define LOGFS_MAGIC 0x7a3a8e5cb9d5bf67ull +#define LOGFS_MAGIC_U32 0xc97e8168u + +/* + * Various blocksize related macros. Blocksize is currently fixed at 4KiB. + * Sooner or later that should become configurable and the macros replaced + * by something superblock-dependent. Pointers in indirect blocks are and + * will remain 64bit. + * + * LOGFS_BLOCKSIZE - self-explaining + * LOGFS_BLOCK_FACTOR - number of pointers per indirect block + * LOGFS_BLOCK_BITS - log2 of LOGFS_BLOCK_FACTOR, used for shifts + */ +#define LOGFS_BLOCKSIZE (4096ull) +#define LOGFS_BLOCK_FACTOR (LOGFS_BLOCKSIZE / sizeof(u64)) +#define LOGFS_BLOCK_BITS (9) + +/* + * Number of blocks at various levels of indirection. There are 16 direct + * block pointers plus a single indirect pointer. + */ +#define I0_BLOCKS (16) +#define I1_BLOCKS LOGFS_BLOCK_FACTOR +#define I2_BLOCKS (LOGFS_BLOCK_FACTOR * I1_BLOCKS) +#define I3_BLOCKS (LOGFS_BLOCK_FACTOR * I2_BLOCKS) +#define I4_BLOCKS (LOGFS_BLOCK_FACTOR * I3_BLOCKS) +#define I5_BLOCKS (LOGFS_BLOCK_FACTOR * I4_BLOCKS) + +#define INDIRECT_INDEX I0_BLOCKS +#define LOGFS_EMBEDDED_FIELDS (I0_BLOCKS + 1) + +/* + * Sizes at which files require another level of indirection. Files smaller + * than LOGFS_EMBEDDED_SIZE can be completely stored in the inode itself, + * similar like ext2 fast symlinks. + * + * Data at a position smaller than LOGFS_I0_SIZE is accessed through the + * direct pointers, else through the 1x indirect pointer and so forth. + */ +#define LOGFS_EMBEDDED_SIZE (LOGFS_EMBEDDED_FIELDS * sizeof(u64)) +#define LOGFS_I0_SIZE (I0_BLOCKS * LOGFS_BLOCKSIZE) +#define LOGFS_I1_SIZE (I1_BLOCKS * LOGFS_BLOCKSIZE) +#define LOGFS_I2_SIZE (I2_BLOCKS * LOGFS_BLOCKSIZE) +#define LOGFS_I3_SIZE (I3_BLOCKS * LOGFS_BLOCKSIZE) +#define LOGFS_I4_SIZE (I4_BLOCKS * LOGFS_BLOCKSIZE) +#define LOGFS_I5_SIZE (I5_BLOCKS * LOGFS_BLOCKSIZE) + +/* + * Each indirect block pointer must have this flag set, if all block pointers + * behind it are set, i.e. there is no hole hidden in the shadow of this + * indirect block pointer. + */ +#define LOGFS_FULLY_POPULATED (1ULL << 63) +#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED) + +/* + * LogFS needs to seperate data into levels. Each level is defined as the + * maximal possible distance from the master inode (inode of the inode file). + * Data blocks reside on level 0, 1x indirect block on level 1, etc. + * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11. + * This effort is necessary to guarantee garbage collection to always make + * progress. + * + * LOGFS_MAX_INDIRECT is the maximal indirection through indirect blocks, + * LOGFS_MAX_LEVELS is one more for the actual data level of a file. It is + * the maximal number of levels for one file. + * LOGFS_NO_AREAS is twice that, as the inode file and regular files are + * effectively stacked on top of each other. + */ +#define LOGFS_MAX_INDIRECT (5) +#define LOGFS_MAX_LEVELS (LOGFS_MAX_INDIRECT + 1) +#define LOGFS_NO_AREAS (2 * LOGFS_MAX_LEVELS) + +/* Maximum size of filenames */ +#define LOGFS_MAX_NAMELEN (255) + +/* Number of segments in the primary journal. */ +#define LOGFS_JOURNAL_SEGS (16) + +/* Maximum number of free/erased/etc. segments in journal entries */ +#define MAX_CACHED_SEGS (64) + + +/* + * LOGFS_OBJECT_HEADERSIZE is the size of a single header in the object store, + * LOGFS_MAX_OBJECTSIZE the size of the largest possible object, including + * its header, + * LOGFS_SEGMENT_RESERVE is the amount of space reserved for each segment for + * its segment header and the padded space at the end when no further objects + * fit. + */ +#define LOGFS_OBJECT_HEADERSIZE (0x1c) +#define LOGFS_SEGMENT_HEADERSIZE (0x18) +#define LOGFS_MAX_OBJECTSIZE (LOGFS_OBJECT_HEADERSIZE + LOGFS_BLOCKSIZE) +#define LOGFS_SEGMENT_RESERVE \ + (LOGFS_SEGMENT_HEADERSIZE + LOGFS_MAX_OBJECTSIZE - 1) + +/* + * Segment types: + * SEG_SUPER - Data or indirect block + * SEG_JOURNAL - Inode + * SEG_OSTORE - Dentry + */ +enum { + SEG_SUPER = 0x01, + SEG_JOURNAL = 0x02, + SEG_OSTORE = 0x03, +}; + +/** + * struct logfs_segment_header - per-segment header in the ostore + * + * @crc: crc32 of header (there is no data) + * @pad: unused, must be 0 + * @type: segment type, see above + * @level: GC level for all objects in this segment + * @segno: segment number + * @ec: erase count for this segment + * @gec: global erase count at time of writing + */ +struct logfs_segment_header { + __be32 crc; + __be16 pad; + __u8 type; + __u8 level; + __be32 segno; + __be32 ec; + __be64 gec; +}; + +SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE); + +#define LOGFS_FEATURES_INCOMPAT (0ull) +#define LOGFS_FEATURES_RO_COMPAT (0ull) +#define LOGFS_FEATURES_COMPAT (0ull) + +/** + * struct logfs_disk_super - on-medium superblock + * + * @ds_magic: magic number, must equal LOGFS_MAGIC + * @ds_crc: crc32 of structure starting with the next field + * @ds_ifile_levels: maximum number of levels for ifile + * @ds_iblock_levels: maximum number of levels for regular files + * @ds_data_levels: number of seperate levels for data + * @pad0: reserved, must be 0 + * @ds_feature_incompat: incompatible filesystem features + * @ds_feature_ro_compat: read-only compatible filesystem features + * @ds_feature_compat: compatible filesystem features + * @ds_flags: flags + * @ds_segment_shift: log2 of segment size + * @ds_block_shift: log2 of block size + * @ds_write_shift: log2 of write size + * @pad1: reserved, must be 0 + * @ds_journal_seg: segments used by primary journal + * @ds_root_reserve: bytes reserved for the superuser + * @ds_speed_reserve: bytes reserved to speed up GC + * @ds_bad_seg_reserve: number of segments reserved to handle bad blocks + * @pad2: reserved, must be 0 + * @pad3: reserved, must be 0 + * + * Contains only read-only fields. Read-write fields like the amount of used + * space is tracked in the dynamic superblock, which is stored in the journal. + */ +struct logfs_disk_super { + struct logfs_segment_header ds_sh; + __be64 ds_magic; + + __be32 ds_crc; + __u8 ds_ifile_levels; + __u8 ds_iblock_levels; + __u8 ds_data_levels; + __u8 ds_segment_shift; + __u8 ds_block_shift; + __u8 ds_write_shift; + __u8 pad0[6]; + + __be64 ds_filesystem_size; + __be32 ds_segment_size; + __be32 ds_bad_seg_reserve; + + __be64 ds_feature_incompat; + __be64 ds_feature_ro_compat; + + __be64 ds_feature_compat; + __be64 ds_feature_flags; + + __be64 ds_root_reserve; + __be64 ds_speed_reserve; + + __be32 ds_journal_seg[LOGFS_JOURNAL_SEGS]; + + __be64 ds_super_ofs[2]; + __be64 pad3[8]; +}; + +SIZE_CHECK(logfs_disk_super, 256); + +/* + * Object types: + * OBJ_BLOCK - Data or indirect block + * OBJ_INODE - Inode + * OBJ_DENTRY - Dentry + */ +enum { + OBJ_BLOCK = 0x04, + OBJ_INODE = 0x05, + OBJ_DENTRY = 0x06, +}; + +/** + * struct logfs_object_header - per-object header in the ostore + * + * @crc: crc32 of header, excluding data_crc + * @len: length of data + * @type: object type, see above + * @compr: compression type + * @ino: inode number + * @bix: block index + * @data_crc: crc32 of payload + */ +struct logfs_object_header { + __be32 crc; + __be16 len; + __u8 type; + __u8 compr; + __be64 ino; + __be64 bix; + __be32 data_crc; +} __attribute__((packed)); + +SIZE_CHECK(logfs_object_header, LOGFS_OBJECT_HEADERSIZE); + +/* + * Reserved inode numbers: + * LOGFS_INO_MASTER - master inode (for inode file) + * LOGFS_INO_ROOT - root directory + * LOGFS_INO_SEGFILE - per-segment used bytes and erase count + */ +enum { + LOGFS_INO_MAPPING = 0x00, + LOGFS_INO_MASTER = 0x01, + LOGFS_INO_ROOT = 0x02, + LOGFS_INO_SEGFILE = 0x03, + LOGFS_RESERVED_INOS = 0x10, +}; + +/* + * Inode flags. High bits should never be written to the medium. They are + * reserved for in-memory usage. + * Low bits should either remain in sync with the corresponding FS_*_FL or + * reuse slots that obviously don't make sense for logfs. + * + * LOGFS_IF_DIRTY Inode must be written back + * LOGFS_IF_ZOMBIE Inode has been deleted + * LOGFS_IF_STILLBORN -ENOSPC happened when creating inode + */ +#define LOGFS_IF_COMPRESSED 0x00000004 /* == FS_COMPR_FL */ +#define LOGFS_IF_DIRTY 0x20000000 +#define LOGFS_IF_ZOMBIE 0x40000000 +#define LOGFS_IF_STILLBORN 0x80000000 + +/* Flags available to chattr */ +#define LOGFS_FL_USER_VISIBLE (LOGFS_IF_COMPRESSED) +#define LOGFS_FL_USER_MODIFIABLE (LOGFS_IF_COMPRESSED) +/* Flags inherited from parent directory on file/directory creation */ +#define LOGFS_FL_INHERITED (LOGFS_IF_COMPRESSED) + +/** + * struct logfs_disk_inode - on-medium inode + * + * @di_mode: file mode + * @di_pad: reserved, must be 0 + * @di_flags: inode flags, see above + * @di_uid: user id + * @di_gid: group id + * @di_ctime: change time + * @di_mtime: modify time + * @di_refcount: reference count (aka nlink or link count) + * @di_generation: inode generation, for nfs + * @di_used_bytes: number of bytes used + * @di_size: file size + * @di_data: data pointers + */ +struct logfs_disk_inode { + __be16 di_mode; + __u8 di_height; + __u8 di_pad; + __be32 di_flags; + __be32 di_uid; + __be32 di_gid; + + __be64 di_ctime; + __be64 di_mtime; + + __be64 di_atime; + __be32 di_refcount; + __be32 di_generation; + + __be64 di_used_bytes; + __be64 di_size; + + __be64 di_data[LOGFS_EMBEDDED_FIELDS]; +}; + +SIZE_CHECK(logfs_disk_inode, 200); + +#define INODE_POINTER_OFS \ + (offsetof(struct logfs_disk_inode, di_data) / sizeof(__be64)) +#define INODE_USED_OFS \ + (offsetof(struct logfs_disk_inode, di_used_bytes) / sizeof(__be64)) +#define INODE_SIZE_OFS \ + (offsetof(struct logfs_disk_inode, di_size) / sizeof(__be64)) +#define INODE_HEIGHT_OFS (0) + +/** + * struct logfs_disk_dentry - on-medium dentry structure + * + * @ino: inode number + * @namelen: length of file name + * @type: file type, identical to bits 12..15 of mode + * @name: file name + */ +/* FIXME: add 6 bytes of padding to remove the __packed */ +struct logfs_disk_dentry { + __be64 ino; + __be16 namelen; + __u8 type; + __u8 name[LOGFS_MAX_NAMELEN]; +} __attribute__((packed)); + +SIZE_CHECK(logfs_disk_dentry, 266); + +#define RESERVED 0xffffffff +#define BADSEG 0xffffffff +/** + * struct logfs_segment_entry - segment file entry + * + * @ec_level: erase count and level + * @valid: number of valid bytes + * + * Segment file contains one entry for every segment. ec_level contains the + * erasecount in the upper 28 bits and the level in the lower 4 bits. An + * ec_level of BADSEG (-1) identifies bad segments. valid contains the number + * of valid bytes or RESERVED (-1 again) if the segment is used for either the + * superblock or the journal, or when the segment is bad. + */ +struct logfs_segment_entry { + __be32 ec_level; + __be32 valid; +}; + +SIZE_CHECK(logfs_segment_entry, 8); + +/** + * struct logfs_journal_header - header for journal entries (JEs) + * + * @h_crc: crc32 of journal entry + * @h_len: length of compressed journal entry, + * not including header + * @h_datalen: length of uncompressed data + * @h_type: JE type + * @h_compr: compression type + * @h_pad: reserved + */ +struct logfs_journal_header { + __be32 h_crc; + __be16 h_len; + __be16 h_datalen; + __be16 h_type; + __u8 h_compr; + __u8 h_pad[5]; +}; + +SIZE_CHECK(logfs_journal_header, 16); + +/* + * Life expectency of data. + * VIM_DEFAULT - default vim + * VIM_SEGFILE - for segment file only - very short-living + * VIM_GC - GC'd data - likely long-living + */ +enum logfs_vim { + VIM_DEFAULT = 0, + VIM_SEGFILE = 1, +}; + +/** + * struct logfs_je_area - wbuf header + * + * @segno: segment number of area + * @used_bytes: number of bytes already used + * @gc_level: GC level + * @vim: life expectancy of data + * + * "Areas" are segments currently being used for writing. There is at least + * one area per GC level. Several may be used to seperate long-living from + * short-living data. If an area with unknown vim is encountered, it can + * simply be closed. + * The write buffer immediately follow this header. + */ +struct logfs_je_area { + __be32 segno; + __be32 used_bytes; + __u8 gc_level; + __u8 vim; +} __attribute__((packed)); + +SIZE_CHECK(logfs_je_area, 10); + +#define MAX_JOURNAL_HEADER \ + (sizeof(struct logfs_journal_header) + sizeof(struct logfs_je_area)) + +/** + * struct logfs_je_dynsb - dynamic superblock + * + * @ds_gec: global erase count + * @ds_sweeper: current position of GC "sweeper" + * @ds_rename_dir: source directory ino (see dir.c documentation) + * @ds_rename_pos: position of source dd (see dir.c documentation) + * @ds_victim_ino: victims of incomplete dir operation (see dir.c) + * @ds_victim_ino: parent inode of victim (see dir.c) + * @ds_used_bytes: number of used bytes + */ +struct logfs_je_dynsb { + __be64 ds_gec; + __be64 ds_sweeper; + + __be64 ds_rename_dir; + __be64 ds_rename_pos; + + __be64 ds_victim_ino; + __be64 ds_victim_parent; /* XXX */ + + __be64 ds_used_bytes; + __be32 ds_generation; + __be32 pad; +}; + +SIZE_CHECK(logfs_je_dynsb, 64); + +/** + * struct logfs_je_anchor - anchor of filesystem tree, aka master inode + * + * @da_size: size of inode file + * @da_last_ino: last created inode + * @da_used_bytes: number of bytes used + * @da_data: data pointers + */ +struct logfs_je_anchor { + __be64 da_size; + __be64 da_last_ino; + + __be64 da_used_bytes; + u8 da_height; + u8 pad[7]; + + __be64 da_data[LOGFS_EMBEDDED_FIELDS]; +}; + +SIZE_CHECK(logfs_je_anchor, 168); + +/** + * struct logfs_je_spillout - spillout entry (from 1st to 2nd journal) + * + * @so_segment: segments used for 2nd journal + * + * Length of the array is given by h_len field in the header. + */ +struct logfs_je_spillout { + __be64 so_segment[0]; +}; + +SIZE_CHECK(logfs_je_spillout, 0); + +/** + * struct logfs_je_journal_ec - erase counts for all journal segments + * + * @ec: erase count + * + * Length of the array is given by h_len field in the header. + */ +struct logfs_je_journal_ec { + __be32 ec[0]; +}; + +SIZE_CHECK(logfs_je_journal_ec, 0); + +/** + * struct logfs_je_free_segments - list of free segmetns with erase count + */ +struct logfs_je_free_segments { + __be32 segno; + __be32 ec; +}; + +SIZE_CHECK(logfs_je_free_segments, 8); + +/** + * struct logfs_seg_alias - list of segment aliases + */ +struct logfs_seg_alias { + __be32 old_segno; + __be32 new_segno; +}; + +SIZE_CHECK(logfs_seg_alias, 8); + +/** + * struct logfs_obj_alias - list of object aliases + */ +struct logfs_obj_alias { + __be64 ino; + __be64 bix; + __be64 val; + u8 level; + u8 pad[5]; + __be16 child_no; +}; + +SIZE_CHECK(logfs_obj_alias, 32); + +/** + * Compression types. + * + * COMPR_NONE - uncompressed + * COMPR_ZLIB - compressed with zlib + */ +enum { + COMPR_NONE = 0, + COMPR_ZLIB = 1, +}; + +/* + * Journal entries come in groups of 16. First group contains unique + * entries, next groups contain one entry per level + * + * JE_FIRST - smallest possible journal entry number + * + * JEG_BASE - base group, containing unique entries + * JE_COMMIT - commit entry, validates all previous entries + * JE_DYNSB - dynamic superblock, anything that ought to be in the + * superblock but cannot because it is read-write data + * JE_ANCHOR - anchor aka master inode aka inode file's inode + * JE_ERASECOUNT erasecounts for all journal segments + * JE_SPILLOUT - unused + * JE_SEG_ALIAS - aliases segments + * JE_AREA - area description + * + * JE_LAST - largest possible journal entry number + */ +enum { + JE_FIRST = 0x01, + + JEG_BASE = 0x00, + JE_COMMIT = 0x02, + JE_DYNSB = 0x03, + JE_ANCHOR = 0x04, + JE_ERASECOUNT = 0x05, + JE_SPILLOUT = 0x06, + JE_OBJ_ALIAS = 0x0d, + JE_AREA = 0x0e, + + JE_LAST = 0x0e, +}; + +#endif diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c new file mode 100644 index 00000000000..7a23b3e7c0a --- /dev/null +++ b/fs/logfs/readwrite.c @@ -0,0 +1,2246 @@ +/* + * fs/logfs/readwrite.c + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> + * + * + * Actually contains five sets of very similar functions: + * read read blocks from a file + * seek_hole find next hole + * seek_data find next data block + * valid check whether a block still belongs to a file + * write write blocks to a file + * delete delete a block (for directories and ifile) + * rewrite move existing blocks of a file to a new location (gc helper) + * truncate truncate a file + */ +#include "logfs.h" +#include <linux/sched.h> + +static u64 adjust_bix(u64 bix, level_t level) +{ + switch (level) { + case 0: + return bix; + case LEVEL(1): + return max_t(u64, bix, I0_BLOCKS); + case LEVEL(2): + return max_t(u64, bix, I1_BLOCKS); + case LEVEL(3): + return max_t(u64, bix, I2_BLOCKS); + case LEVEL(4): + return max_t(u64, bix, I3_BLOCKS); + case LEVEL(5): + return max_t(u64, bix, I4_BLOCKS); + default: + WARN_ON(1); + return bix; + } +} + +static inline u64 maxbix(u8 height) +{ + return 1ULL << (LOGFS_BLOCK_BITS * height); +} + +/** + * The inode address space is cut in two halves. Lower half belongs to data + * pages, upper half to indirect blocks. If the high bit (INDIRECT_BIT) is + * set, the actual block index (bix) and level can be derived from the page + * index. + * + * The lowest three bits of the block index are set to 0 after packing and + * unpacking. Since the lowest n bits (9 for 4KiB blocksize) are ignored + * anyway this is harmless. + */ +#define ARCH_SHIFT (BITS_PER_LONG - 32) +#define INDIRECT_BIT (0x80000000UL << ARCH_SHIFT) +#define LEVEL_SHIFT (28 + ARCH_SHIFT) +static inline pgoff_t first_indirect_block(void) +{ + return INDIRECT_BIT | (1ULL << LEVEL_SHIFT); +} + +pgoff_t logfs_pack_index(u64 bix, level_t level) +{ + pgoff_t index; + + BUG_ON(bix >= INDIRECT_BIT); + if (level == 0) + return bix; + + index = INDIRECT_BIT; + index |= (__force long)level << LEVEL_SHIFT; + index |= bix >> ((__force u8)level * LOGFS_BLOCK_BITS); + return index; +} + +void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level) +{ + u8 __level; + + if (!(index & INDIRECT_BIT)) { + *bix = index; + *level = 0; + return; + } + + __level = (index & ~INDIRECT_BIT) >> LEVEL_SHIFT; + *level = LEVEL(__level); + *bix = (index << (__level * LOGFS_BLOCK_BITS)) & ~INDIRECT_BIT; + *bix = adjust_bix(*bix, *level); + return; +} +#undef ARCH_SHIFT +#undef INDIRECT_BIT +#undef LEVEL_SHIFT + +/* + * Time is stored as nanoseconds since the epoch. + */ +static struct timespec be64_to_timespec(__be64 betime) +{ + return ns_to_timespec(be64_to_cpu(betime)); +} + +static __be64 timespec_to_be64(struct timespec tsp) +{ + return cpu_to_be64((u64)tsp.tv_sec * NSEC_PER_SEC + tsp.tv_nsec); +} + +static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode) +{ + struct logfs_inode *li = logfs_inode(inode); + int i; + + inode->i_mode = be16_to_cpu(di->di_mode); + li->li_height = di->di_height; + li->li_flags = be32_to_cpu(di->di_flags); + inode->i_uid = be32_to_cpu(di->di_uid); + inode->i_gid = be32_to_cpu(di->di_gid); + inode->i_size = be64_to_cpu(di->di_size); + logfs_set_blocks(inode, be64_to_cpu(di->di_used_bytes)); + inode->i_atime = be64_to_timespec(di->di_atime); + inode->i_ctime = be64_to_timespec(di->di_ctime); + inode->i_mtime = be64_to_timespec(di->di_mtime); + inode->i_nlink = be32_to_cpu(di->di_refcount); + inode->i_generation = be32_to_cpu(di->di_generation); + + switch (inode->i_mode & S_IFMT) { + case S_IFSOCK: /* fall through */ + case S_IFBLK: /* fall through */ + case S_IFCHR: /* fall through */ + case S_IFIFO: + inode->i_rdev = be64_to_cpu(di->di_data[0]); + break; + case S_IFDIR: /* fall through */ + case S_IFREG: /* fall through */ + case S_IFLNK: + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + li->li_data[i] = be64_to_cpu(di->di_data[i]); + break; + default: + BUG(); + } +} + +static void logfs_inode_to_disk(struct inode *inode, struct logfs_disk_inode*di) +{ + struct logfs_inode *li = logfs_inode(inode); + int i; + + di->di_mode = cpu_to_be16(inode->i_mode); + di->di_height = li->li_height; + di->di_pad = 0; + di->di_flags = cpu_to_be32(li->li_flags); + di->di_uid = cpu_to_be32(inode->i_uid); + di->di_gid = cpu_to_be32(inode->i_gid); + di->di_size = cpu_to_be64(i_size_read(inode)); + di->di_used_bytes = cpu_to_be64(li->li_used_bytes); + di->di_atime = timespec_to_be64(inode->i_atime); + di->di_ctime = timespec_to_be64(inode->i_ctime); + di->di_mtime = timespec_to_be64(inode->i_mtime); + di->di_refcount = cpu_to_be32(inode->i_nlink); + di->di_generation = cpu_to_be32(inode->i_generation); + + switch (inode->i_mode & S_IFMT) { + case S_IFSOCK: /* fall through */ + case S_IFBLK: /* fall through */ + case S_IFCHR: /* fall through */ + case S_IFIFO: + di->di_data[0] = cpu_to_be64(inode->i_rdev); + break; + case S_IFDIR: /* fall through */ + case S_IFREG: /* fall through */ + case S_IFLNK: + for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) + di->di_data[i] = cpu_to_be64(li->li_data[i]); + break; + default: + BUG(); + } +} + +static void __logfs_set_blocks(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct logfs_inode *li = logfs_inode(inode); + + inode->i_blocks = ULONG_MAX; + if (li->li_used_bytes >> sb->s_blocksize_bits < ULONG_MAX) + inode->i_blocks = ALIGN(li->li_used_bytes, 512) >> 9; +} + +void logfs_set_blocks(struct inode *inode, u64 bytes) +{ + struct logfs_inode *li = logfs_inode(inode); + + li->li_used_bytes = bytes; + __logfs_set_blocks(inode); +} + +static void prelock_page(struct super_block *sb, struct page *page, int lock) +{ + struct logfs_super *super = logfs_super(sb); + + BUG_ON(!PageLocked(page)); + if (lock) { + BUG_ON(PagePreLocked(page)); + SetPagePreLocked(page); + } else { + /* We are in GC path. */ + if (PagePreLocked(page)) + super->s_lock_count++; + else + SetPagePreLocked(page); + } +} + +static void preunlock_page(struct super_block *sb, struct page *page, int lock) +{ + struct logfs_super *super = logfs_super(sb); + + BUG_ON(!PageLocked(page)); + if (lock) + ClearPagePreLocked(page); + else { + /* We are in GC path. */ + BUG_ON(!PagePreLocked(page)); + if (super->s_lock_count) + super->s_lock_count--; + else + ClearPagePreLocked(page); + } +} + +/* + * Logfs is prone to an AB-BA deadlock where one task tries to acquire + * s_write_mutex with a locked page and GC tries to get that page while holding + * s_write_mutex. + * To solve this issue logfs will ignore the page lock iff the page in question + * is waiting for s_write_mutex. We annotate this fact by setting PG_pre_locked + * in addition to PG_locked. + */ +static void logfs_get_wblocks(struct super_block *sb, struct page *page, + int lock) +{ + struct logfs_super *super = logfs_super(sb); + + if (page) + prelock_page(sb, page, lock); + + if (lock) { + mutex_lock(&super->s_write_mutex); + logfs_gc_pass(sb); + /* FIXME: We also have to check for shadowed space + * and mempool fill grade */ + } +} + +static void logfs_put_wblocks(struct super_block *sb, struct page *page, + int lock) +{ + struct logfs_super *super = logfs_super(sb); + + if (page) + preunlock_page(sb, page, lock); + /* Order matters - we must clear PG_pre_locked before releasing + * s_write_mutex or we could race against another task. */ + if (lock) + mutex_unlock(&super->s_write_mutex); +} + +static struct page *logfs_get_read_page(struct inode *inode, u64 bix, + level_t level) +{ + return find_or_create_page(inode->i_mapping, + logfs_pack_index(bix, level), GFP_NOFS); +} + +static void logfs_put_read_page(struct page *page) +{ + unlock_page(page); + page_cache_release(page); +} + +static void logfs_lock_write_page(struct page *page) +{ + int loop = 0; + + while (unlikely(!trylock_page(page))) { + if (loop++ > 0x1000) { + /* Has been observed once so far... */ + printk(KERN_ERR "stack at %p\n", &loop); + BUG(); + } + if (PagePreLocked(page)) { + /* Holder of page lock is waiting for us, it + * is safe to use this page. */ + break; + } + /* Some other process has this page locked and has + * nothing to do with us. Wait for it to finish. + */ + schedule(); + } + BUG_ON(!PageLocked(page)); +} + +static struct page *logfs_get_write_page(struct inode *inode, u64 bix, + level_t level) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t index = logfs_pack_index(bix, level); + struct page *page; + int err; + +repeat: + page = find_get_page(mapping, index); + if (!page) { + page = __page_cache_alloc(GFP_NOFS); + if (!page) + return NULL; + err = add_to_page_cache_lru(page, mapping, index, GFP_NOFS); + if (unlikely(err)) { + page_cache_release(page); + if (err == -EEXIST) + goto repeat; + return NULL; + } + } else logfs_lock_write_page(page); + BUG_ON(!PageLocked(page)); + return page; +} + +static void logfs_unlock_write_page(struct page *page) +{ + if (!PagePreLocked(page)) + unlock_page(page); +} + +static void logfs_put_write_page(struct page *page) +{ + logfs_unlock_write_page(page); + page_cache_release(page); +} + +static struct page *logfs_get_page(struct inode *inode, u64 bix, level_t level, + int rw) +{ + if (rw == READ) + return logfs_get_read_page(inode, bix, level); + else + return logfs_get_write_page(inode, bix, level); +} + +static void logfs_put_page(struct page *page, int rw) +{ + if (rw == READ) + logfs_put_read_page(page); + else + logfs_put_write_page(page); +} + +static unsigned long __get_bits(u64 val, int skip, int no) +{ + u64 ret = val; + + ret >>= skip * no; + ret <<= 64 - no; + ret >>= 64 - no; + return ret; +} + +static unsigned long get_bits(u64 val, level_t skip) +{ + return __get_bits(val, (__force int)skip, LOGFS_BLOCK_BITS); +} + +static inline void init_shadow_tree(struct super_block *sb, + struct shadow_tree *tree) +{ + struct logfs_super *super = logfs_super(sb); + + btree_init_mempool64(&tree->new, super->s_btree_pool); + btree_init_mempool64(&tree->old, super->s_btree_pool); +} + +static void indirect_write_block(struct logfs_block *block) +{ + struct page *page; + struct inode *inode; + int ret; + + page = block->page; + inode = page->mapping->host; + logfs_lock_write_page(page); + ret = logfs_write_buf(inode, page, 0); + logfs_unlock_write_page(page); + /* + * This needs some rework. Unless you want your filesystem to run + * completely synchronously (you don't), the filesystem will always + * report writes as 'successful' before the actual work has been + * done. The actual work gets done here and this is where any errors + * will show up. And there isn't much we can do about it, really. + * + * Some attempts to fix the errors (move from bad blocks, retry io,...) + * have already been done, so anything left should be either a broken + * device or a bug somewhere in logfs itself. Being relatively new, + * the odds currently favor a bug, so for now the line below isn't + * entirely tasteles. + */ + BUG_ON(ret); +} + +static void inode_write_block(struct logfs_block *block) +{ + struct inode *inode; + int ret; + + inode = block->inode; + if (inode->i_ino == LOGFS_INO_MASTER) + logfs_write_anchor(inode->i_sb); + else { + ret = __logfs_write_inode(inode, 0); + /* see indirect_write_block comment */ + BUG_ON(ret); + } +} + +static gc_level_t inode_block_level(struct logfs_block *block) +{ + BUG_ON(block->inode->i_ino == LOGFS_INO_MASTER); + return GC_LEVEL(LOGFS_MAX_LEVELS); +} + +static gc_level_t indirect_block_level(struct logfs_block *block) +{ + struct page *page; + struct inode *inode; + u64 bix; + level_t level; + + page = block->page; + inode = page->mapping->host; + logfs_unpack_index(page->index, &bix, &level); + return expand_level(inode->i_ino, level); +} + +/* + * This silences a false, yet annoying gcc warning. I hate it when my editor + * jumps into bitops.h each time I recompile this file. + * TODO: Complain to gcc folks about this and upgrade compiler. + */ +static unsigned long fnb(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + return find_next_bit(addr, size, offset); +} + +static __be64 inode_val0(struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + u64 val; + + /* + * Explicit shifting generates good code, but must match the format + * of the structure. Add some paranoia just in case. + */ + BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_mode) != 0); + BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_height) != 2); + BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_flags) != 4); + + val = (u64)inode->i_mode << 48 | + (u64)li->li_height << 40 | + (u64)li->li_flags; + return cpu_to_be64(val); +} + +static int inode_write_alias(struct super_block *sb, + struct logfs_block *block, write_alias_t *write_one_alias) +{ + struct inode *inode = block->inode; + struct logfs_inode *li = logfs_inode(inode); + unsigned long pos; + u64 ino , bix; + __be64 val; + level_t level; + int err; + + for (pos = 0; ; pos++) { + pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos); + if (pos >= LOGFS_EMBEDDED_FIELDS + INODE_POINTER_OFS) + return 0; + + switch (pos) { + case INODE_HEIGHT_OFS: + val = inode_val0(inode); + break; + case INODE_USED_OFS: + val = cpu_to_be64(li->li_used_bytes);; + break; + case INODE_SIZE_OFS: + val = cpu_to_be64(i_size_read(inode)); + break; + case INODE_POINTER_OFS ... INODE_POINTER_OFS + LOGFS_EMBEDDED_FIELDS - 1: + val = cpu_to_be64(li->li_data[pos - INODE_POINTER_OFS]); + break; + default: + BUG(); + } + + ino = LOGFS_INO_MASTER; + bix = inode->i_ino; + level = LEVEL(0); + err = write_one_alias(sb, ino, bix, level, pos, val); + if (err) + return err; + } +} + +static int indirect_write_alias(struct super_block *sb, + struct logfs_block *block, write_alias_t *write_one_alias) +{ + unsigned long pos; + struct page *page = block->page; + u64 ino , bix; + __be64 *child, val; + level_t level; + int err; + + for (pos = 0; ; pos++) { + pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos); + if (pos >= LOGFS_BLOCK_FACTOR) + return 0; + + ino = page->mapping->host->i_ino; + logfs_unpack_index(page->index, &bix, &level); + child = kmap_atomic(page, KM_USER0); + val = child[pos]; + kunmap_atomic(child, KM_USER0); + err = write_one_alias(sb, ino, bix, level, pos, val); + if (err) + return err; + } +} + +int logfs_write_obj_aliases_pagecache(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_block *block; + int err; + + list_for_each_entry(block, &super->s_object_alias, alias_list) { + err = block->ops->write_alias(sb, block, write_alias_journal); + if (err) + return err; + } + return 0; +} + +void __free_block(struct super_block *sb, struct logfs_block *block) +{ + BUG_ON(!list_empty(&block->item_list)); + list_del(&block->alias_list); + mempool_free(block, logfs_super(sb)->s_block_pool); +} + +static void inode_free_block(struct super_block *sb, struct logfs_block *block) +{ + struct inode *inode = block->inode; + + logfs_inode(inode)->li_block = NULL; + __free_block(sb, block); +} + +static void indirect_free_block(struct super_block *sb, + struct logfs_block *block) +{ + ClearPagePrivate(block->page); + block->page->private = 0; + __free_block(sb, block); +} + + +static struct logfs_block_ops inode_block_ops = { + .write_block = inode_write_block, + .block_level = inode_block_level, + .free_block = inode_free_block, + .write_alias = inode_write_alias, +}; + +struct logfs_block_ops indirect_block_ops = { + .write_block = indirect_write_block, + .block_level = indirect_block_level, + .free_block = indirect_free_block, + .write_alias = indirect_write_alias, +}; + +struct logfs_block *__alloc_block(struct super_block *sb, + u64 ino, u64 bix, level_t level) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_block *block; + + block = mempool_alloc(super->s_block_pool, GFP_NOFS); + memset(block, 0, sizeof(*block)); + INIT_LIST_HEAD(&block->alias_list); + INIT_LIST_HEAD(&block->item_list); + block->sb = sb; + block->ino = ino; + block->bix = bix; + block->level = level; + return block; +} + +static void alloc_inode_block(struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + struct logfs_block *block; + + if (li->li_block) + return; + + block = __alloc_block(inode->i_sb, LOGFS_INO_MASTER, inode->i_ino, 0); + block->inode = inode; + li->li_block = block; + block->ops = &inode_block_ops; +} + +void initialize_block_counters(struct page *page, struct logfs_block *block, + __be64 *array, int page_is_empty) +{ + u64 ptr; + int i, start; + + block->partial = 0; + block->full = 0; + start = 0; + if (page->index < first_indirect_block()) { + /* Counters are pointless on level 0 */ + return; + } + if (page->index == first_indirect_block()) { + /* Skip unused pointers */ + start = I0_BLOCKS; + block->full = I0_BLOCKS; + } + if (!page_is_empty) { + for (i = start; i < LOGFS_BLOCK_FACTOR; i++) { + ptr = be64_to_cpu(array[i]); + if (ptr) + block->partial++; + if (ptr & LOGFS_FULLY_POPULATED) + block->full++; + } + } +} + +static void alloc_data_block(struct inode *inode, struct page *page) +{ + struct logfs_block *block; + u64 bix; + level_t level; + + if (PagePrivate(page)) + return; + + logfs_unpack_index(page->index, &bix, &level); + block = __alloc_block(inode->i_sb, inode->i_ino, bix, level); + block->page = page; + SetPagePrivate(page); + page->private = (unsigned long)block; + block->ops = &indirect_block_ops; +} + +static void alloc_indirect_block(struct inode *inode, struct page *page, + int page_is_empty) +{ + struct logfs_block *block; + __be64 *array; + + if (PagePrivate(page)) + return; + + alloc_data_block(inode, page); + + block = logfs_block(page); + array = kmap_atomic(page, KM_USER0); + initialize_block_counters(page, block, array, page_is_empty); + kunmap_atomic(array, KM_USER0); +} + +static void block_set_pointer(struct page *page, int index, u64 ptr) +{ + struct logfs_block *block = logfs_block(page); + __be64 *array; + u64 oldptr; + + BUG_ON(!block); + array = kmap_atomic(page, KM_USER0); + oldptr = be64_to_cpu(array[index]); + array[index] = cpu_to_be64(ptr); + kunmap_atomic(array, KM_USER0); + SetPageUptodate(page); + + block->full += !!(ptr & LOGFS_FULLY_POPULATED) + - !!(oldptr & LOGFS_FULLY_POPULATED); + block->partial += !!ptr - !!oldptr; +} + +static u64 block_get_pointer(struct page *page, int index) +{ + __be64 *block; + u64 ptr; + + block = kmap_atomic(page, KM_USER0); + ptr = be64_to_cpu(block[index]); + kunmap_atomic(block, KM_USER0); + return ptr; +} + +static int logfs_read_empty(struct page *page) +{ + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + return 0; +} + +static int logfs_read_direct(struct inode *inode, struct page *page) +{ + struct logfs_inode *li = logfs_inode(inode); + pgoff_t index = page->index; + u64 block; + + block = li->li_data[index]; + if (!block) + return logfs_read_empty(page); + + return logfs_segment_read(inode, page, block, index, 0); +} + +static int logfs_read_loop(struct inode *inode, struct page *page, + int rw_context) +{ + struct logfs_inode *li = logfs_inode(inode); + u64 bix, bofs = li->li_data[INDIRECT_INDEX]; + level_t level, target_level; + int ret; + struct page *ipage; + + logfs_unpack_index(page->index, &bix, &target_level); + if (!bofs) + return logfs_read_empty(page); + + if (bix >= maxbix(li->li_height)) + return logfs_read_empty(page); + + for (level = LEVEL(li->li_height); + (__force u8)level > (__force u8)target_level; + level = SUBLEVEL(level)){ + ipage = logfs_get_page(inode, bix, level, rw_context); + if (!ipage) + return -ENOMEM; + + ret = logfs_segment_read(inode, ipage, bofs, bix, level); + if (ret) { + logfs_put_read_page(ipage); + return ret; + } + + bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level))); + logfs_put_page(ipage, rw_context); + if (!bofs) + return logfs_read_empty(page); + } + + return logfs_segment_read(inode, page, bofs, bix, 0); +} + +static int logfs_read_block(struct inode *inode, struct page *page, + int rw_context) +{ + pgoff_t index = page->index; + + if (index < I0_BLOCKS) + return logfs_read_direct(inode, page); + return logfs_read_loop(inode, page, rw_context); +} + +static int logfs_exist_loop(struct inode *inode, u64 bix) +{ + struct logfs_inode *li = logfs_inode(inode); + u64 bofs = li->li_data[INDIRECT_INDEX]; + level_t level; + int ret; + struct page *ipage; + + if (!bofs) + return 0; + if (bix >= maxbix(li->li_height)) + return 0; + + for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) { + ipage = logfs_get_read_page(inode, bix, level); + if (!ipage) + return -ENOMEM; + + ret = logfs_segment_read(inode, ipage, bofs, bix, level); + if (ret) { + logfs_put_read_page(ipage); + return ret; + } + + bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level))); + logfs_put_read_page(ipage); + if (!bofs) + return 0; + } + + return 1; +} + +int logfs_exist_block(struct inode *inode, u64 bix) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (bix < I0_BLOCKS) + return !!li->li_data[bix]; + return logfs_exist_loop(inode, bix); +} + +static u64 seek_holedata_direct(struct inode *inode, u64 bix, int data) +{ + struct logfs_inode *li = logfs_inode(inode); + + for (; bix < I0_BLOCKS; bix++) + if (data ^ (li->li_data[bix] == 0)) + return bix; + return I0_BLOCKS; +} + +static u64 seek_holedata_loop(struct inode *inode, u64 bix, int data) +{ + struct logfs_inode *li = logfs_inode(inode); + __be64 *rblock; + u64 increment, bofs = li->li_data[INDIRECT_INDEX]; + level_t level; + int ret, slot; + struct page *page; + + BUG_ON(!bofs); + + for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) { + increment = 1 << (LOGFS_BLOCK_BITS * ((__force u8)level-1)); + page = logfs_get_read_page(inode, bix, level); + if (!page) + return bix; + + ret = logfs_segment_read(inode, page, bofs, bix, level); + if (ret) { + logfs_put_read_page(page); + return bix; + } + + slot = get_bits(bix, SUBLEVEL(level)); + rblock = kmap_atomic(page, KM_USER0); + while (slot < LOGFS_BLOCK_FACTOR) { + if (data && (rblock[slot] != 0)) + break; + if (!data && !(be64_to_cpu(rblock[slot]) & LOGFS_FULLY_POPULATED)) + break; + slot++; + bix += increment; + bix &= ~(increment - 1); + } + if (slot >= LOGFS_BLOCK_FACTOR) { + kunmap_atomic(rblock, KM_USER0); + logfs_put_read_page(page); + return bix; + } + bofs = be64_to_cpu(rblock[slot]); + kunmap_atomic(rblock, KM_USER0); + logfs_put_read_page(page); + if (!bofs) { + BUG_ON(data); + return bix; + } + } + return bix; +} + +/** + * logfs_seek_hole - find next hole starting at a given block index + * @inode: inode to search in + * @bix: block index to start searching + * + * Returns next hole. If the file doesn't contain any further holes, the + * block address next to eof is returned instead. + */ +u64 logfs_seek_hole(struct inode *inode, u64 bix) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (bix < I0_BLOCKS) { + bix = seek_holedata_direct(inode, bix, 0); + if (bix < I0_BLOCKS) + return bix; + } + + if (!li->li_data[INDIRECT_INDEX]) + return bix; + else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED) + bix = maxbix(li->li_height); + else { + bix = seek_holedata_loop(inode, bix, 0); + if (bix < maxbix(li->li_height)) + return bix; + /* Should not happen anymore. But if some port writes semi- + * corrupt images (as this one used to) we might run into it. + */ + WARN_ON_ONCE(bix == maxbix(li->li_height)); + } + + return bix; +} + +static u64 __logfs_seek_data(struct inode *inode, u64 bix) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (bix < I0_BLOCKS) { + bix = seek_holedata_direct(inode, bix, 1); + if (bix < I0_BLOCKS) + return bix; + } + + if (bix < maxbix(li->li_height)) { + if (!li->li_data[INDIRECT_INDEX]) + bix = maxbix(li->li_height); + else + return seek_holedata_loop(inode, bix, 1); + } + + return bix; +} + +/** + * logfs_seek_data - find next data block after a given block index + * @inode: inode to search in + * @bix: block index to start searching + * + * Returns next data block. If the file doesn't contain any further data + * blocks, the last block in the file is returned instead. + */ +u64 logfs_seek_data(struct inode *inode, u64 bix) +{ + struct super_block *sb = inode->i_sb; + u64 ret, end; + + ret = __logfs_seek_data(inode, bix); + end = i_size_read(inode) >> sb->s_blocksize_bits; + if (ret >= end) + ret = max(bix, end); + return ret; +} + +static int logfs_is_valid_direct(struct logfs_inode *li, u64 bix, u64 ofs) +{ + return pure_ofs(li->li_data[bix]) == ofs; +} + +static int __logfs_is_valid_loop(struct inode *inode, u64 bix, + u64 ofs, u64 bofs) +{ + struct logfs_inode *li = logfs_inode(inode); + level_t level; + int ret; + struct page *page; + + for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)){ + page = logfs_get_write_page(inode, bix, level); + BUG_ON(!page); + + ret = logfs_segment_read(inode, page, bofs, bix, level); + if (ret) { + logfs_put_write_page(page); + return 0; + } + + bofs = block_get_pointer(page, get_bits(bix, SUBLEVEL(level))); + logfs_put_write_page(page); + if (!bofs) + return 0; + + if (pure_ofs(bofs) == ofs) + return 1; + } + return 0; +} + +static int logfs_is_valid_loop(struct inode *inode, u64 bix, u64 ofs) +{ + struct logfs_inode *li = logfs_inode(inode); + u64 bofs = li->li_data[INDIRECT_INDEX]; + + if (!bofs) + return 0; + + if (bix >= maxbix(li->li_height)) + return 0; + + if (pure_ofs(bofs) == ofs) + return 1; + + return __logfs_is_valid_loop(inode, bix, ofs, bofs); +} + +static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs) +{ + struct logfs_inode *li = logfs_inode(inode); + + if ((inode->i_nlink == 0) && atomic_read(&inode->i_count) == 1) + return 0; + + if (bix < I0_BLOCKS) + return logfs_is_valid_direct(li, bix, ofs); + return logfs_is_valid_loop(inode, bix, ofs); +} + +/** + * logfs_is_valid_block - check whether this block is still valid + * + * @sb - superblock + * @ofs - block physical offset + * @ino - block inode number + * @bix - block index + * @level - block level + * + * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will + * become invalid once the journal is written. + */ +int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix, + gc_level_t gc_level) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode; + int ret, cookie; + + /* Umount closes a segment with free blocks remaining. Those + * blocks are by definition invalid. */ + if (ino == -1) + return 0; + + LOGFS_BUG_ON((u64)(u_long)ino != ino, sb); + + inode = logfs_safe_iget(sb, ino, &cookie); + if (IS_ERR(inode)) + goto invalid; + + ret = __logfs_is_valid_block(inode, bix, ofs); + logfs_safe_iput(inode, cookie); + if (ret) + return ret; + +invalid: + /* Block is nominally invalid, but may still sit in the shadow tree, + * waiting for a journal commit. + */ + if (btree_lookup64(&super->s_shadow_tree.old, ofs)) + return 2; + return 0; +} + +int logfs_readpage_nolock(struct page *page) +{ + struct inode *inode = page->mapping->host; + int ret = -EIO; + + ret = logfs_read_block(inode, page, READ); + + if (ret) { + ClearPageUptodate(page); + SetPageError(page); + } else { + SetPageUptodate(page); + ClearPageError(page); + } + flush_dcache_page(page); + + return ret; +} + +static int logfs_reserve_bytes(struct inode *inode, int bytes) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + u64 available = super->s_free_bytes + super->s_dirty_free_bytes + - super->s_dirty_used_bytes - super->s_dirty_pages; + + if (!bytes) + return 0; + + if (available < bytes) + return -ENOSPC; + + if (available < bytes + super->s_root_reserve && + !capable(CAP_SYS_RESOURCE)) + return -ENOSPC; + + return 0; +} + +int get_page_reserve(struct inode *inode, struct page *page) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + int ret; + + if (logfs_block(page) && logfs_block(page)->reserved_bytes) + return 0; + + logfs_get_wblocks(inode->i_sb, page, WF_LOCK); + ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE); + if (!ret) { + alloc_data_block(inode, page); + logfs_block(page)->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE; + super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE; + } + logfs_put_wblocks(inode->i_sb, page, WF_LOCK); + return ret; +} + +/* + * We are protected by write lock. Push victims up to superblock level + * and release transaction when appropriate. + */ +/* FIXME: This is currently called from the wrong spots. */ +static void logfs_handle_transaction(struct inode *inode, + struct logfs_transaction *ta) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + + if (!ta) + return; + logfs_inode(inode)->li_block->ta = NULL; + + if (inode->i_ino != LOGFS_INO_MASTER) { + BUG(); /* FIXME: Yes, this needs more thought */ + /* just remember the transaction until inode is written */ + //BUG_ON(logfs_inode(inode)->li_transaction); + //logfs_inode(inode)->li_transaction = ta; + return; + } + + switch (ta->state) { + case CREATE_1: /* fall through */ + case UNLINK_1: + BUG_ON(super->s_victim_ino); + super->s_victim_ino = ta->ino; + break; + case CREATE_2: /* fall through */ + case UNLINK_2: + BUG_ON(super->s_victim_ino != ta->ino); + super->s_victim_ino = 0; + /* transaction ends here - free it */ + kfree(ta); + break; + case CROSS_RENAME_1: + BUG_ON(super->s_rename_dir); + BUG_ON(super->s_rename_pos); + super->s_rename_dir = ta->dir; + super->s_rename_pos = ta->pos; + break; + case CROSS_RENAME_2: + BUG_ON(super->s_rename_dir != ta->dir); + BUG_ON(super->s_rename_pos != ta->pos); + super->s_rename_dir = 0; + super->s_rename_pos = 0; + kfree(ta); + break; + case TARGET_RENAME_1: + BUG_ON(super->s_rename_dir); + BUG_ON(super->s_rename_pos); + BUG_ON(super->s_victim_ino); + super->s_rename_dir = ta->dir; + super->s_rename_pos = ta->pos; + super->s_victim_ino = ta->ino; + break; + case TARGET_RENAME_2: + BUG_ON(super->s_rename_dir != ta->dir); + BUG_ON(super->s_rename_pos != ta->pos); + BUG_ON(super->s_victim_ino != ta->ino); + super->s_rename_dir = 0; + super->s_rename_pos = 0; + break; + case TARGET_RENAME_3: + BUG_ON(super->s_rename_dir); + BUG_ON(super->s_rename_pos); + BUG_ON(super->s_victim_ino != ta->ino); + super->s_victim_ino = 0; + kfree(ta); + break; + default: + BUG(); + } +} + +/* + * Not strictly a reservation, but rather a check that we still have enough + * space to satisfy the write. + */ +static int logfs_reserve_blocks(struct inode *inode, int blocks) +{ + return logfs_reserve_bytes(inode, blocks * LOGFS_MAX_OBJECTSIZE); +} + +struct write_control { + u64 ofs; + long flags; +}; + +static struct logfs_shadow *alloc_shadow(struct inode *inode, u64 bix, + level_t level, u64 old_ofs) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + struct logfs_shadow *shadow; + + shadow = mempool_alloc(super->s_shadow_pool, GFP_NOFS); + memset(shadow, 0, sizeof(*shadow)); + shadow->ino = inode->i_ino; + shadow->bix = bix; + shadow->gc_level = expand_level(inode->i_ino, level); + shadow->old_ofs = old_ofs & ~LOGFS_FULLY_POPULATED; + return shadow; +} + +static void free_shadow(struct inode *inode, struct logfs_shadow *shadow) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + + mempool_free(shadow, super->s_shadow_pool); +} + +/** + * fill_shadow_tree - Propagate shadow tree changes due to a write + * @inode: Inode owning the page + * @page: Struct page that was written + * @shadow: Shadow for the current write + * + * Writes in logfs can result in two semi-valid objects. The old object + * is still valid as long as it can be reached by following pointers on + * the medium. Only when writes propagate all the way up to the journal + * has the new object safely replaced the old one. + * + * To handle this problem, a struct logfs_shadow is used to represent + * every single write. It is attached to the indirect block, which is + * marked dirty. When the indirect block is written, its shadows are + * handed up to the next indirect block (or inode). Untimately they + * will reach the master inode and be freed upon journal commit. + * + * This function handles a single step in the propagation. It adds the + * shadow for the current write to the tree, along with any shadows in + * the page's tree, in case it was an indirect block. If a page is + * written, the inode parameter is left NULL, if an inode is written, + * the page parameter is left NULL. + */ +static void fill_shadow_tree(struct inode *inode, struct page *page, + struct logfs_shadow *shadow) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + struct logfs_block *block = logfs_block(page); + struct shadow_tree *tree = &super->s_shadow_tree; + + if (PagePrivate(page)) { + if (block->alias_map) + super->s_no_object_aliases -= bitmap_weight( + block->alias_map, LOGFS_BLOCK_FACTOR); + logfs_handle_transaction(inode, block->ta); + block->ops->free_block(inode->i_sb, block); + } + if (shadow) { + if (shadow->old_ofs) + btree_insert64(&tree->old, shadow->old_ofs, shadow, + GFP_NOFS); + else + btree_insert64(&tree->new, shadow->new_ofs, shadow, + GFP_NOFS); + + super->s_dirty_used_bytes += shadow->new_len; + super->s_dirty_free_bytes += shadow->old_len; + } +} + +static void logfs_set_alias(struct super_block *sb, struct logfs_block *block, + long child_no) +{ + struct logfs_super *super = logfs_super(sb); + + if (block->inode && block->inode->i_ino == LOGFS_INO_MASTER) { + /* Aliases in the master inode are pointless. */ + return; + } + + if (!test_bit(child_no, block->alias_map)) { + set_bit(child_no, block->alias_map); + super->s_no_object_aliases++; + } + list_move_tail(&block->alias_list, &super->s_object_alias); +} + +/* + * Object aliases can and often do change the size and occupied space of a + * file. So not only do we have to change the pointers, we also have to + * change inode->i_size and li->li_used_bytes. Which is done by setting + * another two object aliases for the inode itself. + */ +static void set_iused(struct inode *inode, struct logfs_shadow *shadow) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (shadow->new_len == shadow->old_len) + return; + + alloc_inode_block(inode); + li->li_used_bytes += shadow->new_len - shadow->old_len; + __logfs_set_blocks(inode); + logfs_set_alias(inode->i_sb, li->li_block, INODE_USED_OFS); + logfs_set_alias(inode->i_sb, li->li_block, INODE_SIZE_OFS); +} + +static int logfs_write_i0(struct inode *inode, struct page *page, + struct write_control *wc) +{ + struct logfs_shadow *shadow; + u64 bix; + level_t level; + int full, err = 0; + + logfs_unpack_index(page->index, &bix, &level); + if (wc->ofs == 0) + if (logfs_reserve_blocks(inode, 1)) + return -ENOSPC; + + shadow = alloc_shadow(inode, bix, level, wc->ofs); + if (wc->flags & WF_WRITE) + err = logfs_segment_write(inode, page, shadow); + if (wc->flags & WF_DELETE) + logfs_segment_delete(inode, shadow); + if (err) { + free_shadow(inode, shadow); + return err; + } + + set_iused(inode, shadow); + full = 1; + if (level != 0) { + alloc_indirect_block(inode, page, 0); + full = logfs_block(page)->full == LOGFS_BLOCK_FACTOR; + } + fill_shadow_tree(inode, page, shadow); + wc->ofs = shadow->new_ofs; + if (wc->ofs && full) + wc->ofs |= LOGFS_FULLY_POPULATED; + return 0; +} + +static int logfs_write_direct(struct inode *inode, struct page *page, + long flags) +{ + struct logfs_inode *li = logfs_inode(inode); + struct write_control wc = { + .ofs = li->li_data[page->index], + .flags = flags, + }; + int err; + + alloc_inode_block(inode); + + err = logfs_write_i0(inode, page, &wc); + if (err) + return err; + + li->li_data[page->index] = wc.ofs; + logfs_set_alias(inode->i_sb, li->li_block, + page->index + INODE_POINTER_OFS); + return 0; +} + +static int ptr_change(u64 ofs, struct page *page) +{ + struct logfs_block *block = logfs_block(page); + int empty0, empty1, full0, full1; + + empty0 = ofs == 0; + empty1 = block->partial == 0; + if (empty0 != empty1) + return 1; + + /* The !! is necessary to shrink result to int */ + full0 = !!(ofs & LOGFS_FULLY_POPULATED); + full1 = block->full == LOGFS_BLOCK_FACTOR; + if (full0 != full1) + return 1; + return 0; +} + +static int __logfs_write_rec(struct inode *inode, struct page *page, + struct write_control *this_wc, + pgoff_t bix, level_t target_level, level_t level) +{ + int ret, page_empty = 0; + int child_no = get_bits(bix, SUBLEVEL(level)); + struct page *ipage; + struct write_control child_wc = { + .flags = this_wc->flags, + }; + + ipage = logfs_get_write_page(inode, bix, level); + if (!ipage) + return -ENOMEM; + + if (this_wc->ofs) { + ret = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level); + if (ret) + goto out; + } else if (!PageUptodate(ipage)) { + page_empty = 1; + logfs_read_empty(ipage); + } + + child_wc.ofs = block_get_pointer(ipage, child_no); + + if ((__force u8)level-1 > (__force u8)target_level) + ret = __logfs_write_rec(inode, page, &child_wc, bix, + target_level, SUBLEVEL(level)); + else + ret = logfs_write_i0(inode, page, &child_wc); + + if (ret) + goto out; + + alloc_indirect_block(inode, ipage, page_empty); + block_set_pointer(ipage, child_no, child_wc.ofs); + /* FIXME: first condition seems superfluous */ + if (child_wc.ofs || logfs_block(ipage)->partial) + this_wc->flags |= WF_WRITE; + /* the condition on this_wc->ofs ensures that we won't consume extra + * space for indirect blocks in the future, which we cannot reserve */ + if (!this_wc->ofs || ptr_change(this_wc->ofs, ipage)) + ret = logfs_write_i0(inode, ipage, this_wc); + else + logfs_set_alias(inode->i_sb, logfs_block(ipage), child_no); +out: + logfs_put_write_page(ipage); + return ret; +} + +static int logfs_write_rec(struct inode *inode, struct page *page, + pgoff_t bix, level_t target_level, long flags) +{ + struct logfs_inode *li = logfs_inode(inode); + struct write_control wc = { + .ofs = li->li_data[INDIRECT_INDEX], + .flags = flags, + }; + int ret; + + alloc_inode_block(inode); + + if (li->li_height > (__force u8)target_level) + ret = __logfs_write_rec(inode, page, &wc, bix, target_level, + LEVEL(li->li_height)); + else + ret = logfs_write_i0(inode, page, &wc); + if (ret) + return ret; + + if (li->li_data[INDIRECT_INDEX] != wc.ofs) { + li->li_data[INDIRECT_INDEX] = wc.ofs; + logfs_set_alias(inode->i_sb, li->li_block, + INDIRECT_INDEX + INODE_POINTER_OFS); + } + return ret; +} + +void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta) +{ + alloc_inode_block(inode); + logfs_inode(inode)->li_block->ta = ta; +} + +void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta) +{ + struct logfs_block *block = logfs_inode(inode)->li_block; + + if (block && block->ta) + block->ta = NULL; +} + +static int grow_inode(struct inode *inode, u64 bix, level_t level) +{ + struct logfs_inode *li = logfs_inode(inode); + u8 height = (__force u8)level; + struct page *page; + struct write_control wc = { + .flags = WF_WRITE, + }; + int err; + + BUG_ON(height > 5 || li->li_height > 5); + while (height > li->li_height || bix >= maxbix(li->li_height)) { + page = logfs_get_write_page(inode, I0_BLOCKS + 1, + LEVEL(li->li_height + 1)); + if (!page) + return -ENOMEM; + logfs_read_empty(page); + alloc_indirect_block(inode, page, 1); + block_set_pointer(page, 0, li->li_data[INDIRECT_INDEX]); + err = logfs_write_i0(inode, page, &wc); + logfs_put_write_page(page); + if (err) + return err; + li->li_data[INDIRECT_INDEX] = wc.ofs; + wc.ofs = 0; + li->li_height++; + logfs_set_alias(inode->i_sb, li->li_block, INODE_HEIGHT_OFS); + } + return 0; +} + +static int __logfs_write_buf(struct inode *inode, struct page *page, long flags) +{ + struct logfs_super *super = logfs_super(inode->i_sb); + pgoff_t index = page->index; + u64 bix; + level_t level; + int err; + + flags |= WF_WRITE | WF_DELETE; + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + + logfs_unpack_index(index, &bix, &level); + if (logfs_block(page) && logfs_block(page)->reserved_bytes) + super->s_dirty_pages -= logfs_block(page)->reserved_bytes; + + if (index < I0_BLOCKS) + return logfs_write_direct(inode, page, flags); + + bix = adjust_bix(bix, level); + err = grow_inode(inode, bix, level); + if (err) + return err; + return logfs_write_rec(inode, page, bix, level, flags); +} + +int logfs_write_buf(struct inode *inode, struct page *page, long flags) +{ + struct super_block *sb = inode->i_sb; + int ret; + + logfs_get_wblocks(sb, page, flags & WF_LOCK); + ret = __logfs_write_buf(inode, page, flags); + logfs_put_wblocks(sb, page, flags & WF_LOCK); + return ret; +} + +static int __logfs_delete(struct inode *inode, struct page *page) +{ + long flags = WF_DELETE; + + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + + if (page->index < I0_BLOCKS) + return logfs_write_direct(inode, page, flags); + return logfs_write_rec(inode, page, page->index, 0, flags); +} + +int logfs_delete(struct inode *inode, pgoff_t index, + struct shadow_tree *shadow_tree) +{ + struct super_block *sb = inode->i_sb; + struct page *page; + int ret; + + page = logfs_get_read_page(inode, index, 0); + if (!page) + return -ENOMEM; + + logfs_get_wblocks(sb, page, 1); + ret = __logfs_delete(inode, page); + logfs_put_wblocks(sb, page, 1); + + logfs_put_read_page(page); + + return ret; +} + +/* Rewrite cannot mark the inode dirty but has to write it immediatly. */ +int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs, + gc_level_t gc_level, long flags) +{ + level_t level = shrink_level(gc_level); + struct page *page; + int err; + + page = logfs_get_write_page(inode, bix, level); + if (!page) + return -ENOMEM; + + err = logfs_segment_read(inode, page, ofs, bix, level); + if (!err) { + if (level != 0) + alloc_indirect_block(inode, page, 0); + err = logfs_write_buf(inode, page, flags); + } + logfs_put_write_page(page); + return err; +} + +static int truncate_data_block(struct inode *inode, struct page *page, + u64 ofs, struct logfs_shadow *shadow, u64 size) +{ + loff_t pageofs = page->index << inode->i_sb->s_blocksize_bits; + u64 bix; + level_t level; + int err; + + /* Does truncation happen within this page? */ + if (size <= pageofs || size - pageofs >= PAGE_SIZE) + return 0; + + logfs_unpack_index(page->index, &bix, &level); + BUG_ON(level != 0); + + err = logfs_segment_read(inode, page, ofs, bix, level); + if (err) + return err; + + zero_user_segment(page, size - pageofs, PAGE_CACHE_SIZE); + return logfs_segment_write(inode, page, shadow); +} + +static int logfs_truncate_i0(struct inode *inode, struct page *page, + struct write_control *wc, u64 size) +{ + struct logfs_shadow *shadow; + u64 bix; + level_t level; + int err = 0; + + logfs_unpack_index(page->index, &bix, &level); + BUG_ON(level != 0); + shadow = alloc_shadow(inode, bix, level, wc->ofs); + + err = truncate_data_block(inode, page, wc->ofs, shadow, size); + if (err) { + free_shadow(inode, shadow); + return err; + } + + logfs_segment_delete(inode, shadow); + set_iused(inode, shadow); + fill_shadow_tree(inode, page, shadow); + wc->ofs = shadow->new_ofs; + return 0; +} + +static int logfs_truncate_direct(struct inode *inode, u64 size) +{ + struct logfs_inode *li = logfs_inode(inode); + struct write_control wc; + struct page *page; + int e; + int err; + + alloc_inode_block(inode); + + for (e = I0_BLOCKS - 1; e >= 0; e--) { + if (size > (e+1) * LOGFS_BLOCKSIZE) + break; + + wc.ofs = li->li_data[e]; + if (!wc.ofs) + continue; + + page = logfs_get_write_page(inode, e, 0); + if (!page) + return -ENOMEM; + err = logfs_segment_read(inode, page, wc.ofs, e, 0); + if (err) { + logfs_put_write_page(page); + return err; + } + err = logfs_truncate_i0(inode, page, &wc, size); + logfs_put_write_page(page); + if (err) + return err; + + li->li_data[e] = wc.ofs; + } + return 0; +} + +/* FIXME: these need to become per-sb once we support different blocksizes */ +static u64 __logfs_step[] = { + 1, + I1_BLOCKS, + I2_BLOCKS, + I3_BLOCKS, +}; + +static u64 __logfs_start_index[] = { + I0_BLOCKS, + I1_BLOCKS, + I2_BLOCKS, + I3_BLOCKS +}; + +static inline u64 logfs_step(level_t level) +{ + return __logfs_step[(__force u8)level]; +} + +static inline u64 logfs_factor(u8 level) +{ + return __logfs_step[level] * LOGFS_BLOCKSIZE; +} + +static inline u64 logfs_start_index(level_t level) +{ + return __logfs_start_index[(__force u8)level]; +} + +static void logfs_unpack_raw_index(pgoff_t index, u64 *bix, level_t *level) +{ + logfs_unpack_index(index, bix, level); + if (*bix <= logfs_start_index(SUBLEVEL(*level))) + *bix = 0; +} + +static int __logfs_truncate_rec(struct inode *inode, struct page *ipage, + struct write_control *this_wc, u64 size) +{ + int truncate_happened = 0; + int e, err = 0; + u64 bix, child_bix, next_bix; + level_t level; + struct page *page; + struct write_control child_wc = { /* FIXME: flags */ }; + + logfs_unpack_raw_index(ipage->index, &bix, &level); + err = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level); + if (err) + return err; + + for (e = LOGFS_BLOCK_FACTOR - 1; e >= 0; e--) { + child_bix = bix + e * logfs_step(SUBLEVEL(level)); + next_bix = child_bix + logfs_step(SUBLEVEL(level)); + if (size > next_bix * LOGFS_BLOCKSIZE) + break; + + child_wc.ofs = pure_ofs(block_get_pointer(ipage, e)); + if (!child_wc.ofs) + continue; + + page = logfs_get_write_page(inode, child_bix, SUBLEVEL(level)); + if (!page) + return -ENOMEM; + + if ((__force u8)level > 1) + err = __logfs_truncate_rec(inode, page, &child_wc, size); + else + err = logfs_truncate_i0(inode, page, &child_wc, size); + logfs_put_write_page(page); + if (err) + return err; + + truncate_happened = 1; + alloc_indirect_block(inode, ipage, 0); + block_set_pointer(ipage, e, child_wc.ofs); + } + + if (!truncate_happened) { + printk("ineffectual truncate (%lx, %lx, %llx)\n", inode->i_ino, ipage->index, size); + return 0; + } + + this_wc->flags = WF_DELETE; + if (logfs_block(ipage)->partial) + this_wc->flags |= WF_WRITE; + + return logfs_write_i0(inode, ipage, this_wc); +} + +static int logfs_truncate_rec(struct inode *inode, u64 size) +{ + struct logfs_inode *li = logfs_inode(inode); + struct write_control wc = { + .ofs = li->li_data[INDIRECT_INDEX], + }; + struct page *page; + int err; + + alloc_inode_block(inode); + + if (!wc.ofs) + return 0; + + page = logfs_get_write_page(inode, 0, LEVEL(li->li_height)); + if (!page) + return -ENOMEM; + + err = __logfs_truncate_rec(inode, page, &wc, size); + logfs_put_write_page(page); + if (err) + return err; + + if (li->li_data[INDIRECT_INDEX] != wc.ofs) + li->li_data[INDIRECT_INDEX] = wc.ofs; + return 0; +} + +static int __logfs_truncate(struct inode *inode, u64 size) +{ + int ret; + + if (size >= logfs_factor(logfs_inode(inode)->li_height)) + return 0; + + ret = logfs_truncate_rec(inode, size); + if (ret) + return ret; + + return logfs_truncate_direct(inode, size); +} + +int logfs_truncate(struct inode *inode, u64 size) +{ + struct super_block *sb = inode->i_sb; + int err; + + logfs_get_wblocks(sb, NULL, 1); + err = __logfs_truncate(inode, size); + if (!err) + err = __logfs_write_inode(inode, 0); + logfs_put_wblocks(sb, NULL, 1); + + if (!err) + err = vmtruncate(inode, size); + + /* I don't trust error recovery yet. */ + WARN_ON(err); + return err; +} + +static void move_page_to_inode(struct inode *inode, struct page *page) +{ + struct logfs_inode *li = logfs_inode(inode); + struct logfs_block *block = logfs_block(page); + + if (!block) + return; + + log_blockmove("move_page_to_inode(%llx, %llx, %x)\n", + block->ino, block->bix, block->level); + BUG_ON(li->li_block); + block->ops = &inode_block_ops; + block->inode = inode; + li->li_block = block; + + block->page = NULL; + page->private = 0; + ClearPagePrivate(page); +} + +static void move_inode_to_page(struct page *page, struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + struct logfs_block *block = li->li_block; + + if (!block) + return; + + log_blockmove("move_inode_to_page(%llx, %llx, %x)\n", + block->ino, block->bix, block->level); + BUG_ON(PagePrivate(page)); + block->ops = &indirect_block_ops; + block->page = page; + page->private = (unsigned long)block; + SetPagePrivate(page); + + block->inode = NULL; + li->li_block = NULL; +} + +int logfs_read_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct logfs_super *super = logfs_super(sb); + struct inode *master_inode = super->s_master_inode; + struct page *page; + struct logfs_disk_inode *di; + u64 ino = inode->i_ino; + + if (ino << sb->s_blocksize_bits > i_size_read(master_inode)) + return -ENODATA; + if (!logfs_exist_block(master_inode, ino)) + return -ENODATA; + + page = read_cache_page(master_inode->i_mapping, ino, + (filler_t *)logfs_readpage, NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + + di = kmap_atomic(page, KM_USER0); + logfs_disk_to_inode(di, inode); + kunmap_atomic(di, KM_USER0); + move_page_to_inode(inode, page); + page_cache_release(page); + return 0; +} + +/* Caller must logfs_put_write_page(page); */ +static struct page *inode_to_page(struct inode *inode) +{ + struct inode *master_inode = logfs_super(inode->i_sb)->s_master_inode; + struct logfs_disk_inode *di; + struct page *page; + + BUG_ON(inode->i_ino == LOGFS_INO_MASTER); + + page = logfs_get_write_page(master_inode, inode->i_ino, 0); + if (!page) + return NULL; + + di = kmap_atomic(page, KM_USER0); + logfs_inode_to_disk(inode, di); + kunmap_atomic(di, KM_USER0); + move_inode_to_page(page, inode); + return page; +} + +/* Cheaper version of write_inode. All changes are concealed in + * aliases, which are moved back. No write to the medium happens. + */ +void logfs_clear_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct logfs_inode *li = logfs_inode(inode); + struct logfs_block *block = li->li_block; + struct page *page; + + /* Only deleted files may be dirty at this point */ + BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink); + if (!block) + return; + if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) { + block->ops->free_block(inode->i_sb, block); + return; + } + + BUG_ON(inode->i_ino < LOGFS_RESERVED_INOS); + page = inode_to_page(inode); + BUG_ON(!page); /* FIXME: Use emergency page */ + logfs_put_write_page(page); +} + +static int do_write_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct inode *master_inode = logfs_super(sb)->s_master_inode; + loff_t size = (inode->i_ino + 1) << inode->i_sb->s_blocksize_bits; + struct page *page; + int err; + + BUG_ON(inode->i_ino == LOGFS_INO_MASTER); + /* FIXME: lock inode */ + + if (i_size_read(master_inode) < size) + i_size_write(master_inode, size); + + /* TODO: Tell vfs this inode is clean now */ + + page = inode_to_page(inode); + if (!page) + return -ENOMEM; + + /* FIXME: transaction is part of logfs_block now. Is that enough? */ + err = logfs_write_buf(master_inode, page, 0); + logfs_put_write_page(page); + return err; +} + +static void logfs_mod_segment_entry(struct super_block *sb, u32 segno, + int write, + void (*change_se)(struct logfs_segment_entry *, long), + long arg) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode; + struct page *page; + struct logfs_segment_entry *se; + pgoff_t page_no; + int child_no; + + page_no = segno >> (sb->s_blocksize_bits - 3); + child_no = segno & ((sb->s_blocksize >> 3) - 1); + + inode = super->s_segfile_inode; + page = logfs_get_write_page(inode, page_no, 0); + BUG_ON(!page); /* FIXME: We need some reserve page for this case */ + if (!PageUptodate(page)) + logfs_read_block(inode, page, WRITE); + + if (write) + alloc_indirect_block(inode, page, 0); + se = kmap_atomic(page, KM_USER0); + change_se(se + child_no, arg); + if (write) { + logfs_set_alias(sb, logfs_block(page), child_no); + BUG_ON((int)be32_to_cpu(se[child_no].valid) > super->s_segsize); + } + kunmap_atomic(se, KM_USER0); + + logfs_put_write_page(page); +} + +static void __get_segment_entry(struct logfs_segment_entry *se, long _target) +{ + struct logfs_segment_entry *target = (void *)_target; + + *target = *se; +} + +void logfs_get_segment_entry(struct super_block *sb, u32 segno, + struct logfs_segment_entry *se) +{ + logfs_mod_segment_entry(sb, segno, 0, __get_segment_entry, (long)se); +} + +static void __set_segment_used(struct logfs_segment_entry *se, long increment) +{ + u32 valid; + + valid = be32_to_cpu(se->valid); + valid += increment; + se->valid = cpu_to_be32(valid); +} + +void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment) +{ + struct logfs_super *super = logfs_super(sb); + u32 segno = ofs >> super->s_segshift; + + if (!increment) + return; + + logfs_mod_segment_entry(sb, segno, 1, __set_segment_used, increment); +} + +static void __set_segment_erased(struct logfs_segment_entry *se, long ec_level) +{ + se->ec_level = cpu_to_be32(ec_level); +} + +void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec, + gc_level_t gc_level) +{ + u32 ec_level = ec << 4 | (__force u8)gc_level; + + logfs_mod_segment_entry(sb, segno, 1, __set_segment_erased, ec_level); +} + +static void __set_segment_reserved(struct logfs_segment_entry *se, long ignore) +{ + se->valid = cpu_to_be32(RESERVED); +} + +void logfs_set_segment_reserved(struct super_block *sb, u32 segno) +{ + logfs_mod_segment_entry(sb, segno, 1, __set_segment_reserved, 0); +} + +static void __set_segment_unreserved(struct logfs_segment_entry *se, + long ec_level) +{ + se->valid = 0; + se->ec_level = cpu_to_be32(ec_level); +} + +void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec) +{ + u32 ec_level = ec << 4; + + logfs_mod_segment_entry(sb, segno, 1, __set_segment_unreserved, + ec_level); +} + +int __logfs_write_inode(struct inode *inode, long flags) +{ + struct super_block *sb = inode->i_sb; + int ret; + + logfs_get_wblocks(sb, NULL, flags & WF_LOCK); + ret = do_write_inode(inode); + logfs_put_wblocks(sb, NULL, flags & WF_LOCK); + return ret; +} + +static int do_delete_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct inode *master_inode = logfs_super(sb)->s_master_inode; + struct page *page; + int ret; + + page = logfs_get_write_page(master_inode, inode->i_ino, 0); + if (!page) + return -ENOMEM; + + move_inode_to_page(page, inode); + + logfs_get_wblocks(sb, page, 1); + ret = __logfs_delete(master_inode, page); + logfs_put_wblocks(sb, page, 1); + + logfs_put_write_page(page); + return ret; +} + +/* + * ZOMBIE inodes have already been deleted before and should remain dead, + * if it weren't for valid checking. No need to kill them again here. + */ +void logfs_delete_inode(struct inode *inode) +{ + struct logfs_inode *li = logfs_inode(inode); + + if (!(li->li_flags & LOGFS_IF_ZOMBIE)) { + li->li_flags |= LOGFS_IF_ZOMBIE; + if (i_size_read(inode) > 0) + logfs_truncate(inode, 0); + do_delete_inode(inode); + } + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); +} + +void btree_write_block(struct logfs_block *block) +{ + struct inode *inode; + struct page *page; + int err, cookie; + + inode = logfs_safe_iget(block->sb, block->ino, &cookie); + page = logfs_get_write_page(inode, block->bix, block->level); + + err = logfs_readpage_nolock(page); + BUG_ON(err); + BUG_ON(!PagePrivate(page)); + BUG_ON(logfs_block(page) != block); + err = __logfs_write_buf(inode, page, 0); + BUG_ON(err); + BUG_ON(PagePrivate(page) || page->private); + + logfs_put_write_page(page); + logfs_safe_iput(inode, cookie); +} + +/** + * logfs_inode_write - write inode or dentry objects + * + * @inode: parent inode (ifile or directory) + * @buf: object to write (inode or dentry) + * @n: object size + * @_pos: object number (file position in blocks/objects) + * @flags: write flags + * @lock: 0 if write lock is already taken, 1 otherwise + * @shadow_tree: shadow below this inode + * + * FIXME: All caller of this put a 200-300 byte variable on the stack, + * only to call here and do a memcpy from that stack variable. A good + * example of wasted performance and stack space. + */ +int logfs_inode_write(struct inode *inode, const void *buf, size_t count, + loff_t bix, long flags, struct shadow_tree *shadow_tree) +{ + loff_t pos = bix << inode->i_sb->s_blocksize_bits; + int err; + struct page *page; + void *pagebuf; + + BUG_ON(pos & (LOGFS_BLOCKSIZE-1)); + BUG_ON(count > LOGFS_BLOCKSIZE); + page = logfs_get_write_page(inode, bix, 0); + if (!page) + return -ENOMEM; + + pagebuf = kmap_atomic(page, KM_USER0); + memcpy(pagebuf, buf, count); + flush_dcache_page(page); + kunmap_atomic(pagebuf, KM_USER0); + + if (i_size_read(inode) < pos + LOGFS_BLOCKSIZE) + i_size_write(inode, pos + LOGFS_BLOCKSIZE); + + err = logfs_write_buf(inode, page, flags); + logfs_put_write_page(page); + return err; +} + +int logfs_open_segfile(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *inode; + + inode = logfs_read_meta_inode(sb, LOGFS_INO_SEGFILE); + if (IS_ERR(inode)) + return PTR_ERR(inode); + super->s_segfile_inode = inode; + return 0; +} + +int logfs_init_rw(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int min_fill = 3 * super->s_no_blocks; + + INIT_LIST_HEAD(&super->s_object_alias); + mutex_init(&super->s_write_mutex); + super->s_block_pool = mempool_create_kmalloc_pool(min_fill, + sizeof(struct logfs_block)); + super->s_shadow_pool = mempool_create_kmalloc_pool(min_fill, + sizeof(struct logfs_shadow)); + return 0; +} + +void logfs_cleanup_rw(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + destroy_meta_inode(super->s_segfile_inode); + if (super->s_block_pool) + mempool_destroy(super->s_block_pool); + if (super->s_shadow_pool) + mempool_destroy(super->s_shadow_pool); +} diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c new file mode 100644 index 00000000000..1a14f9910d5 --- /dev/null +++ b/fs/logfs/segment.c @@ -0,0 +1,927 @@ +/* + * fs/logfs/segment.c - Handling the Object Store + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> + * + * Object store or ostore makes up the complete device with exception of + * the superblock and journal areas. Apart from its own metadata it stores + * three kinds of objects: inodes, dentries and blocks, both data and indirect. + */ +#include "logfs.h" + +static int logfs_mark_segment_bad(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct btree_head32 *head = &super->s_reserved_segments; + int err; + + err = btree_insert32(head, segno, (void *)1, GFP_NOFS); + if (err) + return err; + logfs_super(sb)->s_bad_segments++; + /* FIXME: write to journal */ + return 0; +} + +int logfs_erase_segment(struct super_block *sb, u32 segno, int ensure_erase) +{ + struct logfs_super *super = logfs_super(sb); + + super->s_gec++; + + return super->s_devops->erase(sb, (u64)segno << super->s_segshift, + super->s_segsize, ensure_erase); +} + +static s64 logfs_get_free_bytes(struct logfs_area *area, size_t bytes) +{ + s32 ofs; + + logfs_open_area(area, bytes); + + ofs = area->a_used_bytes; + area->a_used_bytes += bytes; + BUG_ON(area->a_used_bytes >= logfs_super(area->a_sb)->s_segsize); + + return dev_ofs(area->a_sb, area->a_segno, ofs); +} + +static struct page *get_mapping_page(struct super_block *sb, pgoff_t index, + int use_filler) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = super->s_devops->readpage; + struct page *page; + + BUG_ON(mapping_gfp_mask(mapping) & __GFP_FS); + if (use_filler) + page = read_cache_page(mapping, index, filler, sb); + else { + page = find_or_create_page(mapping, index, GFP_NOFS); + unlock_page(page); + } + return page; +} + +void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len, + int use_filler) +{ + pgoff_t index = ofs >> PAGE_SHIFT; + struct page *page; + long offset = ofs & (PAGE_SIZE-1); + long copylen; + + /* Only logfs_wbuf_recover may use len==0 */ + BUG_ON(!len && !use_filler); + do { + copylen = min((ulong)len, PAGE_SIZE - offset); + + page = get_mapping_page(area->a_sb, index, use_filler); + SetPageUptodate(page); + BUG_ON(!page); /* FIXME: reserve a pool */ + memcpy(page_address(page) + offset, buf, copylen); + SetPagePrivate(page); + page_cache_release(page); + + buf += copylen; + len -= copylen; + offset = 0; + index++; + } while (len); +} + +/* + * bdev_writeseg will write full pages. Memset the tail to prevent data leaks. + */ +static void pad_wbuf(struct logfs_area *area, int final) +{ + struct super_block *sb = area->a_sb; + struct logfs_super *super = logfs_super(sb); + struct page *page; + u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes); + pgoff_t index = ofs >> PAGE_SHIFT; + long offset = ofs & (PAGE_SIZE-1); + u32 len = PAGE_SIZE - offset; + + if (len == PAGE_SIZE) { + /* The math in this function can surely use some love */ + len = 0; + } + if (len) { + BUG_ON(area->a_used_bytes >= super->s_segsize); + + page = get_mapping_page(area->a_sb, index, 0); + BUG_ON(!page); /* FIXME: reserve a pool */ + memset(page_address(page) + offset, 0xff, len); + SetPagePrivate(page); + page_cache_release(page); + } + + if (!final) + return; + + area->a_used_bytes += len; + for ( ; area->a_used_bytes < super->s_segsize; + area->a_used_bytes += PAGE_SIZE) { + /* Memset another page */ + index++; + page = get_mapping_page(area->a_sb, index, 0); + BUG_ON(!page); /* FIXME: reserve a pool */ + memset(page_address(page), 0xff, PAGE_SIZE); + SetPagePrivate(page); + page_cache_release(page); + } +} + +/* + * We have to be careful with the alias tree. Since lookup is done by bix, + * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with + * indirect blocks. So always use it through accessor functions. + */ +static void *alias_tree_lookup(struct super_block *sb, u64 ino, u64 bix, + level_t level) +{ + struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree; + pgoff_t index = logfs_pack_index(bix, level); + + return btree_lookup128(head, ino, index); +} + +static int alias_tree_insert(struct super_block *sb, u64 ino, u64 bix, + level_t level, void *val) +{ + struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree; + pgoff_t index = logfs_pack_index(bix, level); + + return btree_insert128(head, ino, index, val, GFP_NOFS); +} + +static int btree_write_alias(struct super_block *sb, struct logfs_block *block, + write_alias_t *write_one_alias) +{ + struct object_alias_item *item; + int err; + + list_for_each_entry(item, &block->item_list, list) { + err = write_alias_journal(sb, block->ino, block->bix, + block->level, item->child_no, item->val); + if (err) + return err; + } + return 0; +} + +static gc_level_t btree_block_level(struct logfs_block *block) +{ + return expand_level(block->ino, block->level); +} + +static struct logfs_block_ops btree_block_ops = { + .write_block = btree_write_block, + .block_level = btree_block_level, + .free_block = __free_block, + .write_alias = btree_write_alias, +}; + +int logfs_load_object_aliases(struct super_block *sb, + struct logfs_obj_alias *oa, int count) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_block *block; + struct object_alias_item *item; + u64 ino, bix; + level_t level; + int i, err; + + super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS; + count /= sizeof(*oa); + for (i = 0; i < count; i++) { + item = mempool_alloc(super->s_alias_pool, GFP_NOFS); + if (!item) + return -ENOMEM; + memset(item, 0, sizeof(*item)); + + super->s_no_object_aliases++; + item->val = oa[i].val; + item->child_no = be16_to_cpu(oa[i].child_no); + + ino = be64_to_cpu(oa[i].ino); + bix = be64_to_cpu(oa[i].bix); + level = LEVEL(oa[i].level); + + log_aliases("logfs_load_object_aliases(%llx, %llx, %x, %x) %llx\n", + ino, bix, level, item->child_no, + be64_to_cpu(item->val)); + block = alias_tree_lookup(sb, ino, bix, level); + if (!block) { + block = __alloc_block(sb, ino, bix, level); + block->ops = &btree_block_ops; + err = alias_tree_insert(sb, ino, bix, level, block); + BUG_ON(err); /* mempool empty */ + } + if (test_and_set_bit(item->child_no, block->alias_map)) { + printk(KERN_ERR"LogFS: Alias collision detected\n"); + return -EIO; + } + list_move_tail(&block->alias_list, &super->s_object_alias); + list_add(&item->list, &block->item_list); + } + return 0; +} + +static void kill_alias(void *_block, unsigned long ignore0, + u64 ignore1, u64 ignore2, size_t ignore3) +{ + struct logfs_block *block = _block; + struct super_block *sb = block->sb; + struct logfs_super *super = logfs_super(sb); + struct object_alias_item *item; + + while (!list_empty(&block->item_list)) { + item = list_entry(block->item_list.next, typeof(*item), list); + list_del(&item->list); + mempool_free(item, super->s_alias_pool); + } + block->ops->free_block(sb, block); +} + +static int obj_type(struct inode *inode, level_t level) +{ + if (level == 0) { + if (S_ISDIR(inode->i_mode)) + return OBJ_DENTRY; + if (inode->i_ino == LOGFS_INO_MASTER) + return OBJ_INODE; + } + return OBJ_BLOCK; +} + +static int obj_len(struct super_block *sb, int obj_type) +{ + switch (obj_type) { + case OBJ_DENTRY: + return sizeof(struct logfs_disk_dentry); + case OBJ_INODE: + return sizeof(struct logfs_disk_inode); + case OBJ_BLOCK: + return sb->s_blocksize; + default: + BUG(); + } +} + +static int __logfs_segment_write(struct inode *inode, void *buf, + struct logfs_shadow *shadow, int type, int len, int compr) +{ + struct logfs_area *area; + struct super_block *sb = inode->i_sb; + s64 ofs; + struct logfs_object_header h; + int acc_len; + + if (shadow->gc_level == 0) + acc_len = len; + else + acc_len = obj_len(sb, type); + + area = get_area(sb, shadow->gc_level); + ofs = logfs_get_free_bytes(area, len + LOGFS_OBJECT_HEADERSIZE); + LOGFS_BUG_ON(ofs <= 0, sb); + /* + * Order is important. logfs_get_free_bytes(), by modifying the + * segment file, may modify the content of the very page we're about + * to write now. Which is fine, as long as the calculated crc and + * written data still match. So do the modifications _before_ + * calculating the crc. + */ + + h.len = cpu_to_be16(len); + h.type = type; + h.compr = compr; + h.ino = cpu_to_be64(inode->i_ino); + h.bix = cpu_to_be64(shadow->bix); + h.crc = logfs_crc32(&h, sizeof(h) - 4, 4); + h.data_crc = logfs_crc32(buf, len, 0); + + logfs_buf_write(area, ofs, &h, sizeof(h)); + logfs_buf_write(area, ofs + LOGFS_OBJECT_HEADERSIZE, buf, len); + + shadow->new_ofs = ofs; + shadow->new_len = acc_len + LOGFS_OBJECT_HEADERSIZE; + + return 0; +} + +static s64 logfs_segment_write_compress(struct inode *inode, void *buf, + struct logfs_shadow *shadow, int type, int len) +{ + struct super_block *sb = inode->i_sb; + void *compressor_buf = logfs_super(sb)->s_compressed_je; + ssize_t compr_len; + int ret; + + mutex_lock(&logfs_super(sb)->s_journal_mutex); + compr_len = logfs_compress(buf, compressor_buf, len, len); + + if (compr_len >= 0) { + ret = __logfs_segment_write(inode, compressor_buf, shadow, + type, compr_len, COMPR_ZLIB); + } else { + ret = __logfs_segment_write(inode, buf, shadow, type, len, + COMPR_NONE); + } + mutex_unlock(&logfs_super(sb)->s_journal_mutex); + return ret; +} + +/** + * logfs_segment_write - write data block to object store + * @inode: inode containing data + * + * Returns an errno or zero. + */ +int logfs_segment_write(struct inode *inode, struct page *page, + struct logfs_shadow *shadow) +{ + struct super_block *sb = inode->i_sb; + struct logfs_super *super = logfs_super(sb); + int do_compress, type, len; + int ret; + void *buf; + + super->s_flags |= LOGFS_SB_FLAG_DIRTY; + BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN); + do_compress = logfs_inode(inode)->li_flags & LOGFS_IF_COMPRESSED; + if (shadow->gc_level != 0) { + /* temporarily disable compression for indirect blocks */ + do_compress = 0; + } + + type = obj_type(inode, shrink_level(shadow->gc_level)); + len = obj_len(sb, type); + buf = kmap(page); + if (do_compress) + ret = logfs_segment_write_compress(inode, buf, shadow, type, + len); + else + ret = __logfs_segment_write(inode, buf, shadow, type, len, + COMPR_NONE); + kunmap(page); + + log_segment("logfs_segment_write(%llx, %llx, %x) %llx->%llx %x->%x\n", + shadow->ino, shadow->bix, shadow->gc_level, + shadow->old_ofs, shadow->new_ofs, + shadow->old_len, shadow->new_len); + /* this BUG_ON did catch a locking bug. useful */ + BUG_ON(!(shadow->new_ofs & (super->s_segsize - 1))); + return ret; +} + +int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf) +{ + pgoff_t index = ofs >> PAGE_SHIFT; + struct page *page; + long offset = ofs & (PAGE_SIZE-1); + long copylen; + + while (len) { + copylen = min((ulong)len, PAGE_SIZE - offset); + + page = get_mapping_page(sb, index, 1); + if (IS_ERR(page)) + return PTR_ERR(page); + memcpy(buf, page_address(page) + offset, copylen); + page_cache_release(page); + + buf += copylen; + len -= copylen; + offset = 0; + index++; + } + return 0; +} + +/* + * The "position" of indirect blocks is ambiguous. It can be the position + * of any data block somewhere behind this indirect block. So we need to + * normalize the positions through logfs_block_mask() before comparing. + */ +static int check_pos(struct super_block *sb, u64 pos1, u64 pos2, level_t level) +{ + return (pos1 & logfs_block_mask(sb, level)) != + (pos2 & logfs_block_mask(sb, level)); +} + +#if 0 +static int read_seg_header(struct super_block *sb, u64 ofs, + struct logfs_segment_header *sh) +{ + __be32 crc; + int err; + + err = wbuf_read(sb, ofs, sizeof(*sh), sh); + if (err) + return err; + crc = logfs_crc32(sh, sizeof(*sh), 4); + if (crc != sh->crc) { + printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, " + "got %x\n", ofs, be32_to_cpu(sh->crc), + be32_to_cpu(crc)); + return -EIO; + } + return 0; +} +#endif + +static int read_obj_header(struct super_block *sb, u64 ofs, + struct logfs_object_header *oh) +{ + __be32 crc; + int err; + + err = wbuf_read(sb, ofs, sizeof(*oh), oh); + if (err) + return err; + crc = logfs_crc32(oh, sizeof(*oh) - 4, 4); + if (crc != oh->crc) { + printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, " + "got %x\n", ofs, be32_to_cpu(oh->crc), + be32_to_cpu(crc)); + return -EIO; + } + return 0; +} + +static void move_btree_to_page(struct inode *inode, struct page *page, + __be64 *data) +{ + struct super_block *sb = inode->i_sb; + struct logfs_super *super = logfs_super(sb); + struct btree_head128 *head = &super->s_object_alias_tree; + struct logfs_block *block; + struct object_alias_item *item, *next; + + if (!(super->s_flags & LOGFS_SB_FLAG_OBJ_ALIAS)) + return; + + block = btree_remove128(head, inode->i_ino, page->index); + if (!block) + return; + + log_blockmove("move_btree_to_page(%llx, %llx, %x)\n", + block->ino, block->bix, block->level); + list_for_each_entry_safe(item, next, &block->item_list, list) { + data[item->child_no] = item->val; + list_del(&item->list); + mempool_free(item, super->s_alias_pool); + } + block->page = page; + SetPagePrivate(page); + page->private = (unsigned long)block; + block->ops = &indirect_block_ops; + initialize_block_counters(page, block, data, 0); +} + +/* + * This silences a false, yet annoying gcc warning. I hate it when my editor + * jumps into bitops.h each time I recompile this file. + * TODO: Complain to gcc folks about this and upgrade compiler. + */ +static unsigned long fnb(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + return find_next_bit(addr, size, offset); +} + +void move_page_to_btree(struct page *page) +{ + struct logfs_block *block = logfs_block(page); + struct super_block *sb = block->sb; + struct logfs_super *super = logfs_super(sb); + struct object_alias_item *item; + unsigned long pos; + __be64 *child; + int err; + + if (super->s_flags & LOGFS_SB_FLAG_SHUTDOWN) { + block->ops->free_block(sb, block); + return; + } + log_blockmove("move_page_to_btree(%llx, %llx, %x)\n", + block->ino, block->bix, block->level); + super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS; + + for (pos = 0; ; pos++) { + pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos); + if (pos >= LOGFS_BLOCK_FACTOR) + break; + + item = mempool_alloc(super->s_alias_pool, GFP_NOFS); + BUG_ON(!item); /* mempool empty */ + memset(item, 0, sizeof(*item)); + + child = kmap_atomic(page, KM_USER0); + item->val = child[pos]; + kunmap_atomic(child, KM_USER0); + item->child_no = pos; + list_add(&item->list, &block->item_list); + } + block->page = NULL; + ClearPagePrivate(page); + page->private = 0; + block->ops = &btree_block_ops; + err = alias_tree_insert(block->sb, block->ino, block->bix, block->level, + block); + BUG_ON(err); /* mempool empty */ + ClearPageUptodate(page); +} + +static int __logfs_segment_read(struct inode *inode, void *buf, + u64 ofs, u64 bix, level_t level) +{ + struct super_block *sb = inode->i_sb; + void *compressor_buf = logfs_super(sb)->s_compressed_je; + struct logfs_object_header oh; + __be32 crc; + u16 len; + int err, block_len; + + block_len = obj_len(sb, obj_type(inode, level)); + err = read_obj_header(sb, ofs, &oh); + if (err) + goto out_err; + + err = -EIO; + if (be64_to_cpu(oh.ino) != inode->i_ino + || check_pos(sb, be64_to_cpu(oh.bix), bix, level)) { + printk(KERN_ERR"LOGFS: (ino, bix) don't match at %llx: " + "expected (%lx, %llx), got (%llx, %llx)\n", + ofs, inode->i_ino, bix, + be64_to_cpu(oh.ino), be64_to_cpu(oh.bix)); + goto out_err; + } + + len = be16_to_cpu(oh.len); + + switch (oh.compr) { + case COMPR_NONE: + err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len, buf); + if (err) + goto out_err; + crc = logfs_crc32(buf, len, 0); + if (crc != oh.data_crc) { + printk(KERN_ERR"LOGFS: uncompressed data crc error at " + "%llx: expected %x, got %x\n", ofs, + be32_to_cpu(oh.data_crc), + be32_to_cpu(crc)); + goto out_err; + } + break; + case COMPR_ZLIB: + mutex_lock(&logfs_super(sb)->s_journal_mutex); + err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len, + compressor_buf); + if (err) { + mutex_unlock(&logfs_super(sb)->s_journal_mutex); + goto out_err; + } + crc = logfs_crc32(compressor_buf, len, 0); + if (crc != oh.data_crc) { + printk(KERN_ERR"LOGFS: compressed data crc error at " + "%llx: expected %x, got %x\n", ofs, + be32_to_cpu(oh.data_crc), + be32_to_cpu(crc)); + mutex_unlock(&logfs_super(sb)->s_journal_mutex); + goto out_err; + } + err = logfs_uncompress(compressor_buf, buf, len, block_len); + mutex_unlock(&logfs_super(sb)->s_journal_mutex); + if (err) { + printk(KERN_ERR"LOGFS: uncompress error at %llx\n", ofs); + goto out_err; + } + break; + default: + LOGFS_BUG(sb); + err = -EIO; + goto out_err; + } + return 0; + +out_err: + logfs_set_ro(sb); + printk(KERN_ERR"LOGFS: device is read-only now\n"); + LOGFS_BUG(sb); + return err; +} + +/** + * logfs_segment_read - read data block from object store + * @inode: inode containing data + * @buf: data buffer + * @ofs: physical data offset + * @bix: block index + * @level: block level + * + * Returns 0 on success or a negative errno. + */ +int logfs_segment_read(struct inode *inode, struct page *page, + u64 ofs, u64 bix, level_t level) +{ + int err; + void *buf; + + if (PageUptodate(page)) + return 0; + + ofs &= ~LOGFS_FULLY_POPULATED; + + buf = kmap(page); + err = __logfs_segment_read(inode, buf, ofs, bix, level); + if (!err) { + move_btree_to_page(inode, page, buf); + SetPageUptodate(page); + } + kunmap(page); + log_segment("logfs_segment_read(%lx, %llx, %x) %llx (%d)\n", + inode->i_ino, bix, level, ofs, err); + return err; +} + +int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow) +{ + struct super_block *sb = inode->i_sb; + struct logfs_super *super = logfs_super(sb); + struct logfs_object_header h; + u16 len; + int err; + + super->s_flags |= LOGFS_SB_FLAG_DIRTY; + BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN); + BUG_ON(shadow->old_ofs & LOGFS_FULLY_POPULATED); + if (!shadow->old_ofs) + return 0; + + log_segment("logfs_segment_delete(%llx, %llx, %x) %llx->%llx %x->%x\n", + shadow->ino, shadow->bix, shadow->gc_level, + shadow->old_ofs, shadow->new_ofs, + shadow->old_len, shadow->new_len); + err = read_obj_header(sb, shadow->old_ofs, &h); + LOGFS_BUG_ON(err, sb); + LOGFS_BUG_ON(be64_to_cpu(h.ino) != inode->i_ino, sb); + LOGFS_BUG_ON(check_pos(sb, shadow->bix, be64_to_cpu(h.bix), + shrink_level(shadow->gc_level)), sb); + + if (shadow->gc_level == 0) + len = be16_to_cpu(h.len); + else + len = obj_len(sb, h.type); + shadow->old_len = len + sizeof(h); + return 0; +} + +static void freeseg(struct super_block *sb, u32 segno) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + struct page *page; + u64 ofs, start, end; + + start = dev_ofs(sb, segno, 0); + end = dev_ofs(sb, segno + 1, 0); + for (ofs = start; ofs < end; ofs += PAGE_SIZE) { + page = find_get_page(mapping, ofs >> PAGE_SHIFT); + if (!page) + continue; + ClearPagePrivate(page); + page_cache_release(page); + } +} + +int logfs_open_area(struct logfs_area *area, size_t bytes) +{ + struct super_block *sb = area->a_sb; + struct logfs_super *super = logfs_super(sb); + int err, closed = 0; + + if (area->a_is_open && area->a_used_bytes + bytes <= super->s_segsize) + return 0; + + if (area->a_is_open) { + u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); + u32 len = super->s_segsize - area->a_written_bytes; + + log_gc("logfs_close_area(%x)\n", area->a_segno); + pad_wbuf(area, 1); + super->s_devops->writeseg(area->a_sb, ofs, len); + freeseg(sb, area->a_segno); + closed = 1; + } + + area->a_used_bytes = 0; + area->a_written_bytes = 0; +again: + area->a_ops->get_free_segment(area); + area->a_ops->get_erase_count(area); + + log_gc("logfs_open_area(%x, %x)\n", area->a_segno, area->a_level); + err = area->a_ops->erase_segment(area); + if (err) { + printk(KERN_WARNING "LogFS: Error erasing segment %x\n", + area->a_segno); + logfs_mark_segment_bad(sb, area->a_segno); + goto again; + } + area->a_is_open = 1; + return closed; +} + +void logfs_sync_area(struct logfs_area *area) +{ + struct super_block *sb = area->a_sb; + struct logfs_super *super = logfs_super(sb); + u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); + u32 len = (area->a_used_bytes - area->a_written_bytes); + + if (super->s_writesize) + len &= ~(super->s_writesize - 1); + if (len == 0) + return; + pad_wbuf(area, 0); + super->s_devops->writeseg(sb, ofs, len); + area->a_written_bytes += len; +} + +void logfs_sync_segments(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + for_each_area(i) + logfs_sync_area(super->s_area[i]); +} + +/* + * Pick a free segment to be used for this area. Effectively takes a + * candidate from the free list (not really a candidate anymore). + */ +static void ostore_get_free_segment(struct logfs_area *area) +{ + struct super_block *sb = area->a_sb; + struct logfs_super *super = logfs_super(sb); + + if (super->s_free_list.count == 0) { + printk(KERN_ERR"LOGFS: ran out of free segments\n"); + LOGFS_BUG(sb); + } + + area->a_segno = get_best_cand(sb, &super->s_free_list, NULL); +} + +static void ostore_get_erase_count(struct logfs_area *area) +{ + struct logfs_segment_entry se; + u32 ec_level; + + logfs_get_segment_entry(area->a_sb, area->a_segno, &se); + BUG_ON(se.ec_level == cpu_to_be32(BADSEG) || + se.valid == cpu_to_be32(RESERVED)); + + ec_level = be32_to_cpu(se.ec_level); + area->a_erase_count = (ec_level >> 4) + 1; +} + +static int ostore_erase_segment(struct logfs_area *area) +{ + struct super_block *sb = area->a_sb; + struct logfs_segment_header sh; + u64 ofs; + int err; + + err = logfs_erase_segment(sb, area->a_segno, 0); + if (err) + return err; + + sh.pad = 0; + sh.type = SEG_OSTORE; + sh.level = (__force u8)area->a_level; + sh.segno = cpu_to_be32(area->a_segno); + sh.ec = cpu_to_be32(area->a_erase_count); + sh.gec = cpu_to_be64(logfs_super(sb)->s_gec); + sh.crc = logfs_crc32(&sh, sizeof(sh), 4); + + logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, + area->a_level); + + ofs = dev_ofs(sb, area->a_segno, 0); + area->a_used_bytes = sizeof(sh); + logfs_buf_write(area, ofs, &sh, sizeof(sh)); + return 0; +} + +static const struct logfs_area_ops ostore_area_ops = { + .get_free_segment = ostore_get_free_segment, + .get_erase_count = ostore_get_erase_count, + .erase_segment = ostore_erase_segment, +}; + +static void free_area(struct logfs_area *area) +{ + if (area) + freeseg(area->a_sb, area->a_segno); + kfree(area); +} + +static struct logfs_area *alloc_area(struct super_block *sb) +{ + struct logfs_area *area; + + area = kzalloc(sizeof(*area), GFP_KERNEL); + if (!area) + return NULL; + + area->a_sb = sb; + return area; +} + +static void map_invalidatepage(struct page *page, unsigned long l) +{ + BUG(); +} + +static int map_releasepage(struct page *page, gfp_t g) +{ + /* Don't release these pages */ + return 0; +} + +static const struct address_space_operations mapping_aops = { + .invalidatepage = map_invalidatepage, + .releasepage = map_releasepage, + .set_page_dirty = __set_page_dirty_nobuffers, +}; + +int logfs_init_mapping(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping; + struct inode *inode; + + inode = logfs_new_meta_inode(sb, LOGFS_INO_MAPPING); + if (IS_ERR(inode)) + return PTR_ERR(inode); + super->s_mapping_inode = inode; + mapping = inode->i_mapping; + mapping->a_ops = &mapping_aops; + /* Would it be possible to use __GFP_HIGHMEM as well? */ + mapping_set_gfp_mask(mapping, GFP_NOFS); + return 0; +} + +int logfs_init_areas(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i = -1; + + super->s_alias_pool = mempool_create_kmalloc_pool(600, + sizeof(struct object_alias_item)); + if (!super->s_alias_pool) + return -ENOMEM; + + super->s_journal_area = alloc_area(sb); + if (!super->s_journal_area) + goto err; + + for_each_area(i) { + super->s_area[i] = alloc_area(sb); + if (!super->s_area[i]) + goto err; + super->s_area[i]->a_level = GC_LEVEL(i); + super->s_area[i]->a_ops = &ostore_area_ops; + } + btree_init_mempool128(&super->s_object_alias_tree, + super->s_btree_pool); + return 0; + +err: + for (i--; i >= 0; i--) + free_area(super->s_area[i]); + free_area(super->s_journal_area); + mempool_destroy(super->s_alias_pool); + return -ENOMEM; +} + +void logfs_cleanup_areas(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias); + for_each_area(i) + free_area(super->s_area[i]); + free_area(super->s_journal_area); + destroy_meta_inode(super->s_mapping_inode); +} diff --git a/fs/logfs/super.c b/fs/logfs/super.c new file mode 100644 index 00000000000..c66beab78de --- /dev/null +++ b/fs/logfs/super.c @@ -0,0 +1,650 @@ +/* + * fs/logfs/super.c + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> + * + * Generally contains mount/umount code and also serves as a dump area for + * any functions that don't fit elsewhere and neither justify a file of their + * own. + */ +#include "logfs.h" +#include <linux/bio.h> +#include <linux/mtd/mtd.h> +#include <linux/statfs.h> +#include <linux/buffer_head.h> + +static DEFINE_MUTEX(emergency_mutex); +static struct page *emergency_page; + +struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index) +{ + filler_t *filler = (filler_t *)mapping->a_ops->readpage; + struct page *page; + int err; + + page = read_cache_page(mapping, index, filler, NULL); + if (page) + return page; + + /* No more pages available, switch to emergency page */ + printk(KERN_INFO"Logfs: Using emergency page\n"); + mutex_lock(&emergency_mutex); + err = filler(NULL, emergency_page); + if (err) { + mutex_unlock(&emergency_mutex); + printk(KERN_EMERG"Logfs: Error reading emergency page\n"); + return ERR_PTR(err); + } + return emergency_page; +} + +void emergency_read_end(struct page *page) +{ + if (page == emergency_page) + mutex_unlock(&emergency_mutex); + else + page_cache_release(page); +} + +static void dump_segfile(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_segment_entry se; + u32 segno; + + for (segno = 0; segno < super->s_no_segs; segno++) { + logfs_get_segment_entry(sb, segno, &se); + printk("%3x: %6x %8x", segno, be32_to_cpu(se.ec_level), + be32_to_cpu(se.valid)); + if (++segno < super->s_no_segs) { + logfs_get_segment_entry(sb, segno, &se); + printk(" %6x %8x", be32_to_cpu(se.ec_level), + be32_to_cpu(se.valid)); + } + if (++segno < super->s_no_segs) { + logfs_get_segment_entry(sb, segno, &se); + printk(" %6x %8x", be32_to_cpu(se.ec_level), + be32_to_cpu(se.valid)); + } + if (++segno < super->s_no_segs) { + logfs_get_segment_entry(sb, segno, &se); + printk(" %6x %8x", be32_to_cpu(se.ec_level), + be32_to_cpu(se.valid)); + } + printk("\n"); + } +} + +/* + * logfs_crash_dump - dump debug information to device + * + * The LogFS superblock only occupies part of a segment. This function will + * write as much debug information as it can gather into the spare space. + */ +void logfs_crash_dump(struct super_block *sb) +{ + dump_segfile(sb); +} + +/* + * TODO: move to lib/string.c + */ +/** + * memchr_inv - Find a character in an area of memory. + * @s: The memory area + * @c: The byte to search for + * @n: The size of the area. + * + * returns the address of the first character other than @c, or %NULL + * if the whole buffer contains just @c. + */ +void *memchr_inv(const void *s, int c, size_t n) +{ + const unsigned char *p = s; + while (n-- != 0) + if ((unsigned char)c != *p++) + return (void *)(p - 1); + + return NULL; +} + +/* + * FIXME: There should be a reserve for root, similar to ext2. + */ +int logfs_statfs(struct dentry *dentry, struct kstatfs *stats) +{ + struct super_block *sb = dentry->d_sb; + struct logfs_super *super = logfs_super(sb); + + stats->f_type = LOGFS_MAGIC_U32; + stats->f_bsize = sb->s_blocksize; + stats->f_blocks = super->s_size >> LOGFS_BLOCK_BITS >> 3; + stats->f_bfree = super->s_free_bytes >> sb->s_blocksize_bits; + stats->f_bavail = super->s_free_bytes >> sb->s_blocksize_bits; + stats->f_files = 0; + stats->f_ffree = 0; + stats->f_namelen = LOGFS_MAX_NAMELEN; + return 0; +} + +static int logfs_sb_set(struct super_block *sb, void *_super) +{ + struct logfs_super *super = _super; + + sb->s_fs_info = super; + sb->s_mtd = super->s_mtd; + sb->s_bdev = super->s_bdev; + return 0; +} + +static int logfs_sb_test(struct super_block *sb, void *_super) +{ + struct logfs_super *super = _super; + struct mtd_info *mtd = super->s_mtd; + + if (mtd && sb->s_mtd == mtd) + return 1; + if (super->s_bdev && sb->s_bdev == super->s_bdev) + return 1; + return 0; +} + +static void set_segment_header(struct logfs_segment_header *sh, u8 type, + u8 level, u32 segno, u32 ec) +{ + sh->pad = 0; + sh->type = type; + sh->level = level; + sh->segno = cpu_to_be32(segno); + sh->ec = cpu_to_be32(ec); + sh->gec = cpu_to_be64(segno); + sh->crc = logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4); +} + +static void logfs_write_ds(struct super_block *sb, struct logfs_disk_super *ds, + u32 segno, u32 ec) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_segment_header *sh = &ds->ds_sh; + int i; + + memset(ds, 0, sizeof(*ds)); + set_segment_header(sh, SEG_SUPER, 0, segno, ec); + + ds->ds_ifile_levels = super->s_ifile_levels; + ds->ds_iblock_levels = super->s_iblock_levels; + ds->ds_data_levels = super->s_data_levels; /* XXX: Remove */ + ds->ds_segment_shift = super->s_segshift; + ds->ds_block_shift = sb->s_blocksize_bits; + ds->ds_write_shift = super->s_writeshift; + ds->ds_filesystem_size = cpu_to_be64(super->s_size); + ds->ds_segment_size = cpu_to_be32(super->s_segsize); + ds->ds_bad_seg_reserve = cpu_to_be32(super->s_bad_seg_reserve); + ds->ds_feature_incompat = cpu_to_be64(super->s_feature_incompat); + ds->ds_feature_ro_compat= cpu_to_be64(super->s_feature_ro_compat); + ds->ds_feature_compat = cpu_to_be64(super->s_feature_compat); + ds->ds_feature_flags = cpu_to_be64(super->s_feature_flags); + ds->ds_root_reserve = cpu_to_be64(super->s_root_reserve); + ds->ds_speed_reserve = cpu_to_be64(super->s_speed_reserve); + journal_for_each(i) + ds->ds_journal_seg[i] = cpu_to_be32(super->s_journal_seg[i]); + ds->ds_magic = cpu_to_be64(LOGFS_MAGIC); + ds->ds_crc = logfs_crc32(ds, sizeof(*ds), + LOGFS_SEGMENT_HEADERSIZE + 12); +} + +static int write_one_sb(struct super_block *sb, + struct page *(*find_sb)(struct super_block *sb, u64 *ofs)) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_disk_super *ds; + struct logfs_segment_entry se; + struct page *page; + u64 ofs; + u32 ec, segno; + int err; + + page = find_sb(sb, &ofs); + if (!page) + return -EIO; + ds = page_address(page); + segno = seg_no(sb, ofs); + logfs_get_segment_entry(sb, segno, &se); + ec = be32_to_cpu(se.ec_level) >> 4; + ec++; + logfs_set_segment_erased(sb, segno, ec, 0); + logfs_write_ds(sb, ds, segno, ec); + err = super->s_devops->write_sb(sb, page); + page_cache_release(page); + return err; +} + +int logfs_write_sb(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int err; + + /* First superblock */ + err = write_one_sb(sb, super->s_devops->find_first_sb); + if (err) + return err; + + /* Last superblock */ + err = write_one_sb(sb, super->s_devops->find_last_sb); + if (err) + return err; + return 0; +} + +static int ds_cmp(const void *ds0, const void *ds1) +{ + size_t len = sizeof(struct logfs_disk_super); + + /* We know the segment headers differ, so ignore them */ + len -= LOGFS_SEGMENT_HEADERSIZE; + ds0 += LOGFS_SEGMENT_HEADERSIZE; + ds1 += LOGFS_SEGMENT_HEADERSIZE; + return memcmp(ds0, ds1, len); +} + +static int logfs_recover_sb(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct logfs_disk_super _ds0, *ds0 = &_ds0; + struct logfs_disk_super _ds1, *ds1 = &_ds1; + int err, valid0, valid1; + + /* read first superblock */ + err = wbuf_read(sb, super->s_sb_ofs[0], sizeof(*ds0), ds0); + if (err) + return err; + /* read last superblock */ + err = wbuf_read(sb, super->s_sb_ofs[1], sizeof(*ds1), ds1); + if (err) + return err; + valid0 = logfs_check_ds(ds0) == 0; + valid1 = logfs_check_ds(ds1) == 0; + + if (!valid0 && valid1) { + printk(KERN_INFO"First superblock is invalid - fixing.\n"); + return write_one_sb(sb, super->s_devops->find_first_sb); + } + if (valid0 && !valid1) { + printk(KERN_INFO"Last superblock is invalid - fixing.\n"); + return write_one_sb(sb, super->s_devops->find_last_sb); + } + if (valid0 && valid1 && ds_cmp(ds0, ds1)) { + printk(KERN_INFO"Superblocks don't match - fixing.\n"); + return write_one_sb(sb, super->s_devops->find_last_sb); + } + /* If neither is valid now, something's wrong. Didn't we properly + * check them before?!? */ + BUG_ON(!valid0 && !valid1); + return 0; +} + +static int logfs_make_writeable(struct super_block *sb) +{ + int err; + + /* Repair any broken superblock copies */ + err = logfs_recover_sb(sb); + if (err) + return err; + + /* Check areas for trailing unaccounted data */ + err = logfs_check_areas(sb); + if (err) + return err; + + err = logfs_open_segfile(sb); + if (err) + return err; + + /* Do one GC pass before any data gets dirtied */ + logfs_gc_pass(sb); + + /* after all initializations are done, replay the journal + * for rw-mounts, if necessary */ + err = logfs_replay_journal(sb); + if (err) + return err; + + return 0; +} + +static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt) +{ + struct logfs_super *super = logfs_super(sb); + struct inode *rootdir; + int err; + + /* root dir */ + rootdir = logfs_iget(sb, LOGFS_INO_ROOT); + if (IS_ERR(rootdir)) + goto fail; + + sb->s_root = d_alloc_root(rootdir); + if (!sb->s_root) + goto fail; + + super->s_erase_page = alloc_pages(GFP_KERNEL, 0); + if (!super->s_erase_page) + goto fail2; + memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE); + + /* FIXME: check for read-only mounts */ + err = logfs_make_writeable(sb); + if (err) + goto fail3; + + log_super("LogFS: Finished mounting\n"); + simple_set_mnt(mnt, sb); + return 0; + +fail3: + __free_page(super->s_erase_page); +fail2: + iput(rootdir); +fail: + iput(logfs_super(sb)->s_master_inode); + return -EIO; +} + +int logfs_check_ds(struct logfs_disk_super *ds) +{ + struct logfs_segment_header *sh = &ds->ds_sh; + + if (ds->ds_magic != cpu_to_be64(LOGFS_MAGIC)) + return -EINVAL; + if (sh->crc != logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4)) + return -EINVAL; + if (ds->ds_crc != logfs_crc32(ds, sizeof(*ds), + LOGFS_SEGMENT_HEADERSIZE + 12)) + return -EINVAL; + return 0; +} + +static struct page *find_super_block(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct page *first, *last; + + first = super->s_devops->find_first_sb(sb, &super->s_sb_ofs[0]); + if (!first || IS_ERR(first)) + return NULL; + last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]); + if (!last || IS_ERR(first)) { + page_cache_release(first); + return NULL; + } + + if (!logfs_check_ds(page_address(first))) { + page_cache_release(last); + return first; + } + + /* First one didn't work, try the second superblock */ + if (!logfs_check_ds(page_address(last))) { + page_cache_release(first); + return last; + } + + /* Neither worked, sorry folks */ + page_cache_release(first); + page_cache_release(last); + return NULL; +} + +static int __logfs_read_sb(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + struct page *page; + struct logfs_disk_super *ds; + int i; + + page = find_super_block(sb); + if (!page) + return -EIO; + + ds = page_address(page); + super->s_size = be64_to_cpu(ds->ds_filesystem_size); + super->s_root_reserve = be64_to_cpu(ds->ds_root_reserve); + super->s_speed_reserve = be64_to_cpu(ds->ds_speed_reserve); + super->s_bad_seg_reserve = be32_to_cpu(ds->ds_bad_seg_reserve); + super->s_segsize = 1 << ds->ds_segment_shift; + super->s_segmask = (1 << ds->ds_segment_shift) - 1; + super->s_segshift = ds->ds_segment_shift; + sb->s_blocksize = 1 << ds->ds_block_shift; + sb->s_blocksize_bits = ds->ds_block_shift; + super->s_writesize = 1 << ds->ds_write_shift; + super->s_writeshift = ds->ds_write_shift; + super->s_no_segs = super->s_size >> super->s_segshift; + super->s_no_blocks = super->s_segsize >> sb->s_blocksize_bits; + super->s_feature_incompat = be64_to_cpu(ds->ds_feature_incompat); + super->s_feature_ro_compat = be64_to_cpu(ds->ds_feature_ro_compat); + super->s_feature_compat = be64_to_cpu(ds->ds_feature_compat); + super->s_feature_flags = be64_to_cpu(ds->ds_feature_flags); + + journal_for_each(i) + super->s_journal_seg[i] = be32_to_cpu(ds->ds_journal_seg[i]); + + super->s_ifile_levels = ds->ds_ifile_levels; + super->s_iblock_levels = ds->ds_iblock_levels; + super->s_data_levels = ds->ds_data_levels; + super->s_total_levels = super->s_ifile_levels + super->s_iblock_levels + + super->s_data_levels; + page_cache_release(page); + return 0; +} + +static int logfs_read_sb(struct super_block *sb, int read_only) +{ + struct logfs_super *super = logfs_super(sb); + int ret; + + super->s_btree_pool = mempool_create(32, btree_alloc, btree_free, NULL); + if (!super->s_btree_pool) + return -ENOMEM; + + btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool); + btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool); + + ret = logfs_init_mapping(sb); + if (ret) + return ret; + + ret = __logfs_read_sb(sb); + if (ret) + return ret; + + if (super->s_feature_incompat & ~LOGFS_FEATURES_INCOMPAT) + return -EIO; + if ((super->s_feature_ro_compat & ~LOGFS_FEATURES_RO_COMPAT) && + !read_only) + return -EIO; + + mutex_init(&super->s_dirop_mutex); + mutex_init(&super->s_object_alias_mutex); + INIT_LIST_HEAD(&super->s_freeing_list); + + ret = logfs_init_rw(sb); + if (ret) + return ret; + + ret = logfs_init_areas(sb); + if (ret) + return ret; + + ret = logfs_init_gc(sb); + if (ret) + return ret; + + ret = logfs_init_journal(sb); + if (ret) + return ret; + + return 0; +} + +static void logfs_kill_sb(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + + log_super("LogFS: Start unmounting\n"); + /* Alias entries slow down mount, so evict as many as possible */ + sync_filesystem(sb); + logfs_write_anchor(sb); + + /* + * From this point on alias entries are simply dropped - and any + * writes to the object store are considered bugs. + */ + super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN; + log_super("LogFS: Now in shutdown\n"); + generic_shutdown_super(sb); + + BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes); + + logfs_cleanup_gc(sb); + logfs_cleanup_journal(sb); + logfs_cleanup_areas(sb); + logfs_cleanup_rw(sb); + if (super->s_erase_page) + __free_page(super->s_erase_page); + super->s_devops->put_device(sb); + mempool_destroy(super->s_btree_pool); + mempool_destroy(super->s_alias_pool); + kfree(super); + log_super("LogFS: Finished unmounting\n"); +} + +int logfs_get_sb_device(struct file_system_type *type, int flags, + struct mtd_info *mtd, struct block_device *bdev, + const struct logfs_device_ops *devops, struct vfsmount *mnt) +{ + struct logfs_super *super; + struct super_block *sb; + int err = -ENOMEM; + static int mount_count; + + log_super("LogFS: Start mount %x\n", mount_count++); + super = kzalloc(sizeof(*super), GFP_KERNEL); + if (!super) + goto err0; + + super->s_mtd = mtd; + super->s_bdev = bdev; + err = -EINVAL; + sb = sget(type, logfs_sb_test, logfs_sb_set, super); + if (IS_ERR(sb)) + goto err0; + + if (sb->s_root) { + /* Device is already in use */ + err = 0; + simple_set_mnt(mnt, sb); + goto err0; + } + + super->s_devops = devops; + + /* + * sb->s_maxbytes is limited to 8TB. On 32bit systems, the page cache + * only covers 16TB and the upper 8TB are used for indirect blocks. + * On 64bit system we could bump up the limit, but that would make + * the filesystem incompatible with 32bit systems. + */ + sb->s_maxbytes = (1ull << 43) - 1; + sb->s_op = &logfs_super_operations; + sb->s_flags = flags | MS_NOATIME; + + err = logfs_read_sb(sb, sb->s_flags & MS_RDONLY); + if (err) + goto err1; + + sb->s_flags |= MS_ACTIVE; + err = logfs_get_sb_final(sb, mnt); + if (err) + goto err1; + return 0; + +err1: + up_write(&sb->s_umount); + deactivate_super(sb); + return err; +err0: + kfree(super); + //devops->put_device(sb); + return err; +} + +static int logfs_get_sb(struct file_system_type *type, int flags, + const char *devname, void *data, struct vfsmount *mnt) +{ + ulong mtdnr; + + if (!devname) + return logfs_get_sb_bdev(type, flags, devname, mnt); + if (strncmp(devname, "mtd", 3)) + return logfs_get_sb_bdev(type, flags, devname, mnt); + + { + char *garbage; + mtdnr = simple_strtoul(devname+3, &garbage, 0); + if (*garbage) + return -EINVAL; + } + + return logfs_get_sb_mtd(type, flags, mtdnr, mnt); +} + +static struct file_system_type logfs_fs_type = { + .owner = THIS_MODULE, + .name = "logfs", + .get_sb = logfs_get_sb, + .kill_sb = logfs_kill_sb, + .fs_flags = FS_REQUIRES_DEV, + +}; + +static int __init logfs_init(void) +{ + int ret; + + emergency_page = alloc_pages(GFP_KERNEL, 0); + if (!emergency_page) + return -ENOMEM; + + ret = logfs_compr_init(); + if (ret) + goto out1; + + ret = logfs_init_inode_cache(); + if (ret) + goto out2; + + return register_filesystem(&logfs_fs_type); +out2: + logfs_compr_exit(); +out1: + __free_pages(emergency_page, 0); + return ret; +} + +static void __exit logfs_exit(void) +{ + unregister_filesystem(&logfs_fs_type); + logfs_destroy_inode_cache(); + logfs_compr_exit(); + __free_pages(emergency_page, 0); +} + +module_init(logfs_init); +module_exit(logfs_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Joern Engel <joern@logfs.org>"); +MODULE_DESCRIPTION("scalable flash filesystem"); diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 74ea82d7216..756f8c93780 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -17,8 +17,10 @@ #include <linux/init.h> #include <linux/highuid.h> #include <linux/vfs.h> +#include <linux/writeback.h> -static int minix_write_inode(struct inode * inode, int wait); +static int minix_write_inode(struct inode *inode, + struct writeback_control *wbc); static int minix_statfs(struct dentry *dentry, struct kstatfs *buf); static int minix_remount (struct super_block * sb, int * flags, char * data); @@ -552,7 +554,7 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode) return bh; } -static int minix_write_inode(struct inode *inode, int wait) +static int minix_write_inode(struct inode *inode, struct writeback_control *wbc) { int err = 0; struct buffer_head *bh; @@ -563,7 +565,7 @@ static int minix_write_inode(struct inode *inode, int wait) bh = V2_minix_update_inode(inode); if (!bh) return -EIO; - if (wait && buffer_dirty(bh)) { + if (wbc->sync_mode == WB_SYNC_ALL && buffer_dirty(bh)) { sync_dirty_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) { printk("IO error syncing minix inode [%s:%08lx]\n", diff --git a/fs/mpage.c b/fs/mpage.c index 42381bd6543..598d54e200e 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -561,7 +561,7 @@ page_is_mapped: if (page->index >= end_index) { /* * The page straddles i_size. It must be zeroed out on each - * and every writepage invokation because it may be mmapped. + * and every writepage invocation because it may be mmapped. * "A file is mapped in multiples of the page size. For a file * that is not a multiple of the page size, the remaining memory * is zeroed when mapped, and writes to that region are not diff --git a/fs/namei.c b/fs/namei.c index 0741c69b331..1c0fca6e899 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -19,7 +19,6 @@ #include <linux/slab.h> #include <linux/fs.h> #include <linux/namei.h> -#include <linux/quotaops.h> #include <linux/pagemap.h> #include <linux/fsnotify.h> #include <linux/personality.h> @@ -498,8 +497,6 @@ static int link_path_walk(const char *, struct nameidata *); static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) { - int res = 0; - char *name; if (IS_ERR(link)) goto fail; @@ -510,22 +507,7 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l path_get(&nd->root); } - res = link_path_walk(link, nd); - if (nd->depth || res || nd->last_type!=LAST_NORM) - return res; - /* - * If it is an iterative symlinks resolution in open_namei() we - * have to copy the last component. And all that crap because of - * bloody create() on broken symlinks. Furrfu... - */ - name = __getname(); - if (unlikely(!name)) { - path_put(&nd->path); - return -ENOMEM; - } - strcpy(name, nd->last.name); - nd->last.name = name; - return 0; + return link_path_walk(link, nd); fail: path_put(&nd->path); return PTR_ERR(link); @@ -547,10 +529,10 @@ static inline void path_to_nameidata(struct path *path, struct nameidata *nd) nd->path.dentry = path->dentry; } -static __always_inline int __do_follow_link(struct path *path, struct nameidata *nd) +static __always_inline int +__do_follow_link(struct path *path, struct nameidata *nd, void **p) { int error; - void *cookie; struct dentry *dentry = path->dentry; touch_atime(path->mnt, dentry); @@ -562,9 +544,9 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata } mntget(path->mnt); nd->last_type = LAST_BIND; - cookie = dentry->d_inode->i_op->follow_link(dentry, nd); - error = PTR_ERR(cookie); - if (!IS_ERR(cookie)) { + *p = dentry->d_inode->i_op->follow_link(dentry, nd); + error = PTR_ERR(*p); + if (!IS_ERR(*p)) { char *s = nd_get_link(nd); error = 0; if (s) @@ -574,8 +556,6 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata if (error) path_put(&nd->path); } - if (dentry->d_inode->i_op->put_link) - dentry->d_inode->i_op->put_link(dentry, nd, cookie); } return error; } @@ -589,6 +569,7 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata */ static inline int do_follow_link(struct path *path, struct nameidata *nd) { + void *cookie; int err = -ELOOP; if (current->link_count >= MAX_NESTED_LINKS) goto loop; @@ -602,7 +583,9 @@ static inline int do_follow_link(struct path *path, struct nameidata *nd) current->link_count++; current->total_link_count++; nd->depth++; - err = __do_follow_link(path, nd); + err = __do_follow_link(path, nd, &cookie); + if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link) + path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie); path_put(path); current->link_count--; nd->depth--; @@ -1375,22 +1358,6 @@ static inline int may_create(struct inode *dir, struct dentry *child) return inode_permission(dir, MAY_WRITE | MAY_EXEC); } -/* - * O_DIRECTORY translates into forcing a directory lookup. - */ -static inline int lookup_flags(unsigned int f) -{ - unsigned long retval = LOOKUP_FOLLOW; - - if (f & O_NOFOLLOW) - retval &= ~LOOKUP_FOLLOW; - - if (f & O_DIRECTORY) - retval |= LOOKUP_DIRECTORY; - - return retval; -} - /* * p1 and p2 should be directories on the same fs. */ @@ -1448,7 +1415,6 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode, error = security_inode_create(dir, dentry, mode); if (error) return error; - vfs_dq_init(dir); error = dir->i_op->create(dir, dentry, mode, nd); if (!error) fsnotify_create(dir, dentry); @@ -1590,129 +1556,132 @@ static int open_will_truncate(int flag, struct inode *inode) return (flag & O_TRUNC); } -/* - * Note that the low bits of the passed in "open_flag" - * are not the same as in the local variable "flag". See - * open_to_namei_flags() for more details. - */ -struct file *do_filp_open(int dfd, const char *pathname, - int open_flag, int mode, int acc_mode) +static struct file *finish_open(struct nameidata *nd, + int open_flag, int acc_mode) { struct file *filp; - struct nameidata nd; - int error; - struct path path; - struct dentry *dir; - int count = 0; int will_truncate; - int flag = open_to_namei_flags(open_flag); - int force_reval = 0; + int error; + will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode); + if (will_truncate) { + error = mnt_want_write(nd->path.mnt); + if (error) + goto exit; + } + error = may_open(&nd->path, acc_mode, open_flag); + if (error) { + if (will_truncate) + mnt_drop_write(nd->path.mnt); + goto exit; + } + filp = nameidata_to_filp(nd); + if (!IS_ERR(filp)) { + error = ima_file_check(filp, acc_mode); + if (error) { + fput(filp); + filp = ERR_PTR(error); + } + } + if (!IS_ERR(filp)) { + if (will_truncate) { + error = handle_truncate(&nd->path); + if (error) { + fput(filp); + filp = ERR_PTR(error); + } + } + } /* - * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only - * check for O_DSYNC if the need any syncing at all we enforce it's - * always set instead of having to deal with possibly weird behaviour - * for malicious applications setting only __O_SYNC. + * It is now safe to drop the mnt write + * because the filp has had a write taken + * on its behalf. */ - if (open_flag & __O_SYNC) - open_flag |= O_DSYNC; - - if (!acc_mode) - acc_mode = MAY_OPEN | ACC_MODE(open_flag); + if (will_truncate) + mnt_drop_write(nd->path.mnt); + return filp; - /* O_TRUNC implies we need access checks for write permissions */ - if (flag & O_TRUNC) - acc_mode |= MAY_WRITE; +exit: + if (!IS_ERR(nd->intent.open.file)) + release_open_intent(nd); + path_put(&nd->path); + return ERR_PTR(error); +} - /* Allow the LSM permission hook to distinguish append - access from general write access. */ - if (flag & O_APPEND) - acc_mode |= MAY_APPEND; +static struct file *do_last(struct nameidata *nd, struct path *path, + int open_flag, int acc_mode, + int mode, const char *pathname, + int *want_dir) +{ + struct dentry *dir = nd->path.dentry; + struct file *filp; + int error = -EISDIR; - /* - * The simplest case - just a plain lookup. - */ - if (!(flag & O_CREAT)) { - filp = get_empty_filp(); - - if (filp == NULL) - return ERR_PTR(-ENFILE); - nd.intent.open.file = filp; - filp->f_flags = open_flag; - nd.intent.open.flags = flag; - nd.intent.open.create_mode = 0; - error = do_path_lookup(dfd, pathname, - lookup_flags(flag)|LOOKUP_OPEN, &nd); - if (IS_ERR(nd.intent.open.file)) { - if (error == 0) { - error = PTR_ERR(nd.intent.open.file); - path_put(&nd.path); + switch (nd->last_type) { + case LAST_DOTDOT: + follow_dotdot(nd); + dir = nd->path.dentry; + if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) { + if (!dir->d_op->d_revalidate(dir, nd)) { + error = -ESTALE; + goto exit; } - } else if (error) - release_open_intent(&nd); - if (error) - return ERR_PTR(error); + } + /* fallthrough */ + case LAST_DOT: + case LAST_ROOT: + if (open_flag & O_CREAT) + goto exit; + /* fallthrough */ + case LAST_BIND: + audit_inode(pathname, dir); goto ok; } - /* - * Create - we need to know the parent. - */ -reval: - error = path_init(dfd, pathname, LOOKUP_PARENT, &nd); - if (error) - return ERR_PTR(error); - if (force_reval) - nd.flags |= LOOKUP_REVAL; - error = path_walk(pathname, &nd); - if (error) { - if (nd.root.mnt) - path_put(&nd.root); - return ERR_PTR(error); + /* trailing slashes? */ + if (nd->last.name[nd->last.len]) { + if (open_flag & O_CREAT) + goto exit; + *want_dir = 1; } - if (unlikely(!audit_dummy_context())) - audit_inode(pathname, nd.path.dentry); - /* - * We have the parent and last component. First of all, check - * that we are not asked to creat(2) an obvious directory - that - * will not do. - */ - error = -EISDIR; - if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len]) - goto exit_parent; + /* just plain open? */ + if (!(open_flag & O_CREAT)) { + error = do_lookup(nd, &nd->last, path); + if (error) + goto exit; + error = -ENOENT; + if (!path->dentry->d_inode) + goto exit_dput; + if (path->dentry->d_inode->i_op->follow_link) + return NULL; + error = -ENOTDIR; + if (*want_dir && !path->dentry->d_inode->i_op->lookup) + goto exit_dput; + path_to_nameidata(path, nd); + audit_inode(pathname, nd->path.dentry); + goto ok; + } - error = -ENFILE; - filp = get_empty_filp(); - if (filp == NULL) - goto exit_parent; - nd.intent.open.file = filp; - filp->f_flags = open_flag; - nd.intent.open.flags = flag; - nd.intent.open.create_mode = mode; - dir = nd.path.dentry; - nd.flags &= ~LOOKUP_PARENT; - nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN; - if (flag & O_EXCL) - nd.flags |= LOOKUP_EXCL; + /* OK, it's O_CREAT */ mutex_lock(&dir->d_inode->i_mutex); - path.dentry = lookup_hash(&nd); - path.mnt = nd.path.mnt; -do_last: - error = PTR_ERR(path.dentry); - if (IS_ERR(path.dentry)) { + path->dentry = lookup_hash(nd); + path->mnt = nd->path.mnt; + + error = PTR_ERR(path->dentry); + if (IS_ERR(path->dentry)) { mutex_unlock(&dir->d_inode->i_mutex); goto exit; } - if (IS_ERR(nd.intent.open.file)) { - error = PTR_ERR(nd.intent.open.file); + if (IS_ERR(nd->intent.open.file)) { + error = PTR_ERR(nd->intent.open.file); goto exit_mutex_unlock; } /* Negative dentry, just create the file */ - if (!path.dentry->d_inode) { + if (!path->dentry->d_inode) { /* * This write is needed to ensure that a * ro->rw transition does not occur between @@ -1720,18 +1689,16 @@ do_last: * a permanent write count is taken through * the 'struct file' in nameidata_to_filp(). */ - error = mnt_want_write(nd.path.mnt); + error = mnt_want_write(nd->path.mnt); if (error) goto exit_mutex_unlock; - error = __open_namei_create(&nd, &path, open_flag, mode); + error = __open_namei_create(nd, path, open_flag, mode); if (error) { - mnt_drop_write(nd.path.mnt); + mnt_drop_write(nd->path.mnt); goto exit; } - filp = nameidata_to_filp(&nd); - mnt_drop_write(nd.path.mnt); - if (nd.root.mnt) - path_put(&nd.root); + filp = nameidata_to_filp(nd); + mnt_drop_write(nd->path.mnt); if (!IS_ERR(filp)) { error = ima_file_check(filp, acc_mode); if (error) { @@ -1746,150 +1713,181 @@ do_last: * It already exists. */ mutex_unlock(&dir->d_inode->i_mutex); - audit_inode(pathname, path.dentry); + audit_inode(pathname, path->dentry); error = -EEXIST; - if (flag & O_EXCL) + if (open_flag & O_EXCL) goto exit_dput; - if (__follow_mount(&path)) { + if (__follow_mount(path)) { error = -ELOOP; - if (flag & O_NOFOLLOW) + if (open_flag & O_NOFOLLOW) goto exit_dput; } error = -ENOENT; - if (!path.dentry->d_inode) + if (!path->dentry->d_inode) goto exit_dput; - if (path.dentry->d_inode->i_op->follow_link) - goto do_link; - path_to_nameidata(&path, &nd); + if (path->dentry->d_inode->i_op->follow_link) + return NULL; + + path_to_nameidata(path, nd); error = -EISDIR; - if (S_ISDIR(path.dentry->d_inode->i_mode)) + if (S_ISDIR(path->dentry->d_inode->i_mode)) goto exit; ok: + filp = finish_open(nd, open_flag, acc_mode); + return filp; + +exit_mutex_unlock: + mutex_unlock(&dir->d_inode->i_mutex); +exit_dput: + path_put_conditional(path, nd); +exit: + if (!IS_ERR(nd->intent.open.file)) + release_open_intent(nd); + path_put(&nd->path); + return ERR_PTR(error); +} + +/* + * Note that the low bits of the passed in "open_flag" + * are not the same as in the local variable "flag". See + * open_to_namei_flags() for more details. + */ +struct file *do_filp_open(int dfd, const char *pathname, + int open_flag, int mode, int acc_mode) +{ + struct file *filp; + struct nameidata nd; + int error; + struct path path; + int count = 0; + int flag = open_to_namei_flags(open_flag); + int force_reval = 0; + int want_dir = open_flag & O_DIRECTORY; + + if (!(open_flag & O_CREAT)) + mode = 0; + /* - * Consider: - * 1. may_open() truncates a file - * 2. a rw->ro mount transition occurs - * 3. nameidata_to_filp() fails due to - * the ro mount. - * That would be inconsistent, and should - * be avoided. Taking this mnt write here - * ensures that (2) can not occur. + * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only + * check for O_DSYNC if the need any syncing at all we enforce it's + * always set instead of having to deal with possibly weird behaviour + * for malicious applications setting only __O_SYNC. */ - will_truncate = open_will_truncate(flag, nd.path.dentry->d_inode); - if (will_truncate) { - error = mnt_want_write(nd.path.mnt); - if (error) - goto exit; - } - error = may_open(&nd.path, acc_mode, open_flag); + if (open_flag & __O_SYNC) + open_flag |= O_DSYNC; + + if (!acc_mode) + acc_mode = MAY_OPEN | ACC_MODE(open_flag); + + /* O_TRUNC implies we need access checks for write permissions */ + if (open_flag & O_TRUNC) + acc_mode |= MAY_WRITE; + + /* Allow the LSM permission hook to distinguish append + access from general write access. */ + if (open_flag & O_APPEND) + acc_mode |= MAY_APPEND; + + /* find the parent */ +reval: + error = path_init(dfd, pathname, LOOKUP_PARENT, &nd); + if (error) + return ERR_PTR(error); + if (force_reval) + nd.flags |= LOOKUP_REVAL; + + current->total_link_count = 0; + error = link_path_walk(pathname, &nd); if (error) { - if (will_truncate) - mnt_drop_write(nd.path.mnt); - goto exit; - } - filp = nameidata_to_filp(&nd); - if (!IS_ERR(filp)) { - error = ima_file_check(filp, acc_mode); - if (error) { - fput(filp); - filp = ERR_PTR(error); - } + filp = ERR_PTR(error); + goto out; } - if (!IS_ERR(filp)) { - if (acc_mode & MAY_WRITE) - vfs_dq_init(nd.path.dentry->d_inode); + if (unlikely(!audit_dummy_context()) && (open_flag & O_CREAT)) + audit_inode(pathname, nd.path.dentry); - if (will_truncate) { - error = handle_truncate(&nd.path); - if (error) { - fput(filp); - filp = ERR_PTR(error); - } - } - } /* - * It is now safe to drop the mnt write - * because the filp has had a write taken - * on its behalf. + * We have the parent and last component. */ - if (will_truncate) - mnt_drop_write(nd.path.mnt); + + error = -ENFILE; + filp = get_empty_filp(); + if (filp == NULL) + goto exit_parent; + nd.intent.open.file = filp; + filp->f_flags = open_flag; + nd.intent.open.flags = flag; + nd.intent.open.create_mode = mode; + nd.flags &= ~LOOKUP_PARENT; + nd.flags |= LOOKUP_OPEN; + if (open_flag & O_CREAT) { + nd.flags |= LOOKUP_CREATE; + if (open_flag & O_EXCL) + nd.flags |= LOOKUP_EXCL; + } + filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &want_dir); + while (unlikely(!filp)) { /* trailing symlink */ + struct path holder; + struct inode *inode = path.dentry->d_inode; + void *cookie; + error = -ELOOP; + /* S_ISDIR part is a temporary automount kludge */ + if ((open_flag & O_NOFOLLOW) && !S_ISDIR(inode->i_mode)) + goto exit_dput; + if (count++ == 32) + goto exit_dput; + /* + * This is subtle. Instead of calling do_follow_link() we do + * the thing by hands. The reason is that this way we have zero + * link_count and path_walk() (called from ->follow_link) + * honoring LOOKUP_PARENT. After that we have the parent and + * last component, i.e. we are in the same situation as after + * the first path_walk(). Well, almost - if the last component + * is normal we get its copy stored in nd->last.name and we will + * have to putname() it when we are done. Procfs-like symlinks + * just set LAST_BIND. + */ + nd.flags |= LOOKUP_PARENT; + error = security_inode_follow_link(path.dentry, &nd); + if (error) + goto exit_dput; + error = __do_follow_link(&path, &nd, &cookie); + if (unlikely(error)) { + /* nd.path had been dropped */ + if (!IS_ERR(cookie) && inode->i_op->put_link) + inode->i_op->put_link(path.dentry, &nd, cookie); + path_put(&path); + release_open_intent(&nd); + filp = ERR_PTR(error); + goto out; + } + holder = path; + nd.flags &= ~LOOKUP_PARENT; + filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &want_dir); + if (inode->i_op->put_link) + inode->i_op->put_link(holder.dentry, &nd, cookie); + path_put(&holder); + } +out: if (nd.root.mnt) path_put(&nd.root); + if (filp == ERR_PTR(-ESTALE) && !force_reval) { + force_reval = 1; + goto reval; + } return filp; -exit_mutex_unlock: - mutex_unlock(&dir->d_inode->i_mutex); exit_dput: path_put_conditional(&path, &nd); -exit: if (!IS_ERR(nd.intent.open.file)) release_open_intent(&nd); exit_parent: - if (nd.root.mnt) - path_put(&nd.root); path_put(&nd.path); - return ERR_PTR(error); - -do_link: - error = -ELOOP; - if (flag & O_NOFOLLOW) - goto exit_dput; - /* - * This is subtle. Instead of calling do_follow_link() we do the - * thing by hands. The reason is that this way we have zero link_count - * and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT. - * After that we have the parent and last component, i.e. - * we are in the same situation as after the first path_walk(). - * Well, almost - if the last component is normal we get its copy - * stored in nd->last.name and we will have to putname() it when we - * are done. Procfs-like symlinks just set LAST_BIND. - */ - nd.flags |= LOOKUP_PARENT; - error = security_inode_follow_link(path.dentry, &nd); - if (error) - goto exit_dput; - error = __do_follow_link(&path, &nd); - path_put(&path); - if (error) { - /* Does someone understand code flow here? Or it is only - * me so stupid? Anathema to whoever designed this non-sense - * with "intent.open". - */ - release_open_intent(&nd); - if (nd.root.mnt) - path_put(&nd.root); - if (error == -ESTALE && !force_reval) { - force_reval = 1; - goto reval; - } - return ERR_PTR(error); - } - nd.flags &= ~LOOKUP_PARENT; - if (nd.last_type == LAST_BIND) - goto ok; - error = -EISDIR; - if (nd.last_type != LAST_NORM) - goto exit; - if (nd.last.name[nd.last.len]) { - __putname(nd.last.name); - goto exit; - } - error = -ELOOP; - if (count++==32) { - __putname(nd.last.name); - goto exit; - } - dir = nd.path.dentry; - mutex_lock(&dir->d_inode->i_mutex); - path.dentry = lookup_hash(&nd); - path.mnt = nd.path.mnt; - __putname(nd.last.name); - goto do_last; + filp = ERR_PTR(error); + goto out; } /** @@ -1983,7 +1981,6 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) if (error) return error; - vfs_dq_init(dir); error = dir->i_op->mknod(dir, dentry, mode, dev); if (!error) fsnotify_create(dir, dentry); @@ -2082,7 +2079,6 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (error) return error; - vfs_dq_init(dir); error = dir->i_op->mkdir(dir, dentry, mode); if (!error) fsnotify_mkdir(dir, dentry); @@ -2168,8 +2164,6 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) if (!dir->i_op->rmdir) return -EPERM; - vfs_dq_init(dir); - mutex_lock(&dentry->d_inode->i_mutex); dentry_unhash(dentry); if (d_mountpoint(dentry)) @@ -2255,8 +2249,6 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry) if (!dir->i_op->unlink) return -EPERM; - vfs_dq_init(dir); - mutex_lock(&dentry->d_inode->i_mutex); if (d_mountpoint(dentry)) error = -EBUSY; @@ -2369,7 +2361,6 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) if (error) return error; - vfs_dq_init(dir); error = dir->i_op->symlink(dir, dentry, oldname); if (!error) fsnotify_create(dir, dentry); @@ -2453,7 +2444,6 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de return error; mutex_lock(&inode->i_mutex); - vfs_dq_init(dir); error = dir->i_op->link(old_dentry, dir, new_dentry); mutex_unlock(&inode->i_mutex); if (!error) @@ -2554,7 +2544,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * e) conversion from fhandle to dentry may come in the wrong moment - when * we are removing the target. Solution: we will have to grab ->i_mutex * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on - * ->i_mutex on parents, which works but leads to some truely excessive + * ->i_mutex on parents, which works but leads to some truly excessive * locking]. */ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, @@ -2654,9 +2644,6 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!old_dir->i_op->rename) return -EPERM; - vfs_dq_init(old_dir); - vfs_dq_init(new_dir); - old_name = fsnotify_oldname_init(old_dentry->d_name.name); if (is_dir) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 73ab220354d..36dfdae9512 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -118,7 +118,6 @@ nfs4_callback_up(struct svc_serv *serv) dprintk("NFS: Callback listener port = %u (af %u)\n", nfs_callback_tcpport, PF_INET); -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ret = svc_create_xprt(serv, "tcp", PF_INET6, nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); if (ret > 0) { @@ -129,7 +128,6 @@ nfs4_callback_up(struct svc_serv *serv) ret = 0; else goto out_err; -#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ return svc_prepare_thread(serv, &serv->sv_pools[0]); diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index d4036be0b58..85a7cfd1b8d 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -119,6 +119,14 @@ struct cb_recallanyargs { }; extern unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy); + +struct cb_recallslotargs { + struct sockaddr *crsa_addr; + uint32_t crsa_target_max_slots; +}; +extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args, + void *dummy); + #endif /* CONFIG_NFS_V4_1 */ extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index defa9b4c470..84761b5bb8e 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -143,44 +143,49 @@ int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const n * Return success if the sequenceID is one more than what we last saw on * this slot, accounting for wraparound. Increments the slot's sequence. * - * We don't yet implement a duplicate request cache, so at this time - * we will log replays, and process them as if we had not seen them before, - * but we don't bump the sequence in the slot. Not too worried about it, + * We don't yet implement a duplicate request cache, instead we set the + * back channel ca_maxresponsesize_cached to zero. This is OK for now * since we only currently implement idempotent callbacks anyway. * * We have a single slot backchannel at this time, so we don't bother * checking the used_slots bit array on the table. The lower layer guarantees * a single outstanding callback request at a time. */ -static int -validate_seqid(struct nfs4_slot_table *tbl, u32 slotid, u32 seqid) +static __be32 +validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args) { struct nfs4_slot *slot; dprintk("%s enter. slotid %d seqid %d\n", - __func__, slotid, seqid); + __func__, args->csa_slotid, args->csa_sequenceid); - if (slotid > NFS41_BC_MAX_CALLBACKS) + if (args->csa_slotid > NFS41_BC_MAX_CALLBACKS) return htonl(NFS4ERR_BADSLOT); - slot = tbl->slots + slotid; + slot = tbl->slots + args->csa_slotid; dprintk("%s slot table seqid: %d\n", __func__, slot->seq_nr); /* Normal */ - if (likely(seqid == slot->seq_nr + 1)) { + if (likely(args->csa_sequenceid == slot->seq_nr + 1)) { slot->seq_nr++; return htonl(NFS4_OK); } /* Replay */ - if (seqid == slot->seq_nr) { - dprintk("%s seqid %d is a replay - no DRC available\n", - __func__, seqid); - return htonl(NFS4_OK); + if (args->csa_sequenceid == slot->seq_nr) { + dprintk("%s seqid %d is a replay\n", + __func__, args->csa_sequenceid); + /* Signal process_op to set this error on next op */ + if (args->csa_cachethis == 0) + return htonl(NFS4ERR_RETRY_UNCACHED_REP); + + /* The ca_maxresponsesize_cached is 0 with no DRC */ + else if (args->csa_cachethis == 1) + return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE); } /* Wraparound */ - if (seqid == 1 && (slot->seq_nr + 1) == 0) { + if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) { slot->seq_nr = 1; return htonl(NFS4_OK); } @@ -225,27 +230,87 @@ validate_seqid(struct nfs4_slot_table *tbl, u32 slotid, u32 seqid) return NULL; } -/* FIXME: referring calls should be processed */ -unsigned nfs4_callback_sequence(struct cb_sequenceargs *args, +/* + * For each referring call triple, check the session's slot table for + * a match. If the slot is in use and the sequence numbers match, the + * client is still waiting for a response to the original request. + */ +static bool referring_call_exists(struct nfs_client *clp, + uint32_t nrclists, + struct referring_call_list *rclists) +{ + bool status = 0; + int i, j; + struct nfs4_session *session; + struct nfs4_slot_table *tbl; + struct referring_call_list *rclist; + struct referring_call *ref; + + /* + * XXX When client trunking is implemented, this becomes + * a session lookup from within the loop + */ + session = clp->cl_session; + tbl = &session->fc_slot_table; + + for (i = 0; i < nrclists; i++) { + rclist = &rclists[i]; + if (memcmp(session->sess_id.data, + rclist->rcl_sessionid.data, + NFS4_MAX_SESSIONID_LEN) != 0) + continue; + + for (j = 0; j < rclist->rcl_nrefcalls; j++) { + ref = &rclist->rcl_refcalls[j]; + + dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u " + "slotid %u\n", __func__, + ((u32 *)&rclist->rcl_sessionid.data)[0], + ((u32 *)&rclist->rcl_sessionid.data)[1], + ((u32 *)&rclist->rcl_sessionid.data)[2], + ((u32 *)&rclist->rcl_sessionid.data)[3], + ref->rc_sequenceid, ref->rc_slotid); + + spin_lock(&tbl->slot_tbl_lock); + status = (test_bit(ref->rc_slotid, tbl->used_slots) && + tbl->slots[ref->rc_slotid].seq_nr == + ref->rc_sequenceid); + spin_unlock(&tbl->slot_tbl_lock); + if (status) + goto out; + } + } + +out: + return status; +} + +__be32 nfs4_callback_sequence(struct cb_sequenceargs *args, struct cb_sequenceres *res) { struct nfs_client *clp; - int i, status; - - for (i = 0; i < args->csa_nrclists; i++) - kfree(args->csa_rclists[i].rcl_refcalls); - kfree(args->csa_rclists); + int i; + __be32 status; status = htonl(NFS4ERR_BADSESSION); clp = find_client_with_session(args->csa_addr, 4, &args->csa_sessionid); if (clp == NULL) goto out; - status = validate_seqid(&clp->cl_session->bc_slot_table, - args->csa_slotid, args->csa_sequenceid); + status = validate_seqid(&clp->cl_session->bc_slot_table, args); if (status) goto out_putclient; + /* + * Check for pending referring calls. If a match is found, a + * related callback was received before the response to the original + * call. + */ + if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) { + status = htonl(NFS4ERR_DELAY); + goto out_putclient; + } + memcpy(&res->csr_sessionid, &args->csa_sessionid, sizeof(res->csr_sessionid)); res->csr_sequenceid = args->csa_sequenceid; @@ -256,15 +321,23 @@ unsigned nfs4_callback_sequence(struct cb_sequenceargs *args, out_putclient: nfs_put_client(clp); out: - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); - res->csr_status = status; - return res->csr_status; + for (i = 0; i < args->csa_nrclists; i++) + kfree(args->csa_rclists[i].rcl_refcalls); + kfree(args->csa_rclists); + + if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) + res->csr_status = 0; + else + res->csr_status = status; + dprintk("%s: exit with status = %d res->csr_status %d\n", __func__, + ntohl(status), ntohl(res->csr_status)); + return status; } -unsigned nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy) +__be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy) { struct nfs_client *clp; - int status; + __be32 status; fmode_t flags = 0; status = htonl(NFS4ERR_OP_NOT_IN_SESSION); @@ -289,4 +362,40 @@ out: dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; } + +/* Reduce the fore channel's max_slots to the target value */ +__be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy) +{ + struct nfs_client *clp; + struct nfs4_slot_table *fc_tbl; + __be32 status; + + status = htonl(NFS4ERR_OP_NOT_IN_SESSION); + clp = nfs_find_client(args->crsa_addr, 4); + if (clp == NULL) + goto out; + + dprintk("NFS: CB_RECALL_SLOT request from %s target max slots %d\n", + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR), + args->crsa_target_max_slots); + + fc_tbl = &clp->cl_session->fc_slot_table; + + status = htonl(NFS4ERR_BAD_HIGH_SLOT); + if (args->crsa_target_max_slots > fc_tbl->max_slots || + args->crsa_target_max_slots < 1) + goto out_putclient; + + status = htonl(NFS4_OK); + if (args->crsa_target_max_slots == fc_tbl->max_slots) + goto out_putclient; + + fc_tbl->target_max_slots = args->crsa_target_max_slots; + nfs41_handle_recall_slot(clp); +out_putclient: + nfs_put_client(clp); /* balance nfs_find_client */ +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 8e1a2511c8b..a2b8b4df125 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -24,10 +24,14 @@ #define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ 4 + 1 + 3) #define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +#define CB_OP_RECALLSLOT_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) #endif /* CONFIG_NFS_V4_1 */ #define NFSDBG_FACILITY NFSDBG_CALLBACK +/* Internal error code */ +#define NFS4ERR_RESOURCE_HDR 11050 + typedef __be32 (*callback_process_op_t)(void *, void *); typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *); typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *); @@ -173,7 +177,7 @@ static __be32 decode_op_hdr(struct xdr_stream *xdr, unsigned int *op) __be32 *p; p = read_buf(xdr, 4); if (unlikely(p == NULL)) - return htonl(NFS4ERR_RESOURCE); + return htonl(NFS4ERR_RESOURCE_HDR); *op = ntohl(*p); return 0; } @@ -215,10 +219,10 @@ out: #if defined(CONFIG_NFS_V4_1) -static unsigned decode_sessionid(struct xdr_stream *xdr, +static __be32 decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid) { - uint32_t *p; + __be32 *p; int len = NFS4_MAX_SESSIONID_LEN; p = read_buf(xdr, len); @@ -229,12 +233,12 @@ static unsigned decode_sessionid(struct xdr_stream *xdr, return 0; } -static unsigned decode_rc_list(struct xdr_stream *xdr, +static __be32 decode_rc_list(struct xdr_stream *xdr, struct referring_call_list *rc_list) { - uint32_t *p; + __be32 *p; int i; - unsigned status; + __be32 status; status = decode_sessionid(xdr, &rc_list->rcl_sessionid); if (status) @@ -267,13 +271,13 @@ out: return status; } -static unsigned decode_cb_sequence_args(struct svc_rqst *rqstp, +static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_sequenceargs *args) { - uint32_t *p; + __be32 *p; int i; - unsigned status; + __be32 status; status = decode_sessionid(xdr, &args->csa_sessionid); if (status) @@ -327,11 +331,11 @@ out_free: goto out; } -static unsigned decode_recallany_args(struct svc_rqst *rqstp, +static __be32 decode_recallany_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallanyargs *args) { - uint32_t *p; + __be32 *p; args->craa_addr = svc_addr(rqstp); p = read_buf(xdr, 4); @@ -346,6 +350,20 @@ static unsigned decode_recallany_args(struct svc_rqst *rqstp, return 0; } +static __be32 decode_recallslot_args(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + struct cb_recallslotargs *args) +{ + __be32 *p; + + args->crsa_addr = svc_addr(rqstp); + p = read_buf(xdr, 4); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); + args->crsa_target_max_slots = ntohl(*p++); + return 0; +} + #endif /* CONFIG_NFS_V4_1 */ static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) @@ -465,7 +483,7 @@ static __be32 encode_op_hdr(struct xdr_stream *xdr, uint32_t op, __be32 res) p = xdr_reserve_space(xdr, 8); if (unlikely(p == NULL)) - return htonl(NFS4ERR_RESOURCE); + return htonl(NFS4ERR_RESOURCE_HDR); *p++ = htonl(op); *p = res; return 0; @@ -499,10 +517,10 @@ out: #if defined(CONFIG_NFS_V4_1) -static unsigned encode_sessionid(struct xdr_stream *xdr, +static __be32 encode_sessionid(struct xdr_stream *xdr, const struct nfs4_sessionid *sid) { - uint32_t *p; + __be32 *p; int len = NFS4_MAX_SESSIONID_LEN; p = xdr_reserve_space(xdr, len); @@ -513,11 +531,11 @@ static unsigned encode_sessionid(struct xdr_stream *xdr, return 0; } -static unsigned encode_cb_sequence_res(struct svc_rqst *rqstp, +static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp, struct xdr_stream *xdr, const struct cb_sequenceres *res) { - uint32_t *p; + __be32 *p; unsigned status = res->csr_status; if (unlikely(status != 0)) @@ -554,6 +572,7 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) case OP_CB_RECALL: case OP_CB_SEQUENCE: case OP_CB_RECALL_ANY: + case OP_CB_RECALL_SLOT: *op = &callback_ops[op_nr]; break; @@ -562,7 +581,6 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) case OP_CB_NOTIFY: case OP_CB_PUSH_DELEG: case OP_CB_RECALLABLE_OBJ_AVAIL: - case OP_CB_RECALL_SLOT: case OP_CB_WANTS_CANCELLED: case OP_CB_NOTIFY_LOCK: return htonl(NFS4ERR_NOTSUPP); @@ -602,20 +620,18 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op) static __be32 process_op(uint32_t minorversion, int nop, struct svc_rqst *rqstp, struct xdr_stream *xdr_in, void *argp, - struct xdr_stream *xdr_out, void *resp) + struct xdr_stream *xdr_out, void *resp, int* drc_status) { struct callback_op *op = &callback_ops[0]; - unsigned int op_nr = OP_CB_ILLEGAL; + unsigned int op_nr; __be32 status; long maxlen; __be32 res; dprintk("%s: start\n", __func__); status = decode_op_hdr(xdr_in, &op_nr); - if (unlikely(status)) { - status = htonl(NFS4ERR_OP_ILLEGAL); - goto out; - } + if (unlikely(status)) + return status; dprintk("%s: minorversion=%d nop=%d op_nr=%u\n", __func__, minorversion, nop, op_nr); @@ -624,19 +640,32 @@ static __be32 process_op(uint32_t minorversion, int nop, preprocess_nfs4_op(op_nr, &op); if (status == htonl(NFS4ERR_OP_ILLEGAL)) op_nr = OP_CB_ILLEGAL; -out: + if (status) + goto encode_hdr; + + if (*drc_status) { + status = *drc_status; + goto encode_hdr; + } + maxlen = xdr_out->end - xdr_out->p; if (maxlen > 0 && maxlen < PAGE_SIZE) { - if (likely(status == 0 && op->decode_args != NULL)) - status = op->decode_args(rqstp, xdr_in, argp); - if (likely(status == 0 && op->process_op != NULL)) + status = op->decode_args(rqstp, xdr_in, argp); + if (likely(status == 0)) status = op->process_op(argp, resp); } else status = htonl(NFS4ERR_RESOURCE); + /* Only set by OP_CB_SEQUENCE processing */ + if (status == htonl(NFS4ERR_RETRY_UNCACHED_REP)) { + *drc_status = status; + status = 0; + } + +encode_hdr: res = encode_op_hdr(xdr_out, op_nr, status); - if (status == 0) - status = res; + if (unlikely(res)) + return res; if (op->encode_res != NULL && status == 0) status = op->encode_res(rqstp, xdr_out, resp); dprintk("%s: done, status = %d\n", __func__, ntohl(status)); @@ -652,7 +681,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r struct cb_compound_hdr_res hdr_res = { NULL }; struct xdr_stream xdr_in, xdr_out; __be32 *p; - __be32 status; + __be32 status, drc_status = 0; unsigned int nops = 0; dprintk("%s: start\n", __func__); @@ -672,11 +701,18 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r return rpc_system_err; while (status == 0 && nops != hdr_arg.nops) { - status = process_op(hdr_arg.minorversion, nops, - rqstp, &xdr_in, argp, &xdr_out, resp); + status = process_op(hdr_arg.minorversion, nops, rqstp, + &xdr_in, argp, &xdr_out, resp, &drc_status); nops++; } + /* Buffer overflow in decode_ops_hdr or encode_ops_hdr. Return + * resource error in cb_compound status without returning op */ + if (unlikely(status == htonl(NFS4ERR_RESOURCE_HDR))) { + status = htonl(NFS4ERR_RESOURCE); + nops--; + } + *hdr_res.status = status; *hdr_res.nops = htonl(nops); dprintk("%s: done, status = %u\n", __func__, ntohl(status)); @@ -713,6 +749,11 @@ static struct callback_op callback_ops[] = { .decode_args = (callback_decode_arg_t)decode_recallany_args, .res_maxsize = CB_OP_RECALLANY_RES_MAXSZ, }, + [OP_CB_RECALL_SLOT] = { + .process_op = (callback_process_op_t)nfs4_callback_recallslot, + .decode_args = (callback_decode_arg_t)decode_recallslot_args, + .res_maxsize = CB_OP_RECALLSLOT_RES_MAXSZ, + }, #endif /* CONFIG_NFS_V4_1 */ }; @@ -741,6 +782,7 @@ struct svc_version nfs4_callback_version1 = { .vs_proc = nfs4_callback_procedures1, .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, .vs_dispatch = NULL, + .vs_hidden = 1, }; struct svc_version nfs4_callback_version4 = { diff --git a/fs/nfs/client.c b/fs/nfs/client.c index ee77713ce68..2274f173733 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -164,30 +164,7 @@ error_0: return ERR_PTR(err); } -static void nfs4_shutdown_client(struct nfs_client *clp) -{ -#ifdef CONFIG_NFS_V4 - if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state)) - nfs4_kill_renewd(clp); - BUG_ON(!RB_EMPTY_ROOT(&clp->cl_state_owners)); - if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state)) - nfs_idmap_delete(clp); - - rpc_destroy_wait_queue(&clp->cl_rpcwaitq); -#endif -} - -/* - * Destroy the NFS4 callback service - */ -static void nfs4_destroy_callback(struct nfs_client *clp) -{ #ifdef CONFIG_NFS_V4 - if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) - nfs_callback_down(clp->cl_minorversion); -#endif /* CONFIG_NFS_V4 */ -} - /* * Clears/puts all minor version specific parts from an nfs_client struct * reverting it to minorversion 0. @@ -202,9 +179,33 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp) clp->cl_call_sync = _nfs4_call_sync; #endif /* CONFIG_NFS_V4_1 */ +} +/* + * Destroy the NFS4 callback service + */ +static void nfs4_destroy_callback(struct nfs_client *clp) +{ + if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) + nfs_callback_down(clp->cl_minorversion); +} + +static void nfs4_shutdown_client(struct nfs_client *clp) +{ + if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state)) + nfs4_kill_renewd(clp); + nfs4_clear_client_minor_version(clp); nfs4_destroy_callback(clp); + if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state)) + nfs_idmap_delete(clp); + + rpc_destroy_wait_queue(&clp->cl_rpcwaitq); } +#else +static void nfs4_shutdown_client(struct nfs_client *clp) +{ +} +#endif /* CONFIG_NFS_V4 */ /* * Destroy a shared client record @@ -213,7 +214,6 @@ static void nfs_free_client(struct nfs_client *clp) { dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version); - nfs4_clear_client_minor_version(clp); nfs4_shutdown_client(clp); nfs_fscache_release_client_cookie(clp); diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 944b627ec6e..69e7b814012 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -71,4 +71,10 @@ static inline int nfs_inode_return_delegation(struct inode *inode) } #endif +static inline int nfs_have_delegated_attributes(struct inode *inode) +{ + return nfs_have_delegation(inode, FMODE_READ) && + !(NFS_I(inode)->cache_validity & NFS_INO_REVAL_FORCED); +} + #endif diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 3c7f03b669f..c6f2750648f 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -560,7 +560,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) desc->entry = &my_entry; nfs_block_sillyrename(dentry); - res = nfs_revalidate_mapping_nolock(inode, filp->f_mapping); + res = nfs_revalidate_mapping(inode, filp->f_mapping); if (res < 0) goto out; @@ -1789,7 +1789,7 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str cache = nfs_access_search_rbtree(inode, cred); if (cache == NULL) goto out; - if (!nfs_have_delegation(inode, FMODE_READ) && + if (!nfs_have_delegated_attributes(inode) && !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) goto out_stale; res->jiffies = cache->jiffies; diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index 95e1ca765d4..3f0cd4dfdda 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -36,6 +36,19 @@ struct nfs_dns_ent { }; +static void nfs_dns_ent_update(struct cache_head *cnew, + struct cache_head *ckey) +{ + struct nfs_dns_ent *new; + struct nfs_dns_ent *key; + + new = container_of(cnew, struct nfs_dns_ent, h); + key = container_of(ckey, struct nfs_dns_ent, h); + + memcpy(&new->addr, &key->addr, key->addrlen); + new->addrlen = key->addrlen; +} + static void nfs_dns_ent_init(struct cache_head *cnew, struct cache_head *ckey) { @@ -49,8 +62,7 @@ static void nfs_dns_ent_init(struct cache_head *cnew, new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL); if (new->hostname) { new->namelen = key->namelen; - memcpy(&new->addr, &key->addr, key->addrlen); - new->addrlen = key->addrlen; + nfs_dns_ent_update(cnew, ckey); } else { new->namelen = 0; new->addrlen = 0; @@ -234,7 +246,7 @@ static struct cache_detail nfs_dns_resolve = { .cache_show = nfs_dns_show, .match = nfs_dns_match, .init = nfs_dns_ent_init, - .update = nfs_dns_ent_init, + .update = nfs_dns_ent_update, .alloc = nfs_dns_ent_alloc, }; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 63f2071d644..ae8d02294e4 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -123,11 +123,11 @@ nfs_file_open(struct inode *inode, struct file *filp) filp->f_path.dentry->d_parent->d_name.name, filp->f_path.dentry->d_name.name); + nfs_inc_stats(inode, NFSIOS_VFSOPEN); res = nfs_check_flags(filp->f_flags); if (res) return res; - nfs_inc_stats(inode, NFSIOS_VFSOPEN); res = nfs_open(inode, filp); return res; } @@ -237,9 +237,9 @@ nfs_file_flush(struct file *file, fl_owner_t id) dentry->d_parent->d_name.name, dentry->d_name.name); + nfs_inc_stats(inode, NFSIOS_VFSFLUSH); if ((file->f_mode & FMODE_WRITE) == 0) return 0; - nfs_inc_stats(inode, NFSIOS_VFSFLUSH); /* Flush writes to the server and return any errors */ return nfs_do_fsync(ctx, inode); @@ -262,9 +262,11 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov, (unsigned long) count, (unsigned long) pos); result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); - nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, count); - if (!result) + if (!result) { result = generic_file_aio_read(iocb, iov, nr_segs, pos); + if (result > 0) + nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result); + } return result; } @@ -282,8 +284,11 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos, (unsigned long) count, (unsigned long long) *ppos); res = nfs_revalidate_mapping(inode, filp->f_mapping); - if (!res) + if (!res) { res = generic_file_splice_read(filp, ppos, pipe, count, flags); + if (res > 0) + nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res); + } return res; } @@ -596,6 +601,7 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, { struct dentry * dentry = iocb->ki_filp->f_path.dentry; struct inode * inode = dentry->d_inode; + unsigned long written = 0; ssize_t result; size_t count = iov_length(iov, nr_segs); @@ -622,14 +628,18 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, if (!count) goto out; - nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count); result = generic_file_aio_write(iocb, iov, nr_segs, pos); + if (result > 0) + written = result; + /* Return error values for O_DSYNC and IS_SYNC() */ if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) { int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode); if (err < 0) result = err; } + if (result > 0) + nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); out: return result; @@ -644,6 +654,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe, { struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; + unsigned long written = 0; ssize_t ret; dprintk("NFS splice_write(%s/%s, %lu@%llu)\n", @@ -654,14 +665,17 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe, * The combination of splice and an O_APPEND destination is disallowed. */ - nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count); - ret = generic_file_splice_write(pipe, filp, ppos, count, flags); + if (ret > 0) + written = ret; + if (ret >= 0 && nfs_need_sync_write(filp, inode)) { int err = nfs_do_fsync(nfs_file_open_context(filp), inode); if (err < 0) ret = err; } + if (ret > 0) + nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); return ret; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 7570573bdb3..e358df75a6a 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -97,22 +97,6 @@ u64 nfs_compat_user_ino64(u64 fileid) return ino; } -int nfs_write_inode(struct inode *inode, int sync) -{ - int ret; - - if (sync) { - ret = filemap_fdatawait(inode->i_mapping); - if (ret == 0) - ret = nfs_commit_inode(inode, FLUSH_SYNC); - } else - ret = nfs_commit_inode(inode, 0); - if (ret >= 0) - return 0; - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); - return ret; -} - void nfs_clear_inode(struct inode *inode) { /* @@ -130,16 +114,12 @@ void nfs_clear_inode(struct inode *inode) */ int nfs_sync_mapping(struct address_space *mapping) { - int ret; + int ret = 0; - if (mapping->nrpages == 0) - return 0; - unmap_mapping_range(mapping, 0, 0, 0); - ret = filemap_write_and_wait(mapping); - if (ret != 0) - goto out; - ret = nfs_wb_all(mapping->host); -out: + if (mapping->nrpages != 0) { + unmap_mapping_range(mapping, 0, 0, 0); + ret = nfs_wb_all(mapping->host); + } return ret; } @@ -511,17 +491,11 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME; int err; - /* - * Flush out writes to the server in order to update c/mtime. - * - * Hold the i_mutex to suspend application writes temporarily; - * this prevents long-running writing applications from blocking - * nfs_wb_nocommit. - */ + /* Flush out writes to the server in order to update c/mtime. */ if (S_ISREG(inode->i_mode)) { - mutex_lock(&inode->i_mutex); - nfs_wb_nocommit(inode); - mutex_unlock(&inode->i_mutex); + err = filemap_write_and_wait(inode->i_mapping); + if (err) + goto out; } /* @@ -545,6 +519,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) generic_fillattr(inode, stat); stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); } +out: return err; } @@ -620,11 +595,6 @@ void put_nfs_open_context(struct nfs_open_context *ctx) __put_nfs_open_context(ctx, 0); } -static void put_nfs_open_context_sync(struct nfs_open_context *ctx) -{ - __put_nfs_open_context(ctx, 1); -} - /* * Ensure that mmap has a recent RPC credential for use when writing out * shared pages @@ -671,7 +641,7 @@ static void nfs_file_clear_open_context(struct file *filp) spin_lock(&inode->i_lock); list_move_tail(&ctx->list, &NFS_I(inode)->open_files); spin_unlock(&inode->i_lock); - put_nfs_open_context_sync(ctx); + __put_nfs_open_context(ctx, filp->f_flags & O_DIRECT ? 0 : 1); } } @@ -759,7 +729,7 @@ int nfs_attribute_timeout(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); - if (nfs_have_delegation(inode, FMODE_READ)) + if (nfs_have_delegated_attributes(inode)) return 0; return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); } @@ -779,7 +749,7 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) return __nfs_revalidate_inode(server, inode); } -static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_space *mapping) +static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping) { struct nfs_inode *nfsi = NFS_I(inode); @@ -800,49 +770,10 @@ static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_spa return 0; } -static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping) -{ - int ret = 0; - - mutex_lock(&inode->i_mutex); - if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_DATA) { - ret = nfs_sync_mapping(mapping); - if (ret == 0) - ret = nfs_invalidate_mapping_nolock(inode, mapping); - } - mutex_unlock(&inode->i_mutex); - return ret; -} - -/** - * nfs_revalidate_mapping_nolock - Revalidate the pagecache - * @inode - pointer to host inode - * @mapping - pointer to mapping - */ -int nfs_revalidate_mapping_nolock(struct inode *inode, struct address_space *mapping) -{ - struct nfs_inode *nfsi = NFS_I(inode); - int ret = 0; - - if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) - || nfs_attribute_timeout(inode) || NFS_STALE(inode)) { - ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); - if (ret < 0) - goto out; - } - if (nfsi->cache_validity & NFS_INO_INVALID_DATA) - ret = nfs_invalidate_mapping_nolock(inode, mapping); -out: - return ret; -} - /** * nfs_revalidate_mapping - Revalidate the pagecache * @inode - pointer to host inode * @mapping - pointer to mapping - * - * This version of the function will take the inode->i_mutex and attempt to - * flush out all dirty data if it needs to invalidate the page cache. */ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) { @@ -1420,6 +1351,7 @@ static void init_once(void *foo) INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC); nfsi->npages = 0; + nfsi->ncommit = 0; atomic_set(&nfsi->silly_count, 1); INIT_HLIST_HEAD(&nfsi->silly_list); init_waitqueue_head(&nfsi->waitqueue); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 29e464d23b3..11f82f03c5d 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -211,7 +211,7 @@ extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask); extern struct workqueue_struct *nfsiod_workqueue; extern struct inode *nfs_alloc_inode(struct super_block *sb); extern void nfs_destroy_inode(struct inode *); -extern int nfs_write_inode(struct inode *,int); +extern int nfs_write_inode(struct inode *, struct writeback_control *); extern void nfs_clear_inode(struct inode *); #ifdef CONFIG_NFS_V4 extern void nfs4_clear_inode(struct inode *); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 3f8881d1a05..24992f0a29f 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -22,14 +22,14 @@ #define NFSDBG_FACILITY NFSDBG_PROC -/* A wrapper to handle the EJUKEBOX error message */ +/* A wrapper to handle the EJUKEBOX and EKEYEXPIRED error messages */ static int nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) { int res; do { res = rpc_call_sync(clnt, msg, flags); - if (res != -EJUKEBOX) + if (res != -EJUKEBOX && res != -EKEYEXPIRED) break; schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); res = -ERESTARTSYS; @@ -42,9 +42,10 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) static int nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode) { - if (task->tk_status != -EJUKEBOX) + if (task->tk_status != -EJUKEBOX && task->tk_status != -EKEYEXPIRED) return 0; - nfs_inc_stats(inode, NFSIOS_DELAY); + if (task->tk_status == -EJUKEBOX) + nfs_inc_stats(inode, NFSIOS_DELAY); task->tk_status = 0; rpc_restart_call(task); rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 0c6fda33d66..a187200a7aa 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -46,6 +46,7 @@ enum nfs4_client_state { NFS4CLNT_DELEGRETURN, NFS4CLNT_SESSION_RESET, NFS4CLNT_SESSION_DRAINING, + NFS4CLNT_RECALL_SLOT, }; /* @@ -280,6 +281,7 @@ extern void nfs4_schedule_state_manager(struct nfs_client *); extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state); extern int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state); extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); +extern void nfs41_handle_recall_slot(struct nfs_client *clp); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 84d83be25a9..f9254fb0c9d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -281,6 +281,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, } case -NFS4ERR_GRACE: case -NFS4ERR_DELAY: + case -EKEYEXPIRED: ret = nfs4_delay(server->client, &exception->timeout); if (ret != 0) break; @@ -418,7 +419,8 @@ static void nfs41_sequence_done(struct nfs_client *clp, clp->cl_last_renewal = timestamp; spin_unlock(&clp->cl_lock); /* Check sequence flags */ - nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); + if (atomic_read(&clp->cl_count) > 1) + nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); } out: /* The session may be reset by one of the error handlers. */ @@ -1163,7 +1165,7 @@ static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state int err; do { err = _nfs4_do_open_reclaim(ctx, state); - if (err != -NFS4ERR_DELAY) + if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED) break; nfs4_handle_exception(server, err, &exception); } while (exception.retry); @@ -1582,6 +1584,7 @@ static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state goto out; case -NFS4ERR_GRACE: case -NFS4ERR_DELAY: + case -EKEYEXPIRED: nfs4_handle_exception(server, err, &exception); err = 0; } @@ -3145,10 +3148,19 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special * standalone procedure for queueing an asynchronous RENEW. */ +static void nfs4_renew_release(void *data) +{ + struct nfs_client *clp = data; + + if (atomic_read(&clp->cl_count) > 1) + nfs4_schedule_state_renewal(clp); + nfs_put_client(clp); +} + static void nfs4_renew_done(struct rpc_task *task, void *data) { - struct nfs_client *clp = (struct nfs_client *)task->tk_msg.rpc_argp; - unsigned long timestamp = (unsigned long)data; + struct nfs_client *clp = data; + unsigned long timestamp = task->tk_start; if (task->tk_status < 0) { /* Unless we're shutting down, schedule state recovery! */ @@ -3164,6 +3176,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *data) static const struct rpc_call_ops nfs4_renew_ops = { .rpc_call_done = nfs4_renew_done, + .rpc_release = nfs4_renew_release, }; int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred) @@ -3174,8 +3187,10 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred) .rpc_cred = cred, }; + if (!atomic_inc_not_zero(&clp->cl_count)) + return -EIO; return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, - &nfs4_renew_ops, (void *)jiffies); + &nfs4_renew_ops, clp); } int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) @@ -3452,6 +3467,7 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, if (server) nfs_inc_server_stats(server, NFSIOS_DELAY); case -NFS4ERR_GRACE: + case -EKEYEXPIRED: rpc_delay(task, NFS4_POLL_RETRY_MAX); task->tk_status = 0; return -EAGAIN; @@ -3564,6 +3580,7 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred) case -NFS4ERR_RESOURCE: /* The IBM lawyers misread another document! */ case -NFS4ERR_DELAY: + case -EKEYEXPIRED: err = nfs4_delay(clp->cl_rpcclient, &timeout); } } while (err == 0); @@ -4179,7 +4196,7 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) return 0; err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM); - if (err != -NFS4ERR_DELAY) + if (err != -NFS4ERR_DELAY && err != -EKEYEXPIRED) break; nfs4_handle_exception(server, err, &exception); } while (exception.retry); @@ -4204,6 +4221,7 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request goto out; case -NFS4ERR_GRACE: case -NFS4ERR_DELAY: + case -EKEYEXPIRED: nfs4_handle_exception(server, err, &exception); err = 0; } @@ -4355,6 +4373,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) err = 0; goto out; case -NFS4ERR_DELAY: + case -EKEYEXPIRED: break; } err = nfs4_handle_exception(server, err, &exception); @@ -4500,7 +4519,7 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) status = rpc_call_sync(clp->cl_rpcclient, &msg, 0); - if (status != NFS4ERR_CLID_INUSE) + if (status != -NFS4ERR_CLID_INUSE) break; if (signalled()) @@ -4554,6 +4573,7 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata) switch (task->tk_status) { case -NFS4ERR_DELAY: case -NFS4ERR_GRACE: + case -EKEYEXPIRED: dprintk("%s Retry: tk_status %d\n", __func__, task->tk_status); rpc_delay(task, NFS4_POLL_RETRY_MIN); task->tk_status = 0; @@ -4611,26 +4631,32 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) /* * Reset a slot table */ -static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, int max_slots, - int old_max_slots, int ivalue) +static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs, + int ivalue) { + struct nfs4_slot *new = NULL; int i; int ret = 0; - dprintk("--> %s: max_reqs=%u, tbl %p\n", __func__, max_slots, tbl); + dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__, + max_reqs, tbl->max_slots); - /* - * Until we have dynamic slot table adjustment, insist - * upon the same slot table size - */ - if (max_slots != old_max_slots) { - dprintk("%s reset slot table does't match old\n", - __func__); - ret = -EINVAL; /*XXX NFS4ERR_REQ_TOO_BIG ? */ - goto out; + /* Does the newly negotiated max_reqs match the existing slot table? */ + if (max_reqs != tbl->max_slots) { + ret = -ENOMEM; + new = kmalloc(max_reqs * sizeof(struct nfs4_slot), + GFP_KERNEL); + if (!new) + goto out; + ret = 0; + kfree(tbl->slots); } spin_lock(&tbl->slot_tbl_lock); - for (i = 0; i < max_slots; ++i) + if (new) { + tbl->slots = new; + tbl->max_slots = max_reqs; + } + for (i = 0; i < tbl->max_slots; ++i) tbl->slots[i].seq_nr = ivalue; spin_unlock(&tbl->slot_tbl_lock); dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__, @@ -4648,16 +4674,12 @@ static int nfs4_reset_slot_tables(struct nfs4_session *session) int status; status = nfs4_reset_slot_table(&session->fc_slot_table, - session->fc_attrs.max_reqs, - session->fc_slot_table.max_slots, - 1); + session->fc_attrs.max_reqs, 1); if (status) return status; status = nfs4_reset_slot_table(&session->bc_slot_table, - session->bc_attrs.max_reqs, - session->bc_slot_table.max_slots, - 0); + session->bc_attrs.max_reqs, 0); return status; } @@ -4798,16 +4820,14 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) args->fc_attrs.headerpadsz = 0; args->fc_attrs.max_rqst_sz = mxrqst_sz; args->fc_attrs.max_resp_sz = mxresp_sz; - args->fc_attrs.max_resp_sz_cached = mxresp_sz; args->fc_attrs.max_ops = NFS4_MAX_OPS; args->fc_attrs.max_reqs = session->clp->cl_rpcclient->cl_xprt->max_reqs; dprintk("%s: Fore Channel : max_rqst_sz=%u max_resp_sz=%u " - "max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n", + "max_ops=%u max_reqs=%u\n", __func__, args->fc_attrs.max_rqst_sz, args->fc_attrs.max_resp_sz, - args->fc_attrs.max_resp_sz_cached, args->fc_attrs.max_ops, - args->fc_attrs.max_reqs); + args->fc_attrs.max_ops, args->fc_attrs.max_reqs); /* Back channel attributes */ args->bc_attrs.headerpadsz = 0; @@ -5016,7 +5036,16 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) &res, args.sa_cache_this, 1); } -void nfs41_sequence_call_done(struct rpc_task *task, void *data) +static void nfs41_sequence_release(void *data) +{ + struct nfs_client *clp = (struct nfs_client *)data; + + if (atomic_read(&clp->cl_count) > 1) + nfs4_schedule_state_renewal(clp); + nfs_put_client(clp); +} + +static void nfs41_sequence_call_done(struct rpc_task *task, void *data) { struct nfs_client *clp = (struct nfs_client *)data; @@ -5024,6 +5053,8 @@ void nfs41_sequence_call_done(struct rpc_task *task, void *data) if (task->tk_status < 0) { dprintk("%s ERROR %d\n", __func__, task->tk_status); + if (atomic_read(&clp->cl_count) == 1) + goto out; if (_nfs4_async_handle_error(task, NULL, clp, NULL) == -EAGAIN) { @@ -5032,7 +5063,7 @@ void nfs41_sequence_call_done(struct rpc_task *task, void *data) } } dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred); - +out: kfree(task->tk_msg.rpc_argp); kfree(task->tk_msg.rpc_resp); @@ -5057,6 +5088,7 @@ static void nfs41_sequence_prepare(struct rpc_task *task, void *data) static const struct rpc_call_ops nfs41_sequence_ops = { .rpc_call_done = nfs41_sequence_call_done, .rpc_call_prepare = nfs41_sequence_prepare, + .rpc_release = nfs41_sequence_release, }; static int nfs41_proc_async_sequence(struct nfs_client *clp, @@ -5069,12 +5101,14 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, .rpc_cred = cred, }; + if (!atomic_inc_not_zero(&clp->cl_count)) + return -EIO; args = kzalloc(sizeof(*args), GFP_KERNEL); - if (!args) - return -ENOMEM; res = kzalloc(sizeof(*res), GFP_KERNEL); - if (!res) { + if (!args || !res) { kfree(args); + kfree(res); + nfs_put_client(clp); return -ENOMEM; } res->sr_slotid = NFS4_MAX_SLOT_TABLE; diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index 0156c01c212..d87f10327b7 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -36,11 +36,6 @@ * as an rpc_task, not a real kernel thread, so it always runs in rpciod's * context. There is one renewd per nfs_server. * - * TODO: If the send queue gets backlogged (e.g., if the server goes down), - * we will keep filling the queue with periodic RENEW requests. We need a - * mechanism for ensuring that if renewd successfully sends off a request, - * then it only wakes up when the request is finished. Maybe use the - * child task framework of the RPC layer? */ #include <linux/mm.h> @@ -63,7 +58,7 @@ nfs4_renew_state(struct work_struct *work) struct nfs_client *clp = container_of(work, struct nfs_client, cl_renewd.work); struct rpc_cred *cred; - long lease, timeout; + long lease; unsigned long last, now; ops = nfs4_state_renewal_ops[clp->cl_minorversion]; @@ -75,7 +70,6 @@ nfs4_renew_state(struct work_struct *work) lease = clp->cl_lease_time; last = clp->cl_last_renewal; now = jiffies; - timeout = (2 * lease) / 3 + (long)last - (long)now; /* Are we close to a lease timeout? */ if (time_after(now, last + lease/3)) { cred = ops->get_state_renewal_cred_locked(clp); @@ -90,19 +84,15 @@ nfs4_renew_state(struct work_struct *work) /* Queue an asynchronous RENEW. */ ops->sched_state_renewal(clp, cred); put_rpccred(cred); + goto out_exp; } - timeout = (2 * lease) / 3; - spin_lock(&clp->cl_lock); - } else + } else { dprintk("%s: failed to call renewd. Reason: lease not expired \n", __func__); - if (timeout < 5 * HZ) /* safeguard */ - timeout = 5 * HZ; - dprintk("%s: requeueing work. Lease period = %ld\n", - __func__, (timeout + HZ - 1) / HZ); - cancel_delayed_work(&clp->cl_renewd); - schedule_delayed_work(&clp->cl_renewd, timeout); - spin_unlock(&clp->cl_lock); + spin_unlock(&clp->cl_lock); + } + nfs4_schedule_state_renewal(clp); +out_exp: nfs_expire_unreferenced_delegations(clp); out: dprintk("%s: done\n", __func__); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index c1e2733f4fa..6c5ed51f105 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1249,26 +1249,65 @@ static int nfs4_reclaim_lease(struct nfs_client *clp) } #ifdef CONFIG_NFS_V4_1 +void nfs41_handle_recall_slot(struct nfs_client *clp) +{ + set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state); + nfs4_schedule_state_recovery(clp); +} + +static void nfs4_reset_all_state(struct nfs_client *clp) +{ + if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { + clp->cl_boot_time = CURRENT_TIME; + nfs4_state_start_reclaim_nograce(clp); + nfs4_schedule_state_recovery(clp); + } +} + +static void nfs41_handle_server_reboot(struct nfs_client *clp) +{ + if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { + nfs4_state_start_reclaim_reboot(clp); + nfs4_schedule_state_recovery(clp); + } +} + +static void nfs41_handle_state_revoked(struct nfs_client *clp) +{ + /* Temporary */ + nfs4_reset_all_state(clp); +} + +static void nfs41_handle_recallable_state_revoked(struct nfs_client *clp) +{ + /* This will need to handle layouts too */ + nfs_expire_all_delegations(clp); +} + +static void nfs41_handle_cb_path_down(struct nfs_client *clp) +{ + nfs_expire_all_delegations(clp); + if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0) + nfs4_schedule_state_recovery(clp); +} + void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) { if (!flags) return; - else if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) { - set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); - nfs4_state_start_reclaim_reboot(clp); - nfs4_schedule_state_recovery(clp); - } else if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED | + else if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) + nfs41_handle_server_reboot(clp); + else if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED | SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED | SEQ4_STATUS_ADMIN_STATE_REVOKED | - SEQ4_STATUS_RECALLABLE_STATE_REVOKED | - SEQ4_STATUS_LEASE_MOVED)) { - set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); - nfs4_state_start_reclaim_nograce(clp); - nfs4_schedule_state_recovery(clp); - } else if (flags & (SEQ4_STATUS_CB_PATH_DOWN | + SEQ4_STATUS_LEASE_MOVED)) + nfs41_handle_state_revoked(clp); + else if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED) + nfs41_handle_recallable_state_revoked(clp); + else if (flags & (SEQ4_STATUS_CB_PATH_DOWN | SEQ4_STATUS_BACKCHANNEL_FAULT | SEQ4_STATUS_CB_PATH_DOWN_SESSION)) - nfs_expire_all_delegations(clp); + nfs41_handle_cb_path_down(clp); } static int nfs4_reset_session(struct nfs_client *clp) @@ -1285,23 +1324,52 @@ static int nfs4_reset_session(struct nfs_client *clp) memset(clp->cl_session->sess_id.data, 0, NFS4_MAX_SESSIONID_LEN); status = nfs4_proc_create_session(clp); - if (status) + if (status) { status = nfs4_recovery_handle_error(clp, status); + goto out; + } + /* create_session negotiated new slot table */ + clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state); -out: - /* - * Let the state manager reestablish state - */ - if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) && - status == 0) + /* Let the state manager reestablish state */ + if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) nfs41_setup_state_renewal(clp); - +out: return status; } +static int nfs4_recall_slot(struct nfs_client *clp) +{ + struct nfs4_slot_table *fc_tbl = &clp->cl_session->fc_slot_table; + struct nfs4_channel_attrs *fc_attrs = &clp->cl_session->fc_attrs; + struct nfs4_slot *new, *old; + int i; + + nfs4_begin_drain_session(clp); + new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot), + GFP_KERNEL); + if (!new) + return -ENOMEM; + + spin_lock(&fc_tbl->slot_tbl_lock); + for (i = 0; i < fc_tbl->target_max_slots; i++) + new[i].seq_nr = fc_tbl->slots[i].seq_nr; + old = fc_tbl->slots; + fc_tbl->slots = new; + fc_tbl->max_slots = fc_tbl->target_max_slots; + fc_tbl->target_max_slots = 0; + fc_attrs->max_reqs = fc_tbl->max_slots; + spin_unlock(&fc_tbl->slot_tbl_lock); + + kfree(old); + nfs4_end_drain_session(clp); + return 0; +} + #else /* CONFIG_NFS_V4_1 */ static int nfs4_reset_session(struct nfs_client *clp) { return 0; } static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; } +static int nfs4_recall_slot(struct nfs_client *clp) { return 0; } #endif /* CONFIG_NFS_V4_1 */ /* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors @@ -1314,6 +1382,7 @@ static void nfs4_set_lease_expired(struct nfs_client *clp, int status) case -NFS4ERR_DELAY: case -NFS4ERR_CLID_INUSE: case -EAGAIN: + case -EKEYEXPIRED: break; case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery @@ -1397,6 +1466,15 @@ static void nfs4_state_manager(struct nfs_client *clp) nfs_client_return_marked_delegations(clp); continue; } + /* Recall session slots */ + if (test_and_clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state) + && nfs4_has_session(clp)) { + status = nfs4_recall_slot(clp); + if (status < 0) + goto out_error; + continue; + } + nfs4_clear_state_manager_bit(clp); /* Did we race with an attempt to give us more work? */ diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 5cd5184b56d..4d338be492c 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1578,6 +1578,14 @@ static void encode_create_session(struct xdr_stream *xdr, char machine_name[NFS4_MAX_MACHINE_NAME_LEN]; uint32_t len; struct nfs_client *clp = args->client; + u32 max_resp_sz_cached; + + /* + * Assumes OPEN is the biggest non-idempotent compound. + * 2 is the verifier. + */ + max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE + + RPC_MAX_AUTH_SIZE + 2) * XDR_UNIT; len = scnprintf(machine_name, sizeof(machine_name), "%s", clp->cl_ipaddr); @@ -1592,7 +1600,7 @@ static void encode_create_session(struct xdr_stream *xdr, *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */ *p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */ *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz); /* max resp size */ - *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz_cached); /* Max resp sz cached */ + *p++ = cpu_to_be32(max_resp_sz_cached); /* Max resp sz cached */ *p++ = cpu_to_be32(args->fc_attrs.max_ops); /* max operations */ *p++ = cpu_to_be32(args->fc_attrs.max_reqs); /* max requests */ *p++ = cpu_to_be32(0); /* rdmachannel_attrs */ diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index a12c45b65dd..29d9d36cd5f 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -112,12 +112,10 @@ void nfs_unlock_request(struct nfs_page *req) */ int nfs_set_page_tag_locked(struct nfs_page *req) { - struct nfs_inode *nfsi = NFS_I(req->wb_context->path.dentry->d_inode); - if (!nfs_lock_request_dontget(req)) return 0; if (req->wb_page != NULL) - radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); + radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); return 1; } @@ -126,10 +124,10 @@ int nfs_set_page_tag_locked(struct nfs_page *req) */ void nfs_clear_page_tag_locked(struct nfs_page *req) { - struct inode *inode = req->wb_context->path.dentry->d_inode; - struct nfs_inode *nfsi = NFS_I(inode); - if (req->wb_page != NULL) { + struct inode *inode = req->wb_context->path.dentry->d_inode; + struct nfs_inode *nfsi = NFS_I(inode); + spin_lock(&inode->i_lock); radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); nfs_unlock_request(req); @@ -142,16 +140,22 @@ void nfs_clear_page_tag_locked(struct nfs_page *req) * nfs_clear_request - Free up all resources allocated to the request * @req: * - * Release page resources associated with a write request after it - * has completed. + * Release page and open context resources associated with a read/write + * request after it has completed. */ void nfs_clear_request(struct nfs_page *req) { struct page *page = req->wb_page; + struct nfs_open_context *ctx = req->wb_context; + if (page != NULL) { page_cache_release(page); req->wb_page = NULL; } + if (ctx != NULL) { + put_nfs_open_context(ctx); + req->wb_context = NULL; + } } @@ -165,9 +169,8 @@ static void nfs_free_request(struct kref *kref) { struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); - /* Release struct file or cached credential */ + /* Release struct file and open context */ nfs_clear_request(req); - put_nfs_open_context(req->wb_context); nfs_page_free(req); } diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index ef583854d8d..c752d944fe9 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -47,6 +47,39 @@ #define NFSDBG_FACILITY NFSDBG_PROC /* + * wrapper to handle the -EKEYEXPIRED error message. This should generally + * only happen if using krb5 auth and a user's TGT expires. NFSv2 doesn't + * support the NFSERR_JUKEBOX error code, but we handle this situation in the + * same way that we handle that error with NFSv3. + */ +static int +nfs_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) +{ + int res; + do { + res = rpc_call_sync(clnt, msg, flags); + if (res != -EKEYEXPIRED) + break; + schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); + res = -ERESTARTSYS; + } while (!fatal_signal_pending(current)); + return res; +} + +#define rpc_call_sync(clnt, msg, flags) nfs_rpc_wrapper(clnt, msg, flags) + +static int +nfs_async_handle_expired_key(struct rpc_task *task) +{ + if (task->tk_status != -EKEYEXPIRED) + return 0; + task->tk_status = 0; + rpc_restart_call(task); + rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); + return 1; +} + +/* * Bare-bones access to getattr: this is for nfs_read_super. */ static int @@ -307,6 +340,8 @@ nfs_proc_unlink_setup(struct rpc_message *msg, struct inode *dir) static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir) { + if (nfs_async_handle_expired_key(task)) + return 0; nfs_mark_for_revalidate(dir); return 1; } @@ -560,6 +595,9 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data) { + if (nfs_async_handle_expired_key(task)) + return -EAGAIN; + nfs_invalidate_atime(data->inode); if (task->tk_status >= 0) { nfs_refresh_inode(data->inode, data->res.fattr); @@ -579,6 +617,9 @@ static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message * static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data) { + if (nfs_async_handle_expired_key(task)) + return -EAGAIN; + if (task->tk_status >= 0) nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr); return 0; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index f1afee4eea7..6baf9a39346 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2214,7 +2214,7 @@ static int nfs_get_sb(struct file_system_type *fs_type, } else { error = nfs_bdi_register(server); if (error) - goto error_splat_super; + goto error_splat_bdi; } if (!s->s_root) { @@ -2256,6 +2256,9 @@ out_err_nosb: error_splat_root: dput(mntroot); error_splat_super: + if (server && !s->s_root) + bdi_unregister(&server->backing_dev_info); +error_splat_bdi: deactivate_locked_super(s); goto out; } @@ -2326,7 +2329,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, } else { error = nfs_bdi_register(server); if (error) - goto error_splat_super; + goto error_splat_bdi; } if (!s->s_root) { @@ -2363,6 +2366,9 @@ out_err_noserver: return error; error_splat_super: + if (server && !s->s_root) + bdi_unregister(&server->backing_dev_info); +error_splat_bdi: deactivate_locked_super(s); dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error); return error; @@ -2578,7 +2584,7 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type, } else { error = nfs_bdi_register(server); if (error) - goto error_splat_super; + goto error_splat_bdi; } if (!s->s_root) { @@ -2616,6 +2622,9 @@ out_free: error_splat_root: dput(mntroot); error_splat_super: + if (server && !s->s_root) + bdi_unregister(&server->backing_dev_info); +error_splat_bdi: deactivate_locked_super(s); goto out; } @@ -2811,7 +2820,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags, } else { error = nfs_bdi_register(server); if (error) - goto error_splat_super; + goto error_splat_bdi; } if (!s->s_root) { @@ -2847,6 +2856,9 @@ out_err_noserver: return error; error_splat_super: + if (server && !s->s_root) + bdi_unregister(&server->backing_dev_info); +error_splat_bdi: deactivate_locked_super(s); dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error); return error; @@ -2893,7 +2905,7 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type, } else { error = nfs_bdi_register(server); if (error) - goto error_splat_super; + goto error_splat_bdi; } if (!s->s_root) { @@ -2929,6 +2941,9 @@ out_err_noserver: return error; error_splat_super: + if (server && !s->s_root) + bdi_unregister(&server->backing_dev_info); +error_splat_bdi: deactivate_locked_super(s); dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error); return error; diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c index 412738dbfbc..2ea9e5c27e5 100644 --- a/fs/nfs/symlink.c +++ b/fs/nfs/symlink.c @@ -50,7 +50,7 @@ static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd) struct page *page; void *err; - err = ERR_PTR(nfs_revalidate_mapping_nolock(inode, inode->i_mapping)); + err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping)); if (err) goto read_failed; page = read_cache_page(&inode->i_data, 0, diff --git a/fs/nfs/write.c b/fs/nfs/write.c index d63d964a039..53ff70e2399 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -438,6 +438,7 @@ nfs_mark_request_commit(struct nfs_page *req) radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_COMMIT); + nfsi->ncommit++; spin_unlock(&inode->i_lock); inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); @@ -501,57 +502,6 @@ int nfs_reschedule_unstable_write(struct nfs_page *req) } #endif -/* - * Wait for a request to complete. - * - * Interruptible by fatal signals only. - */ -static int nfs_wait_on_requests_locked(struct inode *inode, pgoff_t idx_start, unsigned int npages) -{ - struct nfs_inode *nfsi = NFS_I(inode); - struct nfs_page *req; - pgoff_t idx_end, next; - unsigned int res = 0; - int error; - - if (npages == 0) - idx_end = ~0; - else - idx_end = idx_start + npages - 1; - - next = idx_start; - while (radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, (void **)&req, next, 1, NFS_PAGE_TAG_LOCKED)) { - if (req->wb_index > idx_end) - break; - - next = req->wb_index + 1; - BUG_ON(!NFS_WBACK_BUSY(req)); - - kref_get(&req->wb_kref); - spin_unlock(&inode->i_lock); - error = nfs_wait_on_request(req); - nfs_release_request(req); - spin_lock(&inode->i_lock); - if (error < 0) - return error; - res++; - } - return res; -} - -static void nfs_cancel_commit_list(struct list_head *head) -{ - struct nfs_page *req; - - while(!list_empty(head)) { - req = nfs_list_entry(head->next); - nfs_list_remove_request(req); - nfs_clear_request_commit(req); - nfs_inode_remove_request(req); - nfs_unlock_request(req); - } -} - #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) static int nfs_need_commit(struct nfs_inode *nfsi) @@ -573,11 +523,17 @@ static int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) { struct nfs_inode *nfsi = NFS_I(inode); + int ret; if (!nfs_need_commit(nfsi)) return 0; - return nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); + ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); + if (ret > 0) + nfsi->ncommit -= ret; + if (nfs_need_commit(NFS_I(inode))) + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + return ret; } #else static inline int nfs_need_commit(struct nfs_inode *nfsi) @@ -642,9 +598,10 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, spin_lock(&inode->i_lock); } - if (nfs_clear_request_commit(req)) - radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree, - req->wb_index, NFS_PAGE_TAG_COMMIT); + if (nfs_clear_request_commit(req) && + radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree, + req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL) + NFS_I(inode)->ncommit--; /* Okay, the request matches. Update the region */ if (offset < req->wb_offset) { @@ -1391,7 +1348,7 @@ static const struct rpc_call_ops nfs_commit_ops = { .rpc_release = nfs_commit_release, }; -int nfs_commit_inode(struct inode *inode, int how) +static int nfs_commit_inode(struct inode *inode, int how) { LIST_HEAD(head); int res; @@ -1406,92 +1363,51 @@ int nfs_commit_inode(struct inode *inode, int how) } return res; } -#else -static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how) -{ - return 0; -} -#endif -long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how) +static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc) { - struct inode *inode = mapping->host; - pgoff_t idx_start, idx_end; - unsigned int npages = 0; - LIST_HEAD(head); - int nocommit = how & FLUSH_NOCOMMIT; - long pages, ret; - - /* FIXME */ - if (wbc->range_cyclic) - idx_start = 0; - else { - idx_start = wbc->range_start >> PAGE_CACHE_SHIFT; - idx_end = wbc->range_end >> PAGE_CACHE_SHIFT; - if (idx_end > idx_start) { - pgoff_t l_npages = 1 + idx_end - idx_start; - npages = l_npages; - if (sizeof(npages) != sizeof(l_npages) && - (pgoff_t)npages != l_npages) - npages = 0; + struct nfs_inode *nfsi = NFS_I(inode); + int flags = FLUSH_SYNC; + int ret = 0; + + /* Don't commit yet if this is a non-blocking flush and there are + * lots of outstanding writes for this mapping. + */ + if (wbc->sync_mode == WB_SYNC_NONE && + nfsi->ncommit <= (nfsi->npages >> 1)) + goto out_mark_dirty; + + if (wbc->nonblocking || wbc->for_background) + flags = 0; + ret = nfs_commit_inode(inode, flags); + if (ret >= 0) { + if (wbc->sync_mode == WB_SYNC_NONE) { + if (ret < wbc->nr_to_write) + wbc->nr_to_write -= ret; + else + wbc->nr_to_write = 0; } + return 0; } - how &= ~FLUSH_NOCOMMIT; - spin_lock(&inode->i_lock); - do { - ret = nfs_wait_on_requests_locked(inode, idx_start, npages); - if (ret != 0) - continue; - if (nocommit) - break; - pages = nfs_scan_commit(inode, &head, idx_start, npages); - if (pages == 0) - break; - if (how & FLUSH_INVALIDATE) { - spin_unlock(&inode->i_lock); - nfs_cancel_commit_list(&head); - ret = pages; - spin_lock(&inode->i_lock); - continue; - } - pages += nfs_scan_commit(inode, &head, 0, 0); - spin_unlock(&inode->i_lock); - ret = nfs_commit_list(inode, &head, how); - spin_lock(&inode->i_lock); - - } while (ret >= 0); - spin_unlock(&inode->i_lock); +out_mark_dirty: + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return ret; } - -static int __nfs_write_mapping(struct address_space *mapping, struct writeback_control *wbc, int how) +#else +static int nfs_commit_inode(struct inode *inode, int how) { - int ret; - - ret = nfs_writepages(mapping, wbc); - if (ret < 0) - goto out; - ret = nfs_sync_mapping_wait(mapping, wbc, how); - if (ret < 0) - goto out; return 0; -out: - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - return ret; } -/* Two pass sync: first using WB_SYNC_NONE, then WB_SYNC_ALL */ -static int nfs_write_mapping(struct address_space *mapping, int how) +static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc) { - struct writeback_control wbc = { - .bdi = mapping->backing_dev_info, - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .range_start = 0, - .range_end = LLONG_MAX, - }; + return 0; +} +#endif - return __nfs_write_mapping(mapping, &wbc, how); +int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + return nfs_commit_unstable_pages(inode, wbc); } /* @@ -1499,37 +1415,26 @@ static int nfs_write_mapping(struct address_space *mapping, int how) */ int nfs_wb_all(struct inode *inode) { - return nfs_write_mapping(inode->i_mapping, 0); -} + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, + .range_start = 0, + .range_end = LLONG_MAX, + }; -int nfs_wb_nocommit(struct inode *inode) -{ - return nfs_write_mapping(inode->i_mapping, FLUSH_NOCOMMIT); + return sync_inode(inode, &wbc); } int nfs_wb_page_cancel(struct inode *inode, struct page *page) { struct nfs_page *req; - loff_t range_start = page_offset(page); - loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); - struct writeback_control wbc = { - .bdi = page->mapping->backing_dev_info, - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .range_start = range_start, - .range_end = range_end, - }; int ret = 0; BUG_ON(!PageLocked(page)); for (;;) { req = nfs_page_find_request(page); if (req == NULL) - goto out; - if (test_bit(PG_CLEAN, &req->wb_flags)) { - nfs_release_request(req); break; - } if (nfs_lock_request_dontget(req)) { nfs_inode_remove_request(req); /* @@ -1543,54 +1448,54 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) ret = nfs_wait_on_request(req); nfs_release_request(req); if (ret < 0) - goto out; + break; } - if (!PagePrivate(page)) - return 0; - ret = nfs_sync_mapping_wait(page->mapping, &wbc, FLUSH_INVALIDATE); -out: return ret; } -static int nfs_wb_page_priority(struct inode *inode, struct page *page, - int how) +/* + * Write back all requests on one page - we do this before reading it. + */ +int nfs_wb_page(struct inode *inode, struct page *page) { loff_t range_start = page_offset(page); loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); struct writeback_control wbc = { - .bdi = page->mapping->backing_dev_info, .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, + .nr_to_write = 0, .range_start = range_start, .range_end = range_end, }; + struct nfs_page *req; + int need_commit; int ret; - do { + while(PagePrivate(page)) { if (clear_page_dirty_for_io(page)) { ret = nfs_writepage_locked(page, &wbc); if (ret < 0) goto out_error; - } else if (!PagePrivate(page)) + } + req = nfs_find_and_lock_request(page); + if (!req) break; - ret = nfs_sync_mapping_wait(page->mapping, &wbc, how); - if (ret < 0) + if (IS_ERR(req)) { + ret = PTR_ERR(req); goto out_error; - } while (PagePrivate(page)); + } + need_commit = test_bit(PG_CLEAN, &req->wb_flags); + nfs_clear_page_tag_locked(req); + if (need_commit) { + ret = nfs_commit_inode(inode, FLUSH_SYNC); + if (ret < 0) + goto out_error; + } + } return 0; out_error: - __mark_inode_dirty(inode, I_DIRTY_PAGES); return ret; } -/* - * Write back all requests on one page - we do this before reading it. - */ -int nfs_wb_page(struct inode *inode, struct page* page) -{ - return nfs_wb_page_priority(inode, page, FLUSH_STABLE); -} - #ifdef CONFIG_MIGRATION int nfs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index c6eed2a3b09..4bc22c763de 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -525,6 +525,8 @@ static struct rpc_cred *callback_cred; int set_callback_cred(void) { + if (callback_cred) + return 0; callback_cred = rpc_lookup_machine_cred(); if (!callback_cred) return -ENOMEM; @@ -542,7 +544,8 @@ void do_probe_callback(struct nfs4_client *clp) }; int status; - status = rpc_call_async(cb->cb_client, &msg, RPC_TASK_SOFT, + status = rpc_call_async(cb->cb_client, &msg, + RPC_TASK_SOFT | RPC_TASK_SOFTCONN, &nfsd4_cb_probe_ops, (void *)clp); if (status) { warn_no_callback_path(clp, status); diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 5a754f7b71e..98fb98e330b 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -119,9 +119,7 @@ out_no_tfm: static void nfsd4_sync_rec_dir(void) { - mutex_lock(&rec_dir.dentry->d_inode->i_mutex); - nfsd_sync_dir(rec_dir.dentry); - mutex_unlock(&rec_dir.dentry->d_inode->i_mutex); + vfs_fsync(NULL, rec_dir.dentry, 0); } int diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index f19ed866c95..c97fddbd17d 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1998,7 +1998,9 @@ nfs4_file_downgrade(struct file *filp, unsigned int share_access) { if (share_access & NFS4_SHARE_ACCESS_WRITE) { drop_file_write_access(filp); + spin_lock(&filp->f_lock); filp->f_mode = (filp->f_mode | FMODE_READ) & ~FMODE_WRITE; + spin_unlock(&filp->f_lock); } } @@ -2480,8 +2482,10 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf } memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t)); - if (nfsd4_has_session(&resp->cstate)) + if (nfsd4_has_session(&resp->cstate)) { open->op_stateowner->so_confirmed = 1; + nfsd4_create_clid_dir(open->op_stateowner->so_client); + } /* * Attempt to hand out a delegation. No error return, because the diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index bbf72d8f9fc..c47b4d7bafa 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1434,7 +1434,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) } op->opnum = ntohl(*argp->p++); - if (op->opnum >= OP_ACCESS && op->opnum < ops->nops) + if (op->opnum >= FIRST_NFS4_OP && op->opnum <= LAST_NFS4_OP) op->status = ops->decoders[op->opnum](argp, &op->u); else { op->opnum = OP_ILLEGAL; @@ -1528,7 +1528,7 @@ static void write_cinfo(__be32 **p, struct nfsd4_change_info *c) } } while (0); /* Encode as an array of strings the string given with components - * seperated @sep. + * separated @sep. */ static __be32 nfsd4_encode_components(char sep, char *components, __be32 **pp, int *buflen) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 2604c3e70ea..0f0e77f2012 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -988,6 +988,7 @@ static ssize_t __write_ports_delfd(char *buf) static ssize_t __write_ports_addxprt(char *buf) { char transport[16]; + struct svc_xprt *xprt; int port, err; if (sscanf(buf, "%15s %4u", transport, &port) != 2) @@ -1002,13 +1003,24 @@ static ssize_t __write_ports_addxprt(char *buf) err = svc_create_xprt(nfsd_serv, transport, PF_INET, port, SVC_SOCK_ANONYMOUS); - if (err < 0) { - /* Give a reasonable perror msg for bad transport string */ - if (err == -ENOENT) - err = -EPROTONOSUPPORT; - return err; - } + if (err < 0) + goto out_err; + + err = svc_create_xprt(nfsd_serv, transport, + PF_INET6, port, SVC_SOCK_ANONYMOUS); + if (err < 0 && err != -EAFNOSUPPORT) + goto out_close; return 0; +out_close: + xprt = svc_find_xprt(nfsd_serv, transport, PF_INET, port); + if (xprt != NULL) { + svc_close_xprt(xprt); + svc_xprt_put(xprt); + } +out_err: + /* Decrease the count, but don't shut down the service */ + nfsd_serv->sv_nrthreads--; + return err; } /* diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 15dc2deaac5..a11b0e8678e 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -20,13 +20,14 @@ #include <linux/fcntl.h> #include <linux/namei.h> #include <linux/delay.h> -#include <linux/quotaops.h> #include <linux/fsnotify.h> #include <linux/posix_acl_xattr.h> #include <linux/xattr.h> #include <linux/jhash.h> #include <linux/ima.h> #include <asm/uaccess.h> +#include <linux/exportfs.h> +#include <linux/writeback.h> #ifdef CONFIG_NFSD_V3 #include "xdr3.h" @@ -271,6 +272,32 @@ out: return err; } +/* + * Commit metadata changes to stable storage. + */ +static int +commit_metadata(struct svc_fh *fhp) +{ + struct inode *inode = fhp->fh_dentry->d_inode; + const struct export_operations *export_ops = inode->i_sb->s_export_op; + int error = 0; + + if (!EX_ISSYNC(fhp->fh_export)) + return 0; + + if (export_ops->commit_metadata) { + error = export_ops->commit_metadata(inode); + } else { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 0, /* metadata only */ + }; + + error = sync_inode(inode, &wbc); + } + + return error; +} /* * Set various file attributes. @@ -377,7 +404,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, put_write_access(inode); goto out_nfserr; } - vfs_dq_init(inode); } /* sanitize the mode change */ @@ -745,8 +771,6 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, flags = O_RDWR|O_LARGEFILE; else flags = O_WRONLY|O_LARGEFILE; - - vfs_dq_init(inode); } *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt), flags, current_cred()); @@ -771,43 +795,6 @@ nfsd_close(struct file *filp) } /* - * Sync a file - * As this calls fsync (not fdatasync) there is no need for a write_inode - * after it. - */ -static inline int nfsd_dosync(struct file *filp, struct dentry *dp, - const struct file_operations *fop) -{ - struct inode *inode = dp->d_inode; - int (*fsync) (struct file *, struct dentry *, int); - int err; - - err = filemap_write_and_wait(inode->i_mapping); - if (err == 0 && fop && (fsync = fop->fsync)) - err = fsync(filp, dp, 0); - return err; -} - -static int -nfsd_sync(struct file *filp) -{ - int err; - struct inode *inode = filp->f_path.dentry->d_inode; - dprintk("nfsd: sync file %s\n", filp->f_path.dentry->d_name.name); - mutex_lock(&inode->i_mutex); - err=nfsd_dosync(filp, filp->f_path.dentry, filp->f_op); - mutex_unlock(&inode->i_mutex); - - return err; -} - -int -nfsd_sync_dir(struct dentry *dp) -{ - return nfsd_dosync(NULL, dp, dp->d_inode->i_fop); -} - -/* * Obtain the readahead parameters for the file * specified by (dev, ino). */ @@ -1010,7 +997,7 @@ static int wait_for_concurrent_writes(struct file *file) if (inode->i_state & I_DIRTY) { dprintk("nfsd: write sync %d\n", task_pid_nr(current)); - err = nfsd_sync(file); + err = vfs_fsync(file, file->f_path.dentry, 0); } last_ino = inode->i_ino; last_dev = inode->i_sb->s_dev; @@ -1158,8 +1145,9 @@ out: #ifdef CONFIG_NFSD_V3 /* * Commit all pending writes to stable storage. - * Strictly speaking, we could sync just the indicated file region here, - * but there's currently no way we can ask the VFS to do so. + * + * Note: we only guarantee that data that lies within the range specified + * by the 'offset' and 'count' parameters will be synced. * * Unfortunately we cannot lock the file to make sure we return full WCC * data to the client, as locking happens lower down in the filesystem. @@ -1169,23 +1157,32 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, unsigned long count) { struct file *file; - __be32 err; + loff_t end = LLONG_MAX; + __be32 err = nfserr_inval; - if ((u64)count > ~(u64)offset) - return nfserr_inval; + if (offset < 0) + goto out; + if (count != 0) { + end = offset + (loff_t)count - 1; + if (end < offset) + goto out; + } err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); if (err) - return err; + goto out; if (EX_ISSYNC(fhp->fh_export)) { - if (file->f_op && file->f_op->fsync) { - err = nfserrno(nfsd_sync(file)); - } else { + int err2 = vfs_fsync_range(file, file->f_path.dentry, + offset, end, 0); + + if (err2 != -EINVAL) + err = nfserrno(err2); + else err = nfserr_notsupp; - } } nfsd_close(file); +out: return err; } #endif /* CONFIG_NFSD_V3 */ @@ -1338,12 +1335,14 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out_nfserr; } - if (EX_ISSYNC(fhp->fh_export)) { - err = nfserrno(nfsd_sync_dir(dentry)); - write_inode_now(dchild->d_inode, 1); - } + err = nfsd_create_setattr(rqstp, resfhp, iap); - err2 = nfsd_create_setattr(rqstp, resfhp, iap); + /* + * nfsd_setattr already committed the child. Transactional filesystems + * had a chance to commit changes for both parent and child + * simultaneously making the following commit_metadata a noop. + */ + err2 = nfserrno(commit_metadata(fhp)); if (err2) err = err2; mnt_drop_write(fhp->fh_export->ex_path.mnt); @@ -1375,7 +1374,6 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, struct dentry *dentry, *dchild = NULL; struct inode *dirp; __be32 err; - __be32 err2; int host_err; __u32 v_mtime=0, v_atime=0; @@ -1470,11 +1468,6 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, if (created) *created = 1; - if (EX_ISSYNC(fhp->fh_export)) { - err = nfserrno(nfsd_sync_dir(dentry)); - /* setattr will sync the child (or not) */ - } - nfsd_check_ignore_resizing(iap); if (createmode == NFS3_CREATE_EXCLUSIVE) { @@ -1489,9 +1482,13 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, } set_attr: - err2 = nfsd_create_setattr(rqstp, resfhp, iap); - if (err2) - err = err2; + err = nfsd_create_setattr(rqstp, resfhp, iap); + + /* + * nfsd_setattr already committed the child (and possibly also the parent). + */ + if (!err) + err = nfserrno(commit_metadata(fhp)); mnt_drop_write(fhp->fh_export->ex_path.mnt); /* @@ -1606,12 +1603,9 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, } } else host_err = vfs_symlink(dentry->d_inode, dnew, path); - - if (!host_err) { - if (EX_ISSYNC(fhp->fh_export)) - host_err = nfsd_sync_dir(dentry); - } err = nfserrno(host_err); + if (!err) + err = nfserrno(commit_metadata(fhp)); fh_unlock(fhp); mnt_drop_write(fhp->fh_export->ex_path.mnt); @@ -1673,11 +1667,9 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, } host_err = vfs_link(dold, dirp, dnew); if (!host_err) { - if (EX_ISSYNC(ffhp->fh_export)) { - err = nfserrno(nfsd_sync_dir(ddir)); - write_inode_now(dest, 1); - } - err = 0; + err = nfserrno(commit_metadata(ffhp)); + if (!err) + err = nfserrno(commit_metadata(tfhp)); } else { if (host_err == -EXDEV && rqstp->rq_vers == 2) err = nfserr_acces; @@ -1773,10 +1765,10 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, goto out_dput_new; host_err = vfs_rename(fdir, odentry, tdir, ndentry); - if (!host_err && EX_ISSYNC(tfhp->fh_export)) { - host_err = nfsd_sync_dir(tdentry); + if (!host_err) { + host_err = commit_metadata(tfhp); if (!host_err) - host_err = nfsd_sync_dir(fdentry); + host_err = commit_metadata(ffhp); } mnt_drop_write(ffhp->fh_export->ex_path.mnt); @@ -1857,12 +1849,9 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, dput(rdentry); - if (host_err) - goto out_drop; - if (EX_ISSYNC(fhp->fh_export)) - host_err = nfsd_sync_dir(dentry); + if (!host_err) + host_err = commit_metadata(fhp); -out_drop: mnt_drop_write(fhp->fh_export->ex_path.mnt); out_nfserr: err = nfserrno(host_err); diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h index f4543ac4f56..5cccf874d69 100644 --- a/fs/nilfs2/alloc.h +++ b/fs/nilfs2/alloc.h @@ -42,7 +42,7 @@ void *nilfs_palloc_block_get_entry(const struct inode *, __u64, const struct buffer_head *, void *); /** - * nilfs_palloc_req - persistent alloctor request and reply + * nilfs_palloc_req - persistent allocator request and reply * @pr_entry_nr: entry number (vblocknr or inode number) * @pr_desc_bh: buffer head of the buffer containing block group descriptors * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 9d1e5de91af..01314675568 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -288,7 +288,7 @@ int nilfs_dat_mark_dirty(struct inode *dat, __u64 vblocknr) * @vblocknrs and @nitems. * * Return Value: On success, 0 is returned. On error, one of the following - * nagative error codes is returned. + * negative error codes is returned. * * %-EIO - I/O error. * diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 0092840492e..85c89dfc71f 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -396,7 +396,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr, /* next page is past the blocks we've got */ if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) { nilfs_error(dir->i_sb, __func__, - "dir %lu size %lld exceeds block cout %llu", + "dir %lu size %lld exceeds block count %llu", dir->i_ino, dir->i_size, (unsigned long long)dir->i_blocks); goto out; diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index e16a6664dfa..8880a9e281e 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -28,10 +28,10 @@ * gcinodes), and this file provides lookup function of the dummy * inodes and their buffer read function. * - * Since NILFS2 keeps up multiple checkpoints/snapshots accross GC, it + * Since NILFS2 keeps up multiple checkpoints/snapshots across GC, it * has to treat blocks that belong to a same file but have different * checkpoint numbers. To avoid interference among generations, dummy - * inodes are managed separatly from actual inodes, and their lookup + * inodes are managed separately from actual inodes, and their lookup * function (nilfs_gc_iget) is designed to be specified with a * checkpoint number argument as well as an inode number. * diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index a2692bbc7b5..fc246dba112 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -292,7 +292,7 @@ void nilfs_free_private_page(struct page *page) * @src: source page * @copy_dirty: flag whether to copy dirty states on the page's buffer heads. * - * This fuction is for both data pages and btnode pages. The dirty flag + * This function is for both data pages and btnode pages. The dirty flag * should be treated by caller. The page must not be under i/o. * Both src and dst page must be locked */ @@ -388,7 +388,7 @@ repeat: } /** - * nilfs_copy_back_pages -- copy back pages to orignal cache from shadow cache + * nilfs_copy_back_pages -- copy back pages to original cache from shadow cache * @dmap: destination page cache * @smap: source page cache * diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index ab56fe44e37..636eaafd6ea 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -32,7 +32,7 @@ struct nilfs_write_info { struct the_nilfs *nilfs; struct bio *bio; - int start, end; /* The region to be submitted */ + int start, end; /* The region to be submitted */ int rest_blocks; int max_pages; int nr_vecs; @@ -174,7 +174,7 @@ int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags, } /* - * Setup segument summary + * Setup segment summary */ void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf) { @@ -470,8 +470,8 @@ static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf, * * %-ENOMEM - Insufficient memory available. */ -int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, - struct the_nilfs *nilfs) +static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, + struct the_nilfs *nilfs) { struct nilfs_write_info wi; struct buffer_head *bh; @@ -514,7 +514,7 @@ int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, * * %-EIO - I/O error */ -int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf) +static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf) { int err = 0; diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index ada2f1b947a..69576a95e13 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -141,7 +141,7 @@ int nilfs_init_transaction_cache(void) } /** - * nilfs_detroy_transaction_cache - destroy the cache for transaction info + * nilfs_destroy_transaction_cache - destroy the cache for transaction info * * nilfs_destroy_transaction_cache() frees the slab cache for the struct * nilfs_transaction_info. @@ -201,7 +201,7 @@ static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti) * This function allocates a nilfs_transaction_info struct to keep context * information on it. It is initialized and hooked onto the current task in * the outermost call. If a pre-allocated struct is given to @ti, it is used - * instead; othewise a new struct is assigned from a slab. + * instead; otherwise a new struct is assigned from a slab. * * When @vacancy_check flag is set, this function will check the amount of * free space, and will wait for the GC to reclaim disk space if low capacity. @@ -2214,7 +2214,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) } /** - * nilfs_secgtor_start_timer - set timer of background write + * nilfs_segctor_start_timer - set timer of background write * @sci: nilfs_sc_info * * If the timer has already been set, it ignores the new request. @@ -2854,7 +2854,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) * @sbi: nilfs_sb_info * * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info, - * initilizes it, and starts the segment constructor. + * initializes it, and starts the segment constructor. * * Return Value: On success, 0 is returned. On error, one of the following * negative error code is returned. diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index 3155e0c7f41..82dfd6a686b 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -30,7 +30,7 @@ #include "sb.h" /** - * struct nilfs_recovery_info - Recovery infomation + * struct nilfs_recovery_info - Recovery information * @ri_need_recovery: Recovery status * @ri_super_root: Block number of the last super root * @ri_ri_cno: Number of the last checkpoint @@ -71,7 +71,7 @@ struct nilfs_recovery_info { */ struct nilfs_cstage { int scnt; - unsigned flags; + unsigned flags; struct nilfs_inode_info *dirty_file_ptr; struct nilfs_inode_info *gc_inode_ptr; }; diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index b6c36d0cc33..3c6cc6005c2 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -18,7 +18,7 @@ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * * Written by Koji Sato <koji@osrg.net>. - * Rivised by Ryusuke Konishi <ryusuke@osrg.net>. + * Revised by Ryusuke Konishi <ryusuke@osrg.net>. */ #include <linux/kernel.h> diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 92579cc4c93..0cdbc5e7655 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -436,7 +436,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) /* * Compute the overhead * - * When distributing meta data blocks outside semgent structure, + * When distributing meta data blocks outside segment structure, * We must count them as the overhead. */ overhead = 0; @@ -866,7 +866,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) if ((*flags & MS_RDONLY) && sbi->s_snapshot_cno != old_opts.snapshot_cno) { printk(KERN_WARNING "NILFS (device %s): couldn't " - "remount to a different snapshot. \n", + "remount to a different snapshot.\n", sb->s_id); err = -EINVAL; goto restore_opts; diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 92733d5651d..33871f7e4f0 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -386,7 +386,7 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs, nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment); if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) { - printk(KERN_ERR "NILFS: too short segment. \n"); + printk(KERN_ERR "NILFS: too short segment.\n"); return -EINVAL; } diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog deleted file mode 100644 index 37c11e19437..00000000000 --- a/fs/ntfs/ChangeLog +++ /dev/null @@ -1,1702 +0,0 @@ -ToDo/Notes: - - Find and fix bugs. - - The only places in the kernel where a file is resized are - ntfs_file_write*() and ntfs_truncate() for both of which i_mutex is - held. Just have to be careful in read-/writepage and other helpers - not running under i_mutex that we play nice. Also need to be careful - with initialized_size extension in ntfs_file_write*() and writepage. - UPDATE: The only things that need to be checked are the compressed - write and the other attribute resize/write cases like index - attributes, etc. For now none of these are implemented so are safe. - - Implement filling in of holes in aops.c::ntfs_writepage() and its - helpers. - - Implement mft.c::sync_mft_mirror_umount(). We currently will just - leave the volume dirty on umount if the final iput(vol->mft_ino) - causes a write of any mirrored mft records due to the mft mirror - inode having been discarded already. Whether this can actually ever - happen is unclear however so it is worth waiting until someone hits - the problem. - -2.1.29 - Fix a deadlock at mount time. - - - During mount the VFS holds s_umount lock on the superblock. So when - we try to empty the journal $LogFile contents by calling - ntfs_attr_set() when the machine does not have much memory and the - journal is large ntfs_attr_set() results in the VM trying to balance - dirty pages which in turn tries to that the s_umount lock and thus we - get a deadlock. The solution is to not use ntfs_attr_set() and - instead do the zeroing by hand at the block level rather than page - cache level. - - Fix sparse warnings. - -2.1.28 - Fix a deadlock. - - - Fix deadlock in fs/ntfs/inode.c::ntfs_put_inode(). Thanks to Sergey - Vlasov for the report and detailed analysis of the deadlock. The fix - involved getting rid of ntfs_put_inode() altogether and hence NTFS no - longer has a ->put_inode super operation. - -2.1.27 - Various bug fixes and cleanups. - - - Fix two compiler warnings on Alpha. Thanks to Andrew Morton for - reporting them. - - Fix an (innocent) off-by-one error in the runlist code. - - Fix a buggette in an "should be impossible" case handling where we - continued the attribute lookup loop instead of aborting it. - - Use buffer_migrate_page() for the ->migratepage function of all ntfs - address space operations. - - Fix comparison of $MFT and $MFTMirr to not bail out when there are - unused, invalid mft records which are the same in both $MFT and - $MFTMirr. - - Add support for sparse files which have a compression unit of 0. - - Remove all the make_bad_inode() calls. This should only be called - from read inode and new inode code paths. - - Limit name length in fs/ntfs/unistr.c::ntfs_nlstoucs() to maximum - allowed by NTFS, i.e. 255 Unicode characters, not including the - terminating NULL (which is not stored on disk). - - Improve comments on file attribute flags in fs/ntfs/layout.h. - - Fix a bug in fs/ntfs/inode.c::ntfs_read_locked_index_inode() where we - forgot to update a temporary variable so loading index inodes which - have an index allocation attribute failed. - - Add a missing call to flush_dcache_mft_record_page() in - fs/ntfs/inode.c::ntfs_write_inode(). - - Handle the recently introduced -ENAMETOOLONG return value from - fs/ntfs/unistr.c::ntfs_nlstoucs() in fs/ntfs/namei.c::ntfs_lookup(). - - Semaphore to mutex conversion. (Ingo Molnar) - -2.1.26 - Minor bug fixes and updates. - - - Fix a potential overflow in file.c where a cast to s64 was missing in - a left shift of a page index. - - The struct inode has had its i_sem semaphore changed to a mutex named - i_mutex. - - We have struct kmem_cache now so use it instead of the typedef - kmem_cache_t. (Pekka Enberg) - - Implement support for sector sizes above 512 bytes (up to the maximum - supported by NTFS which is 4096 bytes). - - Do more detailed reporting of why we cannot mount read-write by - special casing the VOLUME_MODIFIED_BY_CHKDSK flag. - - Miscellaneous updates to layout.h. - - Cope with attribute list attribute having invalid flags. Windows - copes with this and even chkdsk does not detect or fix this so we - have to cope with it, too. Thanks to Pawel Kot for reporting the - problem. - -2.1.25 - (Almost) fully implement write(2) and truncate(2). - - - Change ntfs_map_runlist_nolock(), ntfs_attr_find_vcn_nolock() and - {__,}ntfs_cluster_free() to also take an optional attribute search - context as argument. This allows calling these functions with the - mft record mapped. Update all callers. - - Fix potential deadlock in ntfs_mft_data_extend_allocation_nolock() - error handling by passing in the active search context when calling - ntfs_cluster_free(). - - Change ntfs_cluster_alloc() to take an extra boolean parameter - specifying whether the cluster are being allocated to extend an - attribute or to fill a hole. - - Change ntfs_attr_make_non_resident() to call ntfs_cluster_alloc() - with @is_extension set to TRUE and remove the runlist terminator - fixup code as this is now done by ntfs_cluster_alloc(). - - Change ntfs_attr_make_non_resident to take the attribute value size - as an extra parameter. This is needed since we need to know the size - before we can map the mft record and our callers always know it. The - reason we cannot simply read the size from the vfs inode i_size is - that this is not necessarily uptodate. This happens when - ntfs_attr_make_non_resident() is called in the ->truncate call path. - - Fix ntfs_attr_make_non_resident() to update the vfs inode i_blocks - which is zero for a resident attribute but should no longer be zero - once the attribute is non-resident as it then has real clusters - allocated. - - Add fs/ntfs/attrib.[hc]::ntfs_attr_extend_allocation(), a function to - extend the allocation of an attributes. Optionally, the data size, - but not the initialized size can be extended, too. - - Implement fs/ntfs/inode.[hc]::ntfs_truncate(). It only supports - uncompressed and unencrypted files and it never creates sparse files - at least for the moment (making a file sparse requires us to modify - its directory entries and we do not support directory operations at - the moment). Also, support for highly fragmented files, i.e. ones - whose data attribute is split across multiple extents, is severly - limited. When such a case is encountered, EOPNOTSUPP is returned. - - Enable ATTR_SIZE attribute changes in ntfs_setattr(). This completes - the initial implementation of file truncation. Now both open(2)ing - a file with the O_TRUNC flag and the {,f}truncate(2) system calls - will resize a file appropriately. The limitations are that only - uncompressed and unencrypted files are supported. Also, there is - only very limited support for highly fragmented files (the ones whose - $DATA attribute is split into multiple attribute extents). - - In attrib.c::ntfs_attr_set() call balance_dirty_pages_ratelimited() - and cond_resched() in the main loop as we could be dirtying a lot of - pages and this ensures we play nice with the VM and the system as a - whole. - - Implement file operations ->write, ->aio_write, ->writev for regular - files. This replaces the old use of generic_file_write(), et al and - the address space operations ->prepare_write and ->commit_write. - This means that both sparse and non-sparse (unencrypted and - uncompressed) files can now be extended using the normal write(2) - code path. There are two limitations at present and these are that - we never create sparse files and that we only have limited support - for highly fragmented files, i.e. ones whose data attribute is split - across multiple extents. When such a case is encountered, - EOPNOTSUPP is returned. - - $EA attributes can be both resident and non-resident. - - Use %z for size_t to fix compilation warnings. (Andrew Morton) - - Fix compilation warnings with gcc-4.0.2 on SUSE 10.0. - - Document extended attribute ($EA) NEED_EA flag. (Based on libntfs - patch by Yura Pakhuchiy.) - -2.1.24 - Lots of bug fixes and support more clean journal states. - - - Support journals ($LogFile) which have been modified by chkdsk. This - means users can boot into Windows after we marked the volume dirty. - The Windows boot will run chkdsk and then reboot. The user can then - immediately boot into Linux rather than having to do a full Windows - boot first before rebooting into Linux and we will recognize such a - journal and empty it as it is clean by definition. Note, this only - works if chkdsk left the journal in an obviously clean state. - - Support journals ($LogFile) with only one restart page as well as - journals with two different restart pages. We sanity check both and - either use the only sane one or the more recent one of the two in the - case that both are valid. - - Add fs/ntfs/malloc.h::ntfs_malloc_nofs_nofail() which is analogous to - ntfs_malloc_nofs() but it performs allocations with __GFP_NOFAIL and - hence cannot fail. - - Use ntfs_malloc_nofs_nofail() in the two critical regions in - fs/ntfs/runlist.c::ntfs_runlists_merge(). This means we no longer - need to panic() if the allocation fails as it now cannot fail. - - Fix two nasty runlist merging bugs that had gone unnoticed so far. - Thanks to Stefano Picerno for the bug report. - - Remove two bogus BUG_ON()s from fs/ntfs/mft.c. - - Fix handling of valid but empty mapping pairs array in - fs/ntfs/runlist.c::ntfs_mapping_pairs_decompress(). - - Report unrepresentable inodes during ntfs_readdir() as KERN_WARNING - messages and include the inode number. Thanks to Yura Pakhuchiy for - pointing this out. - - Change ntfs_rl_truncate_nolock() to throw away the runlist if the new - length is zero. - - Add runlist.[hc]::ntfs_rl_punch_nolock() which punches a caller - specified hole into a runlist. - - Fix a bug in fs/ntfs/index.c::ntfs_index_lookup(). When the returned - index entry is in the index root, we forgot to set the @ir pointer in - the index context. Thanks to Yura Pakhuchiy for finding this bug. - - Remove bogus setting of PageError in ntfs_read_compressed_block(). - - Add fs/ntfs/attrib.[hc]::ntfs_resident_attr_value_resize(). - - Fix a bug in ntfs_map_runlist_nolock() where we forgot to protect - access to the allocated size in the ntfs inode with the size lock. - - Fix ntfs_attr_vcn_to_lcn_nolock() and ntfs_attr_find_vcn_nolock() to - return LCN_ENOENT when there is no runlist and the allocated size is - zero. - - Fix load_attribute_list() to handle the case of a NULL runlist. - - Fix handling of sparse attributes in ntfs_attr_make_non_resident(). - - Add BUG() checks to ntfs_attr_make_non_resident() and ntfs_attr_set() - to ensure that these functions are never called for compressed or - encrypted attributes. - - Fix cluster (de)allocators to work when the runlist is NULL and more - importantly to take a locked runlist rather than them locking it - which leads to lock reversal. - - Truncate {a,c,m}time to the ntfs supported time granularity when - updating the times in the inode in ntfs_setattr(). - - Fixup handling of sparse, compressed, and encrypted attributes in - fs/ntfs/inode.c::ntfs_read_locked_{,attr_,index_}inode(), - fs/ntfs/aops.c::ntfs_{read,write}page(). - - Make ntfs_write_block() not instantiate sparse blocks if they contain - only zeroes. - - Optimize fs/ntfs/aops.c::ntfs_write_block() by extending the page - lock protection over the buffer submission for i/o which allows the - removal of the get_bh()/put_bh() pairs for each buffer. - - Fix fs/ntfs/aops.c::ntfs_{read,write}_block() to handle the case - where a concurrent truncate has truncated the runlist under our feet. - - Fix page_has_buffers()/page_buffers() handling in fs/ntfs/aops.c. - - In fs/ntfs/aops.c::ntfs_end_buffer_async_read(), use a bit spin lock - in the first buffer head instead of a driver global spin lock to - improve scalability. - - Minor fix to error handling and error message display in - fs/ntfs/aops.c::ntfs_prepare_nonresident_write(). - - Change the mount options {u,f,d}mask to always parse the number as - an octal number to conform to how chmod(1) works, too. Thanks to - Giuseppe Bilotta and Horst von Brand for pointing out the errors of - my ways. - - Fix various bugs in the runlist merging code. (Based on libntfs - changes by Richard Russon.) - - Fix sparse warnings that have crept in over time. - - Change ntfs_cluster_free() to require a write locked runlist on entry - since we otherwise get into a lock reversal deadlock if a read locked - runlist is passed in. In the process also change it to take an ntfs - inode instead of a vfs inode as parameter. - - Fix the definition of the CHKD ntfs record magic. It had an off by - two error causing it to be CHKB instead of CHKD. - - Fix a stupid bug in __ntfs_bitmap_set_bits_in_run() which caused the - count to become negative and hence we had a wild memset() scribbling - all over the system's ram. - -2.1.23 - Implement extension of resident files and make writing safe as well as - many bug fixes, cleanups, and enhancements... - - - Add printk rate limiting for ntfs_warning() and ntfs_error() when - compiled without debug. This avoids a possible denial of service - attack. Thanks to Carl-Daniel Hailfinger from SuSE for pointing this - out. - - Fix compilation warnings on ia64. (Randy Dunlap) - - Use i_size_{read,write}() instead of reading i_size by hand and cache - the value where apropriate. - - Add size_lock to the ntfs_inode structure. This is an rw spinlock - and it locks against access to the inode sizes. Note, ->size_lock - is also accessed from irq context so you must use the _irqsave and - _irqrestore lock and unlock functions, respectively. Protect all - accesses to allocated_size, initialized_size, and compressed_size. - - Minor optimization to fs/ntfs/super.c::ntfs_statfs() and its helpers. - - Implement extension of resident files in the regular file write code - paths (fs/ntfs/aops.c::ntfs_{prepare,commit}_write()). At present - this only works until the data attribute becomes too big for the mft - record after which we abort the write returning -EOPNOTSUPP from - ntfs_prepare_write(). - - Add disable_sparse mount option together with a per volume sparse - enable bit which is set appropriately and a per inode sparse disable - bit which is preset on some system file inodes as appropriate. - - Enforce that sparse support is disabled on NTFS volumes pre 3.0. - - Fix a bug in fs/ntfs/runlist.c::ntfs_mapping_pairs_decompress() in - the creation of the unmapped runlist element for the base attribute - extent. - - Split ntfs_map_runlist() into ntfs_map_runlist() and a non-locking - helper ntfs_map_runlist_nolock() which is used by ntfs_map_runlist(). - This allows us to map runlist fragments with the runlist lock already - held without having to drop and reacquire it around the call. Adapt - all callers. - - Change ntfs_find_vcn() to ntfs_find_vcn_nolock() which takes a locked - runlist. This allows us to find runlist elements with the runlist - lock already held without having to drop and reacquire it around the - call. Adapt all callers. - - Change time to u64 in time.h::ntfs2utc() as it otherwise generates a - warning in the do_div() call on sparc32. Thanks to Meelis Roos for - the report and analysis of the warning. - - Fix a nasty runlist merge bug when merging two holes. - - Set the ntfs_inode->allocated_size to the real allocated size in the - mft record for resident attributes (fs/ntfs/inode.c). - - Small readability cleanup to use "a" instead of "ctx->attr" - everywhere (fs/ntfs/inode.c). - - Make fs/ntfs/namei.c::ntfs_get_{parent,dentry} static and move the - definition of ntfs_export_ops from fs/ntfs/super.c to namei.c. Also, - declare ntfs_export_ops in fs/ntfs/ntfs.h. - - Correct sparse file handling. The compressed values need to be - checked and set in the ntfs inode as done for compressed files and - the compressed size needs to be used for vfs inode->i_blocks instead - of the allocated size, again, as done for compressed files. - - Add AT_EA in addition to AT_DATA to whitelist for being allowed to be - non-resident in fs/ntfs/attrib.c::ntfs_attr_can_be_non_resident(). - - Add fs/ntfs/attrib.c::ntfs_attr_vcn_to_lcn_nolock() used by the new - write code. - - Fix bug in fs/ntfs/attrib.c::ntfs_find_vcn_nolock() where after - dropping the read lock and taking the write lock we were not checking - whether someone else did not already do the work we wanted to do. - - Rename fs/ntfs/attrib.c::ntfs_find_vcn_nolock() to - ntfs_attr_find_vcn_nolock() and update all callers. - - Add fs/ntfs/attrib.[hc]::ntfs_attr_make_non_resident(). - - Fix sign of various error return values to be negative in - fs/ntfs/lcnalloc.c. - - Modify ->readpage and ->writepage (fs/ntfs/aops.c) so they detect and - handle the case where an attribute is converted from resident to - non-resident by a concurrent file write. - - Remove checks for NULL before calling kfree() since kfree() does the - checking itself. (Jesper Juhl) - - Some utilities modify the boot sector but do not update the checksum. - Thus, relax the checking in fs/ntfs/super.c::is_boot_sector_ntfs() to - only emit a warning when the checksum is incorrect rather than - refusing the mount. Thanks to Bernd Casimir for pointing this - problem out. - - Update attribute definition handling. - - Add NTFS_MAX_CLUSTER_SIZE and NTFS_MAX_PAGES_PER_CLUSTER constants. - - Use NTFS_MAX_CLUSTER_SIZE in super.c instead of hard coding 0x10000. - - Use MAX_BUF_PER_PAGE instead of variable sized array allocation for - better code generation and one less sparse warning in fs/ntfs/aops.c. - - Remove spurious void pointer casts from fs/ntfs/. (Pekka Enberg) - - Use C99 style structure initialization after memory allocation where - possible (fs/ntfs/{attrib.c,index.c,super.c}). Thanks to Al Viro and - Pekka Enberg. - - Stamp the transaction log ($UsnJrnl), aka user space journal, if it - is active on the volume and we are mounting read-write or remounting - from read-only to read-write. - - Fix a bug in address space operations error recovery code paths where - if the runlist was not mapped at all and a mapping error occured we - would leave the runlist locked on exit to the function so that the - next access to the same file would try to take the lock and deadlock. - - Detect the case when Windows has been suspended to disk on the volume - to be mounted and if this is the case do not allow (re)mounting - read-write. This is done by parsing hiberfil.sys if present. - - Fix several occurences of a bug where we would perform 'var & ~const' - with a 64-bit variable and a int, i.e. 32-bit, constant. This causes - the higher order 32-bits of the 64-bit variable to be zeroed. To fix - this cast the 'const' to the same 64-bit type as 'var'. - - Change the runlist terminator of the newly allocated cluster(s) to - LCN_ENOENT in ntfs_attr_make_non_resident(). Otherwise the runlist - code gets confused. - - Add an extra parameter @last_vcn to ntfs_get_size_for_mapping_pairs() - and ntfs_mapping_pairs_build() to allow the runlist encoding to be - partial which is desirable when filling holes in sparse attributes. - Update all callers. - - Change ntfs_map_runlist_nolock() to only decompress the mapping pairs - if the requested vcn is inside it. Otherwise we get into problems - when we try to map an out of bounds vcn because we then try to map - the already mapped runlist fragment which causes - ntfs_mapping_pairs_decompress() to fail and return error. Update - ntfs_attr_find_vcn_nolock() accordingly. - - Fix a nasty deadlock that appeared in recent kernels. - The situation: VFS inode X on a mounted ntfs volume is dirty. For - same inode X, the ntfs_inode is dirty and thus corresponding on-disk - inode, i.e. mft record, which is in a dirty PAGE_CACHE_PAGE belonging - to the table of inodes, i.e. $MFT, inode 0. - What happens: - Process 1: sys_sync()/umount()/whatever... calls - __sync_single_inode() for $MFT -> do_writepages() -> write_page for - the dirty page containing the on-disk inode X, the page is now locked - -> ntfs_write_mst_block() which clears PageUptodate() on the page to - prevent anyone else getting hold of it whilst it does the write out. - This is necessary as the on-disk inode needs "fixups" applied before - the write to disk which are removed again after the write and - PageUptodate is then set again. It then analyses the page looking - for dirty on-disk inodes and when it finds one it calls - ntfs_may_write_mft_record() to see if it is safe to write this - on-disk inode. This then calls ilookup5() to check if the - corresponding VFS inode is in icache(). This in turn calls ifind() - which waits on the inode lock via wait_on_inode whilst holding the - global inode_lock. - Process 2: pdflush results in a call to __sync_single_inode for the - same VFS inode X on the ntfs volume. This locks the inode (I_LOCK) - then calls write-inode -> ntfs_write_inode -> map_mft_record() -> - read_cache_page() for the page (in page cache of table of inodes - $MFT, inode 0) containing the on-disk inode. This page has - PageUptodate() clear because of Process 1 (see above) so - read_cache_page() blocks when it tries to take the page lock for the - page so it can call ntfs_read_page(). - Thus Process 1 is holding the page lock on the page containing the - on-disk inode X and it is waiting on the inode X to be unlocked in - ifind() so it can write the page out and then unlock the page. - And Process 2 is holding the inode lock on inode X and is waiting for - the page to be unlocked so it can call ntfs_readpage() or discover - that Process 1 set PageUptodate() again and use the page. - Thus we have a deadlock due to ifind() waiting on the inode lock. - The solution: The fix is to use the newly introduced - ilookup5_nowait() which does not wait on the inode's lock and hence - avoids the deadlock. This is safe as we do not care about the VFS - inode and only use the fact that it is in the VFS inode cache and the - fact that the vfs and ntfs inodes are one struct in memory to find - the ntfs inode in memory if present. Also, the ntfs inode has its - own locking so it does not matter if the vfs inode is locked. - - Fix bug in mft record writing where we forgot to set the device in - the buffers when mapping them after the VM had discarded them. - Thanks to Martin MOKREJÅ for the bug report. - -2.1.22 - Many bug and race fixes and error handling improvements. - - - Improve error handling in fs/ntfs/inode.c::ntfs_truncate(). - - Change fs/ntfs/inode.c::ntfs_truncate() to return an error code - instead of void and provide a helper ntfs_truncate_vfs() for the - vfs ->truncate method. - - Add a new ntfs inode flag NInoTruncateFailed() and modify - fs/ntfs/inode.c::ntfs_truncate() to set and clear it appropriately. - - Fix min_size and max_size definitions in ATTR_DEF structure in - fs/ntfs/layout.h to be signed. - - Add attribute definition handling helpers to fs/ntfs/attrib.[hc]: - ntfs_attr_size_bounds_check(), ntfs_attr_can_be_non_resident(), and - ntfs_attr_can_be_resident(), which in turn use the new private helper - ntfs_attr_find_in_attrdef(). - - In fs/ntfs/aops.c::mark_ntfs_record_dirty(), take the - mapping->private_lock around the dirtying of the buffer heads - analagous to the way it is done in __set_page_dirty_buffers(). - - Ensure the mft record size does not exceed the PAGE_CACHE_SIZE at - mount time as this cannot work with the current implementation. - - Check for location of attribute name and improve error handling in - general in fs/ntfs/inode.c::ntfs_read_locked_inode() and friends. - - In fs/ntfs/aops.c::ntfs_writepage(), if the page is fully outside - i_size, i.e. race with truncate, invalidate the buffers on the page - so that they become freeable and hence the page does not leak. - - Remove unused function fs/ntfs/runlist.c::ntfs_rl_merge(). (Adrian - Bunk) - - Fix stupid bug in fs/ntfs/attrib.c::ntfs_attr_find() that resulted in - a NULL pointer dereference in the error code path when a corrupt - attribute was found. (Thanks to Domen Puncer for the bug report.) - - Add MODULE_VERSION() to fs/ntfs/super.c. - - Make several functions and variables static. (Adrian Bunk) - - Modify fs/ntfs/aops.c::mark_ntfs_record_dirty() so it allocates - buffers for the page if they are not present and then marks the - buffers belonging to the ntfs record dirty. This causes the buffers - to become busy and hence they are safe from removal until the page - has been written out. - - Fix stupid bug in fs/ntfs/attrib.c::ntfs_external_attr_find() in the - error handling code path that resulted in a BUG() due to trying to - unmap an extent mft record when the mapping of it had failed and it - thus was not mapped. (Thanks to Ken MacFerrin for the bug report.) - - Drop the runlist lock after the vcn has been read in - fs/ntfs/lcnalloc.c::__ntfs_cluster_free(). - - Rewrite handling of multi sector transfer errors. We now do not set - PageError() when such errors are detected in the async i/o handler - fs/ntfs/aops.c::ntfs_end_buffer_async_read(). All users of mst - protected attributes now check the magic of each ntfs record as they - use it and act appropriately. This has the effect of making errors - granular per ntfs record rather than per page which solves the case - where we cannot access any of the ntfs records in a page when a - single one of them had an mst error. (Thanks to Ken MacFerrin for - the bug report.) - - Fix error handling in fs/ntfs/quota.c::ntfs_mark_quotas_out_of_date() - where we failed to release i_mutex on the $Quota/$Q attribute inode. - - Fix bug in handling of bad inodes in fs/ntfs/namei.c::ntfs_lookup(). - - Add mapping of unmapped buffers to all remaining code paths, i.e. - fs/ntfs/aops.c::ntfs_write_mst_block(), mft.c::ntfs_sync_mft_mirror(), - and write_mft_record_nolock(). From now on we require that the - complete runlist for the mft mirror is always mapped into memory. - - Add creation of buffers to fs/ntfs/mft.c::ntfs_sync_mft_mirror(). - - Improve error handling in fs/ntfs/aops.c::ntfs_{read,write}_block(). - - Cleanup fs/ntfs/aops.c::ntfs_{read,write}page() since we know that a - resident attribute will be smaller than a page which makes the code - simpler. Also make the code more tolerant to concurrent ->truncate. - -2.1.21 - Fix some races and bugs, rewrite mft write code, add mft allocator. - - - Implement extent mft record deallocation - fs/ntfs/mft.c::ntfs_extent_mft_record_free(). - - Splitt runlist related functions off from attrib.[hc] to runlist.[hc]. - - Add vol->mft_data_pos and initialize it at mount time. - - Rename init_runlist() to ntfs_init_runlist(), ntfs_vcn_to_lcn() to - ntfs_rl_vcn_to_lcn(), decompress_mapping_pairs() to - ntfs_mapping_pairs_decompress(), ntfs_merge_runlists() to - ntfs_runlists_merge() and adapt all callers. - - Add fs/ntfs/runlist.[hc]::ntfs_get_nr_significant_bytes(), - ntfs_get_size_for_mapping_pairs(), ntfs_write_significant_bytes(), - and ntfs_mapping_pairs_build(), adapted from libntfs. - - Make fs/ntfs/lcnalloc.c::ntfs_cluster_free_from_rl_nolock() not - static and add a declaration for it to lcnalloc.h. - - Add fs/ntfs/lcnalloc.h::ntfs_cluster_free_from_rl() which is a static - inline wrapper for ntfs_cluster_free_from_rl_nolock() which takes the - cluster bitmap lock for the duration of the call. - - Add fs/ntfs/attrib.[hc]::ntfs_attr_record_resize(). - - Implement the equivalent of memset() for an ntfs attribute in - fs/ntfs/attrib.[hc]::ntfs_attr_set() and switch - fs/ntfs/logfile.c::ntfs_empty_logfile() to using it. - - Remove unnecessary casts from LCN_* constants. - - Implement fs/ntfs/runlist.c::ntfs_rl_truncate_nolock(). - - Add MFT_RECORD_OLD as a copy of MFT_RECORD in fs/ntfs/layout.h and - change MFT_RECORD to contain the NTFS 3.1+ specific fields. - - Add a helper function fs/ntfs/aops.c::mark_ntfs_record_dirty() which - marks all buffers belonging to an ntfs record dirty, followed by - marking the page the ntfs record is in dirty and also marking the vfs - inode containing the ntfs record dirty (I_DIRTY_PAGES). - - Switch fs/ntfs/index.h::ntfs_index_entry_mark_dirty() to using the - new helper fs/ntfs/aops.c::mark_ntfs_record_dirty() and remove the no - longer needed fs/ntfs/index.[hc]::__ntfs_index_entry_mark_dirty(). - - Move ntfs_{un,}map_page() from ntfs.h to aops.h and fix resulting - include errors. - - Move the typedefs for runlist_element and runlist from types.h to - runlist.h and fix resulting include errors. - - Remove unused {__,}format_mft_record() from fs/ntfs/mft.c. - - Modify fs/ntfs/mft.c::__mark_mft_record_dirty() to use the helper - mark_ntfs_record_dirty() which also changes the behaviour in that we - now set the buffers belonging to the mft record dirty as well as the - page itself. - - Update fs/ntfs/mft.c::write_mft_record_nolock() and sync_mft_mirror() - to cope with the fact that there now are dirty buffers in mft pages. - - Update fs/ntfs/inode.c::ntfs_write_inode() to also use the helper - mark_ntfs_record_dirty() and thus to set the buffers belonging to the - mft record dirty as well as the page itself. - - Fix compiler warnings on x86-64 in fs/ntfs/dir.c. (Randy Dunlap, - slightly modified by me) - - Add fs/ntfs/mft.c::try_map_mft_record() which fails with -EALREADY if - the mft record is already locked and otherwise behaves the same way - as fs/ntfs/mft.c::map_mft_record(). - - Modify fs/ntfs/mft.c::write_mft_record_nolock() so that it only - writes the mft record if the buffers belonging to it are dirty. - Otherwise we assume that it was written out by other means already. - - Attempting to write outside initialized size is _not_ a bug so remove - the bug check from fs/ntfs/aops.c::ntfs_write_mst_block(). It is in - fact required to write outside initialized size when preparing to - extend the initialized size. - - Map the page instead of using page_address() before writing to it in - fs/ntfs/aops.c::ntfs_mft_writepage(). - - Provide exclusion between opening an inode / mapping an mft record - and accessing the mft record in fs/ntfs/mft.c::ntfs_mft_writepage() - by setting the page not uptodate throughout ntfs_mft_writepage(). - - Clear the page uptodate flag in fs/ntfs/aops.c::ntfs_write_mst_block() - to ensure noone can see the page whilst the mst fixups are applied. - - Add the helper fs/ntfs/mft.c::ntfs_may_write_mft_record() which - checks if an mft record may be written out safely obtaining any - necessary locks in the process. This is used by - fs/ntfs/aops.c::ntfs_write_mst_block(). - - Modify fs/ntfs/aops.c::ntfs_write_mst_block() to also work for - writing mft records and improve its error handling in the process. - Now if any of the records in the page fail to be written out, all - other records will be written out instead of aborting completely. - - Remove ntfs_mft_aops and update all users to use ntfs_mst_aops. - - Modify fs/ntfs/inode.c::ntfs_read_locked_inode() to set the - ntfs_mst_aops for all inodes which are NInoMstProtected() and - ntfs_aops for all other inodes. - - Rename fs/ntfs/mft.c::sync_mft_mirror{,_umount}() to - ntfs_sync_mft_mirror{,_umount}() and change their parameters so they - no longer require an ntfs inode to be present. Update all callers. - - Cleanup the error handling in fs/ntfs/mft.c::ntfs_sync_mft_mirror(). - - Clear the page uptodate flag in fs/ntfs/mft.c::ntfs_sync_mft_mirror() - to ensure noone can see the page whilst the mst fixups are applied. - - Remove the no longer needed fs/ntfs/mft.c::ntfs_mft_writepage() and - fs/ntfs/mft.c::try_map_mft_record(). - - Fix callers of fs/ntfs/aops.c::mark_ntfs_record_dirty() to call it - with the ntfs inode which contains the page rather than the ntfs - inode the mft record of which is in the page. - - Fix race condition in fs/ntfs/inode.c::ntfs_put_inode() by moving the - index inode bitmap inode release code from there to - fs/ntfs/inode.c::ntfs_clear_big_inode(). (Thanks to Christoph - Hellwig for spotting this.) - - Fix race condition in fs/ntfs/inode.c::ntfs_put_inode() by taking the - inode semaphore around the code that sets ni->itype.index.bmp_ino to - NULL and reorganize the code to optimize it a bit. (Thanks to - Christoph Hellwig for spotting this.) - - Modify fs/ntfs/aops.c::mark_ntfs_record_dirty() to no longer take the - ntfs inode as a parameter as this is confusing and misleading and the - needed ntfs inode is available via NTFS_I(page->mapping->host). - Adapt all callers to this change. - - Modify fs/ntfs/mft.c::write_mft_record_nolock() and - fs/ntfs/aops.c::ntfs_write_mst_block() to only check the dirty state - of the first buffer in a record and to take this as the ntfs record - dirty state. We cannot look at the dirty state for subsequent - buffers because we might be racing with - fs/ntfs/aops.c::mark_ntfs_record_dirty(). - - Move the static inline ntfs_init_big_inode() from fs/ntfs/inode.c to - inode.h and make fs/ntfs/inode.c::__ntfs_init_inode() non-static and - add a declaration for it to inode.h. Fix some compilation issues - that resulted due to #includes and header file interdependencies. - - Simplify setup of i_mode in fs/ntfs/inode.c::ntfs_read_locked_inode(). - - Add helpers fs/ntfs/layout.h::MK_MREF() and MK_LE_MREF(). - - Modify fs/ntfs/mft.c::map_extent_mft_record() to only verify the mft - record sequence number if it is specified (i.e. not zero). - - Add fs/ntfs/mft.[hc]::ntfs_mft_record_alloc() and various helper - functions used by it. - - Update Documentation/filesystems/ntfs.txt with instructions on how to - use the Device-Mapper driver with NTFS ftdisk/LDM raid. This removes - the linear raid problem with the Software RAID / MD driver when one - or more of the devices has an odd number of sectors. - -2.1.20 - Fix two stupid bugs introduced in 2.1.18 release. - - - Fix stupid bug in fs/ntfs/attrib.c::ntfs_attr_reinit_search_ctx() - where we did not clear ctx->al_entry but it was still set due to - changes in ntfs_attr_lookup() and ntfs_external_attr_find() in - particular. - - Fix another stupid bug in fs/ntfs/attrib.c::ntfs_external_attr_find() - where we forgot to unmap the extent mft record when we had finished - enumerating an attribute which caused a bug check to trigger when the - VFS calls ->clear_inode. - -2.1.19 - Many cleanups, improvements, and a minor bug fix. - - - Update ->setattr (fs/ntfs/inode.c::ntfs_setattr()) to refuse to - change the uid, gid, and mode of an inode as we do not support NTFS - ACLs yet. - - Remove BKL use from ntfs_setattr() syncing up with the rest of the - kernel. - - Get rid of the ugly transparent union in fs/ntfs/dir.c::ntfs_readdir() - and ntfs_filldir() as per suggestion from Al Viro. - - Change '\0' and L'\0' to simply 0 as per advice from Linus Torvalds. - - Update ->truncate (fs/ntfs/inode.c::ntfs_truncate()) to check if the - inode size has changed and to only output an error if so. - - Rename fs/ntfs/attrib.h::attribute_value_length() to ntfs_attr_size(). - - Add le{16,32,64} as well as sle{16,32,64} data types to - fs/ntfs/types.h. - - Change ntfschar to be le16 instead of u16 in fs/ntfs/types.h. - - Add le versions of VCN, LCN, and LSN called leVCN, leLCN, and leLSN, - respectively, to fs/ntfs/types.h. - - Update endianness conversion macros in fs/ntfs/endian.h to use the - new types as appropriate. - - Do proper type casting when using sle64_to_cpup() in fs/ntfs/dir.c - and index.c. - - Add leMFT_REF data type to fs/ntfs/layout.h. - - Update all NTFS header files with the new little endian data types. - Affected files are fs/ntfs/layout.h, logfile.h, and time.h. - - Do proper type casting when using ntfs_is_*_recordp() in - fs/ntfs/logfile.c, mft.c, and super.c. - - Fix all the sparse bitwise warnings. Had to change all the typedef - enums storing little endian values to simple enums plus a typedef for - the datatype to make sparse happy. - - Fix a bug found by the new sparse bitwise warnings where the default - upcase table was defined as a pointer to wchar_t rather than ntfschar - in fs/ntfs/ntfs.h and super.c. - - Change {const_,}cpu_to_le{16,32}(0) to just 0 as suggested by Al Viro. - -2.1.18 - Fix scheduling latencies at mount time as well as an endianness bug. - - - Remove vol->nr_mft_records as it was pretty meaningless and optimize - the calculation of total/free inodes as used by statfs(). - - Fix scheduling latencies in ntfs_fill_super() by dropping the BKL - because the code itself is using the ntfs_lock semaphore which - provides safe locking. (Ingo Molnar) - - Fix a potential bug in fs/ntfs/mft.c::map_extent_mft_record() that - could occur in the future for when we start closing/freeing extent - inodes if we don't set base_ni->ext.extent_ntfs_inos to NULL after - we free it. - - Rename {find,lookup}_attr() to ntfs_attr_{find,lookup}() as well as - find_external_attr() to ntfs_external_attr_find() to cleanup the - namespace a bit and to be more consistent with libntfs. - - Rename {{re,}init,get,put}_attr_search_ctx() to - ntfs_attr_{{re,}init,get,put}_search_ctx() as well as the type - attr_search_context to ntfs_attr_search_ctx. - - Force use of ntfs_attr_find() in ntfs_attr_lookup() when searching - for the attribute list attribute itself. - - Fix endianness bug in ntfs_external_attr_find(). - - Change ntfs_{external_,}attr_find() to return 0 on success, -ENOENT - if the attribute is not found, and -EIO on real error. In the case - of -ENOENT, the search context is updated to describe the attribute - before which the attribute being searched for would need to be - inserted if such an action were to be desired and in the case of - ntfs_external_attr_find() the search context is also updated to - indicate the attribute list entry before which the attribute list - entry of the attribute being searched for would need to be inserted - if such an action were to be desired. Also make ntfs_find_attr() - static and remove its prototype from attrib.h as it is not used - anywhere other than attrib.c. Update ntfs_attr_lookup() and all - callers of ntfs_{external,}attr_{find,lookup}() for the new return - values. - - Minor cleanup of fs/ntfs/inode.c::ntfs_init_locked_inode(). - -2.1.17 - Fix bugs in mount time error code paths and other updates. - - - Implement bitmap modification code (fs/ntfs/bitmap.[hc]). This - includes functions to set/clear a single bit or a run of bits. - - Add fs/ntfs/attrib.[hc]::ntfs_find_vcn() which returns the locked - runlist element containing a particular vcn. It also takes care of - mapping any needed runlist fragments. - - Implement cluster (de-)allocation code (fs/ntfs/lcnalloc.[hc]). - - Load attribute definition table from $AttrDef at mount time. - - Fix bugs in mount time error code paths involving (de)allocation of - the default and volume upcase tables. - - Remove ntfs_nr_mounts as it is no longer used. - -2.1.16 - Implement access time updates, file sync, async io, and read/writev. - - - Add support for readv/writev and aio_read/aio_write (fs/ntfs/file.c). - This is done by setting the appropriate file operations pointers to - the generic helper functions provided by mm/filemap.c. - - Implement fsync, fdatasync, and msync both for files (fs/ntfs/file.c) - and directories (fs/ntfs/dir.c). - - Add support for {a,m,c}time updates to inode.c::ntfs_write_inode(). - Note, except for the root directory and any other system files opened - by the user, the system files will not have their access times - updated as they are only accessed at the inode level an hence the - file level functions which cause the times to be updated are never - invoked. - -2.1.15 - Invalidate quotas when (re)mounting read-write. - - - Add new element itype.index.collation_rule to the ntfs inode - structure and set it appropriately in ntfs_read_locked_inode(). - - Implement a new inode type "index" to allow efficient access to the - indices found in various system files and adapt inode handling - accordingly (fs/ntfs/inode.[hc]). An index inode is essentially an - attribute inode (NInoAttr() is true) with an attribute type of - AT_INDEX_ALLOCATION. As such, it is no longer allowed to call - ntfs_attr_iget() with an attribute type of AT_INDEX_ALLOCATION as - there would be no way to distinguish between normal attribute inodes - and index inodes. The function to obtain an index inode is - ntfs_index_iget() and it uses the helper function - ntfs_read_locked_index_inode(). Note, we do not overload - ntfs_attr_iget() as indices consist of multiple attributes so using - ntfs_attr_iget() to obtain an index inode would be confusing. - - Ensure that there is no overflow when doing page->index << - PAGE_CACHE_SHIFT by casting page->index to s64 in fs/ntfs/aops.c. - - Use atomic kmap instead of kmap() in fs/ntfs/aops.c::ntfs_read_page() - and ntfs_read_block(). - - Use case sensitive attribute lookups instead of case insensitive ones. - - Lock all page cache pages belonging to mst protected attributes while - accessing them to ensure we never see corrupt data while the page is - under writeout. - - Add framework for generic ntfs collation (fs/ntfs/collation.[hc]). - We have ntfs_is_collation_rule_supported() to check if the collation - rule you want to use is supported and ntfs_collation() which actually - collates two data items. We currently only support COLLATION_BINARY - and COLLATION_NTOFS_ULONG but support for other collation rules will - be added as the need arises. - - Add a new type, ntfs_index_context, to allow retrieval of an index - entry using the corresponding index key. To get an index context, - use ntfs_index_ctx_get() and to release it, use ntfs_index_ctx_put(). - This also adds a new slab cache for the index contexts. To lookup a - key in an index inode, use ntfs_index_lookup(). After modifying an - index entry, call ntfs_index_entry_flush_dcache_page() followed by - ntfs_index_entry_mark_dirty() to ensure the changes are written out - to disk. For details see fs/ntfs/index.[hc]. Note, at present, if - an index entry is in the index allocation attribute rather than the - index root attribute it will not be written out (you will get a - warning message about discarded changes instead). - - Load the quota file ($Quota) and check if quota tracking is enabled - and if so, mark the quotas out of date. This causes windows to - rescan the volume on boot and update all quota entries. - - Add a set_page_dirty address space operation for ntfs_m[fs]t_aops. - It is simply set to __set_page_dirty_nobuffers() to make sure that - running set_page_dirty() on a page containing mft/ntfs records will - not affect the dirty state of the page buffers. - - Add fs/ntfs/index.c::__ntfs_index_entry_mark_dirty() which sets all - buffers that are inside the ntfs record in the page dirty after which - it sets the page dirty. This allows ->writepage to only write the - dirty index records rather than having to write all the records in - the page. Modify fs/ntfs/index.h::ntfs_index_entry_mark_dirty() to - use this rather than __set_page_dirty_nobuffers(). - - Implement fs/ntfs/aops.c::ntfs_write_mst_block() which enables the - writing of page cache pages belonging to mst protected attributes - like the index allocation attribute in directory indices and other - indices like $Quota/$Q, etc. This means that the quota is now marked - out of date on all volumes rather than only on ones where the quota - defaults entry is in the index root attribute of the $Quota/$Q index. - -2.1.14 - Fix an NFSd caused deadlock reported by several users. - - - Modify fs/ntfs/ntfs_readdir() to copy the index root attribute value - to a buffer so that we can put the search context and unmap the mft - record before calling the filldir() callback. We need to do this - because of NFSd which calls ->lookup() from its filldir callback() - and this causes NTFS to deadlock as ntfs_lookup() maps the mft record - of the directory and since ntfs_readdir() has got it mapped already - ntfs_lookup() deadlocks. - -2.1.13 - Enable overwriting of resident files and housekeeping of system files. - - - Implement writing of mft records (fs/ntfs/mft.[hc]), which includes - keeping the mft mirror in sync with the mft when mirrored mft records - are written. The functions are write_mft_record{,_nolock}(). The - implementation is quite rudimentary for now with lots of things not - implemented yet but I am not sure any of them can actually occur so - I will wait for people to hit each one and only then implement it. - - Commit open system inodes at umount time. This should make it - virtually impossible for sync_mft_mirror_umount() to ever be needed. - - Implement ->write_inode (fs/ntfs/inode.c::ntfs_write_inode()) for the - ntfs super operations. This gives us inode writing via the VFS inode - dirty code paths. Note: Access time updates are not implemented yet. - - Implement fs/ntfs/mft.[hc]::{,__}mark_mft_record_dirty() and make - fs/ntfs/aops.c::ntfs_writepage() and ntfs_commit_write() use it, thus - finally enabling resident file overwrite! (-8 This also includes a - placeholder for ->writepage (ntfs_mft_writepage()), which for now - just redirties the page and returns. Also, at umount time, we for - now throw away all mft data page cache pages after the last call to - ntfs_commit_inode() in the hope that all inodes will have been - written out by then and hence no dirty (meta)data will be lost. We - also check for this case and emit an error message telling the user - to run chkdsk. - - Use set_page_writeback() and end_page_writeback() in the resident - attribute code path of fs/ntfs/aops.c::ntfs_writepage() otherwise - the radix-tree tag PAGECACHE_TAG_DIRTY remains set even though the - page is clean. - - Implement ntfs_mft_writepage() so it now checks if any of the mft - records in the page are dirty and if so redirties the page and - returns. Otherwise it just returns (after doing set_page_writeback(), - unlock_page(), end_page_writeback() or the radix-tree tag - PAGECACHE_TAG_DIRTY remains set even though the page is clean), thus - alowing the VM to do with the page as it pleases. Also, at umount - time, now only throw away dirty mft (meta)data pages if dirty inodes - are present and ask the user to email us if they see this happening. - - Add functions ntfs_{clear,set}_volume_flags(), to modify the volume - information flags (fs/ntfs/super.c). - - Mark the volume dirty when (re)mounting read-write and mark it clean - when unmounting or remounting read-only. If any volume errors are - found, the volume is left marked dirty to force chkdsk to run. - - Add code to set the NT4 compatibility flag when (re)mounting - read-write for newer NTFS versions but leave it commented out for now - since we do not make any modifications that are NTFS 1.2 specific yet - and since setting this flag breaks Captive-NTFS which is not nice. - This code must be enabled once we start writing NTFS 1.2 specific - changes otherwise Windows NTFS driver might crash / cause corruption. - -2.1.12 - Fix the second fix to the decompression engine and some cleanups. - - - Add a new address space operations struct, ntfs_mst_aops, for mst - protected attributes. This is because the default ntfs_aops do not - make sense with mst protected data and were they to write anything to - such an attribute they would cause data corruption so we provide - ntfs_mst_aops which does not have any write related operations set. - - Cleanup dirty ntfs inode handling (fs/ntfs/inode.[hc]) which also - includes an adapted ntfs_commit_inode() and an implementation of - ntfs_write_inode() which for now just cleans dirty inodes without - writing them (it does emit a warning that this is happening). - - Undo the second decompression engine fix (see 2.1.9 release ChangeLog - entry) as it was only fixing a theoretical bug but at the same time - it badly broke the handling of sparse and uncompressed compression - blocks. - -2.1.11 - Driver internal cleanups. - - - Only build logfile.o if building the driver with read-write support. - - Really final white space cleanups. - - Use generic_ffs() instead of ffs() in logfile.c which allows the - log_page_size variable to be optimized by gcc into a constant. - - Rename uchar_t to ntfschar everywhere as uchar_t is unsigned 1-byte - char as defined by POSIX and as found on some systems. - -2.1.10 - Force read-only (re)mounting of volumes with unsupported volume flags. - - - Finish off the white space cleanups (remove trailing spaces, etc). - - Clean up ntfs_fill_super() and ntfs_read_inode_mount() by removing - the kludges around the first iget(). Instead of (re)setting ->s_op - we have the $MFT inode set up by explicit new_inode() / set ->i_ino / - insert_inode_hash() / call ntfs_read_inode_mount() directly. This - kills the need for second super_operations and allows to return error - from ntfs_read_inode_mount() without resorting to ugly "poisoning" - tricks. (Al Viro) - - Force read-only (re)mounting if any of the following bits are set in - the volume information flags: - VOLUME_IS_DIRTY, VOLUME_RESIZE_LOG_FILE, - VOLUME_UPGRADE_ON_MOUNT, VOLUME_DELETE_USN_UNDERWAY, - VOLUME_REPAIR_OBJECT_ID, VOLUME_MODIFIED_BY_CHKDSK - To make this easier we define VOLUME_MUST_MOUNT_RO_MASK with all the - above bits set so the test is made easy. - -2.1.9 - Fix two bugs in decompression engine. - - - Fix a bug where we would not always detect that we have reached the - end of a compression block because we were ending at minus one byte - which is effectively the same as being at the end. The fix is to - check whether the uncompressed buffer has been fully filled and if so - we assume we have reached the end of the compression block. A big - thank you to Marcin Gibuła for the bug report, the assistance in - tracking down the bug and testing the fix. - - Fix a possible bug where when a compressed read is truncated to the - end of the file, the offset inside the last page was not truncated. - -2.1.8 - Handle $MFT mirror and $LogFile, improve time handling, and cleanups. - - - Use get_bh() instead of manual atomic_inc() in fs/ntfs/compress.c. - - Modify fs/ntfs/time.c::ntfs2utc(), get_current_ntfs_time(), and - utc2ntfs() to work with struct timespec instead of time_t on the - Linux UTC time side thus preserving the full precision of the NTFS - time and only loosing up to 99 nano-seconds in the Linux UTC time. - - Move fs/ntfs/time.c to fs/ntfs/time.h and make the time functions - static inline. - - Remove unused ntfs_dirty_inode(). - - Cleanup super operations declaration in fs/ntfs/super.c. - - Wrap flush_dcache_mft_record_page() in #ifdef NTFS_RW. - - Add NInoTestSetFoo() and NInoTestClearFoo() macro magic to - fs/ntfs/inode.h and use it to declare NInoTest{Set,Clear}Dirty. - - Move typedefs for ntfs_attr and test_t from fs/ntfs/inode.c to - fs/ntfs/inode.h so they can be used elsewhere. - - Determine the mft mirror size as the number of mirrored mft records - and store it in ntfs_volume->mftmirr_size (fs/ntfs/super.c). - - Load the mft mirror at mount time and compare the mft records stored - in it to the ones in the mft. Force a read-only mount if the two do - not match (fs/ntfs/super.c). - - Fix type casting related warnings on 64-bit architectures. Thanks - to Meelis Roos for reporting them. - - Move %L to %ll as %L is floating point and %ll is integer which is - what we want. - - Read the journal ($LogFile) and determine if the volume has been - shutdown cleanly and force a read-only mount if not (fs/ntfs/super.c - and fs/ntfs/logfile.c). This is a little bit of a crude check in - that we only look at the restart areas and not at the actual log - records so that there will be a very small number of cases where we - think that a volume is dirty when in fact it is clean. This should - only affect volumes that have not been shutdown cleanly and did not - have any pending, non-check-pointed i/o. - - If the $LogFile indicates a clean shutdown and a read-write (re)mount - is requested, empty $LogFile by overwriting it with 0xff bytes to - ensure that Windows cannot cause data corruption by replaying a stale - journal after Linux has written to the volume. - -2.1.7 - Enable NFS exporting of mounted NTFS volumes. - - - Set i_generation in the VFS inode from the seq_no of the NTFS inode. - - Make ntfs_lookup() NFS export safe, i.e. use d_splice_alias(), etc. - - Implement ->get_dentry() in fs/ntfs/namei.c::ntfs_get_dentry() as the - default doesn't allow inode number 0 which is a valid inode on NTFS - and even if it did allow that it uses iget() instead of ntfs_iget() - which makes it useless for us. - - Implement ->get_parent() in fs/ntfs/namei.c::ntfs_get_parent() as the - default just returns -EACCES which is not very useful. - - Define export operations (->s_export_op) for NTFS (ntfs_export_ops) - and set them up in the super block at mount time (super.c) this - allows mounted NTFS volumes to be exported via NFS. - - Add missing return -EOPNOTSUPP; in - fs/ntfs/aops.c::ntfs_commit_nonresident_write(). - - Enforce no atime and no dir atime updates at mount/remount time as - they are not implemented yet anyway. - - Move a few assignments in fs/ntfs/attrib.c::load_attribute_list() to - after a NULL check. Thanks to Dave Jones for pointing this out. - -2.1.6 - Fix minor bug in handling of compressed directories. - - - Fix bug in handling of compressed directories. A compressed - directory is not really compressed so when we set the ->i_blocks - field of a compressed directory inode we were setting it from the - non-existing field ni->itype.compressed.size which gave random - results... For directories we now always use ni->allocated_size. - -2.1.5 - Fix minor bug in attribute list attribute handling. - - - Fix bug in attribute list handling. Actually it is not as much a bug - as too much protection in that we were not allowing attribute lists - which waste space on disk while Windows XP clearly allows it and in - fact creates such attribute lists so our driver was failing. - - Update NTFS documentation ready for 2.6 kernel release. - -2.1.4 - Reduce compiler requirements. - - - Remove all uses of unnamed structs and unions in the driver to make - old and newer gcc versions happy. Makes it a bit uglier IMO but at - least people will stop hassling me about it. - -2.1.3 - Important bug fixes in corner cases. - - - super.c::parse_ntfs_boot_sector(): Correct the check for 64-bit - clusters. (Philipp Thomas) - - attrib.c::load_attribute_list(): Fix bug when initialized_size is a - multiple of the block_size but not the cluster size. (Szabolcs - Szakacsits) - -2.1.2 - Important bug fixes aleviating the hangs in statfs. - - - Fix buggy free cluster and free inode determination logic. - -2.1.1 - Minor updates. - - - Add handling for initialized_size != data_size in compressed files. - - Reduce function local stack usage from 0x3d4 bytes to just noise in - fs/ntfs/upcase.c. (Randy Dunlap) - - Remove compiler warnings for newer gcc. - - Pages are no longer kmapped by mm/filemap.c::generic_file_write() - around calls to ->{prepare,commit}_write. Adapt NTFS appropriately - in fs/ntfs/aops.c::ntfs_prepare_nonresident_write() by using - kmap_atomic(KM_USER0). - -2.1.0 - First steps towards write support: implement file overwrite. - - - Add configuration option for developmental write support with an - appropriately scary configuration help text. - - Initial implementation of fs/ntfs/aops.c::ntfs_writepage() and its - helper fs/ntfs/aops.c::ntfs_write_block(). This enables mmap(2) based - overwriting of existing files on ntfs. Note: Resident files are - only written into memory, and not written out to disk at present, so - avoid writing to files smaller than about 1kiB. - - Initial implementation of fs/ntfs/aops.c::ntfs_prepare_write(), its - helper fs/ntfs/aops.c::ntfs_prepare_nonresident_write() and their - counterparts, fs/ntfs/aops.c::ntfs_commit_write(), and - fs/ntfs/aops.c::ntfs_commit_nonresident_write(), respectively. Also, - add generic_file_write() to the ntfs file operations (fs/ntfs/file.c). - This enables write(2) based overwriting of existing files on ntfs. - Note: As with mmap(2) based overwriting, resident files are only - written into memory, and not written out to disk at present, so avoid - writing to files smaller than about 1kiB. - - Implement ->truncate (fs/ntfs/inode.c::ntfs_truncate()) and - ->setattr() (fs/ntfs/inode.c::ntfs_setattr()) inode operations for - files with the purpose of intercepting and aborting all i_size - changes which we do not support yet. ntfs_truncate() actually only - emits a warning message but AFAICS our interception of i_size changes - elsewhere means ntfs_truncate() never gets called for i_size changes. - It is only called from generic_file_write() when we fail in - ntfs_prepare_{,nonresident_}write() in order to discard any - instantiated buffers beyond i_size. Thus i_size is not actually - changed so our warning message is enough. Unfortunately it is not - possible to easily determine if i_size is being changed or not hence - we just emit an appropriately worded error message. - -2.0.25 - Small bug fixes and cleanups. - - - Unlock the page in an out of memory error code path in - fs/ntfs/aops.c::ntfs_read_block(). - - If fs/ntfs/aops.c::ntfs_read_page() is called on an uptodate page, - just unlock the page and return. (This can happen due to ->writepage - clearing PageUptodate() during write out of MstProtected() - attributes. - - Remove leaked write code again. - -2.0.24 - Cleanups. - - - Treat BUG_ON() as ASSERT() not VERIFY(), i.e. do not use side effects - inside BUG_ON(). (Adam J. Richter) - - Split logical OR expressions inside BUG_ON() into individual BUG_ON() - calls for improved debugging. (Adam J. Richter) - - Add errors flag to the ntfs volume state, accessed via - NVol{,Set,Clear}Errors(vol). - - Do not allow read-write remounts of read-only volumes with errors. - - Clarify comment for ntfs file operation sendfile which was added by - Christoph Hellwig a while ago (just using generic_file_sendfile()) - to say that ntfs ->sendfile is only used for the case where the - source data is on the ntfs partition and the destination is - somewhere else, i.e. nothing we need to concern ourselves with. - - Add generic_file_write() as our ntfs file write operation. - -2.0.23 - Major bug fixes (races, deadlocks, non-i386 architectures). - - - Massive internal locking changes to mft record locking. Fixes lock - recursion and replaces the mrec_lock read/write semaphore with a - mutex. Also removes the now superfluous mft_count. This fixes several - race conditions and deadlocks, especially in the future write code. - - Fix ntfs over loopback for compressed files by adding an - optimization barrier. (gcc was screwing up otherwise ?) - - Miscellaneous cleanups all over the code and a fix or two in error - handling code paths. - Thanks go to Christoph Hellwig for pointing out the following two: - - Remove now unused function fs/ntfs/malloc.h::vmalloc_nofs(). - - Fix ntfs_free() for ia64 and parisc by checking for VMALLOC_END, too. - -2.0.22 - Cleanups, mainly to ntfs_readdir(), and use C99 initializers. - - - Change fs/ntfs/dir.c::ntfs_reddir() to only read/write ->f_pos once - at entry/exit respectively. - - Use C99 initializers for structures. - - Remove unused variable blocks from fs/ntfs/aops.c::ntfs_read_block(). - -2.0.21 - Check for, and refuse to work with too large files/directories/volumes. - - - Limit volume size at mount time to 2TiB on architectures where - unsigned long is 32-bits (fs/ntfs/super.c::parse_ntfs_boot_sector()). - This is the most we can do without overflowing the 32-bit limit of - the block device size imposed on us by sb_bread() and sb_getblk() - for the time being. - - Limit file/directory size at open() time to 16TiB on architectures - where unsigned long is 32-bits (fs/ntfs/file.c::ntfs_file_open() and - fs/ntfs/dir.c::ntfs_dir_open()). This is the most we can do without - overflowing the page cache page index. - -2.0.20 - Support non-resident directory index bitmaps, fix page leak in readdir. - - - Move the directory index bitmap to use an attribute inode instead of - having special fields for it inside the ntfs inode structure. This - means that the index bitmaps now use the page cache for i/o, too, - and also as a side effect we get support for non-resident index - bitmaps for free. - - Simplify/cleanup error handling in fs/ntfs/dir.c::ntfs_readdir() and - fix a page leak that manifested itself in some cases. - - Add fs/ntfs/inode.c::ntfs_put_inode(), which we need to release the - index bitmap inode on the final iput(). - -2.0.19 - Fix race condition, improvements, and optimizations in i/o interface. - - - Apply block optimization added to fs/ntfs/aops.c::ntfs_read_block() - to fs/ntfs/compress.c::ntfs_file_read_compressed_block() as well. - - Drop the "file" from ntfs_file_read_compressed_block(). - - Rename fs/ntfs/aops.c::ntfs_enb_buffer_read_async() to - ntfs_end_buffer_async_read() (more like the fs/buffer.c counterpart). - - Update ntfs_end_buffer_async_read() with the improved logic from - its updated counterpart fs/buffer.c::end_buffer_async_read(). Apply - further logic improvements to better determine when we set PageError. - - Update submission of buffers in fs/ntfs/aops.c::ntfs_read_block() to - check for the buffers being uptodate first in line with the updated - fs/buffer.c::block_read_full_page(). This plugs a small race - condition. - -2.0.18 - Fix race condition in reading of compressed files. - - - There was a narrow window between checking a buffer head for being - uptodate and locking it in ntfs_file_read_compressed_block(). We now - lock the buffer and then check whether it is uptodate or not. - -2.0.17 - Cleanups and optimizations - shrinking the ToDo list. - - - Modify fs/ntfs/inode.c::ntfs_read_locked_inode() to return an error - code and update callers, i.e. ntfs_iget(), to pass that error code - up instead of just using -EIO. - - Modifications to super.c to ensure that both mount and remount - cannot set any write related options when the driver is compiled - read-only. - - Optimize block resolution in fs/ntfs/aops.c::ntfs_read_block() to - cache the current runlist element. This should improve performance - when reading very large and/or very fragmented data. - -2.0.16 - Convert access to $MFT/$BITMAP to attribute inode API. - - - Fix a stupid bug introduced in 2.0.15 where we were unmapping the - wrong inode in fs/ntfs/inode.c::ntfs_attr_iget(). - - Fix debugging check in fs/ntfs/aops.c::ntfs_read_block(). - - Convert $MFT/$BITMAP access to attribute inode API and remove all - remnants of the ugly mftbmp address space and operations hack. This - means we finally have only one readpage function as well as only one - async io completion handler. Yey! The mft bitmap is now just an - attribute inode and is accessed from vol->mftbmp_ino just as if it - were a normal file. Fake inodes rule. (-: - -2.0.15 - Fake inodes based attribute i/o via the pagecache, fixes and cleanups. - - - Fix silly bug in fs/ntfs/super.c::parse_options() which was causing - remounts to fail when the partition had an entry in /etc/fstab and - the entry specified the nls= option. - - Apply same macro magic used in fs/ntfs/inode.h to fs/ntfs/volume.h to - expand all the helper functions NVolFoo(), NVolSetFoo(), and - NVolClearFoo(). - - Move copyright statement from driver initialisation message to - module description (fs/super.c). This makes the initialisation - message fit on one line and fits in better with rest of kernel. - - Update fs/ntfs/attrib.c::map_run_list() to work on both real and - attribute inodes, and both for files and directories. - - Implement fake attribute inodes allowing all attribute i/o to go via - the page cache and to use all the normal vfs/mm functionality: - - Add ntfs_attr_iget() and its helper ntfs_read_locked_attr_inode() - to fs/ntfs/inode.c. - - Add needed cleanup code to ntfs_clear_big_inode(). - - Merge address space operations for files and directories (aops.c), - now just have ntfs_aops: - - Rename: - end_buffer_read_attr_async() -> ntfs_end_buffer_read_async(), - ntfs_attr_read_block() -> ntfs_read_block(), - ntfs_file_read_page() -> ntfs_readpage(). - - Rewrite fs/ntfs/aops.c::ntfs_readpage() to work on both real and - attribute inodes, and both for files and directories. - - Remove obsolete fs/ntfs/aops.c::ntfs_mst_readpage(). - -2.0.14 - Run list merging code cleanup, minor locking changes, typo fixes. - - - Change fs/ntfs/super.c::ntfs_statfs() to not rely on BKL by moving - the locking out of super.c::get_nr_free_mft_records() and taking and - dropping the mftbmp_lock rw_semaphore in ntfs_statfs() itself. - - Bring attribute runlist merging code (fs/ntfs/attrib.c) in sync with - current userspace ntfs library code. This means that if a merge - fails the original runlists are always left unmodified instead of - being silently corrupted. - - Misc typo fixes. - -2.0.13 - Use iget5_locked() in preparation for fake inodes and small cleanups. - - - Remove nr_mft_bits and the now superfluous union with nr_mft_records - from ntfs_volume structure. - - Remove nr_lcn_bits and the now superfluous union with nr_clusters - from ntfs_volume structure. - - Use iget5_locked() and friends instead of conventional iget(). Wrap - the call in fs/ntfs/inode.c::ntfs_iget() and update callers of iget() - to use ntfs_iget(). Leave only one iget() call at mount time so we - don't need an ntfs_iget_mount(). - - Change fs/ntfs/inode.c::ntfs_new_extent_inode() to take mft_no as an - additional argument. - -2.0.12 - Initial cleanup of address space operations following 2.0.11 changes. - - - Merge fs/ntfs/aops.c::end_buffer_read_mst_async() and - fs/ntfs/aops.c::end_buffer_read_file_async() into one function - fs/ntfs/aops.c::end_buffer_read_attr_async() using NInoMstProtected() - to determine whether to apply mst fixups or not. - - Above change allows merging fs/ntfs/aops.c::ntfs_file_read_block() - and fs/ntfs/aops.c::ntfs_mst_readpage() into one function - fs/ntfs/aops.c::ntfs_attr_read_block(). Also, create a tiny wrapper - fs/ntfs/aops.c::ntfs_mst_readpage() to transform the parameters from - the VFS readpage function prototype to the ntfs_attr_read_block() - function prototype. - -2.0.11 - Initial preparations for fake inode based attribute i/o. - - - Move definition of ntfs_inode_state_bits to fs/ntfs/inode.h and - do some macro magic (adapted from include/linux/buffer_head.h) to - expand all the helper functions NInoFoo(), NInoSetFoo(), and - NInoClearFoo(). - - Add new flag to ntfs_inode_state_bits: NI_Sparse. - - Add new fields to ntfs_inode structure to allow use of fake inodes - for attribute i/o: type, name, name_len. Also add new state bits: - NI_Attr, which, if set, indicates the inode is a fake inode, and - NI_MstProtected, which, if set, indicates the attribute uses multi - sector transfer protection, i.e. fixups need to be applied after - reads and before/after writes. - - Rename fs/ntfs/inode.c::ntfs_{new,clear,destroy}_inode() to - ntfs_{new,clear,destroy}_extent_inode() and update callers. - - Use ntfs_clear_extent_inode() in fs/ntfs/inode.c::__ntfs_clear_inode() - instead of ntfs_destroy_extent_inode(). - - Cleanup memory deallocations in {__,}ntfs_clear_{,big_}inode(). - - Make all operations on ntfs inode state bits use the NIno* functions. - - Set up the new ntfs inode fields and state bits in - fs/ntfs/inode.c::ntfs_read_inode() and add appropriate cleanup of - allocated memory to __ntfs_clear_inode(). - - Cleanup ntfs_inode structure a bit for better ordering of elements - w.r.t. their size to allow better packing of the structure in memory. - -2.0.10 - There can only be 2^32 - 1 inodes on an NTFS volume. - - - Add check at mount time to verify that the number of inodes on the - volume does not exceed 2^32 - 1, which is the maximum allowed for - NTFS according to Microsoft. - - Change mft_no member of ntfs_inode structure to be unsigned long. - Update all users. This makes ntfs_inode->mft_no just a copy of struct - inode->i_ino. But we can't just always use struct inode->i_ino and - remove mft_no because extent inodes do not have an attached struct - inode. - -2.0.9 - Decompression engine now uses a single buffer and other cleanups. - - - Change decompression engine to use a single buffer protected by a - spin lock instead of per-CPU buffers. (Rusty Russell) - - Do not update cb_pos when handling a partial final page during - decompression of a sparse compression block, as the value is later - reset without being read/used. (Rusty Russell) - - Switch to using the new KM_BIO_SRC_IRQ for atomic kmap()s. (Andrew - Morton) - - Change buffer size in ntfs_readdir()/ntfs_filldir() to use - NLS_MAX_CHARSET_SIZE which makes the buffers almost 1kiB each but - it also makes everything safer so it is a good thing. - - Miscellaneous minor cleanups to comments. - -2.0.8 - Major updates for handling of case sensitivity and dcache aliasing. - - Big thanks go to Al Viro and other inhabitants of #kernel for investing - their time to discuss the case sensitivity and dcache aliasing issues. - - - Remove unused source file fs/ntfs/attraops.c. - - Remove show_inodes mount option(s), thus dropping support for - displaying of short file names. - - Remove deprecated mount option posix. - - Restore show_sys_files mount option. - - Add new mount option case_sensitive, to determine if the driver - treats file names as case sensitive or not. If case sensitive, create - file names in the POSIX namespace. Otherwise create file names in the - LONG/WIN32 namespace. Note, files remain accessible via their short - file name, if it exists. - - Remove really dumb logic bug in boot sector recovery code. - - Fix dcache aliasing issues wrt short/long file names via changes - to fs/ntfs/dir.c::ntfs_lookup_inode_by_name() and - fs/ntfs/namei.c::ntfs_lookup(): - - Add additional argument to ntfs_lookup_inode_by_name() in which we - return information about the matching file name if the case is not - matching or the match is a short file name. See comments above the - function definition for details. - - Change ntfs_lookup() to only create dcache entries for the correctly - cased file name and only for the WIN32 namespace counterpart of DOS - namespace file names. This ensures we have only one dentry per - directory and also removes all dcache aliasing issues between short - and long file names once we add write support. See comments above - function for details. - - Fix potential 1 byte overflow in fs/ntfs/unistr.c::ntfs_ucstonls(). - -2.0.7 - Minor cleanups and updates for changes in core kernel code. - - - Remove much of the NULL struct element initializers. - - Various updates to make compatible with recent kernels. - - Remove defines of MAX_BUF_PER_PAGE and include linux/buffer_head.h - in fs/ntfs/ntfs.h instead. - - Remove no longer needed KERNEL_VERSION checks. We are now in the - kernel proper so they are no longer needed. - -2.0.6 - Major bugfix to make compatible with other kernel changes. - - - Initialize the mftbmp address space properly now that there are more - fields in the struct address_space. This was leading to hangs and - oopses on umount since 2.5.12 because of changes to other parts of - the kernel. We probably want a kernel generic init_address_space() - function... - - Drop BKL from ntfs_readdir() after consultation with Al Viro. The - only caller of ->readdir() is vfs_readdir() which holds i_mutex - during the call, and i_mutex is sufficient protection against changes - in the directory inode (including ->i_size). - - Use generic_file_llseek() for directories (as opposed to - default_llseek()) as this downs i_mutex instead of the BKL which is - what we now need for exclusion against ->f_pos changes considering we - no longer take the BKL in ntfs_readdir(). - -2.0.5 - Major bugfix. Buffer overflow in extent inode handling. - - - No need to set old blocksize in super.c::ntfs_fill_super() as the - VFS does so via invocation of deactivate_super() calling - fs->fill_super() calling block_kill_super() which does it. - - BKL moved from VFS into dir.c::ntfs_readdir(). (Linus Torvalds) - -> Do we really need it? I don't think so as we have exclusion on - the directory ntfs_inode rw_semaphore mrec_lock. We mmight have to - move the ->f_pos accesses under the mrec_lock though. Check this... - - Fix really, really, really stupid buffer overflow in extent inode - handling in mft.c::map_extent_mft_record(). - -2.0.4 - Cleanups and updates for kernel 2.5.11. - - - Add documentation on how to use the MD driver to be able to use NTFS - stripe and volume sets in Linux and generally cleanup documentation - a bit. - Remove all uses of kdev_t in favour of struct block_device *: - - Change compress.c::ntfs_file_read_compressed_block() to use - sb_getblk() instead of getblk(). - - Change super.c::ntfs_fill_super() to use bdev_hardsect_size() instead - of get_hardsect_size(). - - No need to get old blocksize in super.c::ntfs_fill_super() as - fs/super.c::get_sb_bdev() already does this. - - Set bh->b_bdev instead of bh->b_dev throughout aops.c. - -2.0.3 - Small bug fixes, cleanups, and performance improvements. - - - Remove some dead code from mft.c. - - Optimize readpage and read_block functions throughout aops.c so that - only initialized blocks are read. Non-initialized ones have their - buffer head mapped, zeroed, and set up to date, without scheduling - any i/o. Thanks to Al Viro for advice on how to avoid the device i/o. - Thanks go to Andrew Morton for spotting the below: - - Fix buglet in allocate_compression_buffers() error code path. - - Call flush_dcache_page() after modifying page cache page contents in - ntfs_file_readpage(). - - Check for existence of page buffers throughout aops.c before calling - create_empty_buffers(). This happens when an I/O error occurs and the - read is retried. (It also happens once writing is implemented so that - needed doing anyway but I had left it for later...) - - Don't BUG_ON() uptodate and/or mapped buffers throughout aops.c in - readpage and read_block functions. Reasoning same as above (i.e. I/O - error retries and future write code paths.) - -2.0.2 - Minor updates and cleanups. - - - Cleanup: rename mst.c::__post_read_mst_fixup to post_write_mst_fixup - and cleanup the code a bit, removing the unused size parameter. - - Change default fmask to 0177 and update documentation. - - Change attrib.c::get_attr_search_ctx() to return the search context - directly instead of taking the address of a pointer. A return value - of NULL means the allocation failed. Updated all callers - appropriately. - - Update to 2.5.9 kernel (preserving backwards compatibility) by - replacing all occurences of page->buffers with page_buffers(page). - - Fix minor bugs in runlist merging, also minor cleanup. - - Updates to bootsector layout and mft mirror contents descriptions. - - Small bug fix in error detection in unistr.c and some cleanups. - - Grow name buffer allocations in unistr.c in aligned mutlipled of 64 - bytes. - -2.0.1 - Minor updates. - - - Make default umask correspond to documentation. - - Improve documentation. - - Set default mode to include execute bit. The {u,f,d}mask can be used - to take it away if desired. This allows binaries to be executed from - a mounted ntfs partition. - -2.0.0 - New version number. Remove TNG from the name. Now in the kernel. - - - Add kill_super, just keeping up with the vfs changes in the kernel. - - Repeat some changes from tng-0.0.8 that somehow got lost on the way - from the CVS import into BitKeeper. - - Begin to implement proper handling of allocated_size vs - initialized_size vs data_size (i.e. i_size). Done are - mft.c::ntfs_mft_readpage(), aops.c::end_buffer_read_index_async(), - and attrib.c::load_attribute_list(). - - Lock the runlist in attrib.c::load_attribute_list() while using it. - - Fix memory leak in ntfs_file_read_compressed_block() and generally - clean up compress.c a little, removing some uncommented/unused debug - code. - - Tidy up dir.c a little bit. - - Don't bother getting the runlist in inode.c::ntfs_read_inode(). - - Merge mft.c::ntfs_mft_readpage() and aops.c::ntfs_index_readpage() - creating aops.c::ntfs_mst_readpage(), improving the handling of - holes and overflow in the process and implementing the correct - equivalent of ntfs_file_get_block() in ntfs_mst_readpage() itself. - I am aiming for correctness at the moment. Modularisation can come - later. - - Rename aops.c::end_buffer_read_index_async() to - end_buffer_read_mst_async() and optimize the overflow checking and - handling. - - Use the host of the mftbmp address space mapping to hold the ntfs - volume. This is needed so the async i/o completion handler can - retrieve a pointer to the volume. Hopefully this will not cause - problems elsewhere in the kernel... Otherwise will need to use a - fake inode. - - Complete implementation of proper handling of allocated_size vs - initialized_size vs data_size (i.e. i_size) in whole driver. - Basically aops.c is now completely rewritten. - - Change NTFS driver name to just NTFS and set version number to 2.0.0 - to make a clear distinction from the old driver which is still on - version 1.1.22. - -tng-0.0.8 - 08/03/2002 - Now using BitKeeper, http://linux-ntfs.bkbits.net/ - - - Replace bdevname(sb->s_dev) with sb->s_id. - - Remove now superfluous new-line characters in all callers of - ntfs_debug(). - - Apply kludge in ntfs_read_inode(), setting i_nlink to 1 for - directories. Without this the "find" utility gets very upset which is - fair enough as Linux/Unix do not support directory hard links. - - Further runlist merging work. (Richard Russon) - - Backwards compatibility for gcc-2.95. (Richard Russon) - - Update to kernel 2.5.5-pre1 and rediff the now tiny patch. - - Convert to new filesystem declaration using ->ntfs_get_sb() and - replacing ntfs_read_super() with ntfs_fill_super(). - - Set s_maxbytes to MAX_LFS_FILESIZE to avoid page cache page index - overflow on 32-bit architectures. - - Cleanup upcase loading code to use ntfs_(un)map_page(). - - Disable/reenable preemtion in critical sections of compession engine. - - Replace device size determination in ntfs_fill_super() with - sb->s_bdev->bd_inode->i_size (in bytes) and remove now superfluous - function super.c::get_nr_blocks(). - - Implement a mount time option (show_inodes) allowing choice of which - types of inode names readdir() returns and modify ntfs_filldir() - accordingly. There are several parameters to show_inodes: - system: system files - win32: long file names (including POSIX file names) [DEFAULT] - long: same as win32 - dos: short file names only (excluding POSIX file names) - short: same as dos - posix: same as both win32 and dos - all: all file names - Note that the options are additive, i.e. specifying: - -o show_inodes=system,show_inodes=win32,show_inodes=dos - is the same as specifying: - -o show_inodes=all - Note that the "posix" and "all" options will show all directory - names, BUT the link count on each directory inode entry is set to 1, - due to Linux not supporting directory hard links. This may well - confuse some userspace applications, since the directory names will - have the same inode numbers. Thus it is NOT advisable to use the - "posix" or "all" options. We provide them only for completeness sake. - - Add copies of allocated_size, initialized_size, and compressed_size to - the ntfs inode structure and set them up in - inode.c::ntfs_read_inode(). These reflect the unnamed data attribute - for files and the index allocation attribute for directories. - - Add copies of allocated_size and initialized_size to ntfs inode for - $BITMAP attribute of large directories and set them up in - inode.c::ntfs_read_inode(). - - Add copies of allocated_size and initialized_size to ntfs volume for - $BITMAP attribute of $MFT and set them up in - super.c::load_system_files(). - - Parse deprecated ntfs driver options (iocharset, show_sys_files, - posix, and utf8) and tell user what the new options to use are. Note - we still do support them but they will be removed with kernel 2.7.x. - - Change all occurences of integer long long printf formatting to hex - as printk() will not support long long integer format if/when the - div64 patch goes into the kernel. - - Make slab caches have stable names and change the names to what they - were intended to be. These changes are required/made possible by the - new slab cache name handling which removes the length limitation by - requiring the caller of kmem_cache_create() to supply a stable name - which is then referenced but not copied. - - Rename run_list structure to run_list_element and create a new - run_list structure containing a pointer to a run_list_element - structure and a read/write semaphore. Adapt all users of runlists - to new scheme and take and release the lock as needed. This fixes a - nasty race as the run_list changes even when inodes are locked for - reading and even when the inode isn't locked at all, so we really - needed the serialization. We use a semaphore rather than a spinlock - as memory allocations can sleep and doing everything GFP_ATOMIC - would be silly. - - Cleanup read_inode() removing all code checking for lowest_vcn != 0. - This can never happen due to the nature of lookup_attr() and how we - support attribute lists. If it did happen it would imply the inode - being corrupt. - - Check for lowest_vcn != 0 in ntfs_read_inode() and mark the inode as - bad if found. - - Update to 2.5.6-pre2 changes in struct address_space. - - Use parent_ino() when accessing d_parent inode number in dir.c. - - Import Sourceforge CVS repository into BitKeeper repository: - http://linux-ntfs.bkbits.net/ntfs-tng-2.5 - - Update fs/Makefile, fs/Config.help, fs/Config.in, and - Documentation/filesystems/ntfs.txt for NTFS TNG. - - Create kernel configuration option controlling whether debugging - is enabled or not. - - Add the required export of end_buffer_io_sync() from the patches - directory to the kernel code. - - Update inode.c::ntfs_show_options() with show_inodes mount option. - - Update errors mount option. - -tng-0.0.7 - 13/02/2002 - The driver is now feature complete for read-only! - - - Cleanup mft.c and it's debug/error output in particular. Fix a minor - bug in mapping of extent inodes. Update all the comments to fit all - the recent code changes. - - Modify vcn_to_lcn() to cope with entirely unmapped runlists. - - Cleanups in compress.c, mostly comments and folding help. - - Implement attrib.c::map_run_list() as a generic helper. - - Make compress.c::ntfs_file_read_compressed_block() use map_run_list() - thus making code shorter and enabling attribute list support. - - Cleanup incorrect use of [su]64 with %L printf format specifier in - all source files. Type casts to [unsigned] long long added to correct - the mismatches (important for architectures which have long long not - being 64 bits). - - Merge async io completion handlers for directory indexes and $MFT - data into one by setting the index_block_size{_bits} of the ntfs - inode for $MFT to the mft_record_size{_bits} of the ntfs_volume. - - Cleanup aops.c, update comments. - - Make ntfs_file_get_block() use map_run_list() so all files now - support attribute lists. - - Make ntfs_dir_readpage() almost verbatim copy of - block_read_full_page() by using ntfs_file_get_block() with only real - difference being the use of our own async io completion handler - rather than the default one, thus reducing the amount of code and - automatically enabling attribute list support for directory indices. - - Fix bug in load_attribute_list() - forgot to call brelse in error - code path. - - Change parameters to find_attr() and lookup_attr(). We no longer - pass in the upcase table and its length. These can be gotten from - ctx->ntfs_ino->vol->upcase{_len}. Update all callers. - - Cleanups in attrib.c. - - Implement merging of runlists, attrib.c::merge_run_lists() and its - helpers. (Richard Russon) - - Attribute lists part 2, attribute extents and multi part runlists: - enable proper support for LCN_RL_NOT_MAPPED and automatic mapping of - further runlist parts via attrib.c::map_run_list(). - - Tiny endianness bug fix in decompress_mapping_pairs(). - -tng-0.0.6 - Encrypted directories, bug fixes, cleanups, debugging enhancements. - - - Enable encrypted directories. (Their index root is marked encrypted - to indicate that new files in that directory should be created - encrypted.) - - Fix bug in NInoBmpNonResident() macro. (Cut and paste error.) - - Enable $Extend system directory. Most (if not all) extended system - files do not have unnamed data attributes so ntfs_read_inode() had to - special case them but that is ok, as the special casing recovery - happens inside an error code path so there is zero slow down in the - normal fast path. The special casing is done by introducing a new - function inode.c::ntfs_is_extended_system_file() which checks if any - of the hard links in the inode point to $Extend as being their parent - directory and if they do we assume this is an extended system file. - - Create a sysctl/proc interface to allow {dis,en}abling of debug output - when compiled with -DDEBUG. Default is debug messages to be disabled. - To enable them, one writes a non-zero value to /proc/sys/fs/ntfs-debug - (if /proc is enabled) or uses sysctl(2) to effect the same (if sysctl - interface is enabled). Inspired by old ntfs driver. - - Add debug_msgs insmod/kernel boot parameter to set whether debug - messages are {dis,en}abled. This is useful to enable debug messages - during ntfs initialization and is the only way to activate debugging - when the sysctl interface is not enabled. - - Cleanup debug output in various places. - - Remove all dollar signs ($) from the source (except comments) to - enable compilation on architectures whose gcc compiler does not - support dollar signs in the names of variables/constants. Attribute - types now start with AT_ instead of $ and $I30 is now just I30. - - Cleanup ntfs_lookup() and add consistency check of sequence numbers. - - Load complete runlist for $MFT/$BITMAP during mount and cleanup - access functions. This means we now cope with $MFT/$BITMAP being - spread accross several mft records. - - Disable modification of mft_zone_multiplier on remount. We can always - reenable this later on if we really want to, but we will need to make - sure we readjust the mft_zone size / layout accordingly. - -tng-0.0.5 - Modernize for 2.5.x and further in line-ing with Al Viro's comments. - - - Use sb_set_blocksize() instead of set_blocksize() and verify the - return value. - - Use sb_bread() instead of bread() throughout. - - Add index_vcn_size{_bits} to ntfs_inode structure to store the size - of a directory index block vcn. Apply resulting simplifications in - dir.c everywhere. - - Fix a small bug somewhere (but forgot what it was). - - Change ntfs_{debug,error,warning} to enable gcc to do type checking - on the printf-format parameter list and fix bugs reported by gcc - as a result. (Richard Russon) - - Move inode allocation strategy to Al's new stuff but maintain the - divorce of ntfs_inode from struct inode. To achieve this we have two - separate slab caches, one for big ntfs inodes containing a struct - inode and pure ntfs inodes and at the same time fix some faulty - error code paths in ntfs_read_inode(). - - Show mount options in proc (inode.c::ntfs_show_options()). - -tng-0.0.4 - Big changes, getting in line with Al Viro's comments. - - - Modified (un)map_mft_record functions to be common for read and write - case. To specify which is which, added extra parameter at front of - parameter list. Pass either READ or WRITE to this, each has the - obvious meaning. - - General cleanups to allow for easier folding in vi. - - attrib.c::decompress_mapping_pairs() now accepts the old runlist - argument, and invokes attrib.c::merge_run_lists() to merge the old - and the new runlists. - - Removed attrib.c::find_first_attr(). - - Implemented loading of attribute list and complete runlist for $MFT. - This means we now cope with $MFT being spread across several mft - records. - - Adapt to 2.5.2-pre9 and the changed create_empty_buffers() syntax. - - Adapt major/minor/kdev_t/[bk]devname stuff to new 2.5.x kernels. - - Make ntfs_volume be allocated via kmalloc() instead of using a slab - cache. There are too little ntfs_volume structures at any one time - to justify a private slab cache. - - Fix bogus kmap() use in async io completion. Now use kmap_atomic(). - Use KM_BIO_IRQ on advice from IRC/kernel... - - Use ntfs_map_page() in map_mft_record() and create ->readpage method - for reading $MFT (ntfs_mft_readpage). In the process create dedicated - address space operations (ntfs_mft_aops) for $MFT inode mapping. Also - removed the now superfluous exports from the kernel core patch. - - Fix a bug where kfree() was used instead of ntfs_free(). - - Change map_mft_record() to take ntfs_inode as argument instead of - vfs inode. Dito for unmap_mft_record(). Adapt all callers. - - Add pointer to ntfs_volume to ntfs_inode. - - Add mft record number and sequence number to ntfs_inode. Stop using - i_ino and i_generation for in-driver purposes. - - Implement attrib.c::merge_run_lists(). (Richard Russon) - - Remove use of proper inodes by extent inodes. Move i_ino and - i_generation to ntfs_inode to do this. Apply simplifications that - result and remove iget_no_wait(), etc. - - Pass ntfs_inode everywhere in the driver (used to be struct inode). - - Add reference counting in ntfs_inode for the ntfs inode itself and - for the mapped mft record. - - Extend mft record mapping so we can (un)map extent mft records (new - functions (un)map_extent_mft_record), and so mappings are reference - counted and don't have to happen twice if already mapped - just ref - count increases. - - Add -o iocharset as alias to -o nls for backwards compatibility. - - The latest core patch is now tiny. In fact just a single additional - export is necessary over the base kernel. - -tng-0.0.3 - Cleanups, enhancements, bug fixes. - - - Work on attrib.c::decompress_mapping_pairs() to detect base extents - and setup the runlist appropriately using knowledge provided by the - sizes in the base attribute record. - - Balance the get_/put_attr_search_ctx() calls so we don't leak memory - any more. - - Introduce ntfs_malloc_nofs() and ntfs_free() to allocate/free a single - page or use vmalloc depending on the amount of memory requested. - - Cleanup error output. The __FUNCTION__ "(): " is now added - automatically. Introduced a new header file debug.h to support this - and also moved ntfs_debug() function into it. - - Make reading of compressed files more intelligent and especially get - rid of the vmalloc_nofs() from readpage(). This now uses per CPU - buffers (allocated at first mount with cluster size <= 4kiB and - deallocated on last umount with cluster size <= 4kiB), and - asynchronous io for the compressed data using a list of buffer heads. - Er, we use synchronous io as async io only works on whole pages - covered by buffers and not on individual buffer heads... - - Bug fix for reading compressed files with sparse compression blocks. - -tng-0.0.2 - Now handles larger/fragmented/compressed volumes/files/dirs. - - - Fixed handling of directories when cluster size exceeds index block - size. - - Hide DOS only name space directory entries from readdir() but allow - them in lookup(). This should fix the problem that Linux doesn't - support directory hard links, while still allowing access to entries - via their short file name. This also has the benefit of mimicking - what Windows users are used to, so it is the ideal solution. - - Implemented sync_page everywhere so no more hangs in D state when - waiting for a page. - - Stop using bforget() in favour of brelse(). - - Stop locking buffers unnecessarily. - - Implemented compressed files (inode->mapping contains uncompressed - data, raw compressed data is currently bread() into a vmalloc()ed - memory buffer). - - Enable compressed directories. (Their index root is marked compressed - to indicate that new files in that directory should be created - compressed.) - - Use vsnprintf rather than vsprintf in the ntfs_error and ntfs_warning - functions. (Thanks to Will Dyson for pointing this out.) - - Moved the ntfs_inode and ntfs_volume (the former ntfs_inode_info and - ntfs_sb_info) out of the common inode and super_block structures and - started using the generic_ip and generic_sbp pointers instead. This - makes ntfs entirely private with respect to the kernel tree. - - Detect compiler version and abort with error message if gcc less than - 2.96 is used. - - Fix bug in name comparison function in unistr.c. - - Implement attribute lists part 1, the infrastructure: search contexts - and operations, find_external_attr(), lookup_attr()) and make the - code use the infrastructure. - - Fix stupid buffer overflow bug that became apparent on larger run - list containing attributes. - - Fix bugs in readdir() that became apparent on larger directories. - - The driver is now really useful and survives the test - find . -type f -exec md5sum "{}" \; - without any error messages on a over 1GiB sized partition with >16k - files on it, including compressed files and directories and many files - and directories with attribute lists. - -tng-0.0.1 - The first useful version. - - - Added ntfs_lookup(). - - Added default upcase generation and handling. - - Added compile options to be shown on module init. - - Many bug fixes that were "hidden" before. - - Update to latest kernel. - - Added ntfs_readdir(). - - Added file operations for mmap(), read(), open() and llseek(). We just - use the generic ones. The whole point of going through implementing - readpage() methods and where possible get_block() call backs is that - this allows us to make use of the generic high level methods provided - by the kernel. - - The driver is now actually useful! Yey. (-: It undoubtedly has got bugs - though and it doesn't implement accesssing compressed files yet. Also, - accessing files with attribute list attributes is not implemented yet - either. But for small or simple filesystems it should work and allow - you to list directories, use stat on directory entries and the file - system, open, read, mmap and llseek around in files. A big mile stone - has been reached! - -tng-0.0.0 - Initial version tag. - - Initial driver implementation. The driver can mount and umount simple - NTFS filesystems (i.e. ones without attribute lists in the system - files). If the mount fails there might be problems in the error handling - code paths, so be warned. Otherwise it seems to be loading the system - files nicely and the mft record read mapping/unmapping seems to be - working nicely, too. Proof of inode metadata in the page cache and non- - resident file unnamed stream data in the page cache concepts is thus - complete. diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index 5a9e34475e3..9173e82a45d 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1545,7 +1545,7 @@ static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry, write_inode_now(bmp_vi, !datasync); iput(bmp_vi); } - ret = ntfs_write_inode(vi, 1); + ret = __ntfs_write_inode(vi, 1); write_inode_now(vi, !datasync); err = sync_blockdev(vi->i_sb->s_bdev); if (unlikely(err && !ret)) diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 43179ddd336..b681c71d706 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -2182,7 +2182,7 @@ static int ntfs_file_fsync(struct file *filp, struct dentry *dentry, ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); BUG_ON(S_ISDIR(vi->i_mode)); if (!datasync || !NInoNonResident(NTFS_I(vi))) - ret = ntfs_write_inode(vi, 1); + ret = __ntfs_write_inode(vi, 1); write_inode_now(vi, !datasync); /* * NOTE: If we were to use mapping->private_list (see ext2 and diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index dc2505abb6d..4b57fb1eac2 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -2957,7 +2957,7 @@ out: * * Return 0 on success and -errno on error. */ -int ntfs_write_inode(struct inode *vi, int sync) +int __ntfs_write_inode(struct inode *vi, int sync) { sle64 nt; ntfs_inode *ni = NTFS_I(vi); diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h index 117eaf8032a..9a113544605 100644 --- a/fs/ntfs/inode.h +++ b/fs/ntfs/inode.h @@ -307,12 +307,12 @@ extern void ntfs_truncate_vfs(struct inode *vi); extern int ntfs_setattr(struct dentry *dentry, struct iattr *attr); -extern int ntfs_write_inode(struct inode *vi, int sync); +extern int __ntfs_write_inode(struct inode *vi, int sync); static inline void ntfs_commit_inode(struct inode *vi) { if (!is_bad_inode(vi)) - ntfs_write_inode(vi, 1); + __ntfs_write_inode(vi, 1); return; } diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 80b04770e8e..0de1db6cddb 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -31,6 +31,7 @@ #include <linux/vfs.h> #include <linux/moduleparam.h> #include <linux/smp_lock.h> +#include <linux/bitmap.h> #include "sysctl.h" #include "logfile.h" @@ -39,6 +40,7 @@ #include "dir.h" #include "debug.h" #include "index.h" +#include "inode.h" #include "aops.h" #include "layout.h" #include "malloc.h" @@ -2457,7 +2459,6 @@ static void ntfs_put_super(struct super_block *sb) static s64 get_nr_free_clusters(ntfs_volume *vol) { s64 nr_free = vol->nr_clusters; - u32 *kaddr; struct address_space *mapping = vol->lcnbmp_ino->i_mapping; struct page *page; pgoff_t index, max_index; @@ -2476,7 +2477,8 @@ static s64 get_nr_free_clusters(ntfs_volume *vol) ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.", max_index, PAGE_CACHE_SIZE / 4); for (index = 0; index < max_index; index++) { - unsigned int i; + unsigned long *kaddr; + /* * Read the page from page cache, getting it from backing store * if necessary, and increment the use count. @@ -2489,16 +2491,16 @@ static s64 get_nr_free_clusters(ntfs_volume *vol) nr_free -= PAGE_CACHE_SIZE * 8; continue; } - kaddr = (u32*)kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page, KM_USER0); /* - * For each 4 bytes, subtract the number of set bits. If this + * Subtract the number of set bits. If this * is the last page and it is partial we don't really care as * it just means we do a little extra work but it won't affect * the result as all out of range bytes are set to zero by * ntfs_readpage(). */ - for (i = 0; i < PAGE_CACHE_SIZE / 4; i++) - nr_free -= (s64)hweight32(kaddr[i]); + nr_free -= bitmap_weight(kaddr, + PAGE_CACHE_SIZE * BITS_PER_BYTE); kunmap_atomic(kaddr, KM_USER0); page_cache_release(page); } @@ -2537,7 +2539,6 @@ static s64 get_nr_free_clusters(ntfs_volume *vol) static unsigned long __get_nr_free_mft_records(ntfs_volume *vol, s64 nr_free, const pgoff_t max_index) { - u32 *kaddr; struct address_space *mapping = vol->mftbmp_ino->i_mapping; struct page *page; pgoff_t index; @@ -2547,7 +2548,8 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol, ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = " "0x%lx.", max_index, PAGE_CACHE_SIZE / 4); for (index = 0; index < max_index; index++) { - unsigned int i; + unsigned long *kaddr; + /* * Read the page from page cache, getting it from backing store * if necessary, and increment the use count. @@ -2560,16 +2562,16 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol, nr_free -= PAGE_CACHE_SIZE * 8; continue; } - kaddr = (u32*)kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page, KM_USER0); /* - * For each 4 bytes, subtract the number of set bits. If this + * Subtract the number of set bits. If this * is the last page and it is partial we don't really care as * it just means we do a little extra work but it won't affect * the result as all out of range bytes are set to zero by * ntfs_readpage(). */ - for (i = 0; i < PAGE_CACHE_SIZE / 4; i++) - nr_free -= (s64)hweight32(kaddr[i]); + nr_free -= bitmap_weight(kaddr, + PAGE_CACHE_SIZE * BITS_PER_BYTE); kunmap_atomic(kaddr, KM_USER0); page_cache_release(page); } @@ -2662,6 +2664,13 @@ static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs) return 0; } +#ifdef NTFS_RW +static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc) +{ + return __ntfs_write_inode(vi, wbc->sync_mode == WB_SYNC_ALL); +} +#endif + /** * The complete super operations. */ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 2bbe1ecc08c..9f8bd913c51 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5713,7 +5713,7 @@ int ocfs2_remove_btree_range(struct inode *inode, goto out; } - vfs_dq_free_space_nodirty(inode, + dquot_free_space_nodirty(inode, ocfs2_clusters_to_bytes(inode->i_sb, len)); ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc); @@ -6936,7 +6936,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, goto bail; } - vfs_dq_free_space_nodirty(inode, + dquot_free_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, clusters_to_del)); spin_lock(&OCFS2_I(inode)->ip_lock); OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - @@ -7301,11 +7301,10 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, unsigned int page_end; u64 phys; - if (vfs_dq_alloc_space_nodirty(inode, - ocfs2_clusters_to_bytes(osb->sb, 1))) { - ret = -EDQUOT; + ret = dquot_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, 1)); + if (ret) goto out_commit; - } did_quota = 1; ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, @@ -7381,7 +7380,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, out_commit: if (ret < 0 && did_quota) - vfs_dq_free_space_nodirty(inode, + dquot_free_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, 1)); ocfs2_commit_trans(osb, handle); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 4c2a6d282c4..21441ddb550 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1764,10 +1764,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, wc->w_handle = handle; - if (clusters_to_alloc && vfs_dq_alloc_space_nodirty(inode, - ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc))) { - ret = -EDQUOT; - goto out_commit; + if (clusters_to_alloc) { + ret = dquot_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc)); + if (ret) + goto out_commit; } /* * We don't want this to fail in ocfs2_write_end(), so do it @@ -1810,7 +1811,7 @@ success: return 0; out_quota: if (clusters_to_alloc) - vfs_dq_free_space(inode, + dquot_free_space(inode, ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc)); out_commit: ocfs2_commit_trans(osb, handle); diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index b39da877b12..3bb928a2bf7 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -136,7 +136,7 @@ static ssize_t mlog_store(struct kobject *obj, struct attribute *attr, return mlog_mask_store(mlog_attr->mask, buf, count); } -static struct sysfs_ops mlog_attr_ops = { +static const struct sysfs_ops mlog_attr_ops = { .show = mlog_show, .store = mlog_store, }; diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index d8d0c65ac03..73e743eea2c 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -72,9 +72,9 @@ #include "tcp_internal.h" -#define SC_NODEF_FMT "node %s (num %u) at %u.%u.%u.%u:%u" +#define SC_NODEF_FMT "node %s (num %u) at %pI4:%u" #define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num, \ - NIPQUAD(sc->sc_node->nd_ipv4_address), \ + &sc->sc_node->nd_ipv4_address, \ ntohs(sc->sc_node->nd_ipv4_port) /* diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 765d66c7098..efd77d071c8 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2964,12 +2964,10 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, goto out; } - if (vfs_dq_alloc_space_nodirty(dir, - ocfs2_clusters_to_bytes(osb->sb, - alloc + dx_alloc))) { - ret = -EDQUOT; + ret = dquot_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(osb->sb, alloc + dx_alloc)); + if (ret) goto out_commit; - } did_quota = 1; if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { @@ -3178,7 +3176,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, out_commit: if (ret < 0 && did_quota) - vfs_dq_free_space_nodirty(dir, bytes_allocated); + dquot_free_space_nodirty(dir, bytes_allocated); ocfs2_commit_trans(osb, handle); @@ -3221,11 +3219,10 @@ static int ocfs2_do_extend_dir(struct super_block *sb, if (extend) { u32 offset = OCFS2_I(dir)->ip_clusters; - if (vfs_dq_alloc_space_nodirty(dir, - ocfs2_clusters_to_bytes(sb, 1))) { - status = -EDQUOT; + status = dquot_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(sb, 1)); + if (status) goto bail; - } did_quota = 1; status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset, @@ -3254,7 +3251,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb, status = 0; bail: if (did_quota && status < 0) - vfs_dq_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1)); + dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1)); mlog_exit(status); return status; } @@ -3889,11 +3886,10 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir, goto out; } - if (vfs_dq_alloc_space_nodirty(dir, - ocfs2_clusters_to_bytes(dir->i_sb, 1))) { - ret = -EDQUOT; + ret = dquot_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(dir->i_sb, 1)); + if (ret) goto out_commit; - } did_quota = 1; ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh, @@ -3983,7 +3979,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir, out_commit: if (ret < 0 && did_quota) - vfs_dq_free_space_nodirty(dir, + dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(dir->i_sb, 1)); ocfs2_commit_trans(osb, handle); @@ -4165,11 +4161,10 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir, goto out; } - if (vfs_dq_alloc_space_nodirty(dir, - ocfs2_clusters_to_bytes(osb->sb, 1))) { - ret = -EDQUOT; + ret = dquot_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(osb->sb, 1)); + if (ret) goto out_commit; - } did_quota = 1; /* @@ -4229,7 +4224,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir, out_commit: if (ret < 0 && did_quota) - vfs_dq_free_space_nodirty(dir, + dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(dir->i_sb, 1)); ocfs2_commit_trans(osb, handle); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 8298608d416..50c4ee805da 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1881,7 +1881,7 @@ out: * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of * flock() calls. The locking approach this requires is sufficiently * different from all other cluster lock types that we implement a - * seperate path to the "low-level" dlm calls. In particular: + * separate path to the "low-level" dlm calls. In particular: * * - No optimization of lock levels is done - we take at exactly * what's been requested. diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 5328529e7fd..c562a7581cf 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -453,7 +453,7 @@ static int ocfs2_get_clusters_nocache(struct inode *inode, if (i == -1) { /* * Holes can be larger than the maximum size of an - * extent, so we return their lengths in a seperate + * extent, so we return their lengths in a separate * field. */ if (hole_len) { diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 5b52547d629..17947dc8341 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -107,6 +107,9 @@ static int ocfs2_file_open(struct inode *inode, struct file *file) mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name); + if (file->f_mode & FMODE_WRITE) + dquot_initialize(inode); + spin_lock(&oi->ip_lock); /* Check that the inode hasn't been wiped from disk by another @@ -629,11 +632,10 @@ restart_all: } restarted_transaction: - if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, - clusters_to_add))) { - status = -EDQUOT; + status = dquot_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); + if (status) goto leave; - } did_quota = 1; /* reserve a write to the file entry early on - that we if we @@ -674,7 +676,7 @@ restarted_transaction: clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); spin_unlock(&OCFS2_I(inode)->ip_lock); /* Release unused quota reservation */ - vfs_dq_free_space(inode, + dquot_free_space(inode, ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); did_quota = 0; @@ -710,7 +712,7 @@ restarted_transaction: leave: if (status < 0 && did_quota) - vfs_dq_free_space(inode, + dquot_free_space(inode, ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); if (handle) { ocfs2_commit_trans(osb, handle); @@ -978,6 +980,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; if (size_change) { + dquot_initialize(inode); + status = ocfs2_rw_lock(inode, 1); if (status < 0) { mlog_errno(status); @@ -1020,7 +1024,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) /* * Gather pointers to quota structures so that allocation / * freeing of quota structures happens here and not inside - * vfs_dq_transfer() where we have problems with lock ordering + * dquot_transfer() where we have problems with lock ordering */ if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid && OCFS2_HAS_RO_COMPAT_FEATURE(sb, @@ -1053,7 +1057,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) mlog_errno(status); goto bail_unlock; } - status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; + status = dquot_transfer(inode, attr); if (status < 0) goto bail_commit; } else { diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 88459bdd1ff..278a223aae1 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -665,7 +665,7 @@ static int ocfs2_remove_inode(struct inode *inode, } ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh); - vfs_dq_free_inode(inode); + dquot_free_inode(inode); status = ocfs2_free_dinode(handle, inode_alloc_inode, inode_alloc_bh, di); @@ -971,6 +971,8 @@ void ocfs2_delete_inode(struct inode *inode) goto bail; } + dquot_initialize(inode); + if (!ocfs2_inode_is_valid_to_delete(inode)) { /* It's probably not necessary to truncate_inode_pages * here but we do it for safety anyway (it will most @@ -1087,6 +1089,8 @@ void ocfs2_clear_inode(struct inode *inode) mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, "Inode=%lu\n", inode->i_ino); + dquot_drop(inode); + /* To preven remote deletes we hold open lock before, now it * is time to unlock PR and EX open locks. */ ocfs2_open_unlock(inode); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 50fb26a6a5f..d9cd4e373a5 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -212,7 +212,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode) } else inode->i_gid = current_fsgid(); inode->i_mode = mode; - vfs_dq_init(inode); + dquot_initialize(inode); return inode; } @@ -244,6 +244,8 @@ static int ocfs2_mknod(struct inode *dir, (unsigned long)dev, dentry->d_name.len, dentry->d_name.name); + dquot_initialize(dir); + /* get our super block */ osb = OCFS2_SB(dir->i_sb); @@ -348,13 +350,9 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } - /* We don't use standard VFS wrapper because we don't want vfs_dq_init - * to be called. */ - if (sb_any_quota_active(osb->sb) && - osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) { - status = -EDQUOT; + status = dquot_alloc_inode(inode); + if (status) goto leave; - } did_quota_inode = 1; mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, @@ -431,7 +429,7 @@ static int ocfs2_mknod(struct inode *dir, status = 0; leave: if (status < 0 && did_quota_inode) - vfs_dq_free_inode(inode); + dquot_free_inode(inode); if (handle) ocfs2_commit_trans(osb, handle); @@ -636,6 +634,8 @@ static int ocfs2_link(struct dentry *old_dentry, if (S_ISDIR(inode->i_mode)) return -EPERM; + dquot_initialize(dir); + err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT); if (err < 0) { if (err != -ENOENT) @@ -791,6 +791,8 @@ static int ocfs2_unlink(struct inode *dir, mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, dentry->d_name.len, dentry->d_name.name); + dquot_initialize(dir); + BUG_ON(dentry->d_parent->d_inode != dir); mlog(0, "ino = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -1051,6 +1053,9 @@ static int ocfs2_rename(struct inode *old_dir, old_dentry->d_name.len, old_dentry->d_name.name, new_dentry->d_name.len, new_dentry->d_name.name); + dquot_initialize(old_dir); + dquot_initialize(new_dir); + osb = OCFS2_SB(old_dir->i_sb); if (new_inode) { @@ -1599,6 +1604,8 @@ static int ocfs2_symlink(struct inode *dir, mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, dentry, symname, dentry->d_name.len, dentry->d_name.name); + dquot_initialize(dir); + sb = dir->i_sb; osb = OCFS2_SB(sb); @@ -1688,13 +1695,9 @@ static int ocfs2_symlink(struct inode *dir, goto bail; } - /* We don't use standard VFS wrapper because we don't want vfs_dq_init - * to be called. */ - if (sb_any_quota_active(osb->sb) && - osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) { - status = -EDQUOT; + status = dquot_alloc_inode(inode); + if (status) goto bail; - } did_quota_inode = 1; mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, @@ -1716,11 +1719,10 @@ static int ocfs2_symlink(struct inode *dir, u32 offset = 0; inode->i_op = &ocfs2_symlink_inode_operations; - if (vfs_dq_alloc_space_nodirty(inode, - ocfs2_clusters_to_bytes(osb->sb, 1))) { - status = -EDQUOT; + status = dquot_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, 1)); + if (status) goto bail; - } did_quota = 1; status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0, new_fe_bh, @@ -1788,10 +1790,10 @@ static int ocfs2_symlink(struct inode *dir, d_instantiate(dentry, inode); bail: if (status < 0 && did_quota) - vfs_dq_free_space_nodirty(inode, + dquot_free_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, 1)); if (status < 0 && did_quota_inode) - vfs_dq_free_inode(inode); + dquot_free_inode(inode); if (handle) ocfs2_commit_trans(osb, handle); @@ -2099,13 +2101,9 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, goto leave; } - /* We don't use standard VFS wrapper because we don't want vfs_dq_init - * to be called. */ - if (sb_any_quota_active(osb->sb) && - osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) { - status = -EDQUOT; + status = dquot_alloc_inode(inode); + if (status) goto leave; - } did_quota_inode = 1; inode->i_nlink = 0; @@ -2140,7 +2138,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, insert_inode_hash(inode); leave: if (status < 0 && did_quota_inode) - vfs_dq_free_inode(inode); + dquot_free_inode(inode); if (handle) ocfs2_commit_trans(osb, handle); diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index b437dc0c4ca..355f41d1d52 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -851,13 +851,6 @@ static void ocfs2_destroy_dquot(struct dquot *dquot) } const struct dquot_operations ocfs2_quota_operations = { - .initialize = dquot_initialize, - .drop = dquot_drop, - .alloc_space = dquot_alloc_space, - .alloc_inode = dquot_alloc_inode, - .free_space = dquot_free_space, - .free_inode = dquot_free_inode, - .transfer = dquot_transfer, .write_dquot = ocfs2_write_dquot, .acquire_dquot = ocfs2_acquire_dquot, .release_dquot = ocfs2_release_dquot, diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 21f9e71223c..a6467f3d262 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -457,7 +457,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, break; } dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data; - for_each_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) { + for_each_set_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) { qbh = NULL; status = ocfs2_read_quota_block(lqinode, ol_dqblk_block(sb, chunk, bit), diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index fb6aa7acf54..9e96921dffd 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4390,7 +4390,7 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, } mutex_lock(&inode->i_mutex); - vfs_dq_init(dir); + dquot_initialize(dir); error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve); mutex_unlock(&inode->i_mutex); if (!error) diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index f3b7c1541f3..75d9b5ba1d4 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -11,6 +11,7 @@ #include <linux/parser.h> #include <linux/buffer_head.h> #include <linux/vmalloc.h> +#include <linux/writeback.h> #include <linux/crc-itu-t.h> #include "omfs.h" @@ -89,7 +90,7 @@ static void omfs_update_checksums(struct omfs_inode *oi) oi->i_head.h_check_xor = xor; } -static int omfs_write_inode(struct inode *inode, int wait) +static int __omfs_write_inode(struct inode *inode, int wait) { struct omfs_inode *oi; struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb); @@ -162,9 +163,14 @@ out: return ret; } +static int omfs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + return __omfs_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); +} + int omfs_sync_inode(struct inode *inode) { - return omfs_write_inode(inode, 1); + return __omfs_write_inode(inode, 1); } /* diff --git a/fs/open.c b/fs/open.c index e0b2d88b038..e17f54454b5 100644 --- a/fs/open.c +++ b/fs/open.c @@ -8,7 +8,6 @@ #include <linux/mm.h> #include <linux/file.h> #include <linux/fdtable.h> -#include <linux/quotaops.h> #include <linux/fsnotify.h> #include <linux/module.h> #include <linux/slab.h> @@ -278,10 +277,8 @@ static long do_sys_truncate(const char __user *pathname, loff_t length) error = locks_verify_truncate(inode, NULL, length); if (!error) error = security_path_truncate(&path, length, 0); - if (!error) { - vfs_dq_init(inode); + if (!error) error = do_truncate(path.dentry, length, 0, NULL); - } put_write_and_out: put_write_access(inode); diff --git a/fs/proc/array.c b/fs/proc/array.c index 18e20feee25..aa8637b8102 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -273,7 +273,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) rcu_read_lock(); /* FIXME: is this correct? */ qsize = atomic_read(&__task_cred(p)->user->sigpending); rcu_read_unlock(); - qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur; + qlim = task_rlimit(p, RLIMIT_SIGPENDING); unlock_task_sighand(p, &flags); } @@ -420,7 +420,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, cutime = sig->cutime; cstime = sig->cstime; cgtime = sig->cgtime; - rsslim = sig->rlim[RLIMIT_RSS].rlim_cur; + rsslim = ACCESS_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur); /* add up live thread stats at the group level */ if (whole) { diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 9580abeadeb..08f4d71dacd 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -291,19 +291,17 @@ static const struct inode_operations proc_file_inode_operations = { * returns the struct proc_dir_entry for "/proc/tty/driver", and * returns "serial" in residual. */ -static int xlate_proc_name(const char *name, - struct proc_dir_entry **ret, const char **residual) +static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret, + const char **residual) { const char *cp = name, *next; struct proc_dir_entry *de; int len; - int rtn = 0; de = *ret; if (!de) de = &proc_root; - spin_lock(&proc_subdir_lock); while (1) { next = strchr(cp, '/'); if (!next) @@ -315,16 +313,25 @@ static int xlate_proc_name(const char *name, break; } if (!de) { - rtn = -ENOENT; - goto out; + WARN(1, "name '%s'\n", name); + return -ENOENT; } cp += len + 1; } *residual = cp; *ret = de; -out: + return 0; +} + +static int xlate_proc_name(const char *name, struct proc_dir_entry **ret, + const char **residual) +{ + int rv; + + spin_lock(&proc_subdir_lock); + rv = __xlate_proc_name(name, ret, residual); spin_unlock(&proc_subdir_lock); - return rtn; + return rv; } static DEFINE_IDA(proc_inum_ida); @@ -797,11 +804,13 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) const char *fn = name; int len; - if (xlate_proc_name(name, &parent, &fn) != 0) + spin_lock(&proc_subdir_lock); + if (__xlate_proc_name(name, &parent, &fn) != 0) { + spin_unlock(&proc_subdir_lock); return; + } len = strlen(fn); - spin_lock(&proc_subdir_lock); for (p = &parent->subdir; *p; p=&(*p)->next ) { if (proc_match(len, fn, *p)) { de = *p; @@ -811,8 +820,10 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) } } spin_unlock(&proc_subdir_lock); - if (!de) + if (!de) { + WARN(1, "name '%s'\n", name); return; + } spin_lock(&de->pde_unload_lock); /* diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f277c4a111c..183f8ff5f40 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -16,7 +16,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) { - unsigned long data, text, lib; + unsigned long data, text, lib, swap; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; /* @@ -36,6 +36,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) data = mm->total_vm - mm->shared_vm - mm->stack_vm; text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; + swap = get_mm_counter(mm, MM_SWAPENTS); seq_printf(m, "VmPeak:\t%8lu kB\n" "VmSize:\t%8lu kB\n" @@ -46,7 +47,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "VmStk:\t%8lu kB\n" "VmExe:\t%8lu kB\n" "VmLib:\t%8lu kB\n" - "VmPTE:\t%8lu kB\n", + "VmPTE:\t%8lu kB\n" + "VmSwap:\t%8lu kB\n", hiwater_vm << (PAGE_SHIFT-10), (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), mm->locked_vm << (PAGE_SHIFT-10), @@ -54,7 +56,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) total_rss << (PAGE_SHIFT-10), data << (PAGE_SHIFT-10), mm->stack_vm << (PAGE_SHIFT-10), text, lib, - (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10); + (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10, + swap << (PAGE_SHIFT-10)); } unsigned long task_vsize(struct mm_struct *mm) @@ -65,11 +68,11 @@ unsigned long task_vsize(struct mm_struct *mm) int task_statm(struct mm_struct *mm, int *shared, int *text, int *data, int *resident) { - *shared = get_mm_counter(mm, file_rss); + *shared = get_mm_counter(mm, MM_FILEPAGES); *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT; *data = mm->total_vm - mm->shared_vm; - *resident = *shared + get_mm_counter(mm, anon_rss); + *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); return mm->total_vm; } diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index ebf3440d28c..277575ddc05 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -201,7 +201,8 @@ static const char *qnx4_checkroot(struct super_block *sb) rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE); if (rootdir->di_fname != NULL) { QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname)); - if (!strncmp(rootdir->di_fname, QNX4_BMNAME, sizeof QNX4_BMNAME)) { + if (!strcmp(rootdir->di_fname, + QNX4_BMNAME)) { found = 1; qnx4_sb(sb)->BitMap = kmalloc( sizeof( struct qnx4_inode_entry ), GFP_KERNEL ); if (!qnx4_sb(sb)->BitMap) { diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig index efc02ebb8c7..dad7fb247dd 100644 --- a/fs/quota/Kconfig +++ b/fs/quota/Kconfig @@ -59,3 +59,8 @@ config QUOTACTL bool depends on XFS_QUOTA || QUOTA default y + +config QUOTACTL_COMPAT + bool + depends on QUOTACTL && COMPAT_FOR_U64_ALIGNMENT + default y diff --git a/fs/quota/Makefile b/fs/quota/Makefile index 68d4f6dc057..5f9e9e276af 100644 --- a/fs/quota/Makefile +++ b/fs/quota/Makefile @@ -3,3 +3,5 @@ obj-$(CONFIG_QFMT_V1) += quota_v1.o obj-$(CONFIG_QFMT_V2) += quota_v2.o obj-$(CONFIG_QUOTA_TREE) += quota_tree.o obj-$(CONFIG_QUOTACTL) += quota.o +obj-$(CONFIG_QUOTACTL_COMPAT) += compat.o +obj-$(CONFIG_QUOTA_NETLINK_INTERFACE) += netlink.o diff --git a/fs/quota/compat.c b/fs/quota/compat.c new file mode 100644 index 00000000000..fb1892fe3e5 --- /dev/null +++ b/fs/quota/compat.c @@ -0,0 +1,118 @@ + +#include <linux/syscalls.h> +#include <linux/compat.h> +#include <linux/quotaops.h> + +/* + * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64) + * and is necessary due to alignment problems. + */ +struct compat_if_dqblk { + compat_u64 dqb_bhardlimit; + compat_u64 dqb_bsoftlimit; + compat_u64 dqb_curspace; + compat_u64 dqb_ihardlimit; + compat_u64 dqb_isoftlimit; + compat_u64 dqb_curinodes; + compat_u64 dqb_btime; + compat_u64 dqb_itime; + compat_uint_t dqb_valid; +}; + +/* XFS structures */ +struct compat_fs_qfilestat { + compat_u64 dqb_bhardlimit; + compat_u64 qfs_nblks; + compat_uint_t qfs_nextents; +}; + +struct compat_fs_quota_stat { + __s8 qs_version; + __u16 qs_flags; + __s8 qs_pad; + struct compat_fs_qfilestat qs_uquota; + struct compat_fs_qfilestat qs_gquota; + compat_uint_t qs_incoredqs; + compat_int_t qs_btimelimit; + compat_int_t qs_itimelimit; + compat_int_t qs_rtbtimelimit; + __u16 qs_bwarnlimit; + __u16 qs_iwarnlimit; +}; + +asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, + qid_t id, void __user *addr) +{ + unsigned int cmds; + struct if_dqblk __user *dqblk; + struct compat_if_dqblk __user *compat_dqblk; + struct fs_quota_stat __user *fsqstat; + struct compat_fs_quota_stat __user *compat_fsqstat; + compat_uint_t data; + u16 xdata; + long ret; + + cmds = cmd >> SUBCMDSHIFT; + + switch (cmds) { + case Q_GETQUOTA: + dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); + compat_dqblk = addr; + ret = sys_quotactl(cmd, special, id, dqblk); + if (ret) + break; + if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) || + get_user(data, &dqblk->dqb_valid) || + put_user(data, &compat_dqblk->dqb_valid)) + ret = -EFAULT; + break; + case Q_SETQUOTA: + dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); + compat_dqblk = addr; + ret = -EFAULT; + if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) || + get_user(data, &compat_dqblk->dqb_valid) || + put_user(data, &dqblk->dqb_valid)) + break; + ret = sys_quotactl(cmd, special, id, dqblk); + break; + case Q_XGETQSTAT: + fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat)); + compat_fsqstat = addr; + ret = sys_quotactl(cmd, special, id, fsqstat); + if (ret) + break; + ret = -EFAULT; + /* Copying qs_version, qs_flags, qs_pad */ + if (copy_in_user(compat_fsqstat, fsqstat, + offsetof(struct compat_fs_quota_stat, qs_uquota))) + break; + /* Copying qs_uquota */ + if (copy_in_user(&compat_fsqstat->qs_uquota, + &fsqstat->qs_uquota, + sizeof(compat_fsqstat->qs_uquota)) || + get_user(data, &fsqstat->qs_uquota.qfs_nextents) || + put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents)) + break; + /* Copying qs_gquota */ + if (copy_in_user(&compat_fsqstat->qs_gquota, + &fsqstat->qs_gquota, + sizeof(compat_fsqstat->qs_gquota)) || + get_user(data, &fsqstat->qs_gquota.qfs_nextents) || + put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents)) + break; + /* Copying the rest */ + if (copy_in_user(&compat_fsqstat->qs_incoredqs, + &fsqstat->qs_incoredqs, + sizeof(struct compat_fs_quota_stat) - + offsetof(struct compat_fs_quota_stat, qs_incoredqs)) || + get_user(xdata, &fsqstat->qs_iwarnlimit) || + put_user(xdata, &compat_fsqstat->qs_iwarnlimit)) + break; + ret = 0; + break; + default: + ret = sys_quotactl(cmd, special, id, addr); + } + return ret; +} diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 3fc62b097be..e0b870f4749 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -100,9 +100,13 @@ * * Any operation working on dquots via inode pointers must hold dqptr_sem. If * operation is just reading pointers from inode (or not using them at all) the - * read lock is enough. If pointers are altered function must hold write lock - * (these locking rules also apply for S_NOQUOTA flag in the inode - note that - * for altering the flag i_mutex is also needed). + * read lock is enough. If pointers are altered function must hold write lock. + * Special care needs to be taken about S_NOQUOTA inode flag (marking that + * inode is a quota file). Functions adding pointers from inode to dquots have + * to check this flag under dqptr_sem and then (if S_NOQUOTA is not set) they + * have to do all pointer modifications before dropping dqptr_sem. This makes + * sure they cannot race with quotaon which first sets S_NOQUOTA flag and + * then drops all pointers to dquots from an inode. * * Each dquot has its dq_lock mutex. Locked dquots might not be referenced * from inodes (dquot_alloc_space() and such don't check the dq_lock). @@ -225,6 +229,9 @@ static struct hlist_head *dquot_hash; struct dqstats dqstats; EXPORT_SYMBOL(dqstats); +static qsize_t inode_get_rsv_space(struct inode *inode); +static void __dquot_initialize(struct inode *inode, int type); + static inline unsigned int hashfn(const struct super_block *sb, unsigned int id, int type) { @@ -564,7 +571,7 @@ out: } EXPORT_SYMBOL(dquot_scan_active); -int vfs_quota_sync(struct super_block *sb, int type) +int vfs_quota_sync(struct super_block *sb, int type, int wait) { struct list_head *dirty; struct dquot *dquot; @@ -609,6 +616,33 @@ int vfs_quota_sync(struct super_block *sb, int type) spin_unlock(&dq_list_lock); mutex_unlock(&dqopt->dqonoff_mutex); + if (!wait || (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)) + return 0; + + /* This is not very clever (and fast) but currently I don't know about + * any other simple way of getting quota data to disk and we must get + * them there for userspace to be visible... */ + if (sb->s_op->sync_fs) + sb->s_op->sync_fs(sb, 1); + sync_blockdev(sb->s_bdev); + + /* + * Now when everything is written we can discard the pagecache so + * that userspace sees the changes. + */ + mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (type != -1 && cnt != type) + continue; + if (!sb_has_quota_active(sb, cnt)) + continue; + mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, + I_MUTEX_QUOTA); + truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0); + mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex); + } + mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); + return 0; } EXPORT_SYMBOL(vfs_quota_sync); @@ -840,11 +874,14 @@ static int dqinit_needed(struct inode *inode, int type) static void add_dquot_ref(struct super_block *sb, int type) { struct inode *inode, *old_inode = NULL; + int reserved = 0; spin_lock(&inode_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) continue; + if (unlikely(inode_get_rsv_space(inode) > 0)) + reserved = 1; if (!atomic_read(&inode->i_writecount)) continue; if (!dqinit_needed(inode, type)) @@ -854,7 +891,7 @@ static void add_dquot_ref(struct super_block *sb, int type) spin_unlock(&inode_lock); iput(old_inode); - sb->dq_op->initialize(inode, type); + __dquot_initialize(inode, type); /* We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the inode_lock. * We cannot iput the inode now as we can be holding the last @@ -865,6 +902,12 @@ static void add_dquot_ref(struct super_block *sb, int type) } spin_unlock(&inode_lock); iput(old_inode); + + if (reserved) { + printk(KERN_WARNING "VFS (%s): Writes happened before quota" + " was turned on thus quota information is probably " + "inconsistent. Please run quotacheck(8).\n", sb->s_id); + } } /* @@ -978,10 +1021,12 @@ static inline void dquot_resv_space(struct dquot *dquot, qsize_t number) /* * Claim reserved quota space */ -static void dquot_claim_reserved_space(struct dquot *dquot, - qsize_t number) +static void dquot_claim_reserved_space(struct dquot *dquot, qsize_t number) { - WARN_ON(dquot->dq_dqb.dqb_rsvspace < number); + if (dquot->dq_dqb.dqb_rsvspace < number) { + WARN_ON_ONCE(1); + number = dquot->dq_dqb.dqb_rsvspace; + } dquot->dq_dqb.dqb_curspace += number; dquot->dq_dqb.dqb_rsvspace -= number; } @@ -989,7 +1034,12 @@ static void dquot_claim_reserved_space(struct dquot *dquot, static inline void dquot_free_reserved_space(struct dquot *dquot, qsize_t number) { - dquot->dq_dqb.dqb_rsvspace -= number; + if (dquot->dq_dqb.dqb_rsvspace >= number) + dquot->dq_dqb.dqb_rsvspace -= number; + else { + WARN_ON_ONCE(1); + dquot->dq_dqb.dqb_rsvspace = 0; + } } static void dquot_decr_inodes(struct dquot *dquot, qsize_t number) @@ -1131,13 +1181,13 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype) *warntype = QUOTA_NL_NOWARN; if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) || test_bit(DQ_FAKE_B, &dquot->dq_flags)) - return QUOTA_OK; + return 0; if (dquot->dq_dqb.dqb_ihardlimit && newinodes > dquot->dq_dqb.dqb_ihardlimit && !ignore_hardlimit(dquot)) { *warntype = QUOTA_NL_IHARDWARN; - return NO_QUOTA; + return -EDQUOT; } if (dquot->dq_dqb.dqb_isoftlimit && @@ -1146,7 +1196,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype) get_seconds() >= dquot->dq_dqb.dqb_itime && !ignore_hardlimit(dquot)) { *warntype = QUOTA_NL_ISOFTLONGWARN; - return NO_QUOTA; + return -EDQUOT; } if (dquot->dq_dqb.dqb_isoftlimit && @@ -1157,7 +1207,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype) sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace; } - return QUOTA_OK; + return 0; } /* needs dq_data_lock */ @@ -1169,7 +1219,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war *warntype = QUOTA_NL_NOWARN; if (!sb_has_quota_limits_enabled(sb, dquot->dq_type) || test_bit(DQ_FAKE_B, &dquot->dq_flags)) - return QUOTA_OK; + return 0; tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace + space; @@ -1179,7 +1229,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war !ignore_hardlimit(dquot)) { if (!prealloc) *warntype = QUOTA_NL_BHARDWARN; - return NO_QUOTA; + return -EDQUOT; } if (dquot->dq_dqb.dqb_bsoftlimit && @@ -1189,7 +1239,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war !ignore_hardlimit(dquot)) { if (!prealloc) *warntype = QUOTA_NL_BSOFTLONGWARN; - return NO_QUOTA; + return -EDQUOT; } if (dquot->dq_dqb.dqb_bsoftlimit && @@ -1205,10 +1255,10 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war * We don't allow preallocation to exceed softlimit so exceeding will * be always printed */ - return NO_QUOTA; + return -EDQUOT; } - return QUOTA_OK; + return 0; } static int info_idq_free(struct dquot *dquot, qsize_t inodes) @@ -1242,25 +1292,32 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space) return QUOTA_NL_BHARDBELOW; return QUOTA_NL_NOWARN; } + /* - * Initialize quota pointers in inode - * We do things in a bit complicated way but by that we avoid calling - * dqget() and thus filesystem callbacks under dqptr_sem. + * Initialize quota pointers in inode + * + * We do things in a bit complicated way but by that we avoid calling + * dqget() and thus filesystem callbacks under dqptr_sem. + * + * It is better to call this function outside of any transaction as it + * might need a lot of space in journal for dquot structure allocation. */ -int dquot_initialize(struct inode *inode, int type) +static void __dquot_initialize(struct inode *inode, int type) { unsigned int id = 0; - int cnt, ret = 0; - struct dquot *got[MAXQUOTAS] = { NULL, NULL }; + int cnt; + struct dquot *got[MAXQUOTAS]; struct super_block *sb = inode->i_sb; + qsize_t rsv; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ - if (IS_NOQUOTA(inode)) - return 0; + if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) + return; /* First get references to structures we might need. */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + got[cnt] = NULL; if (type != -1 && cnt != type) continue; switch (cnt) { @@ -1275,7 +1332,6 @@ int dquot_initialize(struct inode *inode, int type) } down_write(&sb_dqopt(sb)->dqptr_sem); - /* Having dqptr_sem we know NOQUOTA flags can't be altered... */ if (IS_NOQUOTA(inode)) goto out_err; for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1287,20 +1343,31 @@ int dquot_initialize(struct inode *inode, int type) if (!inode->i_dquot[cnt]) { inode->i_dquot[cnt] = got[cnt]; got[cnt] = NULL; + /* + * Make quota reservation system happy if someone + * did a write before quota was turned on + */ + rsv = inode_get_rsv_space(inode); + if (unlikely(rsv)) + dquot_resv_space(inode->i_dquot[cnt], rsv); } } out_err: up_write(&sb_dqopt(sb)->dqptr_sem); /* Drop unused references */ dqput_all(got); - return ret; +} + +void dquot_initialize(struct inode *inode) +{ + __dquot_initialize(inode, -1); } EXPORT_SYMBOL(dquot_initialize); /* * Release all quotas referenced by inode */ -int dquot_drop(struct inode *inode) +static void __dquot_drop(struct inode *inode) { int cnt; struct dquot *put[MAXQUOTAS]; @@ -1312,32 +1379,31 @@ int dquot_drop(struct inode *inode) } up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); dqput_all(put); - return 0; } -EXPORT_SYMBOL(dquot_drop); -/* Wrapper to remove references to quota structures from inode */ -void vfs_dq_drop(struct inode *inode) -{ - /* Here we can get arbitrary inode from clear_inode() so we have - * to be careful. OTOH we don't need locking as quota operations - * are allowed to change only at mount time */ - if (!IS_NOQUOTA(inode) && inode->i_sb && inode->i_sb->dq_op - && inode->i_sb->dq_op->drop) { - int cnt; - /* Test before calling to rule out calls from proc and such - * where we are not allowed to block. Note that this is - * actually reliable test even without the lock - the caller - * must assure that nobody can come after the DQUOT_DROP and - * add quota pointers back anyway */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (inode->i_dquot[cnt]) - break; - if (cnt < MAXQUOTAS) - inode->i_sb->dq_op->drop(inode); - } -} -EXPORT_SYMBOL(vfs_dq_drop); +void dquot_drop(struct inode *inode) +{ + int cnt; + + if (IS_NOQUOTA(inode)) + return; + + /* + * Test before calling to rule out calls from proc and such + * where we are not allowed to block. Note that this is + * actually reliable test even without the lock - the caller + * must assure that nobody can come after the DQUOT_DROP and + * add quota pointers back anyway. + */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (inode->i_dquot[cnt]) + break; + } + + if (cnt < MAXQUOTAS) + __dquot_drop(inode); +} +EXPORT_SYMBOL(dquot_drop); /* * inode_reserved_space is managed internally by quota, and protected by @@ -1351,28 +1417,30 @@ static qsize_t *inode_reserved_space(struct inode * inode) return inode->i_sb->dq_op->get_reserved_space(inode); } -static void inode_add_rsv_space(struct inode *inode, qsize_t number) +void inode_add_rsv_space(struct inode *inode, qsize_t number) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) += number; spin_unlock(&inode->i_lock); } +EXPORT_SYMBOL(inode_add_rsv_space); - -static void inode_claim_rsv_space(struct inode *inode, qsize_t number) +void inode_claim_rsv_space(struct inode *inode, qsize_t number) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) -= number; __inode_add_bytes(inode, number); spin_unlock(&inode->i_lock); } +EXPORT_SYMBOL(inode_claim_rsv_space); -static void inode_sub_rsv_space(struct inode *inode, qsize_t number) +void inode_sub_rsv_space(struct inode *inode, qsize_t number) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) -= number; spin_unlock(&inode->i_lock); } +EXPORT_SYMBOL(inode_sub_rsv_space); static qsize_t inode_get_rsv_space(struct inode *inode) { @@ -1404,38 +1472,34 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve) } /* - * Following four functions update i_blocks+i_bytes fields and - * quota information (together with appropriate checks) - * NOTE: We absolutely rely on the fact that caller dirties - * the inode (usually macros in quotaops.h care about this) and - * holds a handle for the current transaction so that dquot write and - * inode write go into the same transaction. + * This functions updates i_blocks+i_bytes fields and quota information + * (together with appropriate checks). + * + * NOTE: We absolutely rely on the fact that caller dirties the inode + * (usually helpers in quotaops.h care about this) and holds a handle for + * the current transaction so that dquot write and inode write go into the + * same transaction. */ /* * This operation can block, but only after everything is updated */ int __dquot_alloc_space(struct inode *inode, qsize_t number, - int warn, int reserve) + int warn, int reserve) { - int cnt, ret = QUOTA_OK; + int cnt, ret = 0; char warntype[MAXQUOTAS]; /* * First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ - if (IS_NOQUOTA(inode)) { + if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) { inode_incr_space(inode, number, reserve); goto out; } down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - if (IS_NOQUOTA(inode)) { - inode_incr_space(inode, number, reserve); - goto out_unlock; - } - for (cnt = 0; cnt < MAXQUOTAS; cnt++) warntype[cnt] = QUOTA_NL_NOWARN; @@ -1443,9 +1507,9 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!inode->i_dquot[cnt]) continue; - if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt) - == NO_QUOTA) { - ret = NO_QUOTA; + ret = check_bdq(inode->i_dquot[cnt], number, !warn, + warntype+cnt); + if (ret) { spin_unlock(&dq_data_lock); goto out_flush_warn; } @@ -1466,61 +1530,45 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, mark_all_dquot_dirty(inode->i_dquot); out_flush_warn: flush_warnings(inode->i_dquot, warntype); -out_unlock: up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); out: return ret; } - -int dquot_alloc_space(struct inode *inode, qsize_t number, int warn) -{ - return __dquot_alloc_space(inode, number, warn, 0); -} -EXPORT_SYMBOL(dquot_alloc_space); - -int dquot_reserve_space(struct inode *inode, qsize_t number, int warn) -{ - return __dquot_alloc_space(inode, number, warn, 1); -} -EXPORT_SYMBOL(dquot_reserve_space); +EXPORT_SYMBOL(__dquot_alloc_space); /* * This operation can block, but only after everything is updated */ -int dquot_alloc_inode(const struct inode *inode, qsize_t number) +int dquot_alloc_inode(const struct inode *inode) { - int cnt, ret = NO_QUOTA; + int cnt, ret = 0; char warntype[MAXQUOTAS]; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ - if (IS_NOQUOTA(inode)) - return QUOTA_OK; + if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) + return 0; for (cnt = 0; cnt < MAXQUOTAS; cnt++) warntype[cnt] = QUOTA_NL_NOWARN; down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - if (IS_NOQUOTA(inode)) { - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - return QUOTA_OK; - } spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!inode->i_dquot[cnt]) continue; - if (check_idq(inode->i_dquot[cnt], number, warntype+cnt) - == NO_QUOTA) + ret = check_idq(inode->i_dquot[cnt], 1, warntype + cnt); + if (ret) goto warn_put_all; } for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!inode->i_dquot[cnt]) continue; - dquot_incr_inodes(inode->i_dquot[cnt], number); + dquot_incr_inodes(inode->i_dquot[cnt], 1); } - ret = QUOTA_OK; + warn_put_all: spin_unlock(&dq_data_lock); - if (ret == QUOTA_OK) + if (ret == 0) mark_all_dquot_dirty(inode->i_dquot); flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); @@ -1528,23 +1576,19 @@ warn_put_all: } EXPORT_SYMBOL(dquot_alloc_inode); -int dquot_claim_space(struct inode *inode, qsize_t number) +/* + * Convert in-memory reserved quotas to real consumed quotas + */ +int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) { int cnt; - int ret = QUOTA_OK; - if (IS_NOQUOTA(inode)) { + if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) { inode_claim_rsv_space(inode, number); - goto out; + return 0; } down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - if (IS_NOQUOTA(inode)) { - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - inode_claim_rsv_space(inode, number); - goto out; - } - spin_lock(&dq_data_lock); /* Claim reserved quotas to allocated quotas */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1557,33 +1601,26 @@ int dquot_claim_space(struct inode *inode, qsize_t number) spin_unlock(&dq_data_lock); mark_all_dquot_dirty(inode->i_dquot); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); -out: - return ret; + return 0; } -EXPORT_SYMBOL(dquot_claim_space); +EXPORT_SYMBOL(dquot_claim_space_nodirty); /* * This operation can block, but only after everything is updated */ -int __dquot_free_space(struct inode *inode, qsize_t number, int reserve) +void __dquot_free_space(struct inode *inode, qsize_t number, int reserve) { unsigned int cnt; char warntype[MAXQUOTAS]; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ - if (IS_NOQUOTA(inode)) { -out_sub: + if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) { inode_decr_space(inode, number, reserve); - return QUOTA_OK; + return; } down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - /* Now recheck reliably when holding dqptr_sem */ - if (IS_NOQUOTA(inode)) { - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - goto out_sub; - } spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!inode->i_dquot[cnt]) @@ -1603,56 +1640,34 @@ out_sub: out_unlock: flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - return QUOTA_OK; -} - -int dquot_free_space(struct inode *inode, qsize_t number) -{ - return __dquot_free_space(inode, number, 0); } -EXPORT_SYMBOL(dquot_free_space); - -/* - * Release reserved quota space - */ -void dquot_release_reserved_space(struct inode *inode, qsize_t number) -{ - __dquot_free_space(inode, number, 1); - -} -EXPORT_SYMBOL(dquot_release_reserved_space); +EXPORT_SYMBOL(__dquot_free_space); /* * This operation can block, but only after everything is updated */ -int dquot_free_inode(const struct inode *inode, qsize_t number) +void dquot_free_inode(const struct inode *inode) { unsigned int cnt; char warntype[MAXQUOTAS]; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ - if (IS_NOQUOTA(inode)) - return QUOTA_OK; + if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) + return; down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - /* Now recheck reliably when holding dqptr_sem */ - if (IS_NOQUOTA(inode)) { - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - return QUOTA_OK; - } spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!inode->i_dquot[cnt]) continue; - warntype[cnt] = info_idq_free(inode->i_dquot[cnt], number); - dquot_decr_inodes(inode->i_dquot[cnt], number); + warntype[cnt] = info_idq_free(inode->i_dquot[cnt], 1); + dquot_decr_inodes(inode->i_dquot[cnt], 1); } spin_unlock(&dq_data_lock); mark_all_dquot_dirty(inode->i_dquot); flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - return QUOTA_OK; } EXPORT_SYMBOL(dquot_free_inode); @@ -1662,37 +1677,31 @@ EXPORT_SYMBOL(dquot_free_inode); * This operation can block, but only after everything is updated * A transaction must be started when entering this function. */ -int dquot_transfer(struct inode *inode, struct iattr *iattr) +static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask) { qsize_t space, cur_space; qsize_t rsv_space = 0; struct dquot *transfer_from[MAXQUOTAS]; struct dquot *transfer_to[MAXQUOTAS]; - int cnt, ret = QUOTA_OK; - int chuid = iattr->ia_valid & ATTR_UID && inode->i_uid != iattr->ia_uid, - chgid = iattr->ia_valid & ATTR_GID && inode->i_gid != iattr->ia_gid; + int cnt, ret = 0; char warntype_to[MAXQUOTAS]; char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS]; /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ if (IS_NOQUOTA(inode)) - return QUOTA_OK; + return 0; /* Initialize the arrays */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { transfer_from[cnt] = NULL; transfer_to[cnt] = NULL; warntype_to[cnt] = QUOTA_NL_NOWARN; } - if (chuid) - transfer_to[USRQUOTA] = dqget(inode->i_sb, iattr->ia_uid, - USRQUOTA); - if (chgid) - transfer_to[GRPQUOTA] = dqget(inode->i_sb, iattr->ia_gid, - GRPQUOTA); - + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (mask & (1 << cnt)) + transfer_to[cnt] = dqget(inode->i_sb, chid[cnt], cnt); + } down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); - /* Now recheck reliably when holding dqptr_sem */ if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); goto put_all; @@ -1706,9 +1715,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) if (!transfer_to[cnt]) continue; transfer_from[cnt] = inode->i_dquot[cnt]; - if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) == - NO_QUOTA || check_bdq(transfer_to[cnt], space, 0, - warntype_to + cnt) == NO_QUOTA) + ret = check_idq(transfer_to[cnt], 1, warntype_to + cnt); + if (ret) + goto over_quota; + ret = check_bdq(transfer_to[cnt], space, 0, warntype_to + cnt); + if (ret) goto over_quota; } @@ -1762,22 +1773,32 @@ over_quota: /* Clear dquot pointers we don't want to dqput() */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) transfer_from[cnt] = NULL; - ret = NO_QUOTA; goto warn_put_all; } -EXPORT_SYMBOL(dquot_transfer); -/* Wrapper for transferring ownership of an inode */ -int vfs_dq_transfer(struct inode *inode, struct iattr *iattr) +/* Wrapper for transferring ownership of an inode for uid/gid only + * Called from FSXXX_setattr() + */ +int dquot_transfer(struct inode *inode, struct iattr *iattr) { + qid_t chid[MAXQUOTAS]; + unsigned long mask = 0; + + if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) { + mask |= 1 << USRQUOTA; + chid[USRQUOTA] = iattr->ia_uid; + } + if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) { + mask |= 1 << GRPQUOTA; + chid[GRPQUOTA] = iattr->ia_gid; + } if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) { - vfs_dq_init(inode); - if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA) - return 1; + dquot_initialize(inode); + return __dquot_transfer(inode, chid, mask); } return 0; } -EXPORT_SYMBOL(vfs_dq_transfer); +EXPORT_SYMBOL(dquot_transfer); /* * Write info of quota file to disk @@ -1798,13 +1819,6 @@ EXPORT_SYMBOL(dquot_commit_info); * Definitions of diskquota operations. */ const struct dquot_operations dquot_operations = { - .initialize = dquot_initialize, - .drop = dquot_drop, - .alloc_space = dquot_alloc_space, - .alloc_inode = dquot_alloc_inode, - .free_space = dquot_free_space, - .free_inode = dquot_free_inode, - .transfer = dquot_transfer, .write_dquot = dquot_commit, .acquire_dquot = dquot_acquire, .release_dquot = dquot_release, @@ -1815,6 +1829,20 @@ const struct dquot_operations dquot_operations = { }; /* + * Generic helper for ->open on filesystems supporting disk quotas. + */ +int dquot_file_open(struct inode *inode, struct file *file) +{ + int error; + + error = generic_file_open(inode, file); + if (!error && (file->f_mode & FMODE_WRITE)) + dquot_initialize(inode); + return error; +} +EXPORT_SYMBOL(dquot_file_open); + +/* * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) */ int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags) @@ -1993,11 +2021,13 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, } if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { - /* As we bypass the pagecache we must now flush the inode so - * that we see all the changes from userspace... */ - write_inode_now(inode, 1); - /* And now flush the block cache so that kernel sees the - * changes */ + /* As we bypass the pagecache we must now flush all the + * dirty data and invalidate caches so that kernel sees + * changes from userspace. It is not enough to just flush + * the quota file since if blocksize < pagesize, invalidation + * of the cache could fail because of other unrelated dirty + * data */ + sync_filesystem(sb); invalidate_bdev(sb->s_bdev); } mutex_lock(&dqopt->dqonoff_mutex); @@ -2010,14 +2040,16 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, /* We don't want quota and atime on quota files (deadlocks * possible) Also nobody should write to the file - we use * special IO operations which ignore the immutable bit. */ - down_write(&dqopt->dqptr_sem); mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA); inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; mutex_unlock(&inode->i_mutex); - up_write(&dqopt->dqptr_sem); - sb->dq_op->drop(inode); + /* + * When S_NOQUOTA is set, remove dquot references as no more + * references can be added + */ + __dquot_drop(inode); } error = -EIO; @@ -2053,14 +2085,12 @@ out_file_init: iput(inode); out_lock: if (oldflags != -1) { - down_write(&dqopt->dqptr_sem); mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); /* Set the flags back (in the case of accidental quotaon() * on a wrong file we don't want to mess up the flags) */ inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE); inode->i_flags |= oldflags; mutex_unlock(&inode->i_mutex); - up_write(&dqopt->dqptr_sem); } mutex_unlock(&dqopt->dqonoff_mutex); out_fmt: diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c new file mode 100644 index 00000000000..2663ed90fb0 --- /dev/null +++ b/fs/quota/netlink.c @@ -0,0 +1,95 @@ + +#include <linux/cred.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/quotaops.h> +#include <linux/sched.h> +#include <net/netlink.h> +#include <net/genetlink.h> + +/* Netlink family structure for quota */ +static struct genl_family quota_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = "VFS_DQUOT", + .version = 1, + .maxattr = QUOTA_NL_A_MAX, +}; + +/** + * quota_send_warning - Send warning to userspace about exceeded quota + * @type: The quota type: USRQQUOTA, GRPQUOTA,... + * @id: The user or group id of the quota that was exceeded + * @dev: The device on which the fs is mounted (sb->s_dev) + * @warntype: The type of the warning: QUOTA_NL_... + * + * This can be used by filesystems (including those which don't use + * dquot) to send a message to userspace relating to quota limits. + * + */ + +void quota_send_warning(short type, unsigned int id, dev_t dev, + const char warntype) +{ + static atomic_t seq; + struct sk_buff *skb; + void *msg_head; + int ret; + int msg_size = 4 * nla_total_size(sizeof(u32)) + + 2 * nla_total_size(sizeof(u64)); + + /* We have to allocate using GFP_NOFS as we are called from a + * filesystem performing write and thus further recursion into + * the fs to free some data could cause deadlocks. */ + skb = genlmsg_new(msg_size, GFP_NOFS); + if (!skb) { + printk(KERN_ERR + "VFS: Not enough memory to send quota warning.\n"); + return; + } + msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq), + "a_genl_family, 0, QUOTA_NL_C_WARNING); + if (!msg_head) { + printk(KERN_ERR + "VFS: Cannot store netlink header in quota warning.\n"); + goto err_out; + } + ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type); + if (ret) + goto attr_err_out; + ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id); + if (ret) + goto attr_err_out; + ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype); + if (ret) + goto attr_err_out; + ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev)); + if (ret) + goto attr_err_out; + ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev)); + if (ret) + goto attr_err_out; + ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid()); + if (ret) + goto attr_err_out; + genlmsg_end(skb, msg_head); + + genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS); + return; +attr_err_out: + printk(KERN_ERR "VFS: Not enough space to compose quota message!\n"); +err_out: + kfree_skb(skb); +} +EXPORT_SYMBOL(quota_send_warning); + +static int __init quota_init(void) +{ + if (genl_register_family("a_genl_family) != 0) + printk(KERN_ERR + "VFS: Failed to create quota netlink interface.\n"); + return 0; +}; + +module_init(quota_init); diff --git a/fs/quota/quota.c b/fs/quota/quota.c index ee91e275695..95388f9b735 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -10,7 +10,6 @@ #include <linux/slab.h> #include <asm/current.h> #include <asm/uaccess.h> -#include <linux/compat.h> #include <linux/kernel.h> #include <linux/security.h> #include <linux/syscalls.h> @@ -18,220 +17,205 @@ #include <linux/capability.h> #include <linux/quotaops.h> #include <linux/types.h> -#include <net/netlink.h> -#include <net/genetlink.h> +#include <linux/writeback.h> -/* Check validity of generic quotactl commands */ -static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, - qid_t id) +static int check_quotactl_permission(struct super_block *sb, int type, int cmd, + qid_t id) { - if (type >= MAXQUOTAS) - return -EINVAL; - if (!sb && cmd != Q_SYNC) - return -ENODEV; - /* Is operation supported? */ - if (sb && !sb->s_qcop) - return -ENOSYS; - switch (cmd) { - case Q_GETFMT: - break; - case Q_QUOTAON: - if (!sb->s_qcop->quota_on) - return -ENOSYS; - break; - case Q_QUOTAOFF: - if (!sb->s_qcop->quota_off) - return -ENOSYS; - break; - case Q_SETINFO: - if (!sb->s_qcop->set_info) - return -ENOSYS; - break; - case Q_GETINFO: - if (!sb->s_qcop->get_info) - return -ENOSYS; - break; - case Q_SETQUOTA: - if (!sb->s_qcop->set_dqblk) - return -ENOSYS; - break; - case Q_GETQUOTA: - if (!sb->s_qcop->get_dqblk) - return -ENOSYS; - break; - case Q_SYNC: - if (sb && !sb->s_qcop->quota_sync) - return -ENOSYS; + /* these commands do not require any special privilegues */ + case Q_GETFMT: + case Q_SYNC: + case Q_GETINFO: + case Q_XGETQSTAT: + case Q_XQUOTASYNC: + break; + /* allow to query information for dquots we "own" */ + case Q_GETQUOTA: + case Q_XGETQUOTA: + if ((type == USRQUOTA && current_euid() == id) || + (type == GRPQUOTA && in_egroup_p(id))) break; - default: - return -EINVAL; + /*FALLTHROUGH*/ + default: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; } - /* Is quota turned on for commands which need it? */ - switch (cmd) { - case Q_GETFMT: - case Q_GETINFO: - case Q_SETINFO: - case Q_SETQUOTA: - case Q_GETQUOTA: - /* This is just an informative test so we are satisfied - * without the lock */ - if (!sb_has_quota_active(sb, type)) - return -ESRCH; - } + return security_quotactl(cmd, type, id, sb); +} - /* Check privileges */ - if (cmd == Q_GETQUOTA) { - if (((type == USRQUOTA && current_euid() != id) || - (type == GRPQUOTA && !in_egroup_p(id))) && - !capable(CAP_SYS_ADMIN)) - return -EPERM; +static int quota_sync_all(int type) +{ + struct super_block *sb; + int ret; + + if (type >= MAXQUOTAS) + return -EINVAL; + ret = security_quotactl(Q_SYNC, type, 0, NULL); + if (ret) + return ret; + + spin_lock(&sb_lock); +restart: + list_for_each_entry(sb, &super_blocks, s_list) { + if (!sb->s_qcop || !sb->s_qcop->quota_sync) + continue; + + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); + if (sb->s_root) + sb->s_qcop->quota_sync(sb, type, 1); + up_read(&sb->s_umount); + spin_lock(&sb_lock); + if (__put_super_and_need_restart(sb)) + goto restart; } - else if (cmd != Q_GETFMT && cmd != Q_SYNC && cmd != Q_GETINFO) - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + spin_unlock(&sb_lock); return 0; } -/* Check validity of XFS Quota Manager commands */ -static int xqm_quotactl_valid(struct super_block *sb, int type, int cmd, - qid_t id) +static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id, + void __user *addr) { - if (type >= XQM_MAXQUOTAS) - return -EINVAL; - if (!sb) - return -ENODEV; - if (!sb->s_qcop) - return -ENOSYS; + char *pathname; + int ret = -ENOSYS; + + pathname = getname(addr); + if (IS_ERR(pathname)) + return PTR_ERR(pathname); + if (sb->s_qcop->quota_on) + ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0); + putname(pathname); + return ret; +} - switch (cmd) { - case Q_XQUOTAON: - case Q_XQUOTAOFF: - case Q_XQUOTARM: - if (!sb->s_qcop->set_xstate) - return -ENOSYS; - break; - case Q_XGETQSTAT: - if (!sb->s_qcop->get_xstate) - return -ENOSYS; - break; - case Q_XSETQLIM: - if (!sb->s_qcop->set_xquota) - return -ENOSYS; - break; - case Q_XGETQUOTA: - if (!sb->s_qcop->get_xquota) - return -ENOSYS; - break; - case Q_XQUOTASYNC: - if (!sb->s_qcop->quota_sync) - return -ENOSYS; - break; - default: - return -EINVAL; - } +static int quota_getfmt(struct super_block *sb, int type, void __user *addr) +{ + __u32 fmt; - /* Check privileges */ - if (cmd == Q_XGETQUOTA) { - if (((type == XQM_USRQUOTA && current_euid() != id) || - (type == XQM_GRPQUOTA && !in_egroup_p(id))) && - !capable(CAP_SYS_ADMIN)) - return -EPERM; - } else if (cmd != Q_XGETQSTAT && cmd != Q_XQUOTASYNC) { - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + down_read(&sb_dqopt(sb)->dqptr_sem); + if (!sb_has_quota_active(sb, type)) { + up_read(&sb_dqopt(sb)->dqptr_sem); + return -ESRCH; } + fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id; + up_read(&sb_dqopt(sb)->dqptr_sem); + if (copy_to_user(addr, &fmt, sizeof(fmt))) + return -EFAULT; + return 0; +} +static int quota_getinfo(struct super_block *sb, int type, void __user *addr) +{ + struct if_dqinfo info; + int ret; + + if (!sb_has_quota_active(sb, type)) + return -ESRCH; + if (!sb->s_qcop->get_info) + return -ENOSYS; + ret = sb->s_qcop->get_info(sb, type, &info); + if (!ret && copy_to_user(addr, &info, sizeof(info))) + return -EFAULT; + return ret; +} + +static int quota_setinfo(struct super_block *sb, int type, void __user *addr) +{ + struct if_dqinfo info; + + if (copy_from_user(&info, addr, sizeof(info))) + return -EFAULT; + if (!sb_has_quota_active(sb, type)) + return -ESRCH; + if (!sb->s_qcop->set_info) + return -ENOSYS; + return sb->s_qcop->set_info(sb, type, &info); +} + +static int quota_getquota(struct super_block *sb, int type, qid_t id, + void __user *addr) +{ + struct if_dqblk idq; + int ret; + + if (!sb_has_quota_active(sb, type)) + return -ESRCH; + if (!sb->s_qcop->get_dqblk) + return -ENOSYS; + ret = sb->s_qcop->get_dqblk(sb, type, id, &idq); + if (ret) + return ret; + if (copy_to_user(addr, &idq, sizeof(idq))) + return -EFAULT; return 0; } -static int check_quotactl_valid(struct super_block *sb, int type, int cmd, - qid_t id) +static int quota_setquota(struct super_block *sb, int type, qid_t id, + void __user *addr) { - int error; - - if (XQM_COMMAND(cmd)) - error = xqm_quotactl_valid(sb, type, cmd, id); - else - error = generic_quotactl_valid(sb, type, cmd, id); - if (!error) - error = security_quotactl(cmd, type, id, sb); - return error; + struct if_dqblk idq; + + if (copy_from_user(&idq, addr, sizeof(idq))) + return -EFAULT; + if (!sb_has_quota_active(sb, type)) + return -ESRCH; + if (!sb->s_qcop->set_dqblk) + return -ENOSYS; + return sb->s_qcop->set_dqblk(sb, type, id, &idq); } -#ifdef CONFIG_QUOTA -void sync_quota_sb(struct super_block *sb, int type) +static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr) { - int cnt; + __u32 flags; - if (!sb->s_qcop->quota_sync) - return; + if (copy_from_user(&flags, addr, sizeof(flags))) + return -EFAULT; + if (!sb->s_qcop->set_xstate) + return -ENOSYS; + return sb->s_qcop->set_xstate(sb, flags, cmd); +} - sb->s_qcop->quota_sync(sb, type); +static int quota_getxstate(struct super_block *sb, void __user *addr) +{ + struct fs_quota_stat fqs; + int ret; - if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE) - return; - /* This is not very clever (and fast) but currently I don't know about - * any other simple way of getting quota data to disk and we must get - * them there for userspace to be visible... */ - if (sb->s_op->sync_fs) - sb->s_op->sync_fs(sb, 1); - sync_blockdev(sb->s_bdev); + if (!sb->s_qcop->get_xstate) + return -ENOSYS; + ret = sb->s_qcop->get_xstate(sb, &fqs); + if (!ret && copy_to_user(addr, &fqs, sizeof(fqs))) + return -EFAULT; + return ret; +} - /* - * Now when everything is written we can discard the pagecache so - * that userspace sees the changes. - */ - mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (type != -1 && cnt != type) - continue; - if (!sb_has_quota_active(sb, cnt)) - continue; - mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, - I_MUTEX_QUOTA); - truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0); - mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex); - } - mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); +static int quota_setxquota(struct super_block *sb, int type, qid_t id, + void __user *addr) +{ + struct fs_disk_quota fdq; + + if (copy_from_user(&fdq, addr, sizeof(fdq))) + return -EFAULT; + if (!sb->s_qcop->set_xquota) + return -ENOSYS; + return sb->s_qcop->set_xquota(sb, type, id, &fdq); } -#endif -static void sync_dquots(int type) +static int quota_getxquota(struct super_block *sb, int type, qid_t id, + void __user *addr) { - struct super_block *sb; - int cnt; + struct fs_disk_quota fdq; + int ret; - spin_lock(&sb_lock); -restart: - list_for_each_entry(sb, &super_blocks, s_list) { - /* This test just improves performance so it needn't be - * reliable... */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (type != -1 && type != cnt) - continue; - if (!sb_has_quota_active(sb, cnt)) - continue; - if (!info_dirty(&sb_dqopt(sb)->info[cnt]) && - list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list)) - continue; - break; - } - if (cnt == MAXQUOTAS) - continue; - sb->s_count++; - spin_unlock(&sb_lock); - down_read(&sb->s_umount); - if (sb->s_root) - sync_quota_sb(sb, type); - up_read(&sb->s_umount); - spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto restart; - } - spin_unlock(&sb_lock); + if (!sb->s_qcop->get_xquota) + return -ENOSYS; + ret = sb->s_qcop->get_xquota(sb, type, id, &fdq); + if (!ret && copy_to_user(addr, &fdq, sizeof(fdq))) + return -EFAULT; + return ret; } /* Copy parameters and call proper function */ @@ -240,117 +224,55 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, { int ret; + if (type >= (XQM_COMMAND(cmd) ? XQM_MAXQUOTAS : MAXQUOTAS)) + return -EINVAL; + if (!sb->s_qcop) + return -ENOSYS; + + ret = check_quotactl_permission(sb, type, cmd, id); + if (ret < 0) + return ret; + switch (cmd) { - case Q_QUOTAON: { - char *pathname; - - pathname = getname(addr); - if (IS_ERR(pathname)) - return PTR_ERR(pathname); - ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0); - putname(pathname); - return ret; - } - case Q_QUOTAOFF: - return sb->s_qcop->quota_off(sb, type, 0); - - case Q_GETFMT: { - __u32 fmt; - - down_read(&sb_dqopt(sb)->dqptr_sem); - if (!sb_has_quota_active(sb, type)) { - up_read(&sb_dqopt(sb)->dqptr_sem); - return -ESRCH; - } - fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id; - up_read(&sb_dqopt(sb)->dqptr_sem); - if (copy_to_user(addr, &fmt, sizeof(fmt))) - return -EFAULT; - return 0; - } - case Q_GETINFO: { - struct if_dqinfo info; - - ret = sb->s_qcop->get_info(sb, type, &info); - if (ret) - return ret; - if (copy_to_user(addr, &info, sizeof(info))) - return -EFAULT; - return 0; - } - case Q_SETINFO: { - struct if_dqinfo info; - - if (copy_from_user(&info, addr, sizeof(info))) - return -EFAULT; - return sb->s_qcop->set_info(sb, type, &info); - } - case Q_GETQUOTA: { - struct if_dqblk idq; - - ret = sb->s_qcop->get_dqblk(sb, type, id, &idq); - if (ret) - return ret; - if (copy_to_user(addr, &idq, sizeof(idq))) - return -EFAULT; - return 0; - } - case Q_SETQUOTA: { - struct if_dqblk idq; - - if (copy_from_user(&idq, addr, sizeof(idq))) - return -EFAULT; - return sb->s_qcop->set_dqblk(sb, type, id, &idq); - } - case Q_SYNC: - if (sb) - sync_quota_sb(sb, type); - else - sync_dquots(type); - return 0; - - case Q_XQUOTAON: - case Q_XQUOTAOFF: - case Q_XQUOTARM: { - __u32 flags; - - if (copy_from_user(&flags, addr, sizeof(flags))) - return -EFAULT; - return sb->s_qcop->set_xstate(sb, flags, cmd); - } - case Q_XGETQSTAT: { - struct fs_quota_stat fqs; - - if ((ret = sb->s_qcop->get_xstate(sb, &fqs))) - return ret; - if (copy_to_user(addr, &fqs, sizeof(fqs))) - return -EFAULT; - return 0; - } - case Q_XSETQLIM: { - struct fs_disk_quota fdq; - - if (copy_from_user(&fdq, addr, sizeof(fdq))) - return -EFAULT; - return sb->s_qcop->set_xquota(sb, type, id, &fdq); - } - case Q_XGETQUOTA: { - struct fs_disk_quota fdq; - - ret = sb->s_qcop->get_xquota(sb, type, id, &fdq); - if (ret) - return ret; - if (copy_to_user(addr, &fdq, sizeof(fdq))) - return -EFAULT; - return 0; - } - case Q_XQUOTASYNC: - return sb->s_qcop->quota_sync(sb, type); - /* We never reach here unless validity check is broken */ - default: - BUG(); + case Q_QUOTAON: + return quota_quotaon(sb, type, cmd, id, addr); + case Q_QUOTAOFF: + if (!sb->s_qcop->quota_off) + return -ENOSYS; + return sb->s_qcop->quota_off(sb, type, 0); + case Q_GETFMT: + return quota_getfmt(sb, type, addr); + case Q_GETINFO: + return quota_getinfo(sb, type, addr); + case Q_SETINFO: + return quota_setinfo(sb, type, addr); + case Q_GETQUOTA: + return quota_getquota(sb, type, id, addr); + case Q_SETQUOTA: + return quota_setquota(sb, type, id, addr); + case Q_SYNC: + if (!sb->s_qcop->quota_sync) + return -ENOSYS; + return sb->s_qcop->quota_sync(sb, type, 1); + case Q_XQUOTAON: + case Q_XQUOTAOFF: + case Q_XQUOTARM: + return quota_setxstate(sb, cmd, addr); + case Q_XGETQSTAT: + return quota_getxstate(sb, addr); + case Q_XSETQLIM: + return quota_setxquota(sb, type, id, addr); + case Q_XGETQUOTA: + return quota_getxquota(sb, type, id, addr); + case Q_XQUOTASYNC: + /* caller already holds s_umount */ + if (sb->s_flags & MS_RDONLY) + return -EROFS; + writeback_inodes_sb(sb); + return 0; + default: + return -EINVAL; } - return 0; } /* @@ -397,224 +319,23 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, cmds = cmd >> SUBCMDSHIFT; type = cmd & SUBCMDMASK; - if (cmds != Q_SYNC || special) { - sb = quotactl_block(special); - if (IS_ERR(sb)) - return PTR_ERR(sb); + /* + * As a special case Q_SYNC can be called without a specific device. + * It will iterate all superblocks that have quota enabled and call + * the sync action on each of them. + */ + if (!special) { + if (cmds == Q_SYNC) + return quota_sync_all(type); + return -ENODEV; } - ret = check_quotactl_valid(sb, type, cmds, id); - if (ret >= 0) - ret = do_quotactl(sb, type, cmds, id, addr); - if (sb) - drop_super(sb); + sb = quotactl_block(special); + if (IS_ERR(sb)) + return PTR_ERR(sb); - return ret; -} - -#if defined(CONFIG_COMPAT_FOR_U64_ALIGNMENT) -/* - * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64) - * and is necessary due to alignment problems. - */ -struct compat_if_dqblk { - compat_u64 dqb_bhardlimit; - compat_u64 dqb_bsoftlimit; - compat_u64 dqb_curspace; - compat_u64 dqb_ihardlimit; - compat_u64 dqb_isoftlimit; - compat_u64 dqb_curinodes; - compat_u64 dqb_btime; - compat_u64 dqb_itime; - compat_uint_t dqb_valid; -}; - -/* XFS structures */ -struct compat_fs_qfilestat { - compat_u64 dqb_bhardlimit; - compat_u64 qfs_nblks; - compat_uint_t qfs_nextents; -}; - -struct compat_fs_quota_stat { - __s8 qs_version; - __u16 qs_flags; - __s8 qs_pad; - struct compat_fs_qfilestat qs_uquota; - struct compat_fs_qfilestat qs_gquota; - compat_uint_t qs_incoredqs; - compat_int_t qs_btimelimit; - compat_int_t qs_itimelimit; - compat_int_t qs_rtbtimelimit; - __u16 qs_bwarnlimit; - __u16 qs_iwarnlimit; -}; - -asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, - qid_t id, void __user *addr) -{ - unsigned int cmds; - struct if_dqblk __user *dqblk; - struct compat_if_dqblk __user *compat_dqblk; - struct fs_quota_stat __user *fsqstat; - struct compat_fs_quota_stat __user *compat_fsqstat; - compat_uint_t data; - u16 xdata; - long ret; + ret = do_quotactl(sb, type, cmds, id, addr); - cmds = cmd >> SUBCMDSHIFT; - - switch (cmds) { - case Q_GETQUOTA: - dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); - compat_dqblk = addr; - ret = sys_quotactl(cmd, special, id, dqblk); - if (ret) - break; - if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) || - get_user(data, &dqblk->dqb_valid) || - put_user(data, &compat_dqblk->dqb_valid)) - ret = -EFAULT; - break; - case Q_SETQUOTA: - dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); - compat_dqblk = addr; - ret = -EFAULT; - if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) || - get_user(data, &compat_dqblk->dqb_valid) || - put_user(data, &dqblk->dqb_valid)) - break; - ret = sys_quotactl(cmd, special, id, dqblk); - break; - case Q_XGETQSTAT: - fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat)); - compat_fsqstat = addr; - ret = sys_quotactl(cmd, special, id, fsqstat); - if (ret) - break; - ret = -EFAULT; - /* Copying qs_version, qs_flags, qs_pad */ - if (copy_in_user(compat_fsqstat, fsqstat, - offsetof(struct compat_fs_quota_stat, qs_uquota))) - break; - /* Copying qs_uquota */ - if (copy_in_user(&compat_fsqstat->qs_uquota, - &fsqstat->qs_uquota, - sizeof(compat_fsqstat->qs_uquota)) || - get_user(data, &fsqstat->qs_uquota.qfs_nextents) || - put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents)) - break; - /* Copying qs_gquota */ - if (copy_in_user(&compat_fsqstat->qs_gquota, - &fsqstat->qs_gquota, - sizeof(compat_fsqstat->qs_gquota)) || - get_user(data, &fsqstat->qs_gquota.qfs_nextents) || - put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents)) - break; - /* Copying the rest */ - if (copy_in_user(&compat_fsqstat->qs_incoredqs, - &fsqstat->qs_incoredqs, - sizeof(struct compat_fs_quota_stat) - - offsetof(struct compat_fs_quota_stat, qs_incoredqs)) || - get_user(xdata, &fsqstat->qs_iwarnlimit) || - put_user(xdata, &compat_fsqstat->qs_iwarnlimit)) - break; - ret = 0; - break; - default: - ret = sys_quotactl(cmd, special, id, addr); - } + drop_super(sb); return ret; } -#endif - - -#ifdef CONFIG_QUOTA_NETLINK_INTERFACE - -/* Netlink family structure for quota */ -static struct genl_family quota_genl_family = { - .id = GENL_ID_GENERATE, - .hdrsize = 0, - .name = "VFS_DQUOT", - .version = 1, - .maxattr = QUOTA_NL_A_MAX, -}; - -/** - * quota_send_warning - Send warning to userspace about exceeded quota - * @type: The quota type: USRQQUOTA, GRPQUOTA,... - * @id: The user or group id of the quota that was exceeded - * @dev: The device on which the fs is mounted (sb->s_dev) - * @warntype: The type of the warning: QUOTA_NL_... - * - * This can be used by filesystems (including those which don't use - * dquot) to send a message to userspace relating to quota limits. - * - */ - -void quota_send_warning(short type, unsigned int id, dev_t dev, - const char warntype) -{ - static atomic_t seq; - struct sk_buff *skb; - void *msg_head; - int ret; - int msg_size = 4 * nla_total_size(sizeof(u32)) + - 2 * nla_total_size(sizeof(u64)); - - /* We have to allocate using GFP_NOFS as we are called from a - * filesystem performing write and thus further recursion into - * the fs to free some data could cause deadlocks. */ - skb = genlmsg_new(msg_size, GFP_NOFS); - if (!skb) { - printk(KERN_ERR - "VFS: Not enough memory to send quota warning.\n"); - return; - } - msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq), - "a_genl_family, 0, QUOTA_NL_C_WARNING); - if (!msg_head) { - printk(KERN_ERR - "VFS: Cannot store netlink header in quota warning.\n"); - goto err_out; - } - ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, type); - if (ret) - goto attr_err_out; - ret = nla_put_u64(skb, QUOTA_NL_A_EXCESS_ID, id); - if (ret) - goto attr_err_out; - ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype); - if (ret) - goto attr_err_out; - ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev)); - if (ret) - goto attr_err_out; - ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev)); - if (ret) - goto attr_err_out; - ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid()); - if (ret) - goto attr_err_out; - genlmsg_end(skb, msg_head); - - genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS); - return; -attr_err_out: - printk(KERN_ERR "VFS: Not enough space to compose quota message!\n"); -err_out: - kfree_skb(skb); -} -EXPORT_SYMBOL(quota_send_warning); - -static int __init quota_init(void) -{ - if (genl_register_family("a_genl_family) != 0) - printk(KERN_ERR - "VFS: Failed to create quota netlink interface.\n"); - return 0; -}; - -module_init(quota_init); -#endif - diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index 65c87276117..483442e66ed 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -169,7 +169,7 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th, return 0; // No free blocks in this bitmap } - /* search for a first zero bit -- beggining of a window */ + /* search for a first zero bit -- beginning of a window */ *beg = reiserfs_find_next_zero_le_bit ((unsigned long *)(bh->b_data), boundary, *beg); @@ -425,7 +425,7 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th, journal_mark_dirty(th, s, sbh); if (for_unformatted) - vfs_dq_free_block_nodirty(inode, 1); + dquot_free_block_nodirty(inode, 1); } void reiserfs_free_block(struct reiserfs_transaction_handle *th, @@ -1049,7 +1049,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start amount_needed, hint->inode->i_uid); #endif quota_ret = - vfs_dq_alloc_block_nodirty(hint->inode, amount_needed); + dquot_alloc_block_nodirty(hint->inode, amount_needed); if (quota_ret) /* Quota exceeded? */ return QUOTA_EXCEEDED; if (hint->preallocate && hint->prealloc_size) { @@ -1058,7 +1058,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start "reiserquota: allocating (prealloc) %d blocks id=%u", hint->prealloc_size, hint->inode->i_uid); #endif - quota_ret = vfs_dq_prealloc_block_nodirty(hint->inode, + quota_ret = dquot_prealloc_block_nodirty(hint->inode, hint->prealloc_size); if (quota_ret) hint->preallocate = hint->prealloc_size = 0; @@ -1092,7 +1092,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start hint->inode->i_uid); #endif /* Free not allocated blocks */ - vfs_dq_free_block_nodirty(hint->inode, + dquot_free_block_nodirty(hint->inode, amount_needed + hint->prealloc_size - nr_allocated); } @@ -1125,7 +1125,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start REISERFS_I(hint->inode)->i_prealloc_count, hint->inode->i_uid); #endif - vfs_dq_free_block_nodirty(hint->inode, amount_needed + + dquot_free_block_nodirty(hint->inode, amount_needed + hint->prealloc_size - nr_allocated - REISERFS_I(hint->inode)-> i_prealloc_count); diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index da2dba082e2..1d9c12714c5 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -289,7 +289,7 @@ const struct file_operations reiserfs_file_operations = { .compat_ioctl = reiserfs_compat_ioctl, #endif .mmap = reiserfs_file_mmap, - .open = generic_file_open, + .open = dquot_file_open, .release = reiserfs_file_release, .fsync = reiserfs_sync_file, .aio_read = generic_file_aio_read, diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 2df0f5c7c60..d1da94b82d8 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -34,6 +34,9 @@ void reiserfs_delete_inode(struct inode *inode) int depth; int err; + if (!is_bad_inode(inode)) + dquot_initialize(inode); + truncate_inode_pages(&inode->i_data, 0); depth = reiserfs_write_lock_once(inode->i_sb); @@ -54,7 +57,7 @@ void reiserfs_delete_inode(struct inode *inode) * after delete_object so that quota updates go into the same transaction as * stat data deletion */ if (!err) - vfs_dq_free_inode(inode); + dquot_free_inode(inode); if (journal_end(&th, inode->i_sb, jbegin_count)) goto out; @@ -1615,7 +1618,7 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp, ** to properly mark inodes for datasync and such, but only actually ** does something when called for a synchronous update. */ -int reiserfs_write_inode(struct inode *inode, int do_sync) +int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct reiserfs_transaction_handle th; int jbegin_count = 1; @@ -1627,7 +1630,7 @@ int reiserfs_write_inode(struct inode *inode, int do_sync) ** inode needs to reach disk for safety, and they can safely be ** ignored because the altered inode has already been logged. */ - if (do_sync && !(current->flags & PF_MEMALLOC)) { + if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) { reiserfs_write_lock(inode->i_sb); if (!journal_begin(&th, inode->i_sb, jbegin_count)) { reiserfs_update_sd(&th, inode); @@ -1765,10 +1768,10 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, BUG_ON(!th->t_trans_id); - if (vfs_dq_alloc_inode(inode)) { - err = -EDQUOT; + dquot_initialize(inode); + err = dquot_alloc_inode(inode); + if (err) goto out_end_trans; - } if (!dir->i_nlink) { err = -EPERM; goto out_bad_inode; @@ -1959,12 +1962,12 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, INODE_PKEY(inode)->k_objectid = 0; /* Quota change must be inside a transaction for journaling */ - vfs_dq_free_inode(inode); + dquot_free_inode(inode); out_end_trans: journal_end(th, th->t_super, th->t_blocks_allocated); /* Drop can be outside and it needs more credits so it's better to have it outside */ - vfs_dq_drop(inode); + dquot_drop(inode); inode->i_flags |= S_NOQUOTA; make_bad_inode(inode); @@ -3073,6 +3076,8 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) depth = reiserfs_write_lock_once(inode->i_sb); if (attr->ia_valid & ATTR_SIZE) { + dquot_initialize(inode); + /* version 2 items will be caught by the s_maxbytes check ** done for us in vmtruncate */ @@ -3134,8 +3139,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) jbegin_count); if (error) goto out; - error = - vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; + error = dquot_transfer(inode, attr); if (error) { journal_end(&th, inode->i_sb, jbegin_count); diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 9d4dcf0b07c..96e4cbbfaa1 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -546,7 +546,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, */ static int drop_new_inode(struct inode *inode) { - vfs_dq_drop(inode); + dquot_drop(inode); make_bad_inode(inode); inode->i_flags |= S_NOQUOTA; iput(inode); @@ -554,7 +554,7 @@ static int drop_new_inode(struct inode *inode) } /* utility function that does setup for reiserfs_new_inode. -** vfs_dq_init needs lots of credits so it's better to have it +** dquot_initialize needs lots of credits so it's better to have it ** outside of a transaction, so we had to pull some bits of ** reiserfs_new_inode out into this func. */ @@ -577,7 +577,7 @@ static int new_inode_init(struct inode *inode, struct inode *dir, int mode) } else { inode->i_gid = current_fsgid(); } - vfs_dq_init(inode); + dquot_initialize(inode); return 0; } @@ -594,6 +594,8 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode, struct reiserfs_transaction_handle th; struct reiserfs_security_handle security; + dquot_initialize(dir); + if (!(inode = new_inode(dir->i_sb))) { return -ENOMEM; } @@ -666,6 +668,8 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode, if (!new_valid_dev(rdev)) return -EINVAL; + dquot_initialize(dir); + if (!(inode = new_inode(dir->i_sb))) { return -ENOMEM; } @@ -739,6 +743,8 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb)); + dquot_initialize(dir); + #ifdef DISPLACE_NEW_PACKING_LOCALITIES /* set flag that new packing locality created and new blocks for the content * of that directory are not displaced yet */ REISERFS_I(dir)->new_packing_locality = 1; @@ -842,6 +848,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) JOURNAL_PER_BALANCE_CNT * 2 + 2 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); + dquot_initialize(dir); + reiserfs_write_lock(dir->i_sb); retval = journal_begin(&th, dir->i_sb, jbegin_count); if (retval) @@ -923,6 +931,8 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) unsigned long savelink; int depth; + dquot_initialize(dir); + inode = dentry->d_inode; /* in this transaction we can be doing at max two balancings and update @@ -1024,6 +1034,8 @@ static int reiserfs_symlink(struct inode *parent_dir, 2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) + REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb)); + dquot_initialize(parent_dir); + if (!(inode = new_inode(parent_dir->i_sb))) { return -ENOMEM; } @@ -1111,6 +1123,8 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, JOURNAL_PER_BALANCE_CNT * 3 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); + dquot_initialize(dir); + reiserfs_write_lock(dir->i_sb); if (inode->i_nlink >= REISERFS_LINK_MAX) { //FIXME: sd_nlink is 32 bit for new files @@ -1235,6 +1249,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, JOURNAL_PER_BALANCE_CNT * 3 + 5 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb); + dquot_initialize(old_dir); + dquot_initialize(new_dir); + old_inode = old_dentry->d_inode; new_dentry_inode = new_dentry->d_inode; diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 5fa7118f04e..313d39d639e 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -1299,7 +1299,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, "reiserquota delete_item(): freeing %u, id=%u type=%c", quota_cut_bytes, inode->i_uid, head2type(&s_ih)); #endif - vfs_dq_free_space_nodirty(inode, quota_cut_bytes); + dquot_free_space_nodirty(inode, quota_cut_bytes); /* Return deleted body length */ return ret_value; @@ -1383,7 +1383,7 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, quota_cut_bytes, inode->i_uid, key2type(key)); #endif - vfs_dq_free_space_nodirty(inode, + dquot_free_space_nodirty(inode, quota_cut_bytes); } break; @@ -1733,7 +1733,7 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, "reiserquota cut_from_item(): freeing %u id=%u type=%c", quota_cut_bytes, inode->i_uid, '?'); #endif - vfs_dq_free_space_nodirty(inode, quota_cut_bytes); + dquot_free_space_nodirty(inode, quota_cut_bytes); return ret_value; } @@ -1968,9 +1968,10 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree key2type(&(key->on_disk_key))); #endif - if (vfs_dq_alloc_space_nodirty(inode, pasted_size)) { + retval = dquot_alloc_space_nodirty(inode, pasted_size); + if (retval) { pathrelse(search_path); - return -EDQUOT; + return retval; } init_tb_struct(th, &s_paste_balance, th->t_super, search_path, pasted_size); @@ -2024,7 +2025,7 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree pasted_size, inode->i_uid, key2type(&(key->on_disk_key))); #endif - vfs_dq_free_space_nodirty(inode, pasted_size); + dquot_free_space_nodirty(inode, pasted_size); return retval; } @@ -2062,9 +2063,10 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, #endif /* We can't dirty inode here. It would be immediately written but * appropriate stat item isn't inserted yet... */ - if (vfs_dq_alloc_space_nodirty(inode, quota_bytes)) { + retval = dquot_alloc_space_nodirty(inode, quota_bytes); + if (retval) { pathrelse(path); - return -EDQUOT; + return retval; } } init_tb_struct(th, &s_ins_balance, th->t_super, path, @@ -2113,6 +2115,6 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, quota_bytes, inode->i_uid, head2type(ih)); #endif if (inode) - vfs_dq_free_space_nodirty(inode, quota_bytes); + dquot_free_space_nodirty(inode, quota_bytes); return retval; } diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index b4a7dd03bdb..04bf5d791bd 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -246,7 +246,7 @@ static int finish_unfinished(struct super_block *s) retval = remove_save_link_only(s, &save_link_key, 0); continue; } - vfs_dq_init(inode); + dquot_initialize(inode); if (truncate && S_ISDIR(inode->i_mode)) { /* We got a truncate request for a dir which is impossible. @@ -578,6 +578,11 @@ out: reiserfs_write_unlock_once(inode->i_sb, lock_depth); } +static void reiserfs_clear_inode(struct inode *inode) +{ + dquot_drop(inode); +} + #ifdef CONFIG_QUOTA static ssize_t reiserfs_quota_write(struct super_block *, int, const char *, size_t, loff_t); @@ -590,6 +595,7 @@ static const struct super_operations reiserfs_sops = { .destroy_inode = reiserfs_destroy_inode, .write_inode = reiserfs_write_inode, .dirty_inode = reiserfs_dirty_inode, + .clear_inode = reiserfs_clear_inode, .delete_inode = reiserfs_delete_inode, .put_super = reiserfs_put_super, .write_super = reiserfs_write_super, @@ -616,13 +622,6 @@ static int reiserfs_write_info(struct super_block *, int); static int reiserfs_quota_on(struct super_block *, int, int, char *, int); static const struct dquot_operations reiserfs_quota_operations = { - .initialize = dquot_initialize, - .drop = dquot_drop, - .alloc_space = dquot_alloc_space, - .alloc_inode = dquot_alloc_inode, - .free_space = dquot_free_space, - .free_inode = dquot_free_inode, - .transfer = dquot_transfer, .write_dquot = reiserfs_write_dquot, .acquire_dquot = reiserfs_acquire_dquot, .release_dquot = reiserfs_release_dquot, diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 81f09fab8ae..37d034ca7d9 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -61,7 +61,6 @@ static int xattr_create(struct inode *dir, struct dentry *dentry, int mode) { BUG_ON(!mutex_is_locked(&dir->i_mutex)); - vfs_dq_init(dir); return dir->i_op->create(dir, dentry, mode, NULL); } #endif @@ -69,7 +68,6 @@ static int xattr_create(struct inode *dir, struct dentry *dentry, int mode) static int xattr_mkdir(struct inode *dir, struct dentry *dentry, int mode) { BUG_ON(!mutex_is_locked(&dir->i_mutex)); - vfs_dq_init(dir); return dir->i_op->mkdir(dir, dentry, mode); } @@ -81,7 +79,6 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry) { int error; BUG_ON(!mutex_is_locked(&dir->i_mutex)); - vfs_dq_init(dir); reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex, I_MUTEX_CHILD, dir->i_sb); @@ -97,7 +94,6 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry) { int error; BUG_ON(!mutex_is_locked(&dir->i_mutex)); - vfs_dq_init(dir); reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex, I_MUTEX_CHILD, dir->i_sb); diff --git a/fs/select.c b/fs/select.c index fd38ce2e32e..500a669f779 100644 --- a/fs/select.c +++ b/fs/select.c @@ -691,6 +691,23 @@ SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp, } #endif /* HAVE_SET_RESTORE_SIGMASK */ +#ifdef __ARCH_WANT_SYS_OLD_SELECT +struct sel_arg_struct { + unsigned long n; + fd_set __user *inp, *outp, *exp; + struct timeval __user *tvp; +}; + +SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg) +{ + struct sel_arg_struct a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp); +} +#endif + struct poll_list { struct poll_list *next; int len; @@ -821,7 +838,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, struct poll_list *walk = head; unsigned long todo = nfds; - if (nfds > current->signal->rlim[RLIMIT_NOFILE].rlim_cur) + if (nfds > rlimit(RLIMIT_NOFILE)) return -EINVAL; len = min_t(unsigned int, nfds, N_STACK_PPS); diff --git a/fs/seq_file.c b/fs/seq_file.c index 5afd554efad..e1f437be6c3 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -734,7 +734,7 @@ EXPORT_SYMBOL(seq_hlist_start_head); * seq_hlist_next - move to the next position of the hlist * @v: the current iterator * @head: the head of the hlist - * @pos: the current posision + * @ppos: the current position * * Called at seq_file->op->next(). */ @@ -800,7 +800,7 @@ EXPORT_SYMBOL(seq_hlist_start_head_rcu); * seq_hlist_next_rcu - move to the next position of the hlist protected by RCU * @v: the current iterator * @head: the head of the hlist - * @pos: the current posision + * @ppos: the current position * * Called at seq_file->op->next(). * diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index 70e3244fa30..df8a19ef870 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_SQUASHFS) += squashfs.o squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o -squashfs-y += namei.o super.o symlink.o +squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 2a796031034..1cb0d81b164 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -29,15 +29,14 @@ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/slab.h> -#include <linux/mutex.h> #include <linux/string.h> #include <linux/buffer_head.h> -#include <linux/zlib.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" +#include "decompressor.h" /* * Read the metadata block length, this is stored in the first two @@ -153,72 +152,10 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, } if (compressed) { - int zlib_err = 0, zlib_init = 0; - - /* - * Uncompress block. - */ - - mutex_lock(&msblk->read_data_mutex); - - msblk->stream.avail_out = 0; - msblk->stream.avail_in = 0; - - bytes = length; - do { - if (msblk->stream.avail_in == 0 && k < b) { - avail = min(bytes, msblk->devblksize - offset); - bytes -= avail; - wait_on_buffer(bh[k]); - if (!buffer_uptodate(bh[k])) - goto release_mutex; - - if (avail == 0) { - offset = 0; - put_bh(bh[k++]); - continue; - } - - msblk->stream.next_in = bh[k]->b_data + offset; - msblk->stream.avail_in = avail; - offset = 0; - } - - if (msblk->stream.avail_out == 0 && page < pages) { - msblk->stream.next_out = buffer[page++]; - msblk->stream.avail_out = PAGE_CACHE_SIZE; - } - - if (!zlib_init) { - zlib_err = zlib_inflateInit(&msblk->stream); - if (zlib_err != Z_OK) { - ERROR("zlib_inflateInit returned" - " unexpected result 0x%x," - " srclength %d\n", zlib_err, - srclength); - goto release_mutex; - } - zlib_init = 1; - } - - zlib_err = zlib_inflate(&msblk->stream, Z_SYNC_FLUSH); - - if (msblk->stream.avail_in == 0 && k < b) - put_bh(bh[k++]); - } while (zlib_err == Z_OK); - - if (zlib_err != Z_STREAM_END) { - ERROR("zlib_inflate error, data probably corrupt\n"); - goto release_mutex; - } - - zlib_err = zlib_inflateEnd(&msblk->stream); - if (zlib_err != Z_OK) { - ERROR("zlib_inflate error, data probably corrupt\n"); - goto release_mutex; - } - length = msblk->stream.total_out; - mutex_unlock(&msblk->read_data_mutex); + length = squashfs_decompress(msblk, buffer, bh, b, offset, + length, srclength, pages); + if (length < 0) + goto read_failure; } else { /* * Block is uncompressed. @@ -255,9 +192,6 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, kfree(bh); return length; -release_mutex: - mutex_unlock(&msblk->read_data_mutex); - block_release: for (; k < b; k++) put_bh(bh[k]); diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c index 40c98fa6b5d..57314bee905 100644 --- a/fs/squashfs/cache.c +++ b/fs/squashfs/cache.c @@ -51,7 +51,6 @@ #include <linux/sched.h> #include <linux/spinlock.h> #include <linux/wait.h> -#include <linux/zlib.h> #include <linux/pagemap.h> #include "squashfs_fs.h" diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c new file mode 100644 index 00000000000..157478da6ac --- /dev/null +++ b/fs/squashfs/decompressor.c @@ -0,0 +1,68 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * decompressor.c + */ + +#include <linux/types.h> +#include <linux/mutex.h> +#include <linux/buffer_head.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "decompressor.h" +#include "squashfs.h" + +/* + * This file (and decompressor.h) implements a decompressor framework for + * Squashfs, allowing multiple decompressors to be easily supported + */ + +static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = { + NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0 +}; + +static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = { + NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0 +}; + +static const struct squashfs_decompressor squashfs_unknown_comp_ops = { + NULL, NULL, NULL, 0, "unknown", 0 +}; + +static const struct squashfs_decompressor *decompressor[] = { + &squashfs_zlib_comp_ops, + &squashfs_lzma_unsupported_comp_ops, + &squashfs_lzo_unsupported_comp_ops, + &squashfs_unknown_comp_ops +}; + + +const struct squashfs_decompressor *squashfs_lookup_decompressor(int id) +{ + int i; + + for (i = 0; decompressor[i]->id; i++) + if (id == decompressor[i]->id) + break; + + return decompressor[i]; +} diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h new file mode 100644 index 00000000000..7425f80783f --- /dev/null +++ b/fs/squashfs/decompressor.h @@ -0,0 +1,55 @@ +#ifndef DECOMPRESSOR_H +#define DECOMPRESSOR_H +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * decompressor.h + */ + +struct squashfs_decompressor { + void *(*init)(struct squashfs_sb_info *); + void (*free)(void *); + int (*decompress)(struct squashfs_sb_info *, void **, + struct buffer_head **, int, int, int, int, int); + int id; + char *name; + int supported; +}; + +static inline void *squashfs_decompressor_init(struct squashfs_sb_info *msblk) +{ + return msblk->decompressor->init(msblk); +} + +static inline void squashfs_decompressor_free(struct squashfs_sb_info *msblk, + void *s) +{ + if (msblk->decompressor) + msblk->decompressor->free(s); +} + +static inline int squashfs_decompress(struct squashfs_sb_info *msblk, + void **buffer, struct buffer_head **bh, int b, int offset, int length, + int srclength, int pages) +{ + return msblk->decompressor->decompress(msblk, buffer, bh, b, offset, + length, srclength, pages); +} +#endif diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c index 566b0eaed86..12b933ac658 100644 --- a/fs/squashfs/dir.c +++ b/fs/squashfs/dir.c @@ -30,7 +30,6 @@ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/slab.h> -#include <linux/zlib.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c index 2b1b8fe5e03..7f93d5a9ee0 100644 --- a/fs/squashfs/export.c +++ b/fs/squashfs/export.c @@ -39,7 +39,6 @@ #include <linux/vfs.h> #include <linux/dcache.h> #include <linux/exportfs.h> -#include <linux/zlib.h> #include <linux/slab.h> #include "squashfs_fs.h" diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index 717767d831d..a25c5060bdc 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -47,7 +47,6 @@ #include <linux/string.h> #include <linux/pagemap.h> #include <linux/mutex.h> -#include <linux/zlib.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c index b5a2c15bbbc..7c90bbd6879 100644 --- a/fs/squashfs/fragment.c +++ b/fs/squashfs/fragment.c @@ -36,7 +36,6 @@ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/slab.h> -#include <linux/zlib.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c index 3795b837ba2..b7f64bcd2b7 100644 --- a/fs/squashfs/id.c +++ b/fs/squashfs/id.c @@ -34,7 +34,6 @@ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/slab.h> -#include <linux/zlib.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index 9101dbde39e..49daaf669e4 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -40,7 +40,6 @@ #include <linux/fs.h> #include <linux/vfs.h> -#include <linux/zlib.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c index 9e398653b22..5266bd8ad93 100644 --- a/fs/squashfs/namei.c +++ b/fs/squashfs/namei.c @@ -57,7 +57,6 @@ #include <linux/slab.h> #include <linux/string.h> #include <linux/dcache.h> -#include <linux/zlib.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 0e9feb6adf7..fe2587af551 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -51,6 +51,9 @@ extern struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *, u64, int); extern int squashfs_read_table(struct super_block *, void *, u64, int); +/* decompressor.c */ +extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int); + /* export.c */ extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, unsigned int); @@ -71,7 +74,7 @@ extern struct inode *squashfs_iget(struct super_block *, long long, extern int squashfs_read_inode(struct inode *, long long); /* - * Inodes and files operations + * Inodes, files and decompressor operations */ /* dir.c */ @@ -88,3 +91,6 @@ extern const struct inode_operations squashfs_dir_inode_ops; /* symlink.c */ extern const struct address_space_operations squashfs_symlink_aops; + +/* zlib_wrapper.c */ +extern const struct squashfs_decompressor squashfs_zlib_comp_ops; diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index 283daafc568..79024245ea0 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -183,8 +183,6 @@ #define SQUASHFS_MAX_FILE_SIZE (1LL << \ (SQUASHFS_MAX_FILE_SIZE_LOG - 2)) -#define SQUASHFS_MARKER_BYTE 0xff - /* meta index cache */ #define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int)) #define SQUASHFS_META_ENTRIES 127 @@ -211,7 +209,9 @@ struct meta_index { /* * definitions for structures on disk */ -#define ZLIB_COMPRESSION 1 +#define ZLIB_COMPRESSION 1 +#define LZMA_COMPRESSION 2 +#define LZO_COMPRESSION 3 struct squashfs_super_block { __le32 s_magic; diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index c8c65614dd1..2e77dc547e2 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -52,25 +52,25 @@ struct squashfs_cache_entry { }; struct squashfs_sb_info { - int devblksize; - int devblksize_log2; - struct squashfs_cache *block_cache; - struct squashfs_cache *fragment_cache; - struct squashfs_cache *read_page; - int next_meta_index; - __le64 *id_table; - __le64 *fragment_index; - unsigned int *fragment_index_2; - struct mutex read_data_mutex; - struct mutex meta_index_mutex; - struct meta_index *meta_index; - z_stream stream; - __le64 *inode_lookup_table; - u64 inode_table; - u64 directory_table; - unsigned int block_size; - unsigned short block_log; - long long bytes_used; - unsigned int inodes; + const struct squashfs_decompressor *decompressor; + int devblksize; + int devblksize_log2; + struct squashfs_cache *block_cache; + struct squashfs_cache *fragment_cache; + struct squashfs_cache *read_page; + int next_meta_index; + __le64 *id_table; + __le64 *fragment_index; + struct mutex read_data_mutex; + struct mutex meta_index_mutex; + struct meta_index *meta_index; + void *stream; + __le64 *inode_lookup_table; + u64 inode_table; + u64 directory_table; + unsigned int block_size; + unsigned short block_log; + long long bytes_used; + unsigned int inodes; }; #endif diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 6c197ef53ad..3550aec2f65 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -35,34 +35,41 @@ #include <linux/pagemap.h> #include <linux/init.h> #include <linux/module.h> -#include <linux/zlib.h> #include <linux/magic.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" +#include "decompressor.h" static struct file_system_type squashfs_fs_type; static const struct super_operations squashfs_super_ops; -static int supported_squashfs_filesystem(short major, short minor, short comp) +static const struct squashfs_decompressor *supported_squashfs_filesystem(short + major, short minor, short id) { + const struct squashfs_decompressor *decompressor; + if (major < SQUASHFS_MAJOR) { ERROR("Major/Minor mismatch, older Squashfs %d.%d " "filesystems are unsupported\n", major, minor); - return -EINVAL; + return NULL; } else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) { ERROR("Major/Minor mismatch, trying to mount newer " "%d.%d filesystem\n", major, minor); ERROR("Please update your kernel\n"); - return -EINVAL; + return NULL; } - if (comp != ZLIB_COMPRESSION) - return -EINVAL; + decompressor = squashfs_lookup_decompressor(id); + if (!decompressor->supported) { + ERROR("Filesystem uses \"%s\" compression. This is not " + "supported\n", decompressor->name); + return NULL; + } - return 0; + return decompressor; } @@ -87,13 +94,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) } msblk = sb->s_fs_info; - msblk->stream.workspace = kmalloc(zlib_inflate_workspacesize(), - GFP_KERNEL); - if (msblk->stream.workspace == NULL) { - ERROR("Failed to allocate zlib workspace\n"); - goto failure; - } - sblk = kzalloc(sizeof(*sblk), GFP_KERNEL); if (sblk == NULL) { ERROR("Failed to allocate squashfs_super_block\n"); @@ -120,25 +120,25 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } + err = -EINVAL; + /* Check it is a SQUASHFS superblock */ sb->s_magic = le32_to_cpu(sblk->s_magic); if (sb->s_magic != SQUASHFS_MAGIC) { if (!silent) ERROR("Can't find a SQUASHFS superblock on %s\n", bdevname(sb->s_bdev, b)); - err = -EINVAL; goto failed_mount; } - /* Check the MAJOR & MINOR versions and compression type */ - err = supported_squashfs_filesystem(le16_to_cpu(sblk->s_major), + /* Check the MAJOR & MINOR versions and lookup compression type */ + msblk->decompressor = supported_squashfs_filesystem( + le16_to_cpu(sblk->s_major), le16_to_cpu(sblk->s_minor), le16_to_cpu(sblk->compression)); - if (err < 0) + if (msblk->decompressor == NULL) goto failed_mount; - err = -EINVAL; - /* * Check if there's xattrs in the filesystem. These are not * supported in this version, so warn that they will be ignored. @@ -205,6 +205,10 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) err = -ENOMEM; + msblk->stream = squashfs_decompressor_init(msblk); + if (msblk->stream == NULL) + goto failed_mount; + msblk->block_cache = squashfs_cache_init("metadata", SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE); if (msblk->block_cache == NULL) @@ -292,17 +296,16 @@ failed_mount: squashfs_cache_delete(msblk->block_cache); squashfs_cache_delete(msblk->fragment_cache); squashfs_cache_delete(msblk->read_page); + squashfs_decompressor_free(msblk, msblk->stream); kfree(msblk->inode_lookup_table); kfree(msblk->fragment_index); kfree(msblk->id_table); - kfree(msblk->stream.workspace); kfree(sb->s_fs_info); sb->s_fs_info = NULL; kfree(sblk); return err; failure: - kfree(msblk->stream.workspace); kfree(sb->s_fs_info); sb->s_fs_info = NULL; return -ENOMEM; @@ -346,10 +349,10 @@ static void squashfs_put_super(struct super_block *sb) squashfs_cache_delete(sbi->block_cache); squashfs_cache_delete(sbi->fragment_cache); squashfs_cache_delete(sbi->read_page); + squashfs_decompressor_free(sbi, sbi->stream); kfree(sbi->id_table); kfree(sbi->fragment_index); kfree(sbi->meta_index); - kfree(sbi->stream.workspace); kfree(sb->s_fs_info); sb->s_fs_info = NULL; } diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c index 83d87880aac..e80be2022a7 100644 --- a/fs/squashfs/symlink.c +++ b/fs/squashfs/symlink.c @@ -36,7 +36,6 @@ #include <linux/slab.h> #include <linux/string.h> #include <linux/pagemap.h> -#include <linux/zlib.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c new file mode 100644 index 00000000000..4dd70e04333 --- /dev/null +++ b/fs/squashfs/zlib_wrapper.c @@ -0,0 +1,150 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * zlib_wrapper.c + */ + + +#include <linux/mutex.h> +#include <linux/buffer_head.h> +#include <linux/zlib.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" +#include "decompressor.h" + +static void *zlib_init(struct squashfs_sb_info *dummy) +{ + z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL); + if (stream == NULL) + goto failed; + stream->workspace = kmalloc(zlib_inflate_workspacesize(), + GFP_KERNEL); + if (stream->workspace == NULL) + goto failed; + + return stream; + +failed: + ERROR("Failed to allocate zlib workspace\n"); + kfree(stream); + return NULL; +} + + +static void zlib_free(void *strm) +{ + z_stream *stream = strm; + + if (stream) + kfree(stream->workspace); + kfree(stream); +} + + +static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer, + struct buffer_head **bh, int b, int offset, int length, int srclength, + int pages) +{ + int zlib_err = 0, zlib_init = 0; + int avail, bytes, k = 0, page = 0; + z_stream *stream = msblk->stream; + + mutex_lock(&msblk->read_data_mutex); + + stream->avail_out = 0; + stream->avail_in = 0; + + bytes = length; + do { + if (stream->avail_in == 0 && k < b) { + avail = min(bytes, msblk->devblksize - offset); + bytes -= avail; + wait_on_buffer(bh[k]); + if (!buffer_uptodate(bh[k])) + goto release_mutex; + + if (avail == 0) { + offset = 0; + put_bh(bh[k++]); + continue; + } + + stream->next_in = bh[k]->b_data + offset; + stream->avail_in = avail; + offset = 0; + } + + if (stream->avail_out == 0 && page < pages) { + stream->next_out = buffer[page++]; + stream->avail_out = PAGE_CACHE_SIZE; + } + + if (!zlib_init) { + zlib_err = zlib_inflateInit(stream); + if (zlib_err != Z_OK) { + ERROR("zlib_inflateInit returned unexpected " + "result 0x%x, srclength %d\n", + zlib_err, srclength); + goto release_mutex; + } + zlib_init = 1; + } + + zlib_err = zlib_inflate(stream, Z_SYNC_FLUSH); + + if (stream->avail_in == 0 && k < b) + put_bh(bh[k++]); + } while (zlib_err == Z_OK); + + if (zlib_err != Z_STREAM_END) { + ERROR("zlib_inflate error, data probably corrupt\n"); + goto release_mutex; + } + + zlib_err = zlib_inflateEnd(stream); + if (zlib_err != Z_OK) { + ERROR("zlib_inflate error, data probably corrupt\n"); + goto release_mutex; + } + + mutex_unlock(&msblk->read_data_mutex); + return stream->total_out; + +release_mutex: + mutex_unlock(&msblk->read_data_mutex); + + for (; k < b; k++) + put_bh(bh[k]); + + return -EIO; +} + +const struct squashfs_decompressor squashfs_zlib_comp_ops = { + .init = zlib_init, + .free = zlib_free, + .decompress = zlib_uncompress, + .id = ZLIB_COMPRESSION, + .name = "zlib", + .supported = 1 +}; + diff --git a/fs/sync.c b/fs/sync.c index 418727a2a23..f557d71cb09 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -34,14 +34,14 @@ static int __sync_filesystem(struct super_block *sb, int wait) if (!sb->s_bdi) return 0; - /* Avoid doing twice syncing and cache pruning for quota sync */ - if (!wait) { - writeout_quota_sb(sb, -1); - writeback_inodes_sb(sb); - } else { - sync_quota_sb(sb, -1); + if (sb->s_qcop && sb->s_qcop->quota_sync) + sb->s_qcop->quota_sync(sb, -1, wait); + + if (wait) sync_inodes_sb(sb); - } + else + writeback_inodes_sb(sb); + if (sb->s_op->sync_fs) sb->s_op->sync_fs(sb, wait); return __sync_blockdev(sb->s_bdev, wait); diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index a0a500af24a..e9d293593e5 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -54,14 +54,14 @@ fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count) int rc; /* need attr_sd for attr, its parent for kobj */ - if (!sysfs_get_active_two(attr_sd)) + if (!sysfs_get_active(attr_sd)) return -ENODEV; rc = -EIO; if (attr->read) rc = attr->read(kobj, attr, buffer, off, count); - sysfs_put_active_two(attr_sd); + sysfs_put_active(attr_sd); return rc; } @@ -125,14 +125,14 @@ flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count) int rc; /* need attr_sd for attr, its parent for kobj */ - if (!sysfs_get_active_two(attr_sd)) + if (!sysfs_get_active(attr_sd)) return -ENODEV; rc = -EIO; if (attr->write) rc = attr->write(kobj, attr, buffer, offset, count); - sysfs_put_active_two(attr_sd); + sysfs_put_active(attr_sd); return rc; } @@ -184,12 +184,12 @@ static void bin_vma_open(struct vm_area_struct *vma) if (!bb->vm_ops || !bb->vm_ops->open) return; - if (!sysfs_get_active_two(attr_sd)) + if (!sysfs_get_active(attr_sd)) return; bb->vm_ops->open(vma); - sysfs_put_active_two(attr_sd); + sysfs_put_active(attr_sd); } static void bin_vma_close(struct vm_area_struct *vma) @@ -201,12 +201,12 @@ static void bin_vma_close(struct vm_area_struct *vma) if (!bb->vm_ops || !bb->vm_ops->close) return; - if (!sysfs_get_active_two(attr_sd)) + if (!sysfs_get_active(attr_sd)) return; bb->vm_ops->close(vma); - sysfs_put_active_two(attr_sd); + sysfs_put_active(attr_sd); } static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf) @@ -219,12 +219,12 @@ static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (!bb->vm_ops || !bb->vm_ops->fault) return VM_FAULT_SIGBUS; - if (!sysfs_get_active_two(attr_sd)) + if (!sysfs_get_active(attr_sd)) return VM_FAULT_SIGBUS; ret = bb->vm_ops->fault(vma, vmf); - sysfs_put_active_two(attr_sd); + sysfs_put_active(attr_sd); return ret; } @@ -241,12 +241,12 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (!bb->vm_ops->page_mkwrite) return 0; - if (!sysfs_get_active_two(attr_sd)) + if (!sysfs_get_active(attr_sd)) return VM_FAULT_SIGBUS; ret = bb->vm_ops->page_mkwrite(vma, vmf); - sysfs_put_active_two(attr_sd); + sysfs_put_active(attr_sd); return ret; } @@ -261,12 +261,12 @@ static int bin_access(struct vm_area_struct *vma, unsigned long addr, if (!bb->vm_ops || !bb->vm_ops->access) return -EINVAL; - if (!sysfs_get_active_two(attr_sd)) + if (!sysfs_get_active(attr_sd)) return -EINVAL; ret = bb->vm_ops->access(vma, addr, buf, len, write); - sysfs_put_active_two(attr_sd); + sysfs_put_active(attr_sd); return ret; } @@ -281,12 +281,12 @@ static int bin_set_policy(struct vm_area_struct *vma, struct mempolicy *new) if (!bb->vm_ops || !bb->vm_ops->set_policy) return 0; - if (!sysfs_get_active_two(attr_sd)) + if (!sysfs_get_active(attr_sd)) return -EINVAL; ret = bb->vm_ops->set_policy(vma, new); - sysfs_put_active_two(attr_sd); + sysfs_put_active(attr_sd); return ret; } @@ -301,12 +301,12 @@ static struct mempolicy *bin_get_policy(struct vm_area_struct *vma, if (!bb->vm_ops || !bb->vm_ops->get_policy) return vma->vm_policy; - if (!sysfs_get_active_two(attr_sd)) + if (!sysfs_get_active(attr_sd)) return vma->vm_policy; pol = bb->vm_ops->get_policy(vma, addr); - sysfs_put_active_two(attr_sd); + sysfs_put_active(attr_sd); return pol; } @@ -321,12 +321,12 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from, if (!bb->vm_ops || !bb->vm_ops->migrate) return 0; - if (!sysfs_get_active_two(attr_sd)) + if (!sysfs_get_active(attr_sd)) return 0; ret = bb->vm_ops->migrate(vma, from, to, flags); - sysfs_put_active_two(attr_sd); + sysfs_put_active(attr_sd); return ret; } #endif @@ -356,7 +356,7 @@ static int mmap(struct file *file, struct vm_area_struct *vma) /* need attr_sd for attr, its parent for kobj */ rc = -ENODEV; - if (!sysfs_get_active_two(attr_sd)) + if (!sysfs_get_active(attr_sd)) goto out_unlock; rc = -EINVAL; @@ -384,7 +384,7 @@ static int mmap(struct file *file, struct vm_area_struct *vma) bb->vm_ops = vma->vm_ops; vma->vm_ops = &bin_vm_ops; out_put: - sysfs_put_active_two(attr_sd); + sysfs_put_active(attr_sd); out_unlock: mutex_unlock(&bb->mutex); @@ -399,7 +399,7 @@ static int open(struct inode * inode, struct file * file) int error; /* binary file operations requires both @sd and its parent */ - if (!sysfs_get_active_two(attr_sd)) + if (!sysfs_get_active(attr_sd)) return -ENODEV; error = -EACCES; @@ -426,11 +426,11 @@ static int open(struct inode * inode, struct file * file) mutex_unlock(&sysfs_bin_lock); /* open succeeded, put active references */ - sysfs_put_active_two(attr_sd); + sysfs_put_active(attr_sd); return 0; err_out: - sysfs_put_active_two(attr_sd); + sysfs_put_active(attr_sd); kfree(bb); return error; } diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 699f371b9f1..590717861c7 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -93,7 +93,7 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd) * RETURNS: * Pointer to @sd on success, NULL on failure. */ -static struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd) +struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd) { if (unlikely(!sd)) return NULL; @@ -124,7 +124,7 @@ static struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd) * Put an active reference to @sd. This function is noop if @sd * is NULL. */ -static void sysfs_put_active(struct sysfs_dirent *sd) +void sysfs_put_active(struct sysfs_dirent *sd) { struct completion *cmpl; int v; @@ -145,45 +145,6 @@ static void sysfs_put_active(struct sysfs_dirent *sd) } /** - * sysfs_get_active_two - get active references to sysfs_dirent and parent - * @sd: sysfs_dirent of interest - * - * Get active reference to @sd and its parent. Parent's active - * reference is grabbed first. This function is noop if @sd is - * NULL. - * - * RETURNS: - * Pointer to @sd on success, NULL on failure. - */ -struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd) -{ - if (sd) { - if (sd->s_parent && unlikely(!sysfs_get_active(sd->s_parent))) - return NULL; - if (unlikely(!sysfs_get_active(sd))) { - sysfs_put_active(sd->s_parent); - return NULL; - } - } - return sd; -} - -/** - * sysfs_put_active_two - put active references to sysfs_dirent and parent - * @sd: sysfs_dirent of interest - * - * Put active references to @sd and its parent. This function is - * noop if @sd is NULL. - */ -void sysfs_put_active_two(struct sysfs_dirent *sd) -{ - if (sd) { - sysfs_put_active(sd); - sysfs_put_active(sd->s_parent); - } -} - -/** * sysfs_deactivate - deactivate sysfs_dirent * @sd: sysfs_dirent to deactivate * @@ -195,6 +156,10 @@ static void sysfs_deactivate(struct sysfs_dirent *sd) int v; BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED)); + + if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF)) + return; + sd->s_sibling = (void *)&wait; rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_); @@ -354,7 +319,6 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type) atomic_set(&sd->s_count, 1); atomic_set(&sd->s_active, 0); - sysfs_dirent_init_lockdep(sd); sd->s_name = name; sd->s_mode = mode; @@ -681,7 +645,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, } /* attach dentry and inode */ - inode = sysfs_get_inode(sd); + inode = sysfs_get_inode(dir->i_sb, sd); if (!inode) { ret = ERR_PTR(-ENOMEM); goto out_unlock; @@ -837,11 +801,46 @@ static inline unsigned char dt_type(struct sysfs_dirent *sd) return (sd->s_mode >> 12) & 15; } +static int sysfs_dir_release(struct inode *inode, struct file *filp) +{ + sysfs_put(filp->private_data); + return 0; +} + +static struct sysfs_dirent *sysfs_dir_pos(struct sysfs_dirent *parent_sd, + ino_t ino, struct sysfs_dirent *pos) +{ + if (pos) { + int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) && + pos->s_parent == parent_sd && + ino == pos->s_ino; + sysfs_put(pos); + if (valid) + return pos; + } + pos = NULL; + if ((ino > 1) && (ino < INT_MAX)) { + pos = parent_sd->s_dir.children; + while (pos && (ino > pos->s_ino)) + pos = pos->s_sibling; + } + return pos; +} + +static struct sysfs_dirent *sysfs_dir_next_pos(struct sysfs_dirent *parent_sd, + ino_t ino, struct sysfs_dirent *pos) +{ + pos = sysfs_dir_pos(parent_sd, ino, pos); + if (pos) + pos = pos->s_sibling; + return pos; +} + static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) { struct dentry *dentry = filp->f_path.dentry; struct sysfs_dirent * parent_sd = dentry->d_fsdata; - struct sysfs_dirent *pos; + struct sysfs_dirent *pos = filp->private_data; ino_t ino; if (filp->f_pos == 0) { @@ -857,29 +856,31 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0) filp->f_pos++; } - if ((filp->f_pos > 1) && (filp->f_pos < INT_MAX)) { - mutex_lock(&sysfs_mutex); - - /* Skip the dentries we have already reported */ - pos = parent_sd->s_dir.children; - while (pos && (filp->f_pos > pos->s_ino)) - pos = pos->s_sibling; - - for ( ; pos; pos = pos->s_sibling) { - const char * name; - int len; - - name = pos->s_name; - len = strlen(name); - filp->f_pos = ino = pos->s_ino; + mutex_lock(&sysfs_mutex); + for (pos = sysfs_dir_pos(parent_sd, filp->f_pos, pos); + pos; + pos = sysfs_dir_next_pos(parent_sd, filp->f_pos, pos)) { + const char * name; + unsigned int type; + int len, ret; + + name = pos->s_name; + len = strlen(name); + ino = pos->s_ino; + type = dt_type(pos); + filp->f_pos = ino; + filp->private_data = sysfs_get(pos); - if (filldir(dirent, name, len, filp->f_pos, ino, - dt_type(pos)) < 0) - break; - } - if (!pos) - filp->f_pos = INT_MAX; mutex_unlock(&sysfs_mutex); + ret = filldir(dirent, name, len, filp->f_pos, ino, type); + mutex_lock(&sysfs_mutex); + if (ret < 0) + break; + } + mutex_unlock(&sysfs_mutex); + if ((filp->f_pos > 1) && !pos) { /* EOF */ + filp->f_pos = INT_MAX; + filp->private_data = NULL; } return 0; } @@ -888,5 +889,6 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) const struct file_operations sysfs_dir_operations = { .read = generic_read_dir, .readdir = sysfs_readdir, + .release = sysfs_dir_release, .llseek = generic_file_llseek, }; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index dc30d9e3168..e222b258274 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -53,7 +53,7 @@ struct sysfs_buffer { size_t count; loff_t pos; char * page; - struct sysfs_ops * ops; + const struct sysfs_ops * ops; struct mutex mutex; int needs_read_fill; int event; @@ -75,7 +75,7 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer { struct sysfs_dirent *attr_sd = dentry->d_fsdata; struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; - struct sysfs_ops * ops = buffer->ops; + const struct sysfs_ops * ops = buffer->ops; int ret = 0; ssize_t count; @@ -85,13 +85,13 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer return -ENOMEM; /* need attr_sd for attr and ops, its parent for kobj */ - if (!sysfs_get_active_two(attr_sd)) + if (!sysfs_get_active(attr_sd)) return -ENODEV; buffer->event = atomic_read(&attr_sd->s_attr.open->event); count = ops->show(kobj, attr_sd->s_attr.attr, buffer->page); - sysfs_put_active_two(attr_sd); + sysfs_put_active(attr_sd); /* * The code works fine with PAGE_SIZE return but it's likely to @@ -199,16 +199,16 @@ flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer, size_t { struct sysfs_dirent *attr_sd = dentry->d_fsdata; struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; - struct sysfs_ops * ops = buffer->ops; + const struct sysfs_ops * ops = buffer->ops; int rc; /* need attr_sd for attr and ops, its parent for kobj */ - if (!sysfs_get_active_two(attr_sd)) + if (!sysfs_get_active(attr_sd)) return -ENODEV; rc = ops->store(kobj, attr_sd->s_attr.attr, buffer->page, count); - sysfs_put_active_two(attr_sd); + sysfs_put_active(attr_sd); return rc; } @@ -335,7 +335,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file) struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; struct sysfs_buffer *buffer; - struct sysfs_ops *ops; + const struct sysfs_ops *ops; int error = -EACCES; char *p; @@ -344,7 +344,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file) memmove(last_sysfs_file, p, strlen(p) + 1); /* need attr_sd for attr and ops, its parent for kobj */ - if (!sysfs_get_active_two(attr_sd)) + if (!sysfs_get_active(attr_sd)) return -ENODEV; /* every kobject with an attribute needs a ktype assigned */ @@ -393,13 +393,13 @@ static int sysfs_open_file(struct inode *inode, struct file *file) goto err_free; /* open succeeded, put active references */ - sysfs_put_active_two(attr_sd); + sysfs_put_active(attr_sd); return 0; err_free: kfree(buffer); err_out: - sysfs_put_active_two(attr_sd); + sysfs_put_active(attr_sd); return error; } @@ -437,12 +437,12 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait) struct sysfs_open_dirent *od = attr_sd->s_attr.open; /* need parent for the kobj, grab both */ - if (!sysfs_get_active_two(attr_sd)) + if (!sysfs_get_active(attr_sd)) goto trigger; poll_wait(filp, &od->poll, wait); - sysfs_put_active_two(attr_sd); + sysfs_put_active(attr_sd); if (buffer->event != atomic_read(&od->event)) goto trigger; @@ -509,6 +509,7 @@ int sysfs_add_file_mode(struct sysfs_dirent *dir_sd, if (!sd) return -ENOMEM; sd->s_attr.attr = (void *)attr; + sysfs_dirent_init_lockdep(sd); sysfs_addrm_start(&acxt, dir_sd); rc = sysfs_add_one(&acxt, sd); @@ -542,6 +543,18 @@ int sysfs_create_file(struct kobject * kobj, const struct attribute * attr) } +int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr) +{ + int err = 0; + int i; + + for (i = 0; ptr[i] && !err; i++) + err = sysfs_create_file(kobj, ptr[i]); + if (err) + while (--i >= 0) + sysfs_remove_file(kobj, ptr[i]); + return err; +} /** * sysfs_add_file_to_group - add an attribute file to a pre-existing group. @@ -614,6 +627,12 @@ void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) sysfs_hash_and_remove(kobj->sd, attr->name); } +void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr) +{ + int i; + for (i = 0; ptr[i]; i++) + sysfs_remove_file(kobj, ptr[i]); +} /** * sysfs_remove_file_from_group - remove an attribute file from a group. @@ -732,3 +751,5 @@ EXPORT_SYMBOL_GPL(sysfs_schedule_callback); EXPORT_SYMBOL_GPL(sysfs_create_file); EXPORT_SYMBOL_GPL(sysfs_remove_file); +EXPORT_SYMBOL_GPL(sysfs_remove_files); +EXPORT_SYMBOL_GPL(sysfs_create_files); diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 6a06a1d1ea7..082daaecac1 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -111,20 +111,20 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr) if (!sd) return -EINVAL; + mutex_lock(&sysfs_mutex); error = inode_change_ok(inode, iattr); if (error) - return error; + goto out; iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */ error = inode_setattr(inode, iattr); if (error) - return error; + goto out; - mutex_lock(&sysfs_mutex); error = sysfs_sd_setattr(sd, iattr); +out: mutex_unlock(&sysfs_mutex); - return error; } @@ -283,6 +283,7 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) /** * sysfs_get_inode - get inode for sysfs_dirent + * @sb: super block * @sd: sysfs_dirent to allocate inode for * * Get inode for @sd. If such inode doesn't exist, a new inode @@ -295,11 +296,11 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) * RETURNS: * Pointer to allocated inode on success, NULL on failure. */ -struct inode * sysfs_get_inode(struct sysfs_dirent *sd) +struct inode * sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd) { struct inode *inode; - inode = iget_locked(sysfs_sb, sd->s_ino); + inode = iget_locked(sb, sd->s_ino); if (inode && (inode->i_state & I_NEW)) sysfs_init_inode(sd, inode); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 49749955cca..0cb10884a2f 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -23,7 +23,6 @@ static struct vfsmount *sysfs_mount; -struct super_block * sysfs_sb = NULL; struct kmem_cache *sysfs_dir_cachep; static const struct super_operations sysfs_ops = { @@ -50,11 +49,10 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = SYSFS_MAGIC; sb->s_op = &sysfs_ops; sb->s_time_gran = 1; - sysfs_sb = sb; /* get root inode, initialize and unlock it */ mutex_lock(&sysfs_mutex); - inode = sysfs_get_inode(&sysfs_root); + inode = sysfs_get_inode(sb, &sysfs_root); mutex_unlock(&sysfs_mutex); if (!inode) { pr_debug("sysfs: could not get root inode\n"); diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index c5eff49fa41..1b9a3a1e8a1 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -123,6 +123,44 @@ void sysfs_remove_link(struct kobject * kobj, const char * name) sysfs_hash_and_remove(parent_sd, name); } +/** + * sysfs_rename_link - rename symlink in object's directory. + * @kobj: object we're acting for. + * @targ: object we're pointing to. + * @old: previous name of the symlink. + * @new: new name of the symlink. + * + * A helper function for the common rename symlink idiom. + */ +int sysfs_rename_link(struct kobject *kobj, struct kobject *targ, + const char *old, const char *new) +{ + struct sysfs_dirent *parent_sd, *sd = NULL; + int result; + + if (!kobj) + parent_sd = &sysfs_root; + else + parent_sd = kobj->sd; + + result = -ENOENT; + sd = sysfs_get_dirent(parent_sd, old); + if (!sd) + goto out; + + result = -EINVAL; + if (sysfs_type(sd) != SYSFS_KOBJ_LINK) + goto out; + if (sd->s_symlink.target_sd->s_dir.kobj != targ) + goto out; + + result = sysfs_rename(sd, parent_sd, new); + +out: + sysfs_put(sd); + return result; +} + static int sysfs_get_target_path(struct sysfs_dirent *parent_sd, struct sysfs_dirent *target_sd, char *path) { diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index cdd9377a6e0..30f5a44fb5d 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -66,8 +66,8 @@ struct sysfs_dirent { }; unsigned int s_flags; + unsigned short s_mode; ino_t s_ino; - umode_t s_mode; struct sysfs_inode_attrs *s_iattr; }; @@ -79,6 +79,7 @@ struct sysfs_dirent { #define SYSFS_KOBJ_BIN_ATTR 0x0004 #define SYSFS_KOBJ_LINK 0x0008 #define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK) +#define SYSFS_ACTIVE_REF (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR) #define SYSFS_FLAG_MASK ~SYSFS_TYPE_MASK #define SYSFS_FLAG_REMOVED 0x0200 @@ -91,9 +92,12 @@ static inline unsigned int sysfs_type(struct sysfs_dirent *sd) #ifdef CONFIG_DEBUG_LOCK_ALLOC #define sysfs_dirent_init_lockdep(sd) \ do { \ - static struct lock_class_key __key; \ + struct attribute *attr = sd->s_attr.attr; \ + struct lock_class_key *key = attr->key; \ + if (!key) \ + key = &attr->skey; \ \ - lockdep_init_map(&sd->dep_map, "s_active", &__key, 0); \ + lockdep_init_map(&sd->dep_map, "s_active", key, 0); \ } while(0) #else #define sysfs_dirent_init_lockdep(sd) do {} while(0) @@ -111,7 +115,6 @@ struct sysfs_addrm_cxt { * mount.c */ extern struct sysfs_dirent sysfs_root; -extern struct super_block *sysfs_sb; extern struct kmem_cache *sysfs_dir_cachep; /* @@ -124,8 +127,8 @@ extern const struct file_operations sysfs_dir_operations; extern const struct inode_operations sysfs_dir_inode_operations; struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd); -struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd); -void sysfs_put_active_two(struct sysfs_dirent *sd); +struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd); +void sysfs_put_active(struct sysfs_dirent *sd); void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *parent_sd); int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd); @@ -168,7 +171,7 @@ static inline void __sysfs_put(struct sysfs_dirent *sd) /* * inode.c */ -struct inode *sysfs_get_inode(struct sysfs_dirent *sd); +struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd); void sysfs_delete_inode(struct inode *inode); int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr); int sysfs_permission(struct inode *inode, int mask); diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 9824743832a..4573734d723 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -26,6 +26,7 @@ #include <linux/init.h> #include <linux/buffer_head.h> #include <linux/vfs.h> +#include <linux/writeback.h> #include <linux/namei.h> #include <asm/byteorder.h> #include "sysv.h" @@ -246,7 +247,7 @@ bad_inode: return ERR_PTR(-EIO); } -int sysv_write_inode(struct inode *inode, int wait) +static int __sysv_write_inode(struct inode *inode, int wait) { struct super_block * sb = inode->i_sb; struct sysv_sb_info * sbi = SYSV_SB(sb); @@ -296,9 +297,14 @@ int sysv_write_inode(struct inode *inode, int wait) return 0; } +int sysv_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + return __sysv_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); +} + int sysv_sync_inode(struct inode *inode) { - return sysv_write_inode(inode, 1); + return __sysv_write_inode(inode, 1); } static void sysv_delete_inode(struct inode *inode) diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index 53786eb5cf6..94cb9b4d76c 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -142,7 +142,7 @@ extern int __sysv_write_begin(struct file *file, struct address_space *mapping, /* inode.c */ extern struct inode *sysv_iget(struct super_block *, unsigned int); -extern int sysv_write_inode(struct inode *, int); +extern int sysv_write_inode(struct inode *, struct writeback_control *wbc); extern int sysv_sync_inode(struct inode *); extern void sysv_set_inode(struct inode *, dev_t); extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *); diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 552fb0111ff..401e503d44a 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1120,7 +1120,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, if (release) ubifs_release_budget(c, &ino_req); if (IS_SYNC(old_inode)) - err = old_inode->i_sb->s_op->write_inode(old_inode, 1); + err = old_inode->i_sb->s_op->write_inode(old_inode, NULL); return err; out_cancel: diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 16a6444330e..e26c02ab6cd 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1011,7 +1011,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc) /* Is the page fully inside @i_size? */ if (page->index < end_index) { if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) { - err = inode->i_sb->s_op->write_inode(inode, 1); + err = inode->i_sb->s_op->write_inode(inode, NULL); if (err) goto out_unlock; /* @@ -1039,7 +1039,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc) kunmap_atomic(kaddr, KM_USER0); if (i_size > synced_i_size) { - err = inode->i_sb->s_op->write_inode(inode, 1); + err = inode->i_sb->s_op->write_inode(inode, NULL); if (err) goto out_unlock; } @@ -1242,7 +1242,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode, if (release) ubifs_release_budget(c, &req); if (IS_SYNC(inode)) - err = inode->i_sb->s_op->write_inode(inode, 1); + err = inode->i_sb->s_op->write_inode(inode, NULL); return err; out: @@ -1316,7 +1316,7 @@ int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync) * the inode unless this is a 'datasync()' call. */ if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) { - err = inode->i_sb->s_op->write_inode(inode, 1); + err = inode->i_sb->s_op->write_inode(inode, NULL); if (err) return err; } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 43f9d19a6f3..4d2f2157dd3 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -283,7 +283,7 @@ static void ubifs_destroy_inode(struct inode *inode) /* * Note, Linux write-back code calls this without 'i_mutex'. */ -static int ubifs_write_inode(struct inode *inode, int wait) +static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc) { int err = 0; struct ubifs_info *c = inode->i_sb->s_fs_info; diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index b2d96f45c12..19626e2491c 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -31,55 +31,8 @@ #define udf_clear_bit(nr, addr) ext2_clear_bit(nr, addr) #define udf_set_bit(nr, addr) ext2_set_bit(nr, addr) #define udf_test_bit(nr, addr) ext2_test_bit(nr, addr) -#define udf_find_first_one_bit(addr, size) find_first_one_bit(addr, size) #define udf_find_next_one_bit(addr, size, offset) \ - find_next_one_bit(addr, size, offset) - -#define leBPL_to_cpup(x) leNUM_to_cpup(BITS_PER_LONG, x) -#define leNUM_to_cpup(x, y) xleNUM_to_cpup(x, y) -#define xleNUM_to_cpup(x, y) (le ## x ## _to_cpup(y)) -#define uintBPL_t uint(BITS_PER_LONG) -#define uint(x) xuint(x) -#define xuint(x) __le ## x - -static inline int find_next_one_bit(void *addr, int size, int offset) -{ - uintBPL_t *p = ((uintBPL_t *) addr) + (offset / BITS_PER_LONG); - int result = offset & ~(BITS_PER_LONG - 1); - unsigned long tmp; - - if (offset >= size) - return size; - size -= result; - offset &= (BITS_PER_LONG - 1); - if (offset) { - tmp = leBPL_to_cpup(p++); - tmp &= ~0UL << offset; - if (size < BITS_PER_LONG) - goto found_first; - if (tmp) - goto found_middle; - size -= BITS_PER_LONG; - result += BITS_PER_LONG; - } - while (size & ~(BITS_PER_LONG - 1)) { - tmp = leBPL_to_cpup(p++); - if (tmp) - goto found_middle; - result += BITS_PER_LONG; - size -= BITS_PER_LONG; - } - if (!size) - return result; - tmp = leBPL_to_cpup(p); -found_first: - tmp &= ~0UL >> (BITS_PER_LONG - size); -found_middle: - return result + ffz(~tmp); -} - -#define find_first_one_bit(addr, size)\ - find_next_one_bit((addr), (size), 0) + ext2_find_next_bit(addr, size, offset) static int read_block_bitmap(struct super_block *sb, struct udf_bitmap *bitmap, unsigned int block, @@ -208,7 +161,7 @@ static void udf_bitmap_free_blocks(struct super_block *sb, ((char *)bh->b_data)[(bit + i) >> 3]); } else { if (inode) - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); udf_add_free_space(sb, sbi->s_partition, 1); } } @@ -260,11 +213,11 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb, while (bit < (sb->s_blocksize << 3) && block_count > 0) { if (!udf_test_bit(bit, bh->b_data)) goto out; - else if (vfs_dq_prealloc_block(inode, 1)) + else if (dquot_prealloc_block(inode, 1)) goto out; else if (!udf_clear_bit(bit, bh->b_data)) { udf_debug("bit already cleared for block %d\n", bit); - vfs_dq_free_block(inode, 1); + dquot_free_block(inode, 1); goto out; } block_count--; @@ -390,10 +343,14 @@ got_block: /* * Check quota for allocation of this block. */ - if (inode && vfs_dq_alloc_block(inode, 1)) { - mutex_unlock(&sbi->s_alloc_mutex); - *err = -EDQUOT; - return 0; + if (inode) { + int ret = dquot_alloc_block(inode, 1); + + if (ret) { + mutex_unlock(&sbi->s_alloc_mutex); + *err = ret; + return 0; + } } newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) - @@ -449,7 +406,7 @@ static void udf_table_free_blocks(struct super_block *sb, /* We do this up front - There are some error conditions that could occure, but.. oh well */ if (inode) - vfs_dq_free_block(inode, count); + dquot_free_block(inode, count); udf_add_free_space(sb, sbi->s_partition, count); start = bloc->logicalBlockNum + offset; @@ -694,7 +651,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb, epos.offset -= adsize; alloc_count = (elen >> sb->s_blocksize_bits); - if (inode && vfs_dq_prealloc_block(inode, + if (inode && dquot_prealloc_block(inode, alloc_count > block_count ? block_count : alloc_count)) alloc_count = 0; else if (alloc_count > block_count) { @@ -797,12 +754,13 @@ static int udf_table_new_block(struct super_block *sb, newblock = goal_eloc.logicalBlockNum; goal_eloc.logicalBlockNum++; goal_elen -= sb->s_blocksize; - - if (inode && vfs_dq_alloc_block(inode, 1)) { - brelse(goal_epos.bh); - mutex_unlock(&sbi->s_alloc_mutex); - *err = -EDQUOT; - return 0; + if (inode) { + *err = dquot_alloc_block(inode, 1); + if (*err) { + brelse(goal_epos.bh); + mutex_unlock(&sbi->s_alloc_mutex); + return 0; + } } if (goal_elen) diff --git a/fs/udf/file.c b/fs/udf/file.c index f311d509b6a..1eb06774ed9 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -34,6 +34,7 @@ #include <linux/errno.h> #include <linux/smp_lock.h> #include <linux/pagemap.h> +#include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/aio.h> @@ -207,7 +208,7 @@ const struct file_operations udf_file_operations = { .read = do_sync_read, .aio_read = generic_file_aio_read, .ioctl = udf_ioctl, - .open = generic_file_open, + .open = dquot_file_open, .mmap = generic_file_mmap, .write = do_sync_write, .aio_write = udf_file_aio_write, @@ -217,6 +218,29 @@ const struct file_operations udf_file_operations = { .llseek = generic_file_llseek, }; +static int udf_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + int error; + + error = inode_change_ok(inode, iattr); + if (error) + return error; + + if (iattr->ia_valid & ATTR_SIZE) + dquot_initialize(inode); + + if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || + (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) { + error = dquot_transfer(inode, iattr); + if (error) + return error; + } + + return inode_setattr(inode, iattr); +} + const struct inode_operations udf_file_inode_operations = { - .truncate = udf_truncate, + .truncate = udf_truncate, + .setattr = udf_setattr, }; diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index c10fa39f97e..fb68c9cd0c3 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -36,8 +36,8 @@ void udf_free_inode(struct inode *inode) * Note: we must free any quota before locking the superblock, * as writing the quota to disk may need the lock as well. */ - vfs_dq_free_inode(inode); - vfs_dq_drop(inode); + dquot_free_inode(inode); + dquot_drop(inode); clear_inode(inode); @@ -61,7 +61,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) struct super_block *sb = dir->i_sb; struct udf_sb_info *sbi = UDF_SB(sb); struct inode *inode; - int block; + int block, ret; uint32_t start = UDF_I(dir)->i_location.logicalBlockNum; struct udf_inode_info *iinfo; struct udf_inode_info *dinfo = UDF_I(dir); @@ -153,12 +153,14 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) insert_inode_hash(inode); mark_inode_dirty(inode); - if (vfs_dq_alloc_inode(inode)) { - vfs_dq_drop(inode); + dquot_initialize(inode); + ret = dquot_alloc_inode(inode); + if (ret) { + dquot_drop(inode); inode->i_flags |= S_NOQUOTA; inode->i_nlink = 0; iput(inode); - *err = -EDQUOT; + *err = ret; return NULL; } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 378a7592257..bb863fe579a 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -36,6 +36,7 @@ #include <linux/pagemap.h> #include <linux/buffer_head.h> #include <linux/writeback.h> +#include <linux/quotaops.h> #include <linux/slab.h> #include <linux/crc-itu-t.h> @@ -70,6 +71,9 @@ static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); void udf_delete_inode(struct inode *inode) { + if (!is_bad_inode(inode)) + dquot_initialize(inode); + truncate_inode_pages(&inode->i_data, 0); if (is_bad_inode(inode)) @@ -102,12 +106,14 @@ void udf_clear_inode(struct inode *inode) if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && inode->i_size != iinfo->i_lenExtents) { printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has " - "inode size %llu different from extent lenght %llu. " + "inode size %llu different from extent length %llu. " "Filesystem need not be standards compliant.\n", inode->i_sb->s_id, inode->i_ino, inode->i_mode, (unsigned long long)inode->i_size, (unsigned long long)iinfo->i_lenExtents); } + + dquot_drop(inode); kfree(iinfo->i_ext.i_data); iinfo->i_ext.i_data = NULL; } @@ -1373,12 +1379,12 @@ static mode_t udf_convert_permissions(struct fileEntry *fe) return mode; } -int udf_write_inode(struct inode *inode, int sync) +int udf_write_inode(struct inode *inode, struct writeback_control *wbc) { int ret; lock_kernel(); - ret = udf_update_inode(inode, sync); + ret = udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); unlock_kernel(); return ret; @@ -1402,20 +1408,19 @@ static int udf_update_inode(struct inode *inode, int do_sync) unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; struct udf_inode_info *iinfo = UDF_I(inode); - bh = udf_tread(inode->i_sb, - udf_get_lb_pblock(inode->i_sb, - &iinfo->i_location, 0)); + bh = udf_tgetblk(inode->i_sb, + udf_get_lb_pblock(inode->i_sb, &iinfo->i_location, 0)); if (!bh) { - udf_debug("bread failure\n"); - return -EIO; + udf_debug("getblk failure\n"); + return -ENOMEM; } - memset(bh->b_data, 0x00, inode->i_sb->s_blocksize); - + lock_buffer(bh); + memset(bh->b_data, 0, inode->i_sb->s_blocksize); fe = (struct fileEntry *)bh->b_data; efe = (struct extendedFileEntry *)bh->b_data; - if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) { + if (iinfo->i_use) { struct unallocSpaceEntry *use = (struct unallocSpaceEntry *)bh->b_data; @@ -1423,20 +1428,18 @@ static int udf_update_inode(struct inode *inode, int do_sync) memcpy(bh->b_data + sizeof(struct unallocSpaceEntry), iinfo->i_ext.i_data, inode->i_sb->s_blocksize - sizeof(struct unallocSpaceEntry)); + use->descTag.tagIdent = cpu_to_le16(TAG_IDENT_USE); + use->descTag.tagLocation = + cpu_to_le32(iinfo->i_location.logicalBlockNum); crclen = sizeof(struct unallocSpaceEntry) + iinfo->i_lenAlloc - sizeof(struct tag); - use->descTag.tagLocation = cpu_to_le32( - iinfo->i_location. - logicalBlockNum); use->descTag.descCRCLength = cpu_to_le16(crclen); use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use + sizeof(struct tag), crclen)); use->descTag.tagChecksum = udf_tag_checksum(&use->descTag); - mark_buffer_dirty(bh); - brelse(bh); - return err; + goto out; } if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET)) @@ -1591,18 +1594,21 @@ static int udf_update_inode(struct inode *inode, int do_sync) fe->descTag.tagSerialNum = cpu_to_le16(sbi->s_serial_number); fe->descTag.tagLocation = cpu_to_le32( iinfo->i_location.logicalBlockNum); - crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - - sizeof(struct tag); + crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - sizeof(struct tag); fe->descTag.descCRCLength = cpu_to_le16(crclen); fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag), crclen)); fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag); +out: + set_buffer_uptodate(bh); + unlock_buffer(bh); + /* write the data blocks */ mark_buffer_dirty(bh); if (do_sync) { sync_dirty_buffer(bh); - if (buffer_req(bh) && !buffer_uptodate(bh)) { + if (buffer_write_io_error(bh)) { printk(KERN_WARNING "IO error syncing udf inode " "[%s:%08lx]\n", inode->i_sb->s_id, inode->i_ino); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 7c56ff00cd5..db423ab078b 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -563,6 +563,8 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode, int err; struct udf_inode_info *iinfo; + dquot_initialize(dir); + lock_kernel(); inode = udf_new_inode(dir, mode, &err); if (!inode) { @@ -616,6 +618,8 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode, if (!old_valid_dev(rdev)) return -EINVAL; + dquot_initialize(dir); + lock_kernel(); err = -EIO; inode = udf_new_inode(dir, mode, &err); @@ -662,6 +666,8 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode) struct udf_inode_info *dinfo = UDF_I(dir); struct udf_inode_info *iinfo; + dquot_initialize(dir); + lock_kernel(); err = -EMLINK; if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1) @@ -799,6 +805,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry) struct fileIdentDesc *fi, cfi; struct kernel_lb_addr tloc; + dquot_initialize(dir); + retval = -ENOENT; lock_kernel(); fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); @@ -845,6 +853,8 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry) struct fileIdentDesc cfi; struct kernel_lb_addr tloc; + dquot_initialize(dir); + retval = -ENOENT; lock_kernel(); fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); @@ -899,6 +909,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, struct buffer_head *bh; struct udf_inode_info *iinfo; + dquot_initialize(dir); + lock_kernel(); inode = udf_new_inode(dir, S_IFLNK, &err); if (!inode) @@ -1069,6 +1081,8 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, int err; struct buffer_head *bh; + dquot_initialize(dir); + lock_kernel(); if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) { unlock_kernel(); @@ -1131,6 +1145,9 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, struct kernel_lb_addr tloc; struct udf_inode_info *old_iinfo = UDF_I(old_inode); + dquot_initialize(old_dir); + dquot_initialize(new_dir); + lock_kernel(); ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); if (ofi) { diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 8d46f4294ee..4223ac855da 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -142,7 +142,7 @@ extern void udf_truncate(struct inode *); extern void udf_read_inode(struct inode *); extern void udf_delete_inode(struct inode *); extern void udf_clear_inode(struct inode *); -extern int udf_write_inode(struct inode *, int); +extern int udf_write_inode(struct inode *, struct writeback_control *wbc); extern long udf_block_map(struct inode *, sector_t); extern int udf_extend_file(struct inode *, struct extent_position *, struct kernel_long_ad *, sector_t); diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 54c16ec95df..5cfa4d85ccf 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -85,7 +85,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) "bit already cleared for fragment %u", i); } - vfs_dq_free_block(inode, count); + dquot_free_block(inode, count); fs32_add(sb, &ucg->cg_cs.cs_nffree, count); @@ -195,7 +195,7 @@ do_more: ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) ufs_clusteracct (sb, ucpi, blkno, 1); - vfs_dq_free_block(inode, uspi->s_fpb); + dquot_free_block(inode, uspi->s_fpb); fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1); uspi->cs_total.cs_nbfree++; @@ -511,6 +511,7 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; unsigned cgno, fragno, fragoff, count, fragsize, i; + int ret; UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n", (unsigned long long)fragment, oldcount, newcount); @@ -556,8 +557,9 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1); for (i = oldcount; i < newcount; i++) ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i); - if (vfs_dq_alloc_block(inode, count)) { - *err = -EDQUOT; + ret = dquot_alloc_block(inode, count); + if (ret) { + *err = ret; return 0; } @@ -596,6 +598,7 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno, struct ufs_cylinder_group * ucg; unsigned oldcg, i, j, k, allocsize; u64 result; + int ret; UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n", inode->i_ino, cgno, (unsigned long long)goal, count); @@ -664,7 +667,7 @@ cg_found: for (i = count; i < uspi->s_fpb; i++) ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i); i = uspi->s_fpb - count; - vfs_dq_free_block(inode, i); + dquot_free_block(inode, i); fs32_add(sb, &ucg->cg_cs.cs_nffree, i); uspi->cs_total.cs_nffree += i; @@ -676,8 +679,9 @@ cg_found: result = ufs_bitmap_search (sb, ucpi, goal, allocsize); if (result == INVBLOCK) return 0; - if (vfs_dq_alloc_block(inode, count)) { - *err = -EDQUOT; + ret = dquot_alloc_block(inode, count); + if (ret) { + *err = ret; return 0; } for (i = 0; i < count; i++) @@ -714,6 +718,7 @@ static u64 ufs_alloccg_block(struct inode *inode, struct ufs_super_block_first * usb1; struct ufs_cylinder_group * ucg; u64 result, blkno; + int ret; UFSD("ENTER, goal %llu\n", (unsigned long long)goal); @@ -747,8 +752,9 @@ gotit: ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) ufs_clusteracct (sb, ucpi, blkno, -1); - if (vfs_dq_alloc_block(inode, uspi->s_fpb)) { - *err = -EDQUOT; + ret = dquot_alloc_block(inode, uspi->s_fpb); + if (ret) { + *err = ret; return INVBLOCK; } diff --git a/fs/ufs/file.c b/fs/ufs/file.c index 73655c61240..a8962cecde5 100644 --- a/fs/ufs/file.c +++ b/fs/ufs/file.c @@ -24,6 +24,7 @@ */ #include <linux/fs.h> +#include <linux/quotaops.h> #include "ufs_fs.h" #include "ufs.h" @@ -40,7 +41,7 @@ const struct file_operations ufs_file_operations = { .write = do_sync_write, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, - .open = generic_file_open, + .open = dquot_file_open, .fsync = simple_fsync, .splice_read = generic_file_splice_read, }; diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 3527c00fef0..230ecf60802 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -95,8 +95,8 @@ void ufs_free_inode (struct inode * inode) is_directory = S_ISDIR(inode->i_mode); - vfs_dq_free_inode(inode); - vfs_dq_drop(inode); + dquot_free_inode(inode); + dquot_drop(inode); clear_inode (inode); @@ -355,9 +355,10 @@ cg_found: unlock_super (sb); - if (vfs_dq_alloc_inode(inode)) { - vfs_dq_drop(inode); - err = -EDQUOT; + dquot_initialize(inode); + err = dquot_alloc_inode(inode); + if (err) { + dquot_drop(inode); goto fail_without_unlock; } diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 7cf33379fd4..80b68c3702d 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -36,6 +36,8 @@ #include <linux/mm.h> #include <linux/smp_lock.h> #include <linux/buffer_head.h> +#include <linux/writeback.h> +#include <linux/quotaops.h> #include "ufs_fs.h" #include "ufs.h" @@ -890,11 +892,11 @@ static int ufs_update_inode(struct inode * inode, int do_sync) return 0; } -int ufs_write_inode (struct inode * inode, int wait) +int ufs_write_inode(struct inode *inode, struct writeback_control *wbc) { int ret; lock_kernel(); - ret = ufs_update_inode (inode, wait); + ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); unlock_kernel(); return ret; } @@ -908,6 +910,9 @@ void ufs_delete_inode (struct inode * inode) { loff_t old_i_size; + if (!is_bad_inode(inode)) + dquot_initialize(inode); + truncate_inode_pages(&inode->i_data, 0); if (is_bad_inode(inode)) goto no_delete; diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 4c26d9e8bc9..118556243e7 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -30,6 +30,7 @@ #include <linux/time.h> #include <linux/fs.h> #include <linux/smp_lock.h> +#include <linux/quotaops.h> #include "ufs_fs.h" #include "ufs.h" @@ -84,6 +85,9 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode, int err; UFSD("BEGIN\n"); + + dquot_initialize(dir); + inode = ufs_new_inode(dir, mode); err = PTR_ERR(inode); @@ -107,6 +111,9 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t if (!old_valid_dev(rdev)) return -EINVAL; + + dquot_initialize(dir); + inode = ufs_new_inode(dir, mode); err = PTR_ERR(inode); if (!IS_ERR(inode)) { @@ -131,6 +138,8 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry, if (l > sb->s_blocksize) goto out_notlocked; + dquot_initialize(dir); + lock_kernel(); inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); err = PTR_ERR(inode); @@ -176,6 +185,8 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, return -EMLINK; } + dquot_initialize(dir); + inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); atomic_inc(&inode->i_count); @@ -193,6 +204,8 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode) if (dir->i_nlink >= UFS_LINK_MAX) goto out; + dquot_initialize(dir); + lock_kernel(); inode_inc_link_count(dir); @@ -237,6 +250,8 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry) struct page *page; int err = -ENOENT; + dquot_initialize(dir); + de = ufs_find_entry(dir, &dentry->d_name, &page); if (!de) goto out; @@ -281,6 +296,9 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, struct ufs_dir_entry *old_de; int err = -ENOENT; + dquot_initialize(old_dir); + dquot_initialize(new_dir); + old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_de) goto out; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 143c20bfb04..14743d935a9 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1016,6 +1016,9 @@ magic_found: case UFS_FSSTABLE: UFSD("fs is stable\n"); break; + case UFS_FSLOG: + UFSD("fs is logging fs\n"); + break; case UFS_FSOSF1: UFSD("fs is DEC OSF/1\n"); break; @@ -1432,6 +1435,11 @@ static void destroy_inodecache(void) kmem_cache_destroy(ufs_inode_cachep); } +static void ufs_clear_inode(struct inode *inode) +{ + dquot_drop(inode); +} + #ifdef CONFIG_QUOTA static ssize_t ufs_quota_read(struct super_block *, int, char *,size_t, loff_t); static ssize_t ufs_quota_write(struct super_block *, int, const char *, size_t, loff_t); @@ -1442,6 +1450,7 @@ static const struct super_operations ufs_super_ops = { .destroy_inode = ufs_destroy_inode, .write_inode = ufs_write_inode, .delete_inode = ufs_delete_inode, + .clear_inode = ufs_clear_inode, .put_super = ufs_put_super, .write_super = ufs_write_super, .sync_fs = ufs_sync_fs, diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index 41dd431ce22..d3b6270cb37 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -44,6 +44,7 @@ #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/sched.h> +#include <linux/quotaops.h> #include "ufs_fs.h" #include "ufs.h" @@ -517,9 +518,18 @@ static int ufs_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; + if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || + (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + error = dquot_transfer(inode, attr); + if (error) + return error; + } if (ia_valid & ATTR_SIZE && attr->ia_size != i_size_read(inode)) { loff_t old_i_size = inode->i_size; + + dquot_initialize(inode); + error = vmtruncate(inode, attr->ia_size); if (error) return error; diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 01d0e2a3b23..43f9f5d5670 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -106,7 +106,7 @@ extern struct inode * ufs_new_inode (struct inode *, int); /* inode.c */ extern struct inode *ufs_iget(struct super_block *, unsigned long); -extern int ufs_write_inode (struct inode *, int); +extern int ufs_write_inode (struct inode *, struct writeback_control *); extern int ufs_sync_inode (struct inode *); extern void ufs_delete_inode (struct inode *); extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *); diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h index 54bde1895a8..6943ec677c0 100644 --- a/fs/ufs/ufs_fs.h +++ b/fs/ufs/ufs_fs.h @@ -138,6 +138,7 @@ typedef __u16 __bitwise __fs16; #define UFS_USEEFT ((__u16)65535) +/* fs_clean values */ #define UFS_FSOK 0x7c269d38 #define UFS_FSACTIVE ((__s8)0x00) #define UFS_FSCLEAN ((__s8)0x01) @@ -145,6 +146,11 @@ typedef __u16 __bitwise __fs16; #define UFS_FSOSF1 ((__s8)0x03) /* is this correct for DEC OSF/1? */ #define UFS_FSBAD ((__s8)0xff) +/* Solaris-specific fs_clean values */ +#define UFS_FSSUSPEND ((__s8)0xfe) /* temporarily suspended */ +#define UFS_FSLOG ((__s8)0xfd) /* logging fs */ +#define UFS_FSFIX ((__s8)0xfc) /* being repaired while mounted */ + /* From here to next blank line, s_flags for ufs_sb_info */ /* directory entry encoding */ #define UFS_DE_MASK 0x00000010 /* mask for the following */ @@ -227,11 +233,16 @@ typedef __u16 __bitwise __fs16; */ #define ufs_cbtocylno(bno) \ ((bno) * uspi->s_nspf / uspi->s_spc) -#define ufs_cbtorpos(bno) \ +#define ufs_cbtorpos(bno) \ + ((UFS_SB(sb)->s_flags & UFS_CG_SUN) ? \ + (((((bno) * uspi->s_nspf % uspi->s_spc) % \ + uspi->s_nsect) * \ + uspi->s_nrpos) / uspi->s_nsect) \ + : \ ((((bno) * uspi->s_nspf % uspi->s_spc / uspi->s_nsect \ * uspi->s_trackskew + (bno) * uspi->s_nspf % uspi->s_spc \ % uspi->s_nsect * uspi->s_interleave) % uspi->s_nsect \ - * uspi->s_nrpos) / uspi->s_npsect) + * uspi->s_nrpos) / uspi->s_npsect)) /* * The following macros optimize certain frequently calculated diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 5c5a366aa33..b4769e40e8b 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -105,7 +105,6 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \ xfs_globals.o \ xfs_ioctl.o \ xfs_iops.o \ - xfs_lrw.o \ xfs_super.o \ xfs_sync.o \ xfs_xattr.o) diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 66abe36c121..99628508cb1 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -39,6 +39,7 @@ #include "xfs_iomap.h" #include "xfs_vnodeops.h" #include "xfs_trace.h" +#include "xfs_bmap.h" #include <linux/mpage.h> #include <linux/pagevec.h> #include <linux/writeback.h> @@ -163,14 +164,17 @@ xfs_ioend_new_eof( } /* - * Update on-disk file size now that data has been written to disk. - * The current in-memory file size is i_size. If a write is beyond - * eof i_new_size will be the intended file size until i_size is - * updated. If this write does not extend all the way to the valid - * file size then restrict this update to the end of the write. + * Update on-disk file size now that data has been written to disk. The + * current in-memory file size is i_size. If a write is beyond eof i_new_size + * will be the intended file size until i_size is updated. If this write does + * not extend all the way to the valid file size then restrict this update to + * the end of the write. + * + * This function does not block as blocking on the inode lock in IO completion + * can lead to IO completion order dependency deadlocks.. If it can't get the + * inode ilock it will return EAGAIN. Callers must handle this. */ - -STATIC void +STATIC int xfs_setfilesize( xfs_ioend_t *ioend) { @@ -181,16 +185,40 @@ xfs_setfilesize( ASSERT(ioend->io_type != IOMAP_READ); if (unlikely(ioend->io_error)) - return; + return 0; + + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) + return EAGAIN; - xfs_ilock(ip, XFS_ILOCK_EXCL); isize = xfs_ioend_new_eof(ioend); if (isize) { ip->i_d.di_size = isize; - xfs_mark_inode_dirty_sync(ip); + xfs_mark_inode_dirty(ip); } xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; +} + +/* + * Schedule IO completion handling on a xfsdatad if this was + * the final hold on this ioend. If we are asked to wait, + * flush the workqueue. + */ +STATIC void +xfs_finish_ioend( + xfs_ioend_t *ioend, + int wait) +{ + if (atomic_dec_and_test(&ioend->io_remaining)) { + struct workqueue_struct *wq; + + wq = (ioend->io_type == IOMAP_UNWRITTEN) ? + xfsconvertd_workqueue : xfsdatad_workqueue; + queue_work(wq, &ioend->io_work); + if (wait) + flush_workqueue(wq); + } } /* @@ -198,11 +226,11 @@ xfs_setfilesize( */ STATIC void xfs_end_io( - struct work_struct *work) + struct work_struct *work) { - xfs_ioend_t *ioend = - container_of(work, xfs_ioend_t, io_work); - struct xfs_inode *ip = XFS_I(ioend->io_inode); + xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); + struct xfs_inode *ip = XFS_I(ioend->io_inode); + int error = 0; /* * For unwritten extents we need to issue transactions to convert a @@ -210,7 +238,6 @@ xfs_end_io( */ if (ioend->io_type == IOMAP_UNWRITTEN && likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) { - int error; error = xfs_iomap_write_unwritten(ip, ioend->io_offset, ioend->io_size); @@ -222,30 +249,23 @@ xfs_end_io( * We might have to update the on-disk file size after extending * writes. */ - if (ioend->io_type != IOMAP_READ) - xfs_setfilesize(ioend); - xfs_destroy_ioend(ioend); -} - -/* - * Schedule IO completion handling on a xfsdatad if this was - * the final hold on this ioend. If we are asked to wait, - * flush the workqueue. - */ -STATIC void -xfs_finish_ioend( - xfs_ioend_t *ioend, - int wait) -{ - if (atomic_dec_and_test(&ioend->io_remaining)) { - struct workqueue_struct *wq; - - wq = (ioend->io_type == IOMAP_UNWRITTEN) ? - xfsconvertd_workqueue : xfsdatad_workqueue; - queue_work(wq, &ioend->io_work); - if (wait) - flush_workqueue(wq); + if (ioend->io_type != IOMAP_READ) { + error = xfs_setfilesize(ioend); + ASSERT(!error || error == EAGAIN); } + + /* + * If we didn't complete processing of the ioend, requeue it to the + * tail of the workqueue for another attempt later. Otherwise destroy + * it. + */ + if (error == EAGAIN) { + atomic_inc(&ioend->io_remaining); + xfs_finish_ioend(ioend, 0); + /* ensure we don't spin on blocked ioends */ + delay(1); + } else + xfs_destroy_ioend(ioend); } /* @@ -341,7 +361,7 @@ xfs_submit_ioend_bio( * but don't update the inode size until I/O completion. */ if (xfs_ioend_new_eof(ioend)) - xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode)); + xfs_mark_inode_dirty(XFS_I(ioend->io_inode)); submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC_PLUG : WRITE, bio); @@ -874,6 +894,125 @@ xfs_cluster_write( } } +STATIC void +xfs_vm_invalidatepage( + struct page *page, + unsigned long offset) +{ + trace_xfs_invalidatepage(page->mapping->host, page, offset); + block_invalidatepage(page, offset); +} + +/* + * If the page has delalloc buffers on it, we need to punch them out before we + * invalidate the page. If we don't, we leave a stale delalloc mapping on the + * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read + * is done on that same region - the delalloc extent is returned when none is + * supposed to be there. + * + * We prevent this by truncating away the delalloc regions on the page before + * invalidating it. Because they are delalloc, we can do this without needing a + * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this + * truncation without a transaction as there is no space left for block + * reservation (typically why we see a ENOSPC in writeback). + * + * This is not a performance critical path, so for now just do the punching a + * buffer head at a time. + */ +STATIC void +xfs_aops_discard_page( + struct page *page) +{ + struct inode *inode = page->mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct buffer_head *bh, *head; + loff_t offset = page_offset(page); + ssize_t len = 1 << inode->i_blkbits; + + if (!xfs_is_delayed_page(page, IOMAP_DELAY)) + goto out_invalidate; + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + goto out_invalidate; + + xfs_fs_cmn_err(CE_ALERT, ip->i_mount, + "page discard on page %p, inode 0x%llx, offset %llu.", + page, ip->i_ino, offset); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + bh = head = page_buffers(page); + do { + int done; + xfs_fileoff_t offset_fsb; + xfs_bmbt_irec_t imap; + int nimaps = 1; + int error; + xfs_fsblock_t firstblock; + xfs_bmap_free_t flist; + + if (!buffer_delay(bh)) + goto next_buffer; + + offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); + + /* + * Map the range first and check that it is a delalloc extent + * before trying to unmap the range. Otherwise we will be + * trying to remove a real extent (which requires a + * transaction) or a hole, which is probably a bad idea... + */ + error = xfs_bmapi(NULL, ip, offset_fsb, 1, + XFS_BMAPI_ENTIRE, NULL, 0, &imap, + &nimaps, NULL, NULL); + + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_fs_cmn_err(CE_ALERT, ip->i_mount, + "page discard failed delalloc mapping lookup."); + } + break; + } + if (!nimaps) { + /* nothing there */ + goto next_buffer; + } + if (imap.br_startblock != DELAYSTARTBLOCK) { + /* been converted, ignore */ + goto next_buffer; + } + WARN_ON(imap.br_blockcount == 0); + + /* + * Note: while we initialise the firstblock/flist pair, they + * should never be used because blocks should never be + * allocated or freed for a delalloc extent and hence we need + * don't cancel or finish them after the xfs_bunmapi() call. + */ + xfs_bmap_init(&flist, &firstblock); + error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock, + &flist, NULL, &done); + + ASSERT(!flist.xbf_count && !flist.xbf_first); + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_fs_cmn_err(CE_ALERT, ip->i_mount, + "page discard unable to remove delalloc mapping."); + } + break; + } +next_buffer: + offset += len; + + } while ((bh = bh->b_this_page) != head); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out_invalidate: + xfs_vm_invalidatepage(page, 0); + return; +} + /* * Calling this without startio set means we are being asked to make a dirty * page ready for freeing it's buffers. When called with startio set then @@ -1125,7 +1264,7 @@ error: */ if (err != -EAGAIN) { if (!unmapped) - block_invalidatepage(page, 0); + xfs_aops_discard_page(page); ClearPageUptodate(page); } return err; @@ -1535,15 +1674,6 @@ xfs_vm_readpages( return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); } -STATIC void -xfs_vm_invalidatepage( - struct page *page, - unsigned long offset) -{ - trace_xfs_invalidatepage(page->mapping->host, page, offset); - block_invalidatepage(page, offset); -} - const struct address_space_operations xfs_address_space_operations = { .readpage = xfs_vm_readpage, .readpages = xfs_vm_readpages, diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 6f76ba85f19..bd111b7e1da 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -168,75 +168,6 @@ test_page_region( } /* - * Mapping of multi-page buffers into contiguous virtual space - */ - -typedef struct a_list { - void *vm_addr; - struct a_list *next; -} a_list_t; - -static a_list_t *as_free_head; -static int as_list_len; -static DEFINE_SPINLOCK(as_lock); - -/* - * Try to batch vunmaps because they are costly. - */ -STATIC void -free_address( - void *addr) -{ - a_list_t *aentry; - -#ifdef CONFIG_XEN - /* - * Xen needs to be able to make sure it can get an exclusive - * RO mapping of pages it wants to turn into a pagetable. If - * a newly allocated page is also still being vmap()ed by xfs, - * it will cause pagetable construction to fail. This is a - * quick workaround to always eagerly unmap pages so that Xen - * is happy. - */ - vunmap(addr); - return; -#endif - - aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT); - if (likely(aentry)) { - spin_lock(&as_lock); - aentry->next = as_free_head; - aentry->vm_addr = addr; - as_free_head = aentry; - as_list_len++; - spin_unlock(&as_lock); - } else { - vunmap(addr); - } -} - -STATIC void -purge_addresses(void) -{ - a_list_t *aentry, *old; - - if (as_free_head == NULL) - return; - - spin_lock(&as_lock); - aentry = as_free_head; - as_free_head = NULL; - as_list_len = 0; - spin_unlock(&as_lock); - - while ((old = aentry) != NULL) { - vunmap(aentry->vm_addr); - aentry = aentry->next; - kfree(old); - } -} - -/* * Internal xfs_buf_t object manipulation */ @@ -337,7 +268,8 @@ xfs_buf_free( uint i; if (xfs_buf_is_vmapped(bp)) - free_address(bp->b_addr - bp->b_offset); + vm_unmap_ram(bp->b_addr - bp->b_offset, + bp->b_page_count); for (i = 0; i < bp->b_page_count; i++) { struct page *page = bp->b_pages[i]; @@ -457,10 +389,8 @@ _xfs_buf_map_pages( bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; bp->b_flags |= XBF_MAPPED; } else if (flags & XBF_MAPPED) { - if (as_list_len > 64) - purge_addresses(); - bp->b_addr = vmap(bp->b_pages, bp->b_page_count, - VM_MAP, PAGE_KERNEL); + bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, + -1, PAGE_KERNEL); if (unlikely(bp->b_addr == NULL)) return -ENOMEM; bp->b_addr += bp->b_offset; @@ -1955,9 +1885,6 @@ xfsbufd( xfs_buf_iostrategy(bp); count++; } - - if (as_list_len > 0) - purge_addresses(); if (count) blk_run_address_space(target->bt_mapping); diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c index 87b8cbd23d4..846b75aeb2a 100644 --- a/fs/xfs/linux-2.6/xfs_export.c +++ b/fs/xfs/linux-2.6/xfs_export.c @@ -29,6 +29,7 @@ #include "xfs_vnodeops.h" #include "xfs_bmap_btree.h" #include "xfs_inode.h" +#include "xfs_inode_item.h" /* * Note that we only accept fileids which are long enough rather than allow @@ -215,9 +216,28 @@ xfs_fs_get_parent( return d_obtain_alias(VFS_I(cip)); } +STATIC int +xfs_fs_nfs_commit_metadata( + struct inode *inode) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + int error = 0; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_ipincount(ip)) { + error = _xfs_log_force_lsn(mp, ip->i_itemp->ili_last_lsn, + XFS_LOG_SYNC, NULL); + } + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + return error; +} + const struct export_operations xfs_export_operations = { .encode_fh = xfs_fs_encode_fh, .fh_to_dentry = xfs_fs_fh_to_dentry, .fh_to_parent = xfs_fs_fh_to_parent, .get_parent = xfs_fs_get_parent, + .commit_metadata = xfs_fs_nfs_commit_metadata, }; diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index e4caeb28ce2..42dd3bcfba6 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -16,6 +16,7 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" +#include "xfs_fs.h" #include "xfs_bit.h" #include "xfs_log.h" #include "xfs_inum.h" @@ -34,52 +35,279 @@ #include "xfs_dir2_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" #include "xfs_error.h" #include "xfs_rw.h" #include "xfs_vnodeops.h" #include "xfs_da_btree.h" #include "xfs_ioctl.h" +#include "xfs_trace.h" #include <linux/dcache.h> static const struct vm_operations_struct xfs_file_vm_ops; -STATIC ssize_t -xfs_file_aio_read( - struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, - loff_t pos) +/* + * xfs_iozero + * + * xfs_iozero clears the specified range of buffer supplied, + * and marks all the affected blocks as valid and modified. If + * an affected block is not allocated, it will be allocated. If + * an affected block is not completely overwritten, and is not + * valid before the operation, it will be read from disk before + * being partially zeroed. + */ +STATIC int +xfs_iozero( + struct xfs_inode *ip, /* inode */ + loff_t pos, /* offset in file */ + size_t count) /* size of data to zero */ { - struct file *file = iocb->ki_filp; - int ioflags = 0; + struct page *page; + struct address_space *mapping; + int status; - BUG_ON(iocb->ki_pos != pos); - if (unlikely(file->f_flags & O_DIRECT)) - ioflags |= IO_ISDIRECT; - if (file->f_mode & FMODE_NOCMTIME) - ioflags |= IO_INVIS; - return xfs_read(XFS_I(file->f_path.dentry->d_inode), iocb, iov, - nr_segs, &iocb->ki_pos, ioflags); + mapping = VFS_I(ip)->i_mapping; + do { + unsigned offset, bytes; + void *fsdata; + + offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ + bytes = PAGE_CACHE_SIZE - offset; + if (bytes > count) + bytes = count; + + status = pagecache_write_begin(NULL, mapping, pos, bytes, + AOP_FLAG_UNINTERRUPTIBLE, + &page, &fsdata); + if (status) + break; + + zero_user(page, offset, bytes); + + status = pagecache_write_end(NULL, mapping, pos, bytes, bytes, + page, fsdata); + WARN_ON(status <= 0); /* can't return less than zero! */ + pos += bytes; + count -= bytes; + status = 0; + } while (count); + + return (-status); +} + +STATIC int +xfs_file_fsync( + struct file *file, + struct dentry *dentry, + int datasync) +{ + struct xfs_inode *ip = XFS_I(dentry->d_inode); + struct xfs_trans *tp; + int error = 0; + int log_flushed = 0; + + xfs_itrace_entry(ip); + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -XFS_ERROR(EIO); + + xfs_iflags_clear(ip, XFS_ITRUNCATED); + + /* + * We always need to make sure that the required inode state is safe on + * disk. The inode might be clean but we still might need to force the + * log because of committed transactions that haven't hit the disk yet. + * Likewise, there could be unflushed non-transactional changes to the + * inode core that have to go to disk and this requires us to issue + * a synchronous transaction to capture these changes correctly. + * + * This code relies on the assumption that if the i_update_core field + * of the inode is clear and the inode is unpinned then it is clean + * and no action is required. + */ + xfs_ilock(ip, XFS_ILOCK_SHARED); + + /* + * First check if the VFS inode is marked dirty. All the dirtying + * of non-transactional updates no goes through mark_inode_dirty*, + * which allows us to distinguish beteeen pure timestamp updates + * and i_size updates which need to be caught for fdatasync. + * After that also theck for the dirty state in the XFS inode, which + * might gets cleared when the inode gets written out via the AIL + * or xfs_iflush_cluster. + */ + if (((dentry->d_inode->i_state & I_DIRTY_DATASYNC) || + ((dentry->d_inode->i_state & I_DIRTY_SYNC) && !datasync)) && + ip->i_update_core) { + /* + * Kick off a transaction to log the inode core to get the + * updates. The sync transaction will also force the log. + */ + xfs_iunlock(ip, XFS_ILOCK_SHARED); + tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS); + error = xfs_trans_reserve(tp, 0, + XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return -error; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + + /* + * Note - it's possible that we might have pushed ourselves out + * of the way during trans_reserve which would flush the inode. + * But there's no guarantee that the inode buffer has actually + * gone out yet (it's delwri). Plus the buffer could be pinned + * anyway if it's part of an inode in another recent + * transaction. So we play it safe and fire off the + * transaction anyway. + */ + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, ip); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + xfs_trans_set_sync(tp); + error = _xfs_trans_commit(tp, 0, &log_flushed); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } else { + /* + * Timestamps/size haven't changed since last inode flush or + * inode transaction commit. That means either nothing got + * written or a transaction committed which caught the updates. + * If the latter happened and the transaction hasn't hit the + * disk yet, the inode will be still be pinned. If it is, + * force the log. + */ + if (xfs_ipincount(ip)) { + error = _xfs_log_force_lsn(ip->i_mount, + ip->i_itemp->ili_last_lsn, + XFS_LOG_SYNC, &log_flushed); + } + xfs_iunlock(ip, XFS_ILOCK_SHARED); + } + + if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) { + /* + * If the log write didn't issue an ordered tag we need + * to flush the disk cache for the data device now. + */ + if (!log_flushed) + xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp); + + /* + * If this inode is on the RT dev we need to flush that + * cache as well. + */ + if (XFS_IS_REALTIME_INODE(ip)) + xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp); + } + + return -error; } STATIC ssize_t -xfs_file_aio_write( +xfs_file_aio_read( struct kiocb *iocb, - const struct iovec *iov, + const struct iovec *iovp, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + size_t size = 0; + ssize_t ret = 0; int ioflags = 0; + xfs_fsize_t n; + unsigned long seg; + + XFS_STATS_INC(xs_read_calls); BUG_ON(iocb->ki_pos != pos); + if (unlikely(file->f_flags & O_DIRECT)) ioflags |= IO_ISDIRECT; if (file->f_mode & FMODE_NOCMTIME) ioflags |= IO_INVIS; - return xfs_write(XFS_I(file->f_mapping->host), iocb, iov, nr_segs, - &iocb->ki_pos, ioflags); + + /* START copy & waste from filemap.c */ + for (seg = 0; seg < nr_segs; seg++) { + const struct iovec *iv = &iovp[seg]; + + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + size += iv->iov_len; + if (unlikely((ssize_t)(size|iv->iov_len) < 0)) + return XFS_ERROR(-EINVAL); + } + /* END copy & waste from filemap.c */ + + if (unlikely(ioflags & IO_ISDIRECT)) { + xfs_buftarg_t *target = + XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + if ((iocb->ki_pos & target->bt_smask) || + (size & target->bt_smask)) { + if (iocb->ki_pos == ip->i_size) + return 0; + return -XFS_ERROR(EINVAL); + } + } + + n = XFS_MAXIOFFSET(mp) - iocb->ki_pos; + if (n <= 0 || size == 0) + return 0; + + if (n < size) + size = n; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + if (unlikely(ioflags & IO_ISDIRECT)) + mutex_lock(&inode->i_mutex); + xfs_ilock(ip, XFS_IOLOCK_SHARED); + + if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) { + int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags); + int iolock = XFS_IOLOCK_SHARED; + + ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, iocb->ki_pos, size, + dmflags, &iolock); + if (ret) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + if (unlikely(ioflags & IO_ISDIRECT)) + mutex_unlock(&inode->i_mutex); + return ret; + } + } + + if (unlikely(ioflags & IO_ISDIRECT)) { + if (inode->i_mapping->nrpages) { + ret = -xfs_flushinval_pages(ip, + (iocb->ki_pos & PAGE_CACHE_MASK), + -1, FI_REMAPF_LOCKED); + } + mutex_unlock(&inode->i_mutex); + if (ret) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return ret; + } + } + + trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); + + ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos); + if (ret > 0) + XFS_STATS_ADD(xs_read_bytes, ret); + + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return ret; } STATIC ssize_t @@ -87,16 +315,44 @@ xfs_file_splice_read( struct file *infilp, loff_t *ppos, struct pipe_inode_info *pipe, - size_t len, + size_t count, unsigned int flags) { + struct xfs_inode *ip = XFS_I(infilp->f_mapping->host); + struct xfs_mount *mp = ip->i_mount; int ioflags = 0; + ssize_t ret; + + XFS_STATS_INC(xs_read_calls); if (infilp->f_mode & FMODE_NOCMTIME) ioflags |= IO_INVIS; - return xfs_splice_read(XFS_I(infilp->f_path.dentry->d_inode), - infilp, ppos, pipe, len, flags, ioflags); + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + xfs_ilock(ip, XFS_IOLOCK_SHARED); + + if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) { + int iolock = XFS_IOLOCK_SHARED; + int error; + + error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count, + FILP_DELAY_FLAG(infilp), &iolock); + if (error) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return -error; + } + } + + trace_xfs_file_splice_read(ip, count, *ppos, ioflags); + + ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); + if (ret > 0) + XFS_STATS_ADD(xs_read_bytes, ret); + + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return ret; } STATIC ssize_t @@ -104,16 +360,538 @@ xfs_file_splice_write( struct pipe_inode_info *pipe, struct file *outfilp, loff_t *ppos, - size_t len, + size_t count, unsigned int flags) { + struct inode *inode = outfilp->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fsize_t isize, new_size; int ioflags = 0; + ssize_t ret; + + XFS_STATS_INC(xs_write_calls); if (outfilp->f_mode & FMODE_NOCMTIME) ioflags |= IO_INVIS; - return xfs_splice_write(XFS_I(outfilp->f_path.dentry->d_inode), - pipe, outfilp, ppos, len, flags, ioflags); + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) { + int iolock = XFS_IOLOCK_EXCL; + int error; + + error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count, + FILP_DELAY_FLAG(outfilp), &iolock); + if (error) { + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return -error; + } + } + + new_size = *ppos + count; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (new_size > ip->i_size) + ip->i_new_size = new_size; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + trace_xfs_file_splice_write(ip, count, *ppos, ioflags); + + ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); + if (ret > 0) + XFS_STATS_ADD(xs_write_bytes, ret); + + isize = i_size_read(inode); + if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize)) + *ppos = isize; + + if (*ppos > ip->i_size) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (*ppos > ip->i_size) + ip->i_size = *ppos; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + if (ip->i_new_size) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + ip->i_new_size = 0; + if (ip->i_d.di_size > ip->i_size) + ip->i_d.di_size = ip->i_size; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return ret; +} + +/* + * This routine is called to handle zeroing any space in the last + * block of the file that is beyond the EOF. We do this since the + * size is being increased without writing anything to that block + * and we don't want anyone to read the garbage on the disk. + */ +STATIC int /* error (positive) */ +xfs_zero_last_block( + xfs_inode_t *ip, + xfs_fsize_t offset, + xfs_fsize_t isize) +{ + xfs_fileoff_t last_fsb; + xfs_mount_t *mp = ip->i_mount; + int nimaps; + int zero_offset; + int zero_len; + int error = 0; + xfs_bmbt_irec_t imap; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + zero_offset = XFS_B_FSB_OFFSET(mp, isize); + if (zero_offset == 0) { + /* + * There are no extra bytes in the last block on disk to + * zero, so return. + */ + return 0; + } + + last_fsb = XFS_B_TO_FSBT(mp, isize); + nimaps = 1; + error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap, + &nimaps, NULL, NULL); + if (error) { + return error; + } + ASSERT(nimaps > 0); + /* + * If the block underlying isize is just a hole, then there + * is nothing to zero. + */ + if (imap.br_startblock == HOLESTARTBLOCK) { + return 0; + } + /* + * Zero the part of the last block beyond the EOF, and write it + * out sync. We need to drop the ilock while we do this so we + * don't deadlock when the buffer cache calls back to us. + */ + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + zero_len = mp->m_sb.sb_blocksize - zero_offset; + if (isize + zero_len > offset) + zero_len = offset - isize; + error = xfs_iozero(ip, isize, zero_len); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + ASSERT(error >= 0); + return error; +} + +/* + * Zero any on disk space between the current EOF and the new, + * larger EOF. This handles the normal case of zeroing the remainder + * of the last block in the file and the unusual case of zeroing blocks + * out beyond the size of the file. This second case only happens + * with fixed size extents and when the system crashes before the inode + * size was updated but after blocks were allocated. If fill is set, + * then any holes in the range are filled and zeroed. If not, the holes + * are left alone as holes. + */ + +int /* error (positive) */ +xfs_zero_eof( + xfs_inode_t *ip, + xfs_off_t offset, /* starting I/O offset */ + xfs_fsize_t isize) /* current inode size */ +{ + xfs_mount_t *mp = ip->i_mount; + xfs_fileoff_t start_zero_fsb; + xfs_fileoff_t end_zero_fsb; + xfs_fileoff_t zero_count_fsb; + xfs_fileoff_t last_fsb; + xfs_fileoff_t zero_off; + xfs_fsize_t zero_len; + int nimaps; + int error = 0; + xfs_bmbt_irec_t imap; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); + ASSERT(offset > isize); + + /* + * First handle zeroing the block on which isize resides. + * We only zero a part of that block so it is handled specially. + */ + error = xfs_zero_last_block(ip, offset, isize); + if (error) { + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); + return error; + } + + /* + * Calculate the range between the new size and the old + * where blocks needing to be zeroed may exist. To get the + * block where the last byte in the file currently resides, + * we need to subtract one from the size and truncate back + * to a block boundary. We subtract 1 in case the size is + * exactly on a block boundary. + */ + last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; + start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); + end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1); + ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb); + if (last_fsb == end_zero_fsb) { + /* + * The size was only incremented on its last block. + * We took care of that above, so just return. + */ + return 0; + } + + ASSERT(start_zero_fsb <= end_zero_fsb); + while (start_zero_fsb <= end_zero_fsb) { + nimaps = 1; + zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; + error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb, + 0, NULL, 0, &imap, &nimaps, NULL, NULL); + if (error) { + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); + return error; + } + ASSERT(nimaps > 0); + + if (imap.br_state == XFS_EXT_UNWRITTEN || + imap.br_startblock == HOLESTARTBLOCK) { + /* + * This loop handles initializing pages that were + * partially initialized by the code below this + * loop. It basically zeroes the part of the page + * that sits on a hole and sets the page as P_HOLE + * and calls remapf if it is a mapped file. + */ + start_zero_fsb = imap.br_startoff + imap.br_blockcount; + ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); + continue; + } + + /* + * There are blocks we need to zero. + * Drop the inode lock while we're doing the I/O. + * We'll still have the iolock to protect us. + */ + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + zero_off = XFS_FSB_TO_B(mp, start_zero_fsb); + zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount); + + if ((zero_off + zero_len) > offset) + zero_len = offset - zero_off; + + error = xfs_iozero(ip, zero_off, zero_len); + if (error) { + goto out_lock; + } + + start_zero_fsb = imap.br_startoff + imap.br_blockcount; + ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + } + + return 0; + +out_lock: + xfs_ilock(ip, XFS_ILOCK_EXCL); + ASSERT(error >= 0); + return error; +} + +STATIC ssize_t +xfs_file_aio_write( + struct kiocb *iocb, + const struct iovec *iovp, + unsigned long nr_segs, + loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t ret = 0, error = 0; + int ioflags = 0; + xfs_fsize_t isize, new_size; + int iolock; + int eventsent = 0; + size_t ocount = 0, count; + int need_i_mutex; + + XFS_STATS_INC(xs_write_calls); + + BUG_ON(iocb->ki_pos != pos); + + if (unlikely(file->f_flags & O_DIRECT)) + ioflags |= IO_ISDIRECT; + if (file->f_mode & FMODE_NOCMTIME) + ioflags |= IO_INVIS; + + error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); + if (error) + return error; + + count = ocount; + if (count == 0) + return 0; + + xfs_wait_for_freeze(mp, SB_FREEZE_WRITE); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + +relock: + if (ioflags & IO_ISDIRECT) { + iolock = XFS_IOLOCK_SHARED; + need_i_mutex = 0; + } else { + iolock = XFS_IOLOCK_EXCL; + need_i_mutex = 1; + mutex_lock(&inode->i_mutex); + } + + xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); + +start: + error = -generic_write_checks(file, &pos, &count, + S_ISBLK(inode->i_mode)); + if (error) { + xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); + goto out_unlock_mutex; + } + + if ((DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && + !(ioflags & IO_INVIS) && !eventsent)) { + int dmflags = FILP_DELAY_FLAG(file); + + if (need_i_mutex) + dmflags |= DM_FLAGS_IMUX; + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + error = XFS_SEND_DATA(ip->i_mount, DM_EVENT_WRITE, ip, + pos, count, dmflags, &iolock); + if (error) { + goto out_unlock_internal; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + eventsent = 1; + + /* + * The iolock was dropped and reacquired in XFS_SEND_DATA + * so we have to recheck the size when appending. + * We will only "goto start;" once, since having sent the + * event prevents another call to XFS_SEND_DATA, which is + * what allows the size to change in the first place. + */ + if ((file->f_flags & O_APPEND) && pos != ip->i_size) + goto start; + } + + if (ioflags & IO_ISDIRECT) { + xfs_buftarg_t *target = + XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + + if ((pos & target->bt_smask) || (count & target->bt_smask)) { + xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); + return XFS_ERROR(-EINVAL); + } + + if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) { + xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); + iolock = XFS_IOLOCK_EXCL; + need_i_mutex = 1; + mutex_lock(&inode->i_mutex); + xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); + goto start; + } + } + + new_size = pos + count; + if (new_size > ip->i_size) + ip->i_new_size = new_size; + + if (likely(!(ioflags & IO_INVIS))) + file_update_time(file); + + /* + * If the offset is beyond the size of the file, we have a couple + * of things to do. First, if there is already space allocated + * we need to either create holes or zero the disk or ... + * + * If there is a page where the previous size lands, we need + * to zero it out up to the new size. + */ + + if (pos > ip->i_size) { + error = xfs_zero_eof(ip, pos, ip->i_size); + if (error) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + goto out_unlock_internal; + } + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + /* + * If we're writing the file then make sure to clear the + * setuid and setgid bits if the process is not being run + * by root. This keeps people from modifying setuid and + * setgid binaries. + */ + error = -file_remove_suid(file); + if (unlikely(error)) + goto out_unlock_internal; + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = mapping->backing_dev_info; + + if ((ioflags & IO_ISDIRECT)) { + if (mapping->nrpages) { + WARN_ON(need_i_mutex == 0); + error = xfs_flushinval_pages(ip, + (pos & PAGE_CACHE_MASK), + -1, FI_REMAPF_LOCKED); + if (error) + goto out_unlock_internal; + } + + if (need_i_mutex) { + /* demote the lock now the cached pages are gone */ + xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); + mutex_unlock(&inode->i_mutex); + + iolock = XFS_IOLOCK_SHARED; + need_i_mutex = 0; + } + + trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags); + ret = generic_file_direct_write(iocb, iovp, + &nr_segs, pos, &iocb->ki_pos, count, ocount); + + /* + * direct-io write to a hole: fall through to buffered I/O + * for completing the rest of the request. + */ + if (ret >= 0 && ret != count) { + XFS_STATS_ADD(xs_write_bytes, ret); + + pos += ret; + count -= ret; + + ioflags &= ~IO_ISDIRECT; + xfs_iunlock(ip, iolock); + goto relock; + } + } else { + int enospc = 0; + ssize_t ret2 = 0; + +write_retry: + trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags); + ret2 = generic_file_buffered_write(iocb, iovp, nr_segs, + pos, &iocb->ki_pos, count, ret); + /* + * if we just got an ENOSPC, flush the inode now we + * aren't holding any page locks and retry *once* + */ + if (ret2 == -ENOSPC && !enospc) { + error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE); + if (error) + goto out_unlock_internal; + enospc = 1; + goto write_retry; + } + ret = ret2; + } + + current->backing_dev_info = NULL; + + isize = i_size_read(inode); + if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize)) + iocb->ki_pos = isize; + + if (iocb->ki_pos > ip->i_size) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (iocb->ki_pos > ip->i_size) + ip->i_size = iocb->ki_pos; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + if (ret == -ENOSPC && + DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) { + xfs_iunlock(ip, iolock); + if (need_i_mutex) + mutex_unlock(&inode->i_mutex); + error = XFS_SEND_NAMESP(ip->i_mount, DM_EVENT_NOSPACE, ip, + DM_RIGHT_NULL, ip, DM_RIGHT_NULL, NULL, NULL, + 0, 0, 0); /* Delay flag intentionally unused */ + if (need_i_mutex) + mutex_lock(&inode->i_mutex); + xfs_ilock(ip, iolock); + if (error) + goto out_unlock_internal; + goto start; + } + + error = -ret; + if (ret <= 0) + goto out_unlock_internal; + + XFS_STATS_ADD(xs_write_bytes, ret); + + /* Handle various SYNC-type writes */ + if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { + loff_t end = pos + ret - 1; + int error2; + + xfs_iunlock(ip, iolock); + if (need_i_mutex) + mutex_unlock(&inode->i_mutex); + + error2 = filemap_write_and_wait_range(mapping, pos, end); + if (!error) + error = error2; + if (need_i_mutex) + mutex_lock(&inode->i_mutex); + xfs_ilock(ip, iolock); + + error2 = -xfs_file_fsync(file, file->f_path.dentry, + (file->f_flags & __O_SYNC) ? 0 : 1); + if (!error) + error = error2; + } + + out_unlock_internal: + if (ip->i_new_size) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + ip->i_new_size = 0; + /* + * If this was a direct or synchronous I/O that failed (such + * as ENOSPC) then part of the I/O may have been written to + * disk before the error occured. In this case the on-disk + * file size may have been adjusted beyond the in-memory file + * size and now needs to be truncated back. + */ + if (ip->i_d.di_size > ip->i_size) + ip->i_d.di_size = ip->i_size; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + xfs_iunlock(ip, iolock); + out_unlock_mutex: + if (need_i_mutex) + mutex_unlock(&inode->i_mutex); + return -error; } STATIC int @@ -160,28 +938,6 @@ xfs_file_release( return -xfs_release(XFS_I(inode)); } -/* - * We ignore the datasync flag here because a datasync is effectively - * identical to an fsync. That is, datasync implies that we need to write - * only the metadata needed to be able to access the data that is written - * if we crash after the call completes. Hence if we are writing beyond - * EOF we have to log the inode size change as well, which makes it a - * full fsync. If we don't write beyond EOF, the inode core will be - * clean in memory and so we don't need to log the inode, just like - * fsync. - */ -STATIC int -xfs_file_fsync( - struct file *file, - struct dentry *dentry, - int datasync) -{ - struct xfs_inode *ip = XFS_I(dentry->d_inode); - - xfs_iflags_clear(ip, XFS_ITRUNCATED); - return -xfs_fsync(ip); -} - STATIC int xfs_file_readdir( struct file *filp, @@ -203,9 +959,9 @@ xfs_file_readdir( * * Try to give it an estimate that's good enough, maybe at some * point we can change the ->readdir prototype to include the - * buffer size. + * buffer size. For now we use the current glibc buffer size. */ - bufsize = (size_t)min_t(loff_t, PAGE_SIZE, ip->i_d.di_size); + bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size); error = xfs_readdir(ip, dirent, bufsize, (xfs_off_t *)&filp->f_pos, filldir); diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index e8566bbf0f0..61a99608731 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -91,6 +91,16 @@ xfs_mark_inode_dirty_sync( mark_inode_dirty_sync(inode); } +void +xfs_mark_inode_dirty( + xfs_inode_t *ip) +{ + struct inode *inode = VFS_I(ip); + + if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR))) + mark_inode_dirty(inode); +} + /* * Change the requested timestamp in the given inode. * We don't lock across timestamp updates, and we don't log them but diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 5af0c81ca1a..facfb323a70 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -88,7 +88,6 @@ #include <xfs_super.h> #include <xfs_globals.h> #include <xfs_fs_subr.h> -#include <xfs_lrw.h> #include <xfs_buf.h> /* diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c deleted file mode 100644 index eac6f80d786..00000000000 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ /dev/null @@ -1,796 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_alloc.h" -#include "xfs_dmapi.h" -#include "xfs_quota.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_bmap.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" -#include "xfs_rtalloc.h" -#include "xfs_error.h" -#include "xfs_itable.h" -#include "xfs_rw.h" -#include "xfs_attr.h" -#include "xfs_inode_item.h" -#include "xfs_buf_item.h" -#include "xfs_utils.h" -#include "xfs_iomap.h" -#include "xfs_vnodeops.h" -#include "xfs_trace.h" - -#include <linux/capability.h> -#include <linux/writeback.h> - - -/* - * xfs_iozero - * - * xfs_iozero clears the specified range of buffer supplied, - * and marks all the affected blocks as valid and modified. If - * an affected block is not allocated, it will be allocated. If - * an affected block is not completely overwritten, and is not - * valid before the operation, it will be read from disk before - * being partially zeroed. - */ -STATIC int -xfs_iozero( - struct xfs_inode *ip, /* inode */ - loff_t pos, /* offset in file */ - size_t count) /* size of data to zero */ -{ - struct page *page; - struct address_space *mapping; - int status; - - mapping = VFS_I(ip)->i_mapping; - do { - unsigned offset, bytes; - void *fsdata; - - offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ - bytes = PAGE_CACHE_SIZE - offset; - if (bytes > count) - bytes = count; - - status = pagecache_write_begin(NULL, mapping, pos, bytes, - AOP_FLAG_UNINTERRUPTIBLE, - &page, &fsdata); - if (status) - break; - - zero_user(page, offset, bytes); - - status = pagecache_write_end(NULL, mapping, pos, bytes, bytes, - page, fsdata); - WARN_ON(status <= 0); /* can't return less than zero! */ - pos += bytes; - count -= bytes; - status = 0; - } while (count); - - return (-status); -} - -ssize_t /* bytes read, or (-) error */ -xfs_read( - xfs_inode_t *ip, - struct kiocb *iocb, - const struct iovec *iovp, - unsigned int segs, - loff_t *offset, - int ioflags) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - xfs_mount_t *mp = ip->i_mount; - size_t size = 0; - ssize_t ret = 0; - xfs_fsize_t n; - unsigned long seg; - - - XFS_STATS_INC(xs_read_calls); - - /* START copy & waste from filemap.c */ - for (seg = 0; seg < segs; seg++) { - const struct iovec *iv = &iovp[seg]; - - /* - * If any segment has a negative length, or the cumulative - * length ever wraps negative then return -EINVAL. - */ - size += iv->iov_len; - if (unlikely((ssize_t)(size|iv->iov_len) < 0)) - return XFS_ERROR(-EINVAL); - } - /* END copy & waste from filemap.c */ - - if (unlikely(ioflags & IO_ISDIRECT)) { - xfs_buftarg_t *target = - XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp; - if ((*offset & target->bt_smask) || - (size & target->bt_smask)) { - if (*offset == ip->i_size) { - return (0); - } - return -XFS_ERROR(EINVAL); - } - } - - n = XFS_MAXIOFFSET(mp) - *offset; - if ((n <= 0) || (size == 0)) - return 0; - - if (n < size) - size = n; - - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - - if (unlikely(ioflags & IO_ISDIRECT)) - mutex_lock(&inode->i_mutex); - xfs_ilock(ip, XFS_IOLOCK_SHARED); - - if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) { - int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags); - int iolock = XFS_IOLOCK_SHARED; - - ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *offset, size, - dmflags, &iolock); - if (ret) { - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - if (unlikely(ioflags & IO_ISDIRECT)) - mutex_unlock(&inode->i_mutex); - return ret; - } - } - - if (unlikely(ioflags & IO_ISDIRECT)) { - if (inode->i_mapping->nrpages) - ret = -xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK), - -1, FI_REMAPF_LOCKED); - mutex_unlock(&inode->i_mutex); - if (ret) { - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return ret; - } - } - - trace_xfs_file_read(ip, size, *offset, ioflags); - - iocb->ki_pos = *offset; - ret = generic_file_aio_read(iocb, iovp, segs, *offset); - if (ret > 0) - XFS_STATS_ADD(xs_read_bytes, ret); - - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return ret; -} - -ssize_t -xfs_splice_read( - xfs_inode_t *ip, - struct file *infilp, - loff_t *ppos, - struct pipe_inode_info *pipe, - size_t count, - int flags, - int ioflags) -{ - xfs_mount_t *mp = ip->i_mount; - ssize_t ret; - - XFS_STATS_INC(xs_read_calls); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return -EIO; - - xfs_ilock(ip, XFS_IOLOCK_SHARED); - - if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) { - int iolock = XFS_IOLOCK_SHARED; - int error; - - error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count, - FILP_DELAY_FLAG(infilp), &iolock); - if (error) { - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return -error; - } - } - - trace_xfs_file_splice_read(ip, count, *ppos, ioflags); - - ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); - if (ret > 0) - XFS_STATS_ADD(xs_read_bytes, ret); - - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return ret; -} - -ssize_t -xfs_splice_write( - xfs_inode_t *ip, - struct pipe_inode_info *pipe, - struct file *outfilp, - loff_t *ppos, - size_t count, - int flags, - int ioflags) -{ - xfs_mount_t *mp = ip->i_mount; - ssize_t ret; - struct inode *inode = outfilp->f_mapping->host; - xfs_fsize_t isize, new_size; - - XFS_STATS_INC(xs_write_calls); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return -EIO; - - xfs_ilock(ip, XFS_IOLOCK_EXCL); - - if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) { - int iolock = XFS_IOLOCK_EXCL; - int error; - - error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count, - FILP_DELAY_FLAG(outfilp), &iolock); - if (error) { - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return -error; - } - } - - new_size = *ppos + count; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (new_size > ip->i_size) - ip->i_new_size = new_size; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - trace_xfs_file_splice_write(ip, count, *ppos, ioflags); - - ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); - if (ret > 0) - XFS_STATS_ADD(xs_write_bytes, ret); - - isize = i_size_read(inode); - if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize)) - *ppos = isize; - - if (*ppos > ip->i_size) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (*ppos > ip->i_size) - ip->i_size = *ppos; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - - if (ip->i_new_size) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - ip->i_new_size = 0; - if (ip->i_d.di_size > ip->i_size) - ip->i_d.di_size = ip->i_size; - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return ret; -} - -/* - * This routine is called to handle zeroing any space in the last - * block of the file that is beyond the EOF. We do this since the - * size is being increased without writing anything to that block - * and we don't want anyone to read the garbage on the disk. - */ -STATIC int /* error (positive) */ -xfs_zero_last_block( - xfs_inode_t *ip, - xfs_fsize_t offset, - xfs_fsize_t isize) -{ - xfs_fileoff_t last_fsb; - xfs_mount_t *mp = ip->i_mount; - int nimaps; - int zero_offset; - int zero_len; - int error = 0; - xfs_bmbt_irec_t imap; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - zero_offset = XFS_B_FSB_OFFSET(mp, isize); - if (zero_offset == 0) { - /* - * There are no extra bytes in the last block on disk to - * zero, so return. - */ - return 0; - } - - last_fsb = XFS_B_TO_FSBT(mp, isize); - nimaps = 1; - error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap, - &nimaps, NULL, NULL); - if (error) { - return error; - } - ASSERT(nimaps > 0); - /* - * If the block underlying isize is just a hole, then there - * is nothing to zero. - */ - if (imap.br_startblock == HOLESTARTBLOCK) { - return 0; - } - /* - * Zero the part of the last block beyond the EOF, and write it - * out sync. We need to drop the ilock while we do this so we - * don't deadlock when the buffer cache calls back to us. - */ - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - zero_len = mp->m_sb.sb_blocksize - zero_offset; - if (isize + zero_len > offset) - zero_len = offset - isize; - error = xfs_iozero(ip, isize, zero_len); - - xfs_ilock(ip, XFS_ILOCK_EXCL); - ASSERT(error >= 0); - return error; -} - -/* - * Zero any on disk space between the current EOF and the new, - * larger EOF. This handles the normal case of zeroing the remainder - * of the last block in the file and the unusual case of zeroing blocks - * out beyond the size of the file. This second case only happens - * with fixed size extents and when the system crashes before the inode - * size was updated but after blocks were allocated. If fill is set, - * then any holes in the range are filled and zeroed. If not, the holes - * are left alone as holes. - */ - -int /* error (positive) */ -xfs_zero_eof( - xfs_inode_t *ip, - xfs_off_t offset, /* starting I/O offset */ - xfs_fsize_t isize) /* current inode size */ -{ - xfs_mount_t *mp = ip->i_mount; - xfs_fileoff_t start_zero_fsb; - xfs_fileoff_t end_zero_fsb; - xfs_fileoff_t zero_count_fsb; - xfs_fileoff_t last_fsb; - xfs_fileoff_t zero_off; - xfs_fsize_t zero_len; - int nimaps; - int error = 0; - xfs_bmbt_irec_t imap; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); - ASSERT(offset > isize); - - /* - * First handle zeroing the block on which isize resides. - * We only zero a part of that block so it is handled specially. - */ - error = xfs_zero_last_block(ip, offset, isize); - if (error) { - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); - return error; - } - - /* - * Calculate the range between the new size and the old - * where blocks needing to be zeroed may exist. To get the - * block where the last byte in the file currently resides, - * we need to subtract one from the size and truncate back - * to a block boundary. We subtract 1 in case the size is - * exactly on a block boundary. - */ - last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; - start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); - end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1); - ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb); - if (last_fsb == end_zero_fsb) { - /* - * The size was only incremented on its last block. - * We took care of that above, so just return. - */ - return 0; - } - - ASSERT(start_zero_fsb <= end_zero_fsb); - while (start_zero_fsb <= end_zero_fsb) { - nimaps = 1; - zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; - error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb, - 0, NULL, 0, &imap, &nimaps, NULL, NULL); - if (error) { - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); - return error; - } - ASSERT(nimaps > 0); - - if (imap.br_state == XFS_EXT_UNWRITTEN || - imap.br_startblock == HOLESTARTBLOCK) { - /* - * This loop handles initializing pages that were - * partially initialized by the code below this - * loop. It basically zeroes the part of the page - * that sits on a hole and sets the page as P_HOLE - * and calls remapf if it is a mapped file. - */ - start_zero_fsb = imap.br_startoff + imap.br_blockcount; - ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); - continue; - } - - /* - * There are blocks we need to zero. - * Drop the inode lock while we're doing the I/O. - * We'll still have the iolock to protect us. - */ - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - zero_off = XFS_FSB_TO_B(mp, start_zero_fsb); - zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount); - - if ((zero_off + zero_len) > offset) - zero_len = offset - zero_off; - - error = xfs_iozero(ip, zero_off, zero_len); - if (error) { - goto out_lock; - } - - start_zero_fsb = imap.br_startoff + imap.br_blockcount; - ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); - - xfs_ilock(ip, XFS_ILOCK_EXCL); - } - - return 0; - -out_lock: - xfs_ilock(ip, XFS_ILOCK_EXCL); - ASSERT(error >= 0); - return error; -} - -ssize_t /* bytes written, or (-) error */ -xfs_write( - struct xfs_inode *xip, - struct kiocb *iocb, - const struct iovec *iovp, - unsigned int nsegs, - loff_t *offset, - int ioflags) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - unsigned long segs = nsegs; - xfs_mount_t *mp; - ssize_t ret = 0, error = 0; - xfs_fsize_t isize, new_size; - int iolock; - int eventsent = 0; - size_t ocount = 0, count; - loff_t pos; - int need_i_mutex; - - XFS_STATS_INC(xs_write_calls); - - error = generic_segment_checks(iovp, &segs, &ocount, VERIFY_READ); - if (error) - return error; - - count = ocount; - pos = *offset; - - if (count == 0) - return 0; - - mp = xip->i_mount; - - xfs_wait_for_freeze(mp, SB_FREEZE_WRITE); - - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - -relock: - if (ioflags & IO_ISDIRECT) { - iolock = XFS_IOLOCK_SHARED; - need_i_mutex = 0; - } else { - iolock = XFS_IOLOCK_EXCL; - need_i_mutex = 1; - mutex_lock(&inode->i_mutex); - } - - xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); - -start: - error = -generic_write_checks(file, &pos, &count, - S_ISBLK(inode->i_mode)); - if (error) { - xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); - goto out_unlock_mutex; - } - - if ((DM_EVENT_ENABLED(xip, DM_EVENT_WRITE) && - !(ioflags & IO_INVIS) && !eventsent)) { - int dmflags = FILP_DELAY_FLAG(file); - - if (need_i_mutex) - dmflags |= DM_FLAGS_IMUX; - - xfs_iunlock(xip, XFS_ILOCK_EXCL); - error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, xip, - pos, count, dmflags, &iolock); - if (error) { - goto out_unlock_internal; - } - xfs_ilock(xip, XFS_ILOCK_EXCL); - eventsent = 1; - - /* - * The iolock was dropped and reacquired in XFS_SEND_DATA - * so we have to recheck the size when appending. - * We will only "goto start;" once, since having sent the - * event prevents another call to XFS_SEND_DATA, which is - * what allows the size to change in the first place. - */ - if ((file->f_flags & O_APPEND) && pos != xip->i_size) - goto start; - } - - if (ioflags & IO_ISDIRECT) { - xfs_buftarg_t *target = - XFS_IS_REALTIME_INODE(xip) ? - mp->m_rtdev_targp : mp->m_ddev_targp; - - if ((pos & target->bt_smask) || (count & target->bt_smask)) { - xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); - return XFS_ERROR(-EINVAL); - } - - if (!need_i_mutex && (mapping->nrpages || pos > xip->i_size)) { - xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); - iolock = XFS_IOLOCK_EXCL; - need_i_mutex = 1; - mutex_lock(&inode->i_mutex); - xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); - goto start; - } - } - - new_size = pos + count; - if (new_size > xip->i_size) - xip->i_new_size = new_size; - - if (likely(!(ioflags & IO_INVIS))) - file_update_time(file); - - /* - * If the offset is beyond the size of the file, we have a couple - * of things to do. First, if there is already space allocated - * we need to either create holes or zero the disk or ... - * - * If there is a page where the previous size lands, we need - * to zero it out up to the new size. - */ - - if (pos > xip->i_size) { - error = xfs_zero_eof(xip, pos, xip->i_size); - if (error) { - xfs_iunlock(xip, XFS_ILOCK_EXCL); - goto out_unlock_internal; - } - } - xfs_iunlock(xip, XFS_ILOCK_EXCL); - - /* - * If we're writing the file then make sure to clear the - * setuid and setgid bits if the process is not being run - * by root. This keeps people from modifying setuid and - * setgid binaries. - */ - error = -file_remove_suid(file); - if (unlikely(error)) - goto out_unlock_internal; - - /* We can write back this queue in page reclaim */ - current->backing_dev_info = mapping->backing_dev_info; - - if ((ioflags & IO_ISDIRECT)) { - if (mapping->nrpages) { - WARN_ON(need_i_mutex == 0); - error = xfs_flushinval_pages(xip, - (pos & PAGE_CACHE_MASK), - -1, FI_REMAPF_LOCKED); - if (error) - goto out_unlock_internal; - } - - if (need_i_mutex) { - /* demote the lock now the cached pages are gone */ - xfs_ilock_demote(xip, XFS_IOLOCK_EXCL); - mutex_unlock(&inode->i_mutex); - - iolock = XFS_IOLOCK_SHARED; - need_i_mutex = 0; - } - - trace_xfs_file_direct_write(xip, count, *offset, ioflags); - ret = generic_file_direct_write(iocb, iovp, - &segs, pos, offset, count, ocount); - - /* - * direct-io write to a hole: fall through to buffered I/O - * for completing the rest of the request. - */ - if (ret >= 0 && ret != count) { - XFS_STATS_ADD(xs_write_bytes, ret); - - pos += ret; - count -= ret; - - ioflags &= ~IO_ISDIRECT; - xfs_iunlock(xip, iolock); - goto relock; - } - } else { - int enospc = 0; - ssize_t ret2 = 0; - -write_retry: - trace_xfs_file_buffered_write(xip, count, *offset, ioflags); - ret2 = generic_file_buffered_write(iocb, iovp, segs, - pos, offset, count, ret); - /* - * if we just got an ENOSPC, flush the inode now we - * aren't holding any page locks and retry *once* - */ - if (ret2 == -ENOSPC && !enospc) { - error = xfs_flush_pages(xip, 0, -1, 0, FI_NONE); - if (error) - goto out_unlock_internal; - enospc = 1; - goto write_retry; - } - ret = ret2; - } - - current->backing_dev_info = NULL; - - isize = i_size_read(inode); - if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize)) - *offset = isize; - - if (*offset > xip->i_size) { - xfs_ilock(xip, XFS_ILOCK_EXCL); - if (*offset > xip->i_size) - xip->i_size = *offset; - xfs_iunlock(xip, XFS_ILOCK_EXCL); - } - - if (ret == -ENOSPC && - DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) { - xfs_iunlock(xip, iolock); - if (need_i_mutex) - mutex_unlock(&inode->i_mutex); - error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, xip, - DM_RIGHT_NULL, xip, DM_RIGHT_NULL, NULL, NULL, - 0, 0, 0); /* Delay flag intentionally unused */ - if (need_i_mutex) - mutex_lock(&inode->i_mutex); - xfs_ilock(xip, iolock); - if (error) - goto out_unlock_internal; - goto start; - } - - error = -ret; - if (ret <= 0) - goto out_unlock_internal; - - XFS_STATS_ADD(xs_write_bytes, ret); - - /* Handle various SYNC-type writes */ - if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { - loff_t end = pos + ret - 1; - int error2; - - xfs_iunlock(xip, iolock); - if (need_i_mutex) - mutex_unlock(&inode->i_mutex); - - error2 = filemap_write_and_wait_range(mapping, pos, end); - if (!error) - error = error2; - if (need_i_mutex) - mutex_lock(&inode->i_mutex); - xfs_ilock(xip, iolock); - - error2 = xfs_fsync(xip); - if (!error) - error = error2; - } - - out_unlock_internal: - if (xip->i_new_size) { - xfs_ilock(xip, XFS_ILOCK_EXCL); - xip->i_new_size = 0; - /* - * If this was a direct or synchronous I/O that failed (such - * as ENOSPC) then part of the I/O may have been written to - * disk before the error occured. In this case the on-disk - * file size may have been adjusted beyond the in-memory file - * size and now needs to be truncated back. - */ - if (xip->i_d.di_size > xip->i_size) - xip->i_d.di_size = xip->i_size; - xfs_iunlock(xip, XFS_ILOCK_EXCL); - } - xfs_iunlock(xip, iolock); - out_unlock_mutex: - if (need_i_mutex) - mutex_unlock(&inode->i_mutex); - return -error; -} - -/* - * If the underlying (data/log/rt) device is readonly, there are some - * operations that cannot proceed. - */ -int -xfs_dev_is_read_only( - xfs_mount_t *mp, - char *message) -{ - if (xfs_readonly_buftarg(mp->m_ddev_targp) || - xfs_readonly_buftarg(mp->m_logdev_targp) || - (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) { - cmn_err(CE_NOTE, - "XFS: %s required on read-only device.", message); - cmn_err(CE_NOTE, - "XFS: write access unavailable, cannot proceed."); - return EROFS; - } - return 0; -} diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h deleted file mode 100644 index 342ae8c0d01..00000000000 --- a/fs/xfs/linux-2.6/xfs_lrw.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_LRW_H__ -#define __XFS_LRW_H__ - -struct xfs_mount; -struct xfs_inode; -struct xfs_buf; - -extern int xfs_dev_is_read_only(struct xfs_mount *, char *); - -extern int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); - -#endif /* __XFS_LRW_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c index 3d4a0c84d63..1947514ce1a 100644 --- a/fs/xfs/linux-2.6/xfs_quotaops.c +++ b/fs/xfs/linux-2.6/xfs_quotaops.c @@ -44,20 +44,6 @@ xfs_quota_type(int type) } STATIC int -xfs_fs_quota_sync( - struct super_block *sb, - int type) -{ - struct xfs_mount *mp = XFS_M(sb); - - if (sb->s_flags & MS_RDONLY) - return -EROFS; - if (!XFS_IS_QUOTA_RUNNING(mp)) - return -ENOSYS; - return -xfs_sync_data(mp, 0); -} - -STATIC int xfs_fs_get_xstate( struct super_block *sb, struct fs_quota_stat *fqs) @@ -82,8 +68,6 @@ xfs_fs_set_xstate( return -EROFS; if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp)) return -ENOSYS; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; if (uflags & XFS_QUOTA_UDQ_ACCT) flags |= XFS_UQUOTA_ACCT; @@ -144,14 +128,11 @@ xfs_fs_set_xquota( return -ENOSYS; if (!XFS_IS_QUOTA_ON(mp)) return -ESRCH; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq); } const struct quotactl_ops xfs_quotactl_operations = { - .quota_sync = xfs_fs_quota_sync, .get_xstate = xfs_fs_get_xstate, .set_xstate = xfs_fs_set_xstate, .get_xquota = xfs_fs_get_xquota, diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 25ea2408118..71345a370d9 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1063,7 +1063,7 @@ xfs_log_inode( STATIC int xfs_fs_write_inode( struct inode *inode, - int sync) + struct writeback_control *wbc) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1074,11 +1074,7 @@ xfs_fs_write_inode( if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - if (sync) { - error = xfs_wait_on_pages(ip, 0, -1); - if (error) - goto out; - + if (wbc->sync_mode == WB_SYNC_ALL) { /* * Make sure the inode has hit stable storage. By using the * log and the fsync transactions we reduce the IOs we have diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index a9f6d20aff4..05cd85317f6 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -607,7 +607,8 @@ xfssyncd( set_freezable(); timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10); for (;;) { - timeleft = schedule_timeout_interruptible(timeleft); + if (list_empty(&mp->m_sync_list)) + timeleft = schedule_timeout_interruptible(timeleft); /* swsusp */ try_to_freeze(); if (kthread_should_stop() && list_empty(&mp->m_sync_list)) @@ -627,8 +628,7 @@ xfssyncd( list_add_tail(&mp->m_sync_work.w_list, &mp->m_sync_list); } - list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list) - list_move(&work->w_list, &tmp); + list_splice_init(&mp->m_sync_list, &tmp); spin_unlock(&mp->m_sync_lock); list_for_each_entry_safe(work, n, &tmp, w_list) { @@ -688,12 +688,12 @@ xfs_inode_set_reclaim_tag( struct xfs_perag *pag; pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - read_lock(&pag->pag_ici_lock); + write_lock(&pag->pag_ici_lock); spin_lock(&ip->i_flags_lock); __xfs_inode_set_reclaim_tag(pag, ip); __xfs_iflags_set(ip, XFS_IRECLAIMABLE); spin_unlock(&ip->i_flags_lock); - read_unlock(&pag->pag_ici_lock); + write_unlock(&pag->pag_ici_lock); xfs_perag_put(pag); } diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c index 856eb3c8d60..5a107601e96 100644 --- a/fs/xfs/linux-2.6/xfs_trace.c +++ b/fs/xfs/linux-2.6/xfs_trace.c @@ -52,22 +52,6 @@ #include "quota/xfs_dquot.h" /* - * Format fsblock number into a static buffer & return it. - */ -STATIC char *xfs_fmtfsblock(xfs_fsblock_t bno) -{ - static char rval[50]; - - if (bno == NULLFSBLOCK) - sprintf(rval, "NULLFSBLOCK"); - else if (isnullstartblock(bno)) - sprintf(rval, "NULLSTARTBLOCK(%lld)", startblockval(bno)); - else - sprintf(rval, "%lld", (xfs_dfsbno_t)bno); - return rval; -} - -/* * We include this last to have the helpers above available for the trace * event implementations. */ diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index a4574dcf506..fcaa62f0799 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -197,13 +197,13 @@ TRACE_EVENT(xfs_iext_insert, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " - "offset %lld block %s count %lld flag %d caller %pf", + "offset %lld block %lld count %lld flag %d caller %pf", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), (long)__entry->idx, __entry->startoff, - xfs_fmtfsblock(__entry->startblock), + (__int64_t)__entry->startblock, __entry->blockcount, __entry->state, (char *)__entry->caller_ip) @@ -241,13 +241,13 @@ DECLARE_EVENT_CLASS(xfs_bmap_class, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " - "offset %lld block %s count %lld flag %d caller %pf", + "offset %lld block %lld count %lld flag %d caller %pf", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), (long)__entry->idx, __entry->startoff, - xfs_fmtfsblock(__entry->startblock), + (__int64_t)__entry->startblock, __entry->blockcount, __entry->state, (char *)__entry->caller_ip) @@ -593,7 +593,7 @@ DECLARE_EVENT_CLASS(xfs_dquot_class, TP_ARGS(dqp), TP_STRUCT__entry( __field(dev_t, dev) - __field(__be32, id) + __field(u32, id) __field(unsigned, flags) __field(unsigned, nrefs) __field(unsigned long long, res_bcount) @@ -606,7 +606,7 @@ DECLARE_EVENT_CLASS(xfs_dquot_class, ), \ TP_fast_assign( __entry->dev = dqp->q_mount->m_super->s_dev; - __entry->id = dqp->q_core.d_id; + __entry->id = be32_to_cpu(dqp->q_core.d_id); __entry->flags = dqp->dq_flags; __entry->nrefs = dqp->q_nrefs; __entry->res_bcount = dqp->q_res_bcount; @@ -622,10 +622,10 @@ DECLARE_EVENT_CLASS(xfs_dquot_class, be64_to_cpu(dqp->q_core.d_ino_softlimit); ), TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx " - "bcnt 0x%llx [hard 0x%llx | soft 0x%llx] " - "icnt 0x%llx [hard 0x%llx | soft 0x%llx]", + "bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx " + "icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]", MAJOR(__entry->dev), MINOR(__entry->dev), - be32_to_cpu(__entry->id), + __entry->id, __print_flags(__entry->flags, "|", XFS_DQ_FLAGS), __entry->nrefs, __entry->res_bcount, @@ -881,7 +881,7 @@ TRACE_EVENT(name, \ ), \ TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \ "offset 0x%llx count %zd flags %s " \ - "startoff 0x%llx startblock %s blockcount 0x%llx", \ + "startoff 0x%llx startblock %lld blockcount 0x%llx", \ MAJOR(__entry->dev), MINOR(__entry->dev), \ __entry->ino, \ __entry->size, \ @@ -890,7 +890,7 @@ TRACE_EVENT(name, \ __entry->count, \ __print_flags(__entry->flags, "|", BMAPI_FLAGS), \ __entry->startoff, \ - xfs_fmtfsblock(__entry->startblock), \ + (__int64_t)__entry->startblock, \ __entry->blockcount) \ ) DEFINE_IOMAP_EVENT(xfs_iomap_enter); diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 1869fb97381..5c11e4d1701 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2550,22 +2550,134 @@ xfs_bmap_rtalloc( } STATIC int +xfs_bmap_btalloc_nullfb( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + xfs_extlen_t *blen) +{ + struct xfs_mount *mp = ap->ip->i_mount; + struct xfs_perag *pag; + xfs_agnumber_t ag, startag; + int notinit = 0; + int error; + + if (ap->userdata && xfs_inode_is_filestream(ap->ip)) + args->type = XFS_ALLOCTYPE_NEAR_BNO; + else + args->type = XFS_ALLOCTYPE_START_BNO; + args->total = ap->total; + + /* + * Search for an allocation group with a single extent large enough + * for the request. If one isn't found, then adjust the minimum + * allocation size to the largest space found. + */ + startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno); + if (startag == NULLAGNUMBER) + startag = ag = 0; + + pag = xfs_perag_get(mp, ag); + while (*blen < ap->alen) { + if (!pag->pagf_init) { + error = xfs_alloc_pagf_init(mp, args->tp, ag, + XFS_ALLOC_FLAG_TRYLOCK); + if (error) { + xfs_perag_put(pag); + return error; + } + } + + /* + * See xfs_alloc_fix_freelist... + */ + if (pag->pagf_init) { + xfs_extlen_t longest; + longest = xfs_alloc_longest_free_extent(mp, pag); + if (*blen < longest) + *blen = longest; + } else + notinit = 1; + + if (xfs_inode_is_filestream(ap->ip)) { + if (*blen >= ap->alen) + break; + + if (ap->userdata) { + /* + * If startag is an invalid AG, we've + * come here once before and + * xfs_filestream_new_ag picked the + * best currently available. + * + * Don't continue looping, since we + * could loop forever. + */ + if (startag == NULLAGNUMBER) + break; + + error = xfs_filestream_new_ag(ap, &ag); + xfs_perag_put(pag); + if (error) + return error; + + /* loop again to set 'blen'*/ + startag = NULLAGNUMBER; + pag = xfs_perag_get(mp, ag); + continue; + } + } + if (++ag == mp->m_sb.sb_agcount) + ag = 0; + if (ag == startag) + break; + xfs_perag_put(pag); + pag = xfs_perag_get(mp, ag); + } + xfs_perag_put(pag); + + /* + * Since the above loop did a BUF_TRYLOCK, it is + * possible that there is space for this request. + */ + if (notinit || *blen < ap->minlen) + args->minlen = ap->minlen; + /* + * If the best seen length is less than the request + * length, use the best as the minimum. + */ + else if (*blen < ap->alen) + args->minlen = *blen; + /* + * Otherwise we've seen an extent as big as alen, + * use that as the minimum. + */ + else + args->minlen = ap->alen; + + /* + * set the failure fallback case to look in the selected + * AG as the stream may have moved. + */ + if (xfs_inode_is_filestream(ap->ip)) + ap->rval = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0); + + return 0; +} + +STATIC int xfs_bmap_btalloc( xfs_bmalloca_t *ap) /* bmap alloc argument struct */ { xfs_mount_t *mp; /* mount point structure */ xfs_alloctype_t atype = 0; /* type for allocation routines */ xfs_extlen_t align; /* minimum allocation alignment */ - xfs_agnumber_t ag; xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ - xfs_agnumber_t startag; + xfs_agnumber_t ag; xfs_alloc_arg_t args; xfs_extlen_t blen; xfs_extlen_t nextminlen = 0; - xfs_perag_t *pag; int nullfb; /* true if ap->firstblock isn't set */ int isaligned; - int notinit; int tryagain; int error; @@ -2612,103 +2724,9 @@ xfs_bmap_btalloc( args.firstblock = ap->firstblock; blen = 0; if (nullfb) { - if (ap->userdata && xfs_inode_is_filestream(ap->ip)) - args.type = XFS_ALLOCTYPE_NEAR_BNO; - else - args.type = XFS_ALLOCTYPE_START_BNO; - args.total = ap->total; - - /* - * Search for an allocation group with a single extent - * large enough for the request. - * - * If one isn't found, then adjust the minimum allocation - * size to the largest space found. - */ - startag = ag = XFS_FSB_TO_AGNO(mp, args.fsbno); - if (startag == NULLAGNUMBER) - startag = ag = 0; - notinit = 0; - pag = xfs_perag_get(mp, ag); - while (blen < ap->alen) { - if (!pag->pagf_init && - (error = xfs_alloc_pagf_init(mp, args.tp, - ag, XFS_ALLOC_FLAG_TRYLOCK))) { - xfs_perag_put(pag); - return error; - } - /* - * See xfs_alloc_fix_freelist... - */ - if (pag->pagf_init) { - xfs_extlen_t longest; - longest = xfs_alloc_longest_free_extent(mp, pag); - if (blen < longest) - blen = longest; - } else - notinit = 1; - - if (xfs_inode_is_filestream(ap->ip)) { - if (blen >= ap->alen) - break; - - if (ap->userdata) { - /* - * If startag is an invalid AG, we've - * come here once before and - * xfs_filestream_new_ag picked the - * best currently available. - * - * Don't continue looping, since we - * could loop forever. - */ - if (startag == NULLAGNUMBER) - break; - - error = xfs_filestream_new_ag(ap, &ag); - xfs_perag_put(pag); - if (error) - return error; - - /* loop again to set 'blen'*/ - startag = NULLAGNUMBER; - pag = xfs_perag_get(mp, ag); - continue; - } - } - if (++ag == mp->m_sb.sb_agcount) - ag = 0; - if (ag == startag) - break; - xfs_perag_put(pag); - pag = xfs_perag_get(mp, ag); - } - xfs_perag_put(pag); - /* - * Since the above loop did a BUF_TRYLOCK, it is - * possible that there is space for this request. - */ - if (notinit || blen < ap->minlen) - args.minlen = ap->minlen; - /* - * If the best seen length is less than the request - * length, use the best as the minimum. - */ - else if (blen < ap->alen) - args.minlen = blen; - /* - * Otherwise we've seen an extent as big as alen, - * use that as the minimum. - */ - else - args.minlen = ap->alen; - - /* - * set the failure fallback case to look in the selected - * AG as the stream may have moved. - */ - if (xfs_inode_is_filestream(ap->ip)) - ap->rval = args.fsbno = XFS_AGB_TO_FSB(mp, ag, 0); + error = xfs_bmap_btalloc_nullfb(ap, &args, &blen); + if (error) + return error; } else if (ap->low) { if (xfs_inode_is_filestream(ap->ip)) args.type = XFS_ALLOCTYPE_FIRST_AG; diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index f52ac276277..7cf7220e7d5 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -292,7 +292,8 @@ typedef struct xfs_bstat { __s32 bs_extents; /* number of extents */ __u32 bs_gen; /* generation count */ __u16 bs_projid; /* project id */ - unsigned char bs_pad[14]; /* pad space, unused */ + __u16 bs_forkoff; /* inode fork offset in bytes */ + unsigned char bs_pad[12]; /* pad space, unused */ __u32 bs_dmevmask; /* DMIG event mask */ __u16 bs_dmstate; /* DMIG state info */ __u16 bs_aextents; /* attribute number of extents */ diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index e281eb4a1c4..6845db90818 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -190,13 +190,12 @@ xfs_iget_cache_hit( trace_xfs_iget_reclaim(ip); /* - * We need to set XFS_INEW atomically with clearing the - * reclaimable tag so that we do have an indicator of the - * inode still being initialized. + * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode + * from stomping over us while we recycle the inode. We can't + * clear the radix tree reclaimable tag yet as it requires + * pag_ici_lock to be held exclusive. */ - ip->i_flags |= XFS_INEW; - ip->i_flags &= ~XFS_IRECLAIMABLE; - __xfs_inode_clear_reclaim_tag(mp, pag, ip); + ip->i_flags |= XFS_IRECLAIM; spin_unlock(&ip->i_flags_lock); read_unlock(&pag->pag_ici_lock); @@ -216,7 +215,15 @@ xfs_iget_cache_hit( trace_xfs_iget_reclaim(ip); goto out_error; } + + write_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM); + ip->i_flags |= XFS_INEW; + __xfs_inode_clear_reclaim_tag(mp, pag, ip); inode->i_state = I_NEW; + spin_unlock(&ip->i_flags_lock); + write_unlock(&pag->pag_ici_lock); } else { /* If the VFS inode is being torn down, pause and try again. */ if (!igrab(inode)) { diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index fa31360046d..0ffd5644704 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2439,75 +2439,31 @@ xfs_idestroy_fork( } /* - * Increment the pin count of the given buffer. - * This value is protected by ipinlock spinlock in the mount structure. + * This is called to unpin an inode. The caller must have the inode locked + * in at least shared mode so that the buffer cannot be subsequently pinned + * once someone is waiting for it to be unpinned. */ -void -xfs_ipin( - xfs_inode_t *ip) -{ - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - atomic_inc(&ip->i_pincount); -} - -/* - * Decrement the pin count of the given inode, and wake up - * anyone in xfs_iwait_unpin() if the count goes to 0. The - * inode must have been previously pinned with a call to xfs_ipin(). - */ -void -xfs_iunpin( - xfs_inode_t *ip) -{ - ASSERT(atomic_read(&ip->i_pincount) > 0); - - if (atomic_dec_and_test(&ip->i_pincount)) - wake_up(&ip->i_ipin_wait); -} - -/* - * This is called to unpin an inode. It can be directed to wait or to return - * immediately without waiting for the inode to be unpinned. The caller must - * have the inode locked in at least shared mode so that the buffer cannot be - * subsequently pinned once someone is waiting for it to be unpinned. - */ -STATIC void -__xfs_iunpin_wait( - xfs_inode_t *ip, - int wait) +static void +xfs_iunpin_nowait( + struct xfs_inode *ip) { - xfs_inode_log_item_t *iip = ip->i_itemp; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - if (atomic_read(&ip->i_pincount) == 0) - return; /* Give the log a push to start the unpinning I/O */ - if (iip && iip->ili_last_lsn) - xfs_log_force_lsn(ip->i_mount, iip->ili_last_lsn, 0); - else - xfs_log_force(ip->i_mount, 0); + xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0); - if (wait) - wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0)); } void xfs_iunpin_wait( - xfs_inode_t *ip) + struct xfs_inode *ip) { - __xfs_iunpin_wait(ip, 1); -} - -static inline void -xfs_iunpin_nowait( - xfs_inode_t *ip) -{ - __xfs_iunpin_wait(ip, 0); + if (xfs_ipincount(ip)) { + xfs_iunpin_nowait(ip); + wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0)); + } } - /* * xfs_iextents_copy() * diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 6c912b02759..9965e40a461 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -471,8 +471,6 @@ int xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *, int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); void xfs_iext_realloc(xfs_inode_t *, int, int); -void xfs_ipin(xfs_inode_t *); -void xfs_iunpin(xfs_inode_t *); void xfs_iunpin_wait(xfs_inode_t *); int xfs_iflush(xfs_inode_t *, uint); void xfs_ichgtime(xfs_inode_t *, int); @@ -480,6 +478,7 @@ void xfs_lock_inodes(xfs_inode_t **, int, uint); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); void xfs_synchronize_times(xfs_inode_t *); +void xfs_mark_inode_dirty(xfs_inode_t *); void xfs_mark_inode_dirty_sync(xfs_inode_t *); #define IHOLD(ip) \ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index d4dc063111f..7bfea854015 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -535,23 +535,23 @@ xfs_inode_item_format( /* * This is called to pin the inode associated with the inode log - * item in memory so it cannot be written out. Do this by calling - * xfs_ipin() to bump the pin count in the inode while holding the - * inode pin lock. + * item in memory so it cannot be written out. */ STATIC void xfs_inode_item_pin( xfs_inode_log_item_t *iip) { ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL)); - xfs_ipin(iip->ili_inode); + + atomic_inc(&iip->ili_inode->i_pincount); } /* * This is called to unpin the inode associated with the inode log * item which was previously pinned with a call to xfs_inode_item_pin(). - * Just call xfs_iunpin() on the inode to do this. + * + * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0. */ /* ARGSUSED */ STATIC void @@ -559,7 +559,11 @@ xfs_inode_item_unpin( xfs_inode_log_item_t *iip, int stale) { - xfs_iunpin(iip->ili_inode); + struct xfs_inode *ip = iip->ili_inode; + + ASSERT(atomic_read(&ip->i_pincount) > 0); + if (atomic_dec_and_test(&ip->i_pincount)) + wake_up(&ip->i_ipin_wait); } /* ARGSUSED */ @@ -568,7 +572,7 @@ xfs_inode_item_unpin_remove( xfs_inode_log_item_t *iip, xfs_trans_t *tp) { - xfs_iunpin(iip->ili_inode); + xfs_inode_item_unpin(iip, 0); } /* diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 3af02314c60..b1b801e4a28 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -106,6 +106,7 @@ xfs_bulkstat_one_iget( buf->bs_dmevmask = dic->di_dmevmask; buf->bs_dmstate = dic->di_dmstate; buf->bs_aextents = dic->di_anextents; + buf->bs_forkoff = XFS_IFORK_BOFF(ip); switch (dic->di_format) { case XFS_DINODE_FMT_DEV: @@ -176,6 +177,7 @@ xfs_bulkstat_one_dinode( buf->bs_dmevmask = be32_to_cpu(dic->di_dmevmask); buf->bs_dmstate = be16_to_cpu(dic->di_dmstate); buf->bs_aextents = be16_to_cpu(dic->di_anextents); + buf->bs_forkoff = XFS_DFORK_BOFF(dic); switch (dic->di_format) { case XFS_DINODE_FMT_DEV: diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 4f16be4b6ee..e8fba92d7cd 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -60,7 +60,7 @@ STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); STATIC void xlog_dealloc_log(xlog_t *log); STATIC int xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[], - int nentries, xfs_log_ticket_t tic, + int nentries, struct xlog_ticket *tic, xfs_lsn_t *start_lsn, xlog_in_core_t **commit_iclog, uint flags); @@ -243,14 +243,14 @@ xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type) * out when the next write occurs. */ xfs_lsn_t -xfs_log_done(xfs_mount_t *mp, - xfs_log_ticket_t xtic, - void **iclog, - uint flags) +xfs_log_done( + struct xfs_mount *mp, + struct xlog_ticket *ticket, + struct xlog_in_core **iclog, + uint flags) { - xlog_t *log = mp->m_log; - xlog_ticket_t *ticket = (xfs_log_ticket_t) xtic; - xfs_lsn_t lsn = 0; + struct log *log = mp->m_log; + xfs_lsn_t lsn = 0; if (XLOG_FORCED_SHUTDOWN(log) || /* @@ -258,8 +258,7 @@ xfs_log_done(xfs_mount_t *mp, * If we get an error, just continue and give back the log ticket. */ (((ticket->t_flags & XLOG_TIC_INITED) == 0) && - (xlog_commit_record(mp, ticket, - (xlog_in_core_t **)iclog, &lsn)))) { + (xlog_commit_record(mp, ticket, iclog, &lsn)))) { lsn = (xfs_lsn_t) -1; if (ticket->t_flags & XLOG_TIC_PERM_RESERV) { flags |= XFS_LOG_REL_PERM_RESERV; @@ -289,7 +288,7 @@ xfs_log_done(xfs_mount_t *mp, } return lsn; -} /* xfs_log_done */ +} /* * Attaches a new iclog I/O completion callback routine during @@ -298,11 +297,11 @@ xfs_log_done(xfs_mount_t *mp, * executing the callback at an appropriate time. */ int -xfs_log_notify(xfs_mount_t *mp, /* mount of partition */ - void *iclog_hndl, /* iclog to hang callback off */ - xfs_log_callback_t *cb) +xfs_log_notify( + struct xfs_mount *mp, + struct xlog_in_core *iclog, + xfs_log_callback_t *cb) { - xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl; int abortflg; spin_lock(&iclog->ic_callback_lock); @@ -316,16 +315,14 @@ xfs_log_notify(xfs_mount_t *mp, /* mount of partition */ } spin_unlock(&iclog->ic_callback_lock); return abortflg; -} /* xfs_log_notify */ +} int -xfs_log_release_iclog(xfs_mount_t *mp, - void *iclog_hndl) +xfs_log_release_iclog( + struct xfs_mount *mp, + struct xlog_in_core *iclog) { - xlog_t *log = mp->m_log; - xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl; - - if (xlog_state_release_iclog(log, iclog)) { + if (xlog_state_release_iclog(mp->m_log, iclog)) { xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); return EIO; } @@ -344,17 +341,18 @@ xfs_log_release_iclog(xfs_mount_t *mp, * reservation, we prevent over allocation problems. */ int -xfs_log_reserve(xfs_mount_t *mp, - int unit_bytes, - int cnt, - xfs_log_ticket_t *ticket, - __uint8_t client, - uint flags, - uint t_type) +xfs_log_reserve( + struct xfs_mount *mp, + int unit_bytes, + int cnt, + struct xlog_ticket **ticket, + __uint8_t client, + uint flags, + uint t_type) { - xlog_t *log = mp->m_log; - xlog_ticket_t *internal_ticket; - int retval = 0; + struct log *log = mp->m_log; + struct xlog_ticket *internal_ticket; + int retval = 0; ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); ASSERT((flags & XFS_LOG_NOSLEEP) == 0); @@ -367,7 +365,7 @@ xfs_log_reserve(xfs_mount_t *mp, if (*ticket != NULL) { ASSERT(flags & XFS_LOG_PERM_RESERV); - internal_ticket = (xlog_ticket_t *)*ticket; + internal_ticket = *ticket; trace_xfs_log_reserve(log, internal_ticket); @@ -519,7 +517,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) xlog_in_core_t *first_iclog; #endif xfs_log_iovec_t reg[1]; - xfs_log_ticket_t tic = NULL; + xlog_ticket_t *tic = NULL; xfs_lsn_t lsn; int error; @@ -656,24 +654,24 @@ xfs_log_unmount(xfs_mount_t *mp) * transaction occur with one call to xfs_log_write(). */ int -xfs_log_write(xfs_mount_t * mp, - xfs_log_iovec_t reg[], - int nentries, - xfs_log_ticket_t tic, - xfs_lsn_t *start_lsn) +xfs_log_write( + struct xfs_mount *mp, + struct xfs_log_iovec reg[], + int nentries, + struct xlog_ticket *tic, + xfs_lsn_t *start_lsn) { - int error; - xlog_t *log = mp->m_log; + struct log *log = mp->m_log; + int error; if (XLOG_FORCED_SHUTDOWN(log)) return XFS_ERROR(EIO); - if ((error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0))) { + error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0); + if (error) xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); - } return error; -} /* xfs_log_write */ - +} void xfs_log_move_tail(xfs_mount_t *mp, @@ -1642,16 +1640,16 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) * bytes have been written out. */ STATIC int -xlog_write(xfs_mount_t * mp, - xfs_log_iovec_t reg[], - int nentries, - xfs_log_ticket_t tic, - xfs_lsn_t *start_lsn, - xlog_in_core_t **commit_iclog, - uint flags) +xlog_write( + struct xfs_mount *mp, + struct xfs_log_iovec reg[], + int nentries, + struct xlog_ticket *ticket, + xfs_lsn_t *start_lsn, + struct xlog_in_core **commit_iclog, + uint flags) { xlog_t *log = mp->m_log; - xlog_ticket_t *ticket = (xlog_ticket_t *)tic; xlog_in_core_t *iclog = NULL; /* ptr to current in-core log */ xlog_op_header_t *logop_head; /* ptr to log operation header */ __psint_t ptr; /* copy address into data region */ @@ -1765,7 +1763,7 @@ xlog_write(xfs_mount_t * mp, default: xfs_fs_cmn_err(CE_WARN, mp, "Bad XFS transaction clientid 0x%x in ticket 0x%p", - logop_head->oh_clientid, tic); + logop_head->oh_clientid, ticket); return XFS_ERROR(EIO); } diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 7074be9d13e..97a24c7795a 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -110,8 +110,6 @@ typedef struct xfs_log_iovec { uint i_type; /* type of region */ } xfs_log_iovec_t; -typedef void* xfs_log_ticket_t; - /* * Structure used to pass callback function and the function's argument * to the log manager. @@ -126,10 +124,12 @@ typedef struct xfs_log_callback { #ifdef __KERNEL__ /* Log manager interfaces */ struct xfs_mount; +struct xlog_in_core; struct xlog_ticket; + xfs_lsn_t xfs_log_done(struct xfs_mount *mp, - xfs_log_ticket_t ticket, - void **iclog, + struct xlog_ticket *ticket, + struct xlog_in_core **iclog, uint flags); int _xfs_log_force(struct xfs_mount *mp, uint flags, @@ -151,21 +151,21 @@ int xfs_log_mount_finish(struct xfs_mount *mp); void xfs_log_move_tail(struct xfs_mount *mp, xfs_lsn_t tail_lsn); int xfs_log_notify(struct xfs_mount *mp, - void *iclog, + struct xlog_in_core *iclog, xfs_log_callback_t *callback_entry); int xfs_log_release_iclog(struct xfs_mount *mp, - void *iclog_hndl); + struct xlog_in_core *iclog); int xfs_log_reserve(struct xfs_mount *mp, int length, int count, - xfs_log_ticket_t *ticket, + struct xlog_ticket **ticket, __uint8_t clientid, uint flags, uint t_type); int xfs_log_write(struct xfs_mount *mp, xfs_log_iovec_t region[], int nentries, - xfs_log_ticket_t ticket, + struct xlog_ticket *ticket, xfs_lsn_t *start_lsn); int xfs_log_unmount_write(struct xfs_mount *mp); void xfs_log_unmount(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 6afaaeb2950..e79b56b4bca 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1097,13 +1097,15 @@ xfs_default_resblks(xfs_mount_t *mp) __uint64_t resblks; /* - * We default to 5% or 1024 fsbs of space reserved, whichever is smaller. - * This may drive us straight to ENOSPC on mount, but that implies - * we were already there on the last unmount. Warn if this occurs. + * We default to 5% or 8192 fsbs of space reserved, whichever is + * smaller. This is intended to cover concurrent allocation + * transactions when we initially hit enospc. These each require a 4 + * block reservation. Hence by default we cover roughly 2000 concurrent + * allocation reservations. */ resblks = mp->m_sb.sb_dblocks; do_div(resblks, 20); - resblks = min_t(__uint64_t, resblks, 1024); + resblks = min_t(__uint64_t, resblks, 8192); return resblks; } @@ -1417,6 +1419,9 @@ xfs_mountfs( * when at ENOSPC. This is needed for operations like create with * attr, unwritten extent conversion at ENOSPC, etc. Data allocations * are not allowed to use this reserved space. + * + * This may drive us straight to ENOSPC on mount, but that implies + * we were already there on the last unmount. Warn if this occurs. */ if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { resblks = xfs_default_resblks(mp); @@ -1725,26 +1730,30 @@ xfs_mod_incore_sb_unlocked( lcounter += rem; } } else { /* Taking blocks away */ - lcounter += delta; + if (lcounter >= 0) { + mp->m_sb.sb_fdblocks = lcounter + + XFS_ALLOC_SET_ASIDE(mp); + return 0; + } - /* - * If were out of blocks, use any available reserved blocks if - * were allowed to. - */ + /* + * We are out of blocks, use any available reserved + * blocks if were allowed to. + */ + if (!rsvd) + return XFS_ERROR(ENOSPC); - if (lcounter < 0) { - if (rsvd) { - lcounter = (long long)mp->m_resblks_avail + delta; - if (lcounter < 0) { - return XFS_ERROR(ENOSPC); - } - mp->m_resblks_avail = lcounter; - return 0; - } else { /* not reserved */ - return XFS_ERROR(ENOSPC); - } + lcounter = (long long)mp->m_resblks_avail + delta; + if (lcounter >= 0) { + mp->m_resblks_avail = lcounter; + return 0; } + printk_once(KERN_WARNING + "Filesystem \"%s\": reserve blocks depleted! " + "Consider increasing reserve pool size.", + mp->m_fsname); + return XFS_ERROR(ENOSPC); } mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp); @@ -2052,6 +2061,26 @@ xfs_mount_log_sb( return error; } +/* + * If the underlying (data/log/rt) device is readonly, there are some + * operations that cannot proceed. + */ +int +xfs_dev_is_read_only( + struct xfs_mount *mp, + char *message) +{ + if (xfs_readonly_buftarg(mp->m_ddev_targp) || + xfs_readonly_buftarg(mp->m_logdev_targp) || + (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) { + cmn_err(CE_NOTE, + "XFS: %s required on read-only device.", message); + cmn_err(CE_NOTE, + "XFS: write access unavailable, cannot proceed."); + return EROFS; + } + return 0; +} #ifdef HAVE_PERCPU_SB /* diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 14dafd60823..4fa0bc7b983 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -436,6 +436,8 @@ extern void xfs_freesb(xfs_mount_t *); extern int xfs_fs_writable(xfs_mount_t *); extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t); +extern int xfs_dev_is_read_only(struct xfs_mount *, char *); + extern int xfs_dmops_get(struct xfs_mount *); extern void xfs_dmops_put(struct xfs_mount *); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index be942d4e332..f73e358bae8 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -796,7 +796,7 @@ _xfs_trans_commit( int sync; #define XFS_TRANS_LOGVEC_COUNT 16 xfs_log_iovec_t log_vector_fast[XFS_TRANS_LOGVEC_COUNT]; - void *commit_iclog; + struct xlog_in_core *commit_iclog; int shutdown; commit_lsn = -1; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index c93e3a10285..79c8bab9dff 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -910,7 +910,7 @@ typedef struct xfs_trans { unsigned int t_blk_res_used; /* # of resvd blocks used */ unsigned int t_rtx_res; /* # of rt extents resvd */ unsigned int t_rtx_res_used; /* # of resvd rt extents used */ - xfs_log_ticket_t t_ticket; /* log mgr ticket */ + struct xlog_ticket *t_ticket; /* log mgr ticket */ xfs_lsn_t t_lsn; /* log seq num of start of * transaction. */ xfs_lsn_t t_commit_lsn; /* log seq num of end of diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 5ffd544434e..fb586360d1c 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -46,6 +46,65 @@ STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *, STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *, xfs_daddr_t, int); +/* + * Add the locked buffer to the transaction. + * + * The buffer must be locked, and it cannot be associated with any + * transaction. + * + * If the buffer does not yet have a buf log item associated with it, + * then allocate one for it. Then add the buf item to the transaction. + */ +STATIC void +_xfs_trans_bjoin( + struct xfs_trans *tp, + struct xfs_buf *bp, + int reset_recur) +{ + struct xfs_buf_log_item *bip; + + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL); + + /* + * The xfs_buf_log_item pointer is stored in b_fsprivate. If + * it doesn't have one yet, then allocate one and initialize it. + * The checks to see if one is there are in xfs_buf_item_init(). + */ + xfs_buf_item_init(bp, tp->t_mountp); + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); + ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); + if (reset_recur) + bip->bli_recur = 0; + + /* + * Take a reference for this transaction on the buf item. + */ + atomic_inc(&bip->bli_refcount); + + /* + * Get a log_item_desc to point at the new item. + */ + (void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip); + + /* + * Initialize b_fsprivate2 so we can find it with incore_match() + * in xfs_trans_get_buf() and friends above. + */ + XFS_BUF_SET_FSPRIVATE2(bp, tp); + +} + +void +xfs_trans_bjoin( + struct xfs_trans *tp, + struct xfs_buf *bp) +{ + _xfs_trans_bjoin(tp, bp, 0); + trace_xfs_trans_bjoin(bp->b_fspriv); +} /* * Get and lock the buffer for the caller if it is not already @@ -132,40 +191,8 @@ xfs_trans_get_buf(xfs_trans_t *tp, ASSERT(!XFS_BUF_GETERROR(bp)); - /* - * The xfs_buf_log_item pointer is stored in b_fsprivate. If - * it doesn't have one yet, then allocate one and initialize it. - * The checks to see if one is there are in xfs_buf_item_init(). - */ - xfs_buf_item_init(bp, tp->t_mountp); - - /* - * Set the recursion count for the buffer within this transaction - * to 0. - */ - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); - ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); - bip->bli_recur = 0; - - /* - * Take a reference for this transaction on the buf item. - */ - atomic_inc(&bip->bli_refcount); - - /* - * Get a log_item_desc to point at the new item. - */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip); - - /* - * Initialize b_fsprivate2 so we can find it with incore_match() - * above. - */ - XFS_BUF_SET_FSPRIVATE2(bp, tp); - - trace_xfs_trans_get_buf(bip); + _xfs_trans_bjoin(tp, bp, 1); + trace_xfs_trans_get_buf(bp->b_fspriv); return (bp); } @@ -210,44 +237,11 @@ xfs_trans_getsb(xfs_trans_t *tp, } bp = xfs_getsb(mp, flags); - if (bp == NULL) { + if (bp == NULL) return NULL; - } - - /* - * The xfs_buf_log_item pointer is stored in b_fsprivate. If - * it doesn't have one yet, then allocate one and initialize it. - * The checks to see if one is there are in xfs_buf_item_init(). - */ - xfs_buf_item_init(bp, mp); - - /* - * Set the recursion count for the buffer within this transaction - * to 0. - */ - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); - ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); - bip->bli_recur = 0; - - /* - * Take a reference for this transaction on the buf item. - */ - atomic_inc(&bip->bli_refcount); - - /* - * Get a log_item_desc to point at the new item. - */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip); - - /* - * Initialize b_fsprivate2 so we can find it with incore_match() - * above. - */ - XFS_BUF_SET_FSPRIVATE2(bp, tp); - trace_xfs_trans_getsb(bip); + _xfs_trans_bjoin(tp, bp, 1); + trace_xfs_trans_getsb(bp->b_fspriv); return (bp); } @@ -425,40 +419,9 @@ xfs_trans_read_buf( if (XFS_FORCED_SHUTDOWN(mp)) goto shutdown_abort; - /* - * The xfs_buf_log_item pointer is stored in b_fsprivate. If - * it doesn't have one yet, then allocate one and initialize it. - * The checks to see if one is there are in xfs_buf_item_init(). - */ - xfs_buf_item_init(bp, tp->t_mountp); + _xfs_trans_bjoin(tp, bp, 1); + trace_xfs_trans_read_buf(bp->b_fspriv); - /* - * Set the recursion count for the buffer within this transaction - * to 0. - */ - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); - ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); - bip->bli_recur = 0; - - /* - * Take a reference for this transaction on the buf item. - */ - atomic_inc(&bip->bli_refcount); - - /* - * Get a log_item_desc to point at the new item. - */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip); - - /* - * Initialize b_fsprivate2 so we can find it with incore_match() - * above. - */ - XFS_BUF_SET_FSPRIVATE2(bp, tp); - - trace_xfs_trans_read_buf(bip); *bpp = bp; return 0; @@ -623,53 +586,6 @@ xfs_trans_brelse(xfs_trans_t *tp, } /* - * Add the locked buffer to the transaction. - * The buffer must be locked, and it cannot be associated with any - * transaction. - * - * If the buffer does not yet have a buf log item associated with it, - * then allocate one for it. Then add the buf item to the transaction. - */ -void -xfs_trans_bjoin(xfs_trans_t *tp, - xfs_buf_t *bp) -{ - xfs_buf_log_item_t *bip; - - ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL); - - /* - * The xfs_buf_log_item pointer is stored in b_fsprivate. If - * it doesn't have one yet, then allocate one and initialize it. - * The checks to see if one is there are in xfs_buf_item_init(). - */ - xfs_buf_item_init(bp, tp->t_mountp); - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); - ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); - - /* - * Take a reference for this transaction on the buf item. - */ - atomic_inc(&bip->bli_refcount); - - /* - * Get a log_item_desc to point at the new item. - */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip); - - /* - * Initialize b_fsprivate2 so we can find it with incore_match() - * in xfs_trans_get_buf() and friends above. - */ - XFS_BUF_SET_FSPRIVATE2(bp, tp); - - trace_xfs_trans_bjoin(bip); -} - -/* * Mark the buffer as not needing to be unlocked when the buf item's * IOP_UNLOCK() routine is called. The buffer must already be locked * and associated with the given transaction. diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index ddd2c5d1b85..9d376be0ea3 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -584,113 +584,6 @@ xfs_readlink( } /* - * xfs_fsync - * - * This is called to sync the inode and its data out to disk. We need to hold - * the I/O lock while flushing the data, and the inode lock while flushing the - * inode. The inode lock CANNOT be held while flushing the data, so acquire - * after we're done with that. - */ -int -xfs_fsync( - xfs_inode_t *ip) -{ - xfs_trans_t *tp; - int error = 0; - int log_flushed = 0; - - xfs_itrace_entry(ip); - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return XFS_ERROR(EIO); - - /* - * We always need to make sure that the required inode state is safe on - * disk. The inode might be clean but we still might need to force the - * log because of committed transactions that haven't hit the disk yet. - * Likewise, there could be unflushed non-transactional changes to the - * inode core that have to go to disk and this requires us to issue - * a synchronous transaction to capture these changes correctly. - * - * This code relies on the assumption that if the update_* fields - * of the inode are clear and the inode is unpinned then it is clean - * and no action is required. - */ - xfs_ilock(ip, XFS_ILOCK_SHARED); - - if (!ip->i_update_core) { - /* - * Timestamps/size haven't changed since last inode flush or - * inode transaction commit. That means either nothing got - * written or a transaction committed which caught the updates. - * If the latter happened and the transaction hasn't hit the - * disk yet, the inode will be still be pinned. If it is, - * force the log. - */ - xfs_iunlock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) { - if (ip->i_itemp->ili_last_lsn) { - error = _xfs_log_force_lsn(ip->i_mount, - ip->i_itemp->ili_last_lsn, - XFS_LOG_SYNC, &log_flushed); - } else { - error = _xfs_log_force(ip->i_mount, - XFS_LOG_SYNC, &log_flushed); - } - } - } else { - /* - * Kick off a transaction to log the inode core to get the - * updates. The sync transaction will also force the log. - */ - xfs_iunlock(ip, XFS_ILOCK_SHARED); - tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, 0, - XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - return error; - } - xfs_ilock(ip, XFS_ILOCK_EXCL); - - /* - * Note - it's possible that we might have pushed ourselves out - * of the way during trans_reserve which would flush the inode. - * But there's no guarantee that the inode buffer has actually - * gone out yet (it's delwri). Plus the buffer could be pinned - * anyway if it's part of an inode in another recent - * transaction. So we play it safe and fire off the - * transaction anyway. - */ - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - xfs_trans_set_sync(tp); - error = _xfs_trans_commit(tp, 0, &log_flushed); - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - - if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) { - /* - * If the log write didn't issue an ordered tag we need - * to flush the disk cache for the data device now. - */ - if (!log_flushed) - xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp); - - /* - * If this inode is on the RT dev we need to flush that - * cache as well. - */ - if (XFS_IS_REALTIME_INODE(ip)) - xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp); - } - - return error; -} - -/* * Flags for xfs_free_eofblocks */ #define XFS_FREE_EOF_TRYLOCK (1<<0) diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 774f40729ca..d8dfa8d0dad 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -21,7 +21,6 @@ int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags); #define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */ int xfs_readlink(struct xfs_inode *ip, char *link); -int xfs_fsync(struct xfs_inode *ip); int xfs_release(struct xfs_inode *ip); int xfs_inactive(struct xfs_inode *ip); int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, @@ -50,18 +49,6 @@ int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); -ssize_t xfs_read(struct xfs_inode *ip, struct kiocb *iocb, - const struct iovec *iovp, unsigned int segs, - loff_t *offset, int ioflags); -ssize_t xfs_splice_read(struct xfs_inode *ip, struct file *infilp, - loff_t *ppos, struct pipe_inode_info *pipe, size_t count, - int flags, int ioflags); -ssize_t xfs_splice_write(struct xfs_inode *ip, - struct pipe_inode_info *pipe, struct file *outfilp, - loff_t *ppos, size_t count, int flags, int ioflags); -ssize_t xfs_write(struct xfs_inode *xip, struct kiocb *iocb, - const struct iovec *iovp, unsigned int nsegs, - loff_t *offset, int ioflags); int xfs_bmap(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, int flags, struct xfs_iomap *iomapp, int *niomaps); void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first, @@ -72,4 +59,6 @@ int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last, uint64_t flags, int fiopt); int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last); +int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); + #endif /* _XFS_VNODEOPS_H */ |