diff options
Diffstat (limited to 'fs')
571 files changed, 24519 insertions, 20086 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 14da82564f4e..6894b085f0ee 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -537,7 +537,7 @@ static struct attribute_group v9fs_attr_group = { * */ -static int v9fs_sysfs_init(void) +static int __init v9fs_sysfs_init(void) { v9fs_kobj = kobject_create_and_add("9p", fs_kobj); if (!v9fs_kobj) diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index c71e88602ff4..cc1cfae726b3 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -259,8 +259,7 @@ static int v9fs_launder_page(struct page *page) * */ static ssize_t -v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t pos, unsigned long nr_segs) +v9fs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos) { /* * FIXME @@ -269,7 +268,7 @@ v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, */ p9_debug(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) off/no(%lld/%lu) EINVAL\n", iocb->ki_filp->f_path.dentry->d_name.name, - (long long)pos, nr_segs); + (long long)pos, iter->nr_segs); return -EINVAL; } diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 4d0c2e0be7e5..0b3bfa303dda 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -42,7 +42,6 @@ /** * struct p9_rdir - readdir accounting - * @mutex: mutex protecting readdir * @head: start offset of current dirread buffer * @tail: end offset of current dirread buffer * @buf: dirread buffer diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index d8223209d4b1..520c11c2dcca 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -352,9 +352,6 @@ static int v9fs_file_flock_dotl(struct file *filp, int cmd, invalidate_mapping_pages(&inode->i_data, 0, -1); } /* Convert flock to posix lock */ - fl->fl_owner = (fl_owner_t)filp; - fl->fl_start = 0; - fl->fl_end = OFFSET_MAX; fl->fl_flags |= FL_POSIX; fl->fl_flags ^= FL_FLOCK; @@ -684,7 +681,7 @@ v9fs_direct_read(struct file *filp, char __user *udata, size_t count, /** * v9fs_cached_file_read - read from a file * @filp: file pointer to read - * @udata: user data buffer to read data into + * @data: user data buffer to read data into * @count: size of buffer * @offset: offset at which to read data * @@ -695,13 +692,13 @@ v9fs_cached_file_read(struct file *filp, char __user *data, size_t count, { if (filp->f_flags & O_DIRECT) return v9fs_direct_read(filp, data, count, offset); - return do_sync_read(filp, data, count, offset); + return new_sync_read(filp, data, count, offset); } /** * v9fs_mmap_file_read - read from a file * @filp: file pointer to read - * @udata: user data buffer to read data into + * @data: user data buffer to read data into * @count: size of buffer * @offset: offset at which to read data * @@ -763,7 +760,7 @@ err_out: buff_write: mutex_unlock(&inode->i_mutex); - return do_sync_write(filp, data, count, offsetp); + return new_sync_write(filp, data, count, offsetp); } /** @@ -781,7 +778,7 @@ v9fs_cached_file_write(struct file *filp, const char __user * data, if (filp->f_flags & O_DIRECT) return v9fs_direct_write(filp, data, count, offset); - return do_sync_write(filp, data, count, offset); + return new_sync_write(filp, data, count, offset); } @@ -850,8 +847,8 @@ const struct file_operations v9fs_cached_file_operations = { .llseek = generic_file_llseek, .read = v9fs_cached_file_read, .write = v9fs_cached_file_write, - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .open = v9fs_file_open, .release = v9fs_dir_release, .lock = v9fs_file_lock, @@ -863,8 +860,8 @@ const struct file_operations v9fs_cached_file_operations_dotl = { .llseek = generic_file_llseek, .read = v9fs_cached_file_read, .write = v9fs_cached_file_write, - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .open = v9fs_file_open, .release = v9fs_dir_release, .lock = v9fs_file_lock_dotl, diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 53161ec058a7..7fa4f7a7653d 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -147,7 +147,7 @@ static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses, int major = -1, minor = -1; strlcpy(ext, stat->extension, sizeof(ext)); - sscanf(ext, "%c %u %u", &type, &major, &minor); + sscanf(ext, "%c %i %i", &type, &major, &minor); switch (type) { case 'c': res |= S_IFCHR; @@ -580,7 +580,7 @@ static int v9fs_at_to_dotl_flags(int flags) * v9fs_remove - helper function to remove files and directories * @dir: directory inode that is being deleted * @dentry: dentry that is being deleted - * @rmdir: removing a directory + * @flags: removing a directory * */ @@ -778,7 +778,7 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode * @dir: inode that is being walked from * @dentry: dentry that is being walked to? - * @nameidata: path data + * @flags: lookup flags (unused) * */ @@ -1324,7 +1324,7 @@ v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p) * v9fs_vfs_mkspecial - create a special file * @dir: inode to create special file in * @dentry: dentry to create - * @mode: mode to create special file + * @perm: mode to create special file * @extension: 9p2000.u format extension string representing special file * */ diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 59dc8e87647f..1fa85aae24df 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -226,7 +226,7 @@ int v9fs_open_to_dotl_flags(int flags) * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol. * @dir: directory inode that is being created * @dentry: dentry that is being deleted - * @mode: create permissions + * @omode: create permissions * */ @@ -375,7 +375,7 @@ err_clunk_old_fid: * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory * @dir: inode that is being unlinked * @dentry: dentry that is being unlinked - * @mode: mode for new directory + * @omode: mode for new directory * */ @@ -607,7 +607,6 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) * v9fs_stat2inode_dotl - populate an inode structure with stat info * @stat: stat structure * @inode: inode to populate - * @sb: superblock of filesystem * */ @@ -808,7 +807,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, * v9fs_vfs_mknod_dotl - create a special file * @dir: inode destination for new link * @dentry: dentry for file - * @mode: mode for creation + * @omode: mode for creation * @rdev: device associated with special file * */ diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index 04133a1fd9cb..f95e01e058e4 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -156,7 +156,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name, offset += write_count; value_len -= write_count; } - retval = offset; + retval = 0; err: p9_client_clunk(fid); return retval; diff --git a/fs/Makefile b/fs/Makefile index f9cb9876e466..4030cbfbc9af 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -14,14 +14,13 @@ obj-y := open.o read_write.o file_table.o super.o \ stack.o fs_struct.o statfs.o ifeq ($(CONFIG_BLOCK),y) -obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o +obj-y += buffer.o block_dev.o direct-io.o mpage.o else obj-y += no-block.o endif obj-$(CONFIG_PROC_FS) += proc_namespace.o -obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o obj-y += notify/ obj-$(CONFIG_EPOLL) += eventpoll.o obj-$(CONFIG_ANON_INODES) += anon_inodes.o diff --git a/fs/adfs/file.c b/fs/adfs/file.c index a36da5382b40..07c9edce5aa7 100644 --- a/fs/adfs/file.c +++ b/fs/adfs/file.c @@ -23,12 +23,12 @@ const struct file_operations adfs_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, + .read = new_sync_read, + .read_iter = generic_file_read_iter, .mmap = generic_file_mmap, .fsync = generic_file_fsync, - .write = do_sync_write, - .aio_write = generic_file_aio_write, + .write = new_sync_write, + .write_iter = generic_file_write_iter, .splice_read = generic_file_splice_read, }; diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 25b23b1e7f22..9bca88159725 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -1,3 +1,9 @@ +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/types.h> #include <linux/fs.h> #include <linux/buffer_head.h> @@ -206,7 +212,7 @@ affs_set_blocksize(struct super_block *sb, int size) static inline struct buffer_head * affs_bread(struct super_block *sb, int block) { - pr_debug("affs_bread: %d\n", block); + pr_debug("%s: %d\n", __func__, block); if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size) return sb_bread(sb, block); return NULL; @@ -214,7 +220,7 @@ affs_bread(struct super_block *sb, int block) static inline struct buffer_head * affs_getblk(struct super_block *sb, int block) { - pr_debug("affs_getblk: %d\n", block); + pr_debug("%s: %d\n", __func__, block); if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size) return sb_getblk(sb, block); return NULL; @@ -223,7 +229,7 @@ static inline struct buffer_head * affs_getzeroblk(struct super_block *sb, int block) { struct buffer_head *bh; - pr_debug("affs_getzeroblk: %d\n", block); + pr_debug("%s: %d\n", __func__, block); if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size) { bh = sb_getblk(sb, block); lock_buffer(bh); @@ -238,7 +244,7 @@ static inline struct buffer_head * affs_getemptyblk(struct super_block *sb, int block) { struct buffer_head *bh; - pr_debug("affs_getemptyblk: %d\n", block); + pr_debug("%s: %d\n", __func__, block); if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size) { bh = sb_getblk(sb, block); wait_on_buffer(bh); @@ -251,7 +257,7 @@ static inline void affs_brelse(struct buffer_head *bh) { if (bh) - pr_debug("affs_brelse: %lld\n", (long long) bh->b_blocknr); + pr_debug("%s: %lld\n", __func__, (long long) bh->b_blocknr); brelse(bh); } diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index 533a322c41c0..406b29836b19 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -34,7 +34,7 @@ affs_insert_hash(struct inode *dir, struct buffer_head *bh) ino = bh->b_blocknr; offset = affs_hash_name(sb, AFFS_TAIL(sb, bh)->name + 1, AFFS_TAIL(sb, bh)->name[0]); - pr_debug("AFFS: insert_hash(dir=%u, ino=%d)\n", (u32)dir->i_ino, ino); + pr_debug("%s(dir=%u, ino=%d)\n", __func__, (u32)dir->i_ino, ino); dir_bh = affs_bread(sb, dir->i_ino); if (!dir_bh) @@ -84,7 +84,8 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh) sb = dir->i_sb; rem_ino = rem_bh->b_blocknr; offset = affs_hash_name(sb, AFFS_TAIL(sb, rem_bh)->name+1, AFFS_TAIL(sb, rem_bh)->name[0]); - pr_debug("AFFS: remove_hash(dir=%d, ino=%d, hashval=%d)\n", (u32)dir->i_ino, rem_ino, offset); + pr_debug("%s(dir=%d, ino=%d, hashval=%d)\n", + __func__, (u32)dir->i_ino, rem_ino, offset); bh = affs_bread(sb, dir->i_ino); if (!bh) @@ -147,7 +148,7 @@ affs_remove_link(struct dentry *dentry) u32 link_ino, ino; int retval; - pr_debug("AFFS: remove_link(key=%ld)\n", inode->i_ino); + pr_debug("%s(key=%ld)\n", __func__, inode->i_ino); retval = -EIO; bh = affs_bread(sb, inode->i_ino); if (!bh) @@ -279,7 +280,7 @@ affs_remove_header(struct dentry *dentry) if (!inode) goto done; - pr_debug("AFFS: remove_header(key=%ld)\n", inode->i_ino); + pr_debug("%s(key=%ld)\n", __func__, inode->i_ino); retval = -EIO; bh = affs_bread(sb, (u32)(long)dentry->d_fsdata); if (!bh) @@ -451,10 +452,10 @@ affs_error(struct super_block *sb, const char *function, const char *fmt, ...) vsnprintf(ErrorBuffer,sizeof(ErrorBuffer),fmt,args); va_end(args); - printk(KERN_CRIT "AFFS error (device %s): %s(): %s\n", sb->s_id, + pr_crit("error (device %s): %s(): %s\n", sb->s_id, function,ErrorBuffer); if (!(sb->s_flags & MS_RDONLY)) - printk(KERN_WARNING "AFFS: Remounting filesystem read-only\n"); + pr_warn("Remounting filesystem read-only\n"); sb->s_flags |= MS_RDONLY; } @@ -467,7 +468,7 @@ affs_warning(struct super_block *sb, const char *function, const char *fmt, ...) vsnprintf(ErrorBuffer,sizeof(ErrorBuffer),fmt,args); va_end(args); - printk(KERN_WARNING "AFFS warning (device %s): %s(): %s\n", sb->s_id, + pr_warn("(device %s): %s(): %s\n", sb->s_id, function,ErrorBuffer); } diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c index a32246b8359e..c8de51185c23 100644 --- a/fs/affs/bitmap.c +++ b/fs/affs/bitmap.c @@ -17,7 +17,7 @@ affs_count_free_blocks(struct super_block *sb) u32 free; int i; - pr_debug("AFFS: count_free_blocks()\n"); + pr_debug("%s()\n", __func__); if (sb->s_flags & MS_RDONLY) return 0; @@ -43,7 +43,7 @@ affs_free_block(struct super_block *sb, u32 block) u32 blk, bmap, bit, mask, tmp; __be32 *data; - pr_debug("AFFS: free_block(%u)\n", block); + pr_debug("%s(%u)\n", __func__, block); if (block > sbi->s_partition_size) goto err_range; @@ -125,7 +125,7 @@ affs_alloc_block(struct inode *inode, u32 goal) sb = inode->i_sb; sbi = AFFS_SB(sb); - pr_debug("AFFS: balloc(inode=%lu,goal=%u): ", inode->i_ino, goal); + pr_debug("balloc(inode=%lu,goal=%u): ", inode->i_ino, goal); if (AFFS_I(inode)->i_pa_cnt) { pr_debug("%d\n", AFFS_I(inode)->i_lastalloc+1); @@ -254,8 +254,7 @@ int affs_init_bitmap(struct super_block *sb, int *flags) return 0; if (!AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag) { - printk(KERN_NOTICE "AFFS: Bitmap invalid - mounting %s read only\n", - sb->s_id); + pr_notice("Bitmap invalid - mounting %s read only\n", sb->s_id); *flags |= MS_RDONLY; return 0; } @@ -268,7 +267,7 @@ int affs_init_bitmap(struct super_block *sb, int *flags) size = sbi->s_bmap_count * sizeof(*bm); bm = sbi->s_bitmap = kzalloc(size, GFP_KERNEL); if (!sbi->s_bitmap) { - printk(KERN_ERR "AFFS: Bitmap allocation failed\n"); + pr_err("Bitmap allocation failed\n"); return -ENOMEM; } @@ -282,17 +281,17 @@ int affs_init_bitmap(struct super_block *sb, int *flags) bm->bm_key = be32_to_cpu(bmap_blk[blk]); bh = affs_bread(sb, bm->bm_key); if (!bh) { - printk(KERN_ERR "AFFS: Cannot read bitmap\n"); + pr_err("Cannot read bitmap\n"); res = -EIO; goto out; } if (affs_checksum_block(sb, bh)) { - printk(KERN_WARNING "AFFS: Bitmap %u invalid - mounting %s read only.\n", - bm->bm_key, sb->s_id); + pr_warn("Bitmap %u invalid - mounting %s read only.\n", + bm->bm_key, sb->s_id); *flags |= MS_RDONLY; goto out; } - pr_debug("AFFS: read bitmap block %d: %d\n", blk, bm->bm_key); + pr_debug("read bitmap block %d: %d\n", blk, bm->bm_key); bm->bm_free = memweight(bh->b_data + 4, sb->s_blocksize - 4); /* Don't try read the extension if this is the last block, @@ -304,7 +303,7 @@ int affs_init_bitmap(struct super_block *sb, int *flags) affs_brelse(bmap_bh); bmap_bh = affs_bread(sb, be32_to_cpu(bmap_blk[blk])); if (!bmap_bh) { - printk(KERN_ERR "AFFS: Cannot read bitmap extension\n"); + pr_err("Cannot read bitmap extension\n"); res = -EIO; goto out; } diff --git a/fs/affs/dir.c b/fs/affs/dir.c index cbbda476a805..59f07bec92a6 100644 --- a/fs/affs/dir.c +++ b/fs/affs/dir.c @@ -54,8 +54,8 @@ affs_readdir(struct file *file, struct dir_context *ctx) u32 ino; int error = 0; - pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n", - inode->i_ino, (unsigned long)ctx->pos); + pr_debug("%s(ino=%lu,f_pos=%lx)\n", + __func__, inode->i_ino, (unsigned long)ctx->pos); if (ctx->pos < 2) { file->private_data = (void *)0; @@ -81,7 +81,7 @@ affs_readdir(struct file *file, struct dir_context *ctx) */ ino = (u32)(long)file->private_data; if (ino && file->f_version == inode->i_version) { - pr_debug("AFFS: readdir() left off=%d\n", ino); + pr_debug("readdir() left off=%d\n", ino); goto inside; } @@ -117,7 +117,7 @@ inside: namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30); name = AFFS_TAIL(sb, fh_bh)->name + 1; - pr_debug("AFFS: readdir(): dir_emit(\"%.*s\", " + pr_debug("readdir(): dir_emit(\"%.*s\", " "ino=%u), hash=%d, f_pos=%x\n", namelen, name, ino, hash_pos, (u32)ctx->pos); diff --git a/fs/affs/file.c b/fs/affs/file.c index 8669b6ecddee..a7fe57d2cd9a 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -27,10 +27,10 @@ static int affs_file_release(struct inode *inode, struct file *filp); const struct file_operations affs_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write = new_sync_write, + .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .open = affs_file_open, .release = affs_file_release, @@ -45,7 +45,7 @@ const struct inode_operations affs_file_inode_operations = { static int affs_file_open(struct inode *inode, struct file *filp) { - pr_debug("AFFS: open(%lu,%d)\n", + pr_debug("open(%lu,%d)\n", inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt)); atomic_inc(&AFFS_I(inode)->i_opencnt); return 0; @@ -54,7 +54,7 @@ affs_file_open(struct inode *inode, struct file *filp) static int affs_file_release(struct inode *inode, struct file *filp) { - pr_debug("AFFS: release(%lu, %d)\n", + pr_debug("release(%lu, %d)\n", inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt)); if (atomic_dec_and_test(&AFFS_I(inode)->i_opencnt)) { @@ -324,7 +324,8 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul struct buffer_head *ext_bh; u32 ext; - pr_debug("AFFS: get_block(%u, %lu)\n", (u32)inode->i_ino, (unsigned long)block); + pr_debug("%s(%u, %lu)\n", + __func__, (u32)inode->i_ino, (unsigned long)block); BUG_ON(block > (sector_t)0x7fffffffUL); @@ -498,34 +499,36 @@ affs_getemptyblk_ino(struct inode *inode, int block) } static int -affs_do_readpage_ofs(struct file *file, struct page *page, unsigned from, unsigned to) +affs_do_readpage_ofs(struct page *page, unsigned to) { struct inode *inode = page->mapping->host; struct super_block *sb = inode->i_sb; struct buffer_head *bh; char *data; + unsigned pos = 0; u32 bidx, boff, bsize; u32 tmp; - pr_debug("AFFS: read_page(%u, %ld, %d, %d)\n", (u32)inode->i_ino, page->index, from, to); - BUG_ON(from > to || to > PAGE_CACHE_SIZE); + pr_debug("%s(%u, %ld, 0, %d)\n", __func__, (u32)inode->i_ino, + page->index, to); + BUG_ON(to > PAGE_CACHE_SIZE); kmap(page); data = page_address(page); bsize = AFFS_SB(sb)->s_data_blksize; - tmp = (page->index << PAGE_CACHE_SHIFT) + from; + tmp = page->index << PAGE_CACHE_SHIFT; bidx = tmp / bsize; boff = tmp % bsize; - while (from < to) { + while (pos < to) { bh = affs_bread_ino(inode, bidx, 0); if (IS_ERR(bh)) return PTR_ERR(bh); - tmp = min(bsize - boff, to - from); - BUG_ON(from + tmp > to || tmp > bsize); - memcpy(data + from, AFFS_DATA(bh) + boff, tmp); + tmp = min(bsize - boff, to - pos); + BUG_ON(pos + tmp > to || tmp > bsize); + memcpy(data + pos, AFFS_DATA(bh) + boff, tmp); affs_brelse(bh); bidx++; - from += tmp; + pos += tmp; boff = 0; } flush_dcache_page(page); @@ -542,7 +545,7 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize) u32 size, bsize; u32 tmp; - pr_debug("AFFS: extent_file(%u, %d)\n", (u32)inode->i_ino, newsize); + pr_debug("%s(%u, %d)\n", __func__, (u32)inode->i_ino, newsize); bsize = AFFS_SB(sb)->s_data_blksize; bh = NULL; size = AFFS_I(inode)->mmu_private; @@ -608,14 +611,14 @@ affs_readpage_ofs(struct file *file, struct page *page) u32 to; int err; - pr_debug("AFFS: read_page(%u, %ld)\n", (u32)inode->i_ino, page->index); + pr_debug("%s(%u, %ld)\n", __func__, (u32)inode->i_ino, page->index); to = PAGE_CACHE_SIZE; if (((page->index + 1) << PAGE_CACHE_SHIFT) > inode->i_size) { to = inode->i_size & ~PAGE_CACHE_MASK; memset(page_address(page) + to, 0, PAGE_CACHE_SIZE - to); } - err = affs_do_readpage_ofs(file, page, 0, to); + err = affs_do_readpage_ofs(page, to); if (!err) SetPageUptodate(page); unlock_page(page); @@ -631,7 +634,8 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping pgoff_t index; int err = 0; - pr_debug("AFFS: write_begin(%u, %llu, %llu)\n", (u32)inode->i_ino, (unsigned long long)pos, (unsigned long long)pos + len); + pr_debug("%s(%u, %llu, %llu)\n", __func__, (u32)inode->i_ino, + (unsigned long long)pos, (unsigned long long)pos + len); if (pos > AFFS_I(inode)->mmu_private) { /* XXX: this probably leaves a too-big i_size in case of * failure. Should really be updating i_size at write_end time @@ -651,7 +655,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping return 0; /* XXX: inefficient but safe in the face of short writes */ - err = affs_do_readpage_ofs(file, page, 0, PAGE_CACHE_SIZE); + err = affs_do_readpage_ofs(page, PAGE_CACHE_SIZE); if (err) { unlock_page(page); page_cache_release(page); @@ -680,7 +684,9 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, * due to write_begin. */ - pr_debug("AFFS: write_begin(%u, %llu, %llu)\n", (u32)inode->i_ino, (unsigned long long)pos, (unsigned long long)pos + len); + pr_debug("%s(%u, %llu, %llu)\n", + __func__, (u32)inode->i_ino, (unsigned long long)pos, + (unsigned long long)pos + len); bsize = AFFS_SB(sb)->s_data_blksize; data = page_address(page); @@ -802,7 +808,7 @@ affs_free_prealloc(struct inode *inode) { struct super_block *sb = inode->i_sb; - pr_debug("AFFS: free_prealloc(ino=%lu)\n", inode->i_ino); + pr_debug("free_prealloc(ino=%lu)\n", inode->i_ino); while (AFFS_I(inode)->i_pa_cnt) { AFFS_I(inode)->i_pa_cnt--; @@ -822,7 +828,7 @@ affs_truncate(struct inode *inode) struct buffer_head *ext_bh; int i; - pr_debug("AFFS: truncate(inode=%d, oldsize=%u, newsize=%u)\n", + pr_debug("truncate(inode=%d, oldsize=%u, newsize=%u)\n", (u32)inode->i_ino, (u32)AFFS_I(inode)->mmu_private, (u32)inode->i_size); last_blk = 0; diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 96df91e8c334..bec2d1a0c91c 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -34,7 +34,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) if (!(inode->i_state & I_NEW)) return inode; - pr_debug("AFFS: affs_iget(%lu)\n", inode->i_ino); + pr_debug("affs_iget(%lu)\n", inode->i_ino); block = inode->i_ino; bh = affs_bread(sb, block); @@ -175,7 +175,7 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc) uid_t uid; gid_t gid; - pr_debug("AFFS: write_inode(%lu)\n",inode->i_ino); + pr_debug("write_inode(%lu)\n", inode->i_ino); if (!inode->i_nlink) // possibly free block @@ -220,7 +220,7 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr) struct inode *inode = dentry->d_inode; int error; - pr_debug("AFFS: notify_change(%lu,0x%x)\n",inode->i_ino,attr->ia_valid); + pr_debug("notify_change(%lu,0x%x)\n", inode->i_ino, attr->ia_valid); error = inode_change_ok(inode,attr); if (error) @@ -258,7 +258,8 @@ void affs_evict_inode(struct inode *inode) { unsigned long cache_page; - pr_debug("AFFS: evict_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink); + pr_debug("evict_inode(ino=%lu, nlink=%u)\n", + inode->i_ino, inode->i_nlink); truncate_inode_pages_final(&inode->i_data); if (!inode->i_nlink) { @@ -271,7 +272,7 @@ affs_evict_inode(struct inode *inode) affs_free_prealloc(inode); cache_page = (unsigned long)AFFS_I(inode)->i_lc; if (cache_page) { - pr_debug("AFFS: freeing ext cache\n"); + pr_debug("freeing ext cache\n"); AFFS_I(inode)->i_lc = NULL; AFFS_I(inode)->i_ac = NULL; free_page(cache_page); @@ -350,7 +351,8 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3 u32 block = 0; int retval; - pr_debug("AFFS: add_entry(dir=%u, inode=%u, \"%*s\", type=%d)\n", (u32)dir->i_ino, + pr_debug("%s(dir=%u, inode=%u, \"%*s\", type=%d)\n", + __func__, (u32)dir->i_ino, (u32)inode->i_ino, (int)dentry->d_name.len, dentry->d_name.name, type); retval = -EIO; diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 6dae1ccd176d..035bd31556fc 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -190,7 +190,8 @@ affs_find_entry(struct inode *dir, struct dentry *dentry) toupper_t toupper = affs_get_toupper(sb); u32 key; - pr_debug("AFFS: find_entry(\"%.*s\")\n", (int)dentry->d_name.len, dentry->d_name.name); + pr_debug("%s(\"%.*s\")\n", + __func__, (int)dentry->d_name.len, dentry->d_name.name); bh = affs_bread(sb, dir->i_ino); if (!bh) @@ -218,7 +219,8 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) struct buffer_head *bh; struct inode *inode = NULL; - pr_debug("AFFS: lookup(\"%.*s\")\n",(int)dentry->d_name.len,dentry->d_name.name); + pr_debug("%s(\"%.*s\")\n", + __func__, (int)dentry->d_name.len, dentry->d_name.name); affs_lock_dir(dir); bh = affs_find_entry(dir, dentry); @@ -248,9 +250,9 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) int affs_unlink(struct inode *dir, struct dentry *dentry) { - pr_debug("AFFS: unlink(dir=%d, %lu \"%.*s\")\n", (u32)dir->i_ino, - dentry->d_inode->i_ino, - (int)dentry->d_name.len, dentry->d_name.name); + pr_debug("%s(dir=%d, %lu \"%.*s\")\n", + __func__, (u32)dir->i_ino, dentry->d_inode->i_ino, + (int)dentry->d_name.len, dentry->d_name.name); return affs_remove_header(dentry); } @@ -262,7 +264,8 @@ affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) struct inode *inode; int error; - pr_debug("AFFS: create(%lu,\"%.*s\",0%ho)\n",dir->i_ino,(int)dentry->d_name.len, + pr_debug("%s(%lu,\"%.*s\",0%ho)\n", + __func__, dir->i_ino, (int)dentry->d_name.len, dentry->d_name.name,mode); inode = affs_new_inode(dir); @@ -291,8 +294,9 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; int error; - pr_debug("AFFS: mkdir(%lu,\"%.*s\",0%ho)\n",dir->i_ino, - (int)dentry->d_name.len,dentry->d_name.name,mode); + pr_debug("%s(%lu,\"%.*s\",0%ho)\n", + __func__, dir->i_ino, (int)dentry->d_name.len, + dentry->d_name.name, mode); inode = affs_new_inode(dir); if (!inode) @@ -317,8 +321,8 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) int affs_rmdir(struct inode *dir, struct dentry *dentry) { - pr_debug("AFFS: rmdir(dir=%u, %lu \"%.*s\")\n", (u32)dir->i_ino, - dentry->d_inode->i_ino, + pr_debug("%s(dir=%u, %lu \"%.*s\")\n", + __func__, (u32)dir->i_ino, dentry->d_inode->i_ino, (int)dentry->d_name.len, dentry->d_name.name); return affs_remove_header(dentry); @@ -334,8 +338,9 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) int i, maxlen, error; char c, lc; - pr_debug("AFFS: symlink(%lu,\"%.*s\" -> \"%s\")\n",dir->i_ino, - (int)dentry->d_name.len,dentry->d_name.name,symname); + pr_debug("%s(%lu,\"%.*s\" -> \"%s\")\n", + __func__, dir->i_ino, (int)dentry->d_name.len, + dentry->d_name.name, symname); maxlen = AFFS_SB(sb)->s_hashsize * sizeof(u32) - 1; inode = affs_new_inode(dir); @@ -404,7 +409,8 @@ affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = old_dentry->d_inode; - pr_debug("AFFS: link(%u, %u, \"%.*s\")\n", (u32)inode->i_ino, (u32)dir->i_ino, + pr_debug("%s(%u, %u, \"%.*s\")\n", + __func__, (u32)inode->i_ino, (u32)dir->i_ino, (int)dentry->d_name.len,dentry->d_name.name); return affs_add_entry(dir, inode, dentry, ST_LINKFILE); @@ -418,9 +424,10 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry, struct buffer_head *bh = NULL; int retval; - pr_debug("AFFS: rename(old=%u,\"%*s\" to new=%u,\"%*s\")\n", - (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name, - (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name); + pr_debug("%s(old=%u,\"%*s\" to new=%u,\"%*s\")\n", + __func__, (u32)old_dir->i_ino, (int)old_dentry->d_name.len, + old_dentry->d_name.name, (u32)new_dir->i_ino, + (int)new_dentry->d_name.len, new_dentry->d_name.name); retval = affs_check_name(new_dentry->d_name.name, new_dentry->d_name.len, diff --git a/fs/affs/super.c b/fs/affs/super.c index 6d589f28bf9b..51f1a95bff73 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -46,7 +46,7 @@ static void affs_put_super(struct super_block *sb) { struct affs_sb_info *sbi = AFFS_SB(sb); - pr_debug("AFFS: put_super()\n"); + pr_debug("%s()\n", __func__); cancel_delayed_work_sync(&sbi->sb_work); } @@ -220,7 +220,7 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved, return 0; if (n != 512 && n != 1024 && n != 2048 && n != 4096) { - printk ("AFFS: Invalid blocksize (512, 1024, 2048, 4096 allowed)\n"); + pr_warn("Invalid blocksize (512, 1024, 2048, 4096 allowed)\n"); return 0; } *blocksize = n; @@ -285,8 +285,8 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved, /* Silently ignore the quota options */ break; default: - printk("AFFS: Unrecognized mount option \"%s\" " - "or missing value\n", p); + pr_warn("Unrecognized mount option \"%s\" or missing value\n", + p); return 0; } } @@ -319,7 +319,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) save_mount_options(sb, data); - pr_debug("AFFS: read_super(%s)\n",data ? (const char *)data : "no options"); + pr_debug("read_super(%s)\n", data ? (const char *)data : "no options"); sb->s_magic = AFFS_SUPER_MAGIC; sb->s_op = &affs_sops; @@ -339,9 +339,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block, &blocksize,&sbi->s_prefix, sbi->s_volume, &mount_flags)) { - printk(KERN_ERR "AFFS: Error parsing options\n"); - kfree(sbi->s_prefix); - kfree(sbi); + pr_err("Error parsing options\n"); return -EINVAL; } /* N.B. after this point s_prefix must be released */ @@ -358,7 +356,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) */ size = sb->s_bdev->bd_inode->i_size >> 9; - pr_debug("AFFS: initial blocksize=%d, #blocks=%d\n", 512, size); + pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size); affs_set_blocksize(sb, PAGE_SIZE); /* Try to find root block. Its location depends on the block size. */ @@ -373,7 +371,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) sbi->s_root_block = root_block; if (root_block < 0) sbi->s_root_block = (reserved + size - 1) / 2; - pr_debug("AFFS: setting blocksize to %d\n", blocksize); + pr_debug("setting blocksize to %d\n", blocksize); affs_set_blocksize(sb, blocksize); sbi->s_partition_size = size; @@ -388,7 +386,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) * block behind the calculated one. So we check this one, too. */ for (num_bm = 0; num_bm < 2; num_bm++) { - pr_debug("AFFS: Dev %s, trying root=%u, bs=%d, " + pr_debug("Dev %s, trying root=%u, bs=%d, " "size=%d, reserved=%d\n", sb->s_id, sbi->s_root_block + num_bm, @@ -409,8 +407,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) } } if (!silent) - printk(KERN_ERR "AFFS: No valid root block on device %s\n", - sb->s_id); + pr_err("No valid root block on device %s\n", sb->s_id); return -EINVAL; /* N.B. after this point bh must be released */ @@ -422,7 +419,7 @@ got_root: /* Find out which kind of FS we have */ boot_bh = sb_bread(sb, 0); if (!boot_bh) { - printk(KERN_ERR "AFFS: Cannot read boot block\n"); + pr_err("Cannot read boot block\n"); return -EINVAL; } memcpy(sig, boot_bh->b_data, 4); @@ -435,8 +432,7 @@ got_root: */ if ((chksum == FS_DCFFS || chksum == MUFS_DCFFS || chksum == FS_DCOFS || chksum == MUFS_DCOFS) && !(sb->s_flags & MS_RDONLY)) { - printk(KERN_NOTICE "AFFS: Dircache FS - mounting %s read only\n", - sb->s_id); + pr_notice("Dircache FS - mounting %s read only\n", sb->s_id); sb->s_flags |= MS_RDONLY; } switch (chksum) { @@ -470,14 +466,14 @@ got_root: sb->s_flags |= MS_NOEXEC; break; default: - printk(KERN_ERR "AFFS: Unknown filesystem on device %s: %08X\n", - sb->s_id, chksum); + pr_err("Unknown filesystem on device %s: %08X\n", + sb->s_id, chksum); return -EINVAL; } if (mount_flags & SF_VERBOSE) { u8 len = AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0]; - printk(KERN_NOTICE "AFFS: Mounting volume \"%.*s\": Type=%.3s\\%c, Blocksize=%d\n", + pr_notice("Mounting volume \"%.*s\": Type=%.3s\\%c, Blocksize=%d\n", len > 31 ? 31 : len, AFFS_ROOT_TAIL(sb, root_bh)->disk_name + 1, sig, sig[3] + '0', blocksize); @@ -508,11 +504,11 @@ got_root: sb->s_root = d_make_root(root_inode); if (!sb->s_root) { - printk(KERN_ERR "AFFS: Get root inode failed\n"); + pr_err("AFFS: Get root inode failed\n"); return -ENOMEM; } - pr_debug("AFFS: s_flags=%lX\n",sb->s_flags); + pr_debug("s_flags=%lX\n", sb->s_flags); return 0; } @@ -532,7 +528,7 @@ affs_remount(struct super_block *sb, int *flags, char *data) char volume[32]; char *prefix = NULL; - pr_debug("AFFS: remount(flags=0x%x,opts=\"%s\")\n",*flags,data); + pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data); sync_filesystem(sb); *flags |= MS_NODIRATIME; @@ -580,8 +576,9 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf) int free; u64 id = huge_encode_dev(sb->s_bdev->bd_dev); - pr_debug("AFFS: statfs() partsize=%d, reserved=%d\n",AFFS_SB(sb)->s_partition_size, - AFFS_SB(sb)->s_reserved); + pr_debug("%s() partsize=%d, reserved=%d\n", + __func__, AFFS_SB(sb)->s_partition_size, + AFFS_SB(sb)->s_reserved); free = affs_count_free_blocks(sb); buf->f_type = AFFS_SUPER_MAGIC; diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c index ee00f08c4f53..f39b71c3981e 100644 --- a/fs/affs/symlink.c +++ b/fs/affs/symlink.c @@ -21,7 +21,7 @@ static int affs_symlink_readpage(struct file *file, struct page *page) char c; char lc; - pr_debug("AFFS: follow_link(ino=%lu)\n",inode->i_ino); + pr_debug("follow_link(ino=%lu)\n", inode->i_ino); err = -EIO; bh = affs_bread(inode->i_sb, inode->i_ino); diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 1c8c6cc6de30..4b0eff6da674 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -130,6 +130,15 @@ static void afs_cm_destructor(struct afs_call *call) { _enter(""); + /* Break the callbacks here so that we do it after the final ACK is + * received. The step number here must match the final number in + * afs_deliver_cb_callback(). + */ + if (call->unmarshall == 6) { + ASSERT(call->server && call->count && call->request); + afs_break_callbacks(call->server, call->count, call->request); + } + afs_put_server(call->server); call->server = NULL; kfree(call->buffer); @@ -272,6 +281,16 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, _debug("trailer"); if (skb->len != 0) return -EBADMSG; + + /* Record that the message was unmarshalled successfully so + * that the call destructor can know do the callback breaking + * work, even if the final ACK isn't received. + * + * If the step number changes, then afs_cm_destructor() must be + * updated also. + */ + call->unmarshall++; + case 6: break; } diff --git a/fs/afs/file.c b/fs/afs/file.c index 66d50fe2ee45..932ce07948b3 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -31,10 +31,10 @@ const struct file_operations afs_file_operations = { .open = afs_open, .release = afs_release, .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = afs_file_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = afs_file_write, .mmap = generic_file_readonly_mmap, .splice_read = generic_file_splice_read, .fsync = afs_fsync, diff --git a/fs/afs/flock.c b/fs/afs/flock.c index a8cf2cff836c..4baf1d2b39e4 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -555,10 +555,6 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl) return -ENOLCK; /* we're simulating flock() locks using posix locks on the server */ - fl->fl_owner = (fl_owner_t) file; - fl->fl_start = 0; - fl->fl_end = OFFSET_MAX; - if (fl->fl_type == F_UNLCK) return afs_do_unlk(file, fl); return afs_do_setlk(file, fl); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index be75b500005d..71d5982312f3 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -75,7 +75,7 @@ struct afs_call { const struct afs_call_type *type; /* type of call */ const struct afs_wait_mode *wait_mode; /* completion wait mode */ wait_queue_head_t waitq; /* processes awaiting completion */ - work_func_t async_workfn; + void (*async_workfn)(struct afs_call *call); /* asynchronous work function */ struct work_struct async_work; /* asynchronous work processor */ struct work_struct work; /* actual work processor */ struct sk_buff_head rx_queue; /* received packets */ @@ -747,8 +747,7 @@ extern int afs_write_end(struct file *file, struct address_space *mapping, extern int afs_writepage(struct page *, struct writeback_control *); extern int afs_writepages(struct address_space *, struct writeback_control *); extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *); -extern ssize_t afs_file_write(struct kiocb *, const struct iovec *, - unsigned long, loff_t); +extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *); extern int afs_writeback_all(struct afs_vnode *); extern int afs_fsync(struct file *, loff_t, loff_t, int); diff --git a/fs/afs/main.c b/fs/afs/main.c index 42dd2e499ed8..35de0c04729f 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -55,13 +55,13 @@ static int __init afs_get_client_UUID(void) afs_uuid.time_low = uuidtime; afs_uuid.time_mid = uuidtime >> 32; afs_uuid.time_hi_and_version = (uuidtime >> 48) & AFS_UUID_TIMEHI_MASK; - afs_uuid.time_hi_and_version = AFS_UUID_VERSION_TIME; + afs_uuid.time_hi_and_version |= AFS_UUID_VERSION_TIME; get_random_bytes(&clockseq, 2); afs_uuid.clock_seq_low = clockseq; afs_uuid.clock_seq_hi_and_reserved = (clockseq >> 8) & AFS_UUID_CLOCKHI_MASK; - afs_uuid.clock_seq_hi_and_reserved = AFS_UUID_VARIANT_STD; + afs_uuid.clock_seq_hi_and_reserved |= AFS_UUID_VARIANT_STD; _debug("AFS UUID: %08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x", afs_uuid.time_low, diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index ef943df73b8c..03a3beb17004 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -25,7 +25,7 @@ static void afs_wake_up_call_waiter(struct afs_call *); static int afs_wait_for_call_to_complete(struct afs_call *); static void afs_wake_up_async_call(struct afs_call *); static int afs_dont_wait_for_call_to_complete(struct afs_call *); -static void afs_process_async_call(struct work_struct *); +static void afs_process_async_call(struct afs_call *); static void afs_rx_interceptor(struct sock *, unsigned long, struct sk_buff *); static int afs_deliver_cm_op_id(struct afs_call *, struct sk_buff *, bool); @@ -58,6 +58,13 @@ static void afs_collect_incoming_call(struct work_struct *); static struct sk_buff_head afs_incoming_calls; static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call); +static void afs_async_workfn(struct work_struct *work) +{ + struct afs_call *call = container_of(work, struct afs_call, async_work); + + call->async_workfn(call); +} + /* * open an RxRPC socket and bind it to be a server for callback notifications * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT @@ -184,6 +191,28 @@ static void afs_free_call(struct afs_call *call) } /* + * End a call but do not free it + */ +static void afs_end_call_nofree(struct afs_call *call) +{ + if (call->rxcall) { + rxrpc_kernel_end_call(call->rxcall); + call->rxcall = NULL; + } + if (call->type->destructor) + call->type->destructor(call); +} + +/* + * End a call and free it + */ +static void afs_end_call(struct afs_call *call) +{ + afs_end_call_nofree(call); + afs_free_call(call); +} + +/* * allocate a call with flat request and reply buffers */ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type, @@ -326,7 +355,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, atomic_read(&afs_outstanding_calls)); call->wait_mode = wait_mode; - INIT_WORK(&call->async_work, afs_process_async_call); + call->async_workfn = afs_process_async_call; + INIT_WORK(&call->async_work, afs_async_workfn); memset(&srx, 0, sizeof(srx)); srx.srx_family = AF_RXRPC; @@ -383,11 +413,8 @@ error_do_abort: rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT); while ((skb = skb_dequeue(&call->rx_queue))) afs_free_skb(skb); - rxrpc_kernel_end_call(rxcall); - call->rxcall = NULL; error_kill_call: - call->type->destructor(call); - afs_free_call(call); + afs_end_call(call); _leave(" = %d", ret); return ret; } @@ -509,12 +536,8 @@ static void afs_deliver_to_call(struct afs_call *call) if (call->state >= AFS_CALL_COMPLETE) { while ((skb = skb_dequeue(&call->rx_queue))) afs_free_skb(skb); - if (call->incoming) { - rxrpc_kernel_end_call(call->rxcall); - call->rxcall = NULL; - call->type->destructor(call); - afs_free_call(call); - } + if (call->incoming) + afs_end_call(call); } _leave(""); @@ -564,10 +587,7 @@ static int afs_wait_for_call_to_complete(struct afs_call *call) } _debug("call complete"); - rxrpc_kernel_end_call(call->rxcall); - call->rxcall = NULL; - call->type->destructor(call); - afs_free_call(call); + afs_end_call(call); _leave(" = %d", ret); return ret; } @@ -603,11 +623,8 @@ static int afs_dont_wait_for_call_to_complete(struct afs_call *call) /* * delete an asynchronous call */ -static void afs_delete_async_call(struct work_struct *work) +static void afs_delete_async_call(struct afs_call *call) { - struct afs_call *call = - container_of(work, struct afs_call, async_work); - _enter(""); afs_free_call(call); @@ -620,11 +637,8 @@ static void afs_delete_async_call(struct work_struct *work) * - on a multiple-thread workqueue this work item may try to run on several * CPUs at the same time */ -static void afs_process_async_call(struct work_struct *work) +static void afs_process_async_call(struct afs_call *call) { - struct afs_call *call = - container_of(work, struct afs_call, async_work); - _enter(""); if (!skb_queue_empty(&call->rx_queue)) @@ -637,10 +651,7 @@ static void afs_process_async_call(struct work_struct *work) call->reply = NULL; /* kill the call */ - rxrpc_kernel_end_call(call->rxcall); - call->rxcall = NULL; - if (call->type->destructor) - call->type->destructor(call); + afs_end_call_nofree(call); /* we can't just delete the call because the work item may be * queued */ @@ -663,13 +674,6 @@ void afs_transfer_reply(struct afs_call *call, struct sk_buff *skb) call->reply_size += len; } -static void afs_async_workfn(struct work_struct *work) -{ - struct afs_call *call = container_of(work, struct afs_call, async_work); - - call->async_workfn(work); -} - /* * accept the backlog of incoming calls */ @@ -790,10 +794,7 @@ void afs_send_empty_reply(struct afs_call *call) _debug("oom"); rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT); default: - rxrpc_kernel_end_call(call->rxcall); - call->rxcall = NULL; - call->type->destructor(call); - afs_free_call(call); + afs_end_call(call); _leave(" [error]"); return; } @@ -823,17 +824,16 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) call->state = AFS_CALL_AWAIT_ACK; n = rxrpc_kernel_send_data(call->rxcall, &msg, len); if (n >= 0) { + /* Success */ _leave(" [replied]"); return; } + if (n == -ENOMEM) { _debug("oom"); rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT); } - rxrpc_kernel_end_call(call->rxcall); - call->rxcall = NULL; - call->type->destructor(call); - afs_free_call(call); + afs_end_call(call); _leave(" [error]"); } diff --git a/fs/afs/write.c b/fs/afs/write.c index a890db4b9898..ab6adfd52516 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -625,15 +625,14 @@ void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call) /* * write to an AFS file */ -ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from) { struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp)); ssize_t result; - size_t count = iov_length(iov, nr_segs); + size_t count = iov_iter_count(from); - _enter("{%x.%u},{%zu},%lu,", - vnode->fid.vid, vnode->fid.vnode, count, nr_segs); + _enter("{%x.%u},{%zu},", + vnode->fid.vid, vnode->fid.vnode, count); if (IS_SWAPFILE(&vnode->vfs_inode)) { printk(KERN_INFO @@ -644,7 +643,7 @@ ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov, if (!count) return 0; - result = generic_file_aio_write(iocb, iov, nr_segs, pos); + result = generic_file_write_iter(iocb, from); if (IS_ERR_VALUE(result)) { _leave(" = %zd", result); return result; @@ -112,6 +112,11 @@ struct kioctx { struct work_struct free_work; + /* + * signals when all in-flight requests are done + */ + struct completion *requests_done; + struct { /* * This counts the number of available slots in the ringbuffer, @@ -472,7 +477,7 @@ void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) } EXPORT_SYMBOL(kiocb_set_cancel_fn); -static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb) +static int kiocb_cancel(struct kiocb *kiocb) { kiocb_cancel_fn *old, *cancel; @@ -508,6 +513,10 @@ static void free_ioctx_reqs(struct percpu_ref *ref) { struct kioctx *ctx = container_of(ref, struct kioctx, reqs); + /* At this point we know that there are no any in-flight requests */ + if (ctx->requests_done) + complete(ctx->requests_done); + INIT_WORK(&ctx->free_work, free_ioctx); schedule_work(&ctx->free_work); } @@ -529,7 +538,7 @@ static void free_ioctx_users(struct percpu_ref *ref) struct kiocb, ki_list); list_del_init(&req->ki_list); - kiocb_cancel(ctx, req); + kiocb_cancel(req); } spin_unlock_irq(&ctx->ctx_lock); @@ -718,37 +727,42 @@ err: * when the processes owning a context have all exited to encourage * the rapid destruction of the kioctx. */ -static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx) +static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, + struct completion *requests_done) { - if (!atomic_xchg(&ctx->dead, 1)) { - struct kioctx_table *table; + struct kioctx_table *table; - spin_lock(&mm->ioctx_lock); - rcu_read_lock(); - table = rcu_dereference(mm->ioctx_table); + if (atomic_xchg(&ctx->dead, 1)) + return -EINVAL; - WARN_ON(ctx != table->table[ctx->id]); - table->table[ctx->id] = NULL; - rcu_read_unlock(); - spin_unlock(&mm->ioctx_lock); - /* percpu_ref_kill() will do the necessary call_rcu() */ - wake_up_all(&ctx->wait); + spin_lock(&mm->ioctx_lock); + rcu_read_lock(); + table = rcu_dereference(mm->ioctx_table); - /* - * It'd be more correct to do this in free_ioctx(), after all - * the outstanding kiocbs have finished - but by then io_destroy - * has already returned, so io_setup() could potentially return - * -EAGAIN with no ioctxs actually in use (as far as userspace - * could tell). - */ - aio_nr_sub(ctx->max_reqs); + WARN_ON(ctx != table->table[ctx->id]); + table->table[ctx->id] = NULL; + rcu_read_unlock(); + spin_unlock(&mm->ioctx_lock); - if (ctx->mmap_size) - vm_munmap(ctx->mmap_base, ctx->mmap_size); + /* percpu_ref_kill() will do the necessary call_rcu() */ + wake_up_all(&ctx->wait); - percpu_ref_kill(&ctx->users); - } + /* + * It'd be more correct to do this in free_ioctx(), after all + * the outstanding kiocbs have finished - but by then io_destroy + * has already returned, so io_setup() could potentially return + * -EAGAIN with no ioctxs actually in use (as far as userspace + * could tell). + */ + aio_nr_sub(ctx->max_reqs); + + if (ctx->mmap_size) + vm_munmap(ctx->mmap_base, ctx->mmap_size); + + ctx->requests_done = requests_done; + percpu_ref_kill(&ctx->users); + return 0; } /* wait_on_sync_kiocb: @@ -809,23 +823,27 @@ void exit_aio(struct mm_struct *mm) */ ctx->mmap_size = 0; - kill_ioctx(mm, ctx); + kill_ioctx(mm, ctx, NULL); } } static void put_reqs_available(struct kioctx *ctx, unsigned nr) { struct kioctx_cpu *kcpu; + unsigned long flags; preempt_disable(); kcpu = this_cpu_ptr(ctx->cpu); + local_irq_save(flags); kcpu->reqs_available += nr; + while (kcpu->reqs_available >= ctx->req_batch * 2) { kcpu->reqs_available -= ctx->req_batch; atomic_add(ctx->req_batch, &ctx->reqs_available); } + local_irq_restore(flags); preempt_enable(); } @@ -833,10 +851,12 @@ static bool get_reqs_available(struct kioctx *ctx) { struct kioctx_cpu *kcpu; bool ret = false; + unsigned long flags; preempt_disable(); kcpu = this_cpu_ptr(ctx->cpu); + local_irq_save(flags); if (!kcpu->reqs_available) { int old, avail = atomic_read(&ctx->reqs_available); @@ -855,6 +875,7 @@ static bool get_reqs_available(struct kioctx *ctx) ret = true; kcpu->reqs_available--; out: + local_irq_restore(flags); preempt_enable(); return ret; } @@ -1007,6 +1028,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) /* everything turned out well, dispose of the aiocb. */ kiocb_free(iocb); + put_reqs_available(ctx, 1); /* * We have to order our ring_info tail store above and test @@ -1048,6 +1070,9 @@ static long aio_read_events_ring(struct kioctx *ctx, if (head == tail) goto out; + head %= ctx->nr_events; + tail %= ctx->nr_events; + while (ret < nr) { long avail; struct io_event *ev; @@ -1086,8 +1111,6 @@ static long aio_read_events_ring(struct kioctx *ctx, flush_dcache_page(ctx->ring_pages[0]); pr_debug("%li h%u t%u\n", ret, head, tail); - - put_reqs_available(ctx, ret); out: mutex_unlock(&ctx->ring_lock); @@ -1185,7 +1208,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) if (!IS_ERR(ioctx)) { ret = put_user(ioctx->user_id, ctxp); if (ret) - kill_ioctx(current->mm, ioctx); + kill_ioctx(current->mm, ioctx, NULL); percpu_ref_put(&ioctx->users); } @@ -1203,9 +1226,25 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) { struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { - kill_ioctx(current->mm, ioctx); + struct completion requests_done = + COMPLETION_INITIALIZER_ONSTACK(requests_done); + int ret; + + /* Pass requests_done to kill_ioctx() where it can be set + * in a thread-safe way. If we try to set it here then we have + * a race condition if two io_destroy() called simultaneously. + */ + ret = kill_ioctx(current->mm, ioctx, &requests_done); percpu_ref_put(&ioctx->users); - return 0; + + /* Wait until all IO for the context are done. Otherwise kernel + * keep using user-space buffers even if user thinks the context + * is destroyed. + */ + if (!ret) + wait_for_completion(&requests_done); + + return ret; } pr_debug("EINVAL: io_destroy: invalid context id\n"); return -EINVAL; @@ -1213,6 +1252,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *, unsigned long, loff_t); +typedef ssize_t (rw_iter_op)(struct kiocb *, struct iov_iter *); static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb, int rw, char __user *buf, @@ -1270,7 +1310,9 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, int rw; fmode_t mode; aio_rw_op *rw_op; + rw_iter_op *iter_op; struct iovec inline_vec, *iovec = &inline_vec; + struct iov_iter iter; switch (opcode) { case IOCB_CMD_PREAD: @@ -1278,6 +1320,7 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, mode = FMODE_READ; rw = READ; rw_op = file->f_op->aio_read; + iter_op = file->f_op->read_iter; goto rw_common; case IOCB_CMD_PWRITE: @@ -1285,12 +1328,13 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, mode = FMODE_WRITE; rw = WRITE; rw_op = file->f_op->aio_write; + iter_op = file->f_op->write_iter; goto rw_common; rw_common: if (unlikely(!(file->f_mode & mode))) return -EBADF; - if (!rw_op) + if (!rw_op && !iter_op) return -EINVAL; ret = (opcode == IOCB_CMD_PREADV || @@ -1299,10 +1343,8 @@ rw_common: &iovec, compat) : aio_setup_single_vector(req, rw, buf, &nr_segs, iovec); - if (ret) - return ret; - - ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); + if (!ret) + ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); if (ret < 0) { if (iovec != &inline_vec) kfree(iovec); @@ -1321,7 +1363,12 @@ rw_common: if (rw == WRITE) file_start_write(file); - ret = rw_op(req, iovec, nr_segs, req->ki_pos); + if (iter_op) { + iov_iter_init(&iter, rw, iovec, nr_segs, req->ki_nbytes); + ret = iter_op(req, &iter); + } else { + ret = rw_op(req, iovec, nr_segs, req->ki_pos); + } if (rw == WRITE) file_end_write(file); @@ -1559,7 +1606,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, kiocb = lookup_kiocb(ctx, iocb, key); if (kiocb) - ret = kiocb_cancel(ctx, kiocb); + ret = kiocb_cancel(kiocb); else ret = -EINVAL; diff --git a/fs/attr.c b/fs/attr.c index 5d4e59d56e85..6530ced19697 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -50,14 +50,14 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr) if ((ia_valid & ATTR_UID) && (!uid_eq(current_fsuid(), inode->i_uid) || !uid_eq(attr->ia_uid, inode->i_uid)) && - !inode_capable(inode, CAP_CHOWN)) + !capable_wrt_inode_uidgid(inode, CAP_CHOWN)) return -EPERM; /* Make sure caller can chgrp. */ if ((ia_valid & ATTR_GID) && (!uid_eq(current_fsuid(), inode->i_uid) || (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) && - !inode_capable(inode, CAP_CHOWN)) + !capable_wrt_inode_uidgid(inode, CAP_CHOWN)) return -EPERM; /* Make sure a caller can chmod. */ @@ -67,7 +67,7 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr) /* Also check the setgid bit! */ if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : inode->i_gid) && - !inode_capable(inode, CAP_FSETID)) + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) attr->ia_mode &= ~S_ISGID; } @@ -160,7 +160,7 @@ void setattr_copy(struct inode *inode, const struct iattr *attr) umode_t mode = attr->ia_mode; if (!in_group_p(inode->i_gid) && - !inode_capable(inode, CAP_FSETID)) + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) mode &= ~S_ISGID; inode->i_mode = mode; } diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 232e03d4780d..5b570b6efa28 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -737,7 +737,7 @@ MODULE_ALIAS_MISCDEV(AUTOFS_MINOR); MODULE_ALIAS("devname:autofs"); /* Register/deregister misc character device */ -int autofs_dev_ioctl_init(void) +int __init autofs_dev_ioctl_init(void) { int r; diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index d7bd395ab586..1c55388ae633 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -210,7 +210,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) int pipefd; struct autofs_sb_info *sbi; struct autofs_info *ino; - int pgrp; + int pgrp = 0; bool pgrp_set = false; int ret = -EINVAL; diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 2caf36ac3e93..cc87c1abac97 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -179,7 +179,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry) spin_lock(&active->d_lock); /* Already gone? */ - if (!d_count(active)) + if ((int) d_count(active) <= 0) goto next; qstr = &active->d_name; @@ -230,7 +230,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry) spin_lock(&expiring->d_lock); - /* Bad luck, we've already been dentry_iput */ + /* We've already been dentry_iput or unlinked */ if (!expiring->d_inode) goto next; diff --git a/fs/befs/btree.c b/fs/befs/btree.c index a2cd305a993a..9c7faa8a9288 100644 --- a/fs/befs/btree.c +++ b/fs/befs/btree.c @@ -318,7 +318,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds, * befs_find_key - Search for a key within a node * @sb: Filesystem superblock * @node: Node to find the key within - * @key: Keystring to search for + * @findkey: Keystring to search for * @value: If key is found, the value stored with the key is put here * * finds exact match if one exists, and returns BEFS_BT_MATCH @@ -405,7 +405,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node, * Heres how it works: Key_no is the index of the key/value pair to * return in keybuf/value. * Bufsize is the size of keybuf (BEFS_NAME_LEN+1 is a good size). Keysize is - * the number of charecters in the key (just a convenience). + * the number of characters in the key (just a convenience). * * Algorithm: * Get the first leafnode of the tree. See if the requested key is in that @@ -502,12 +502,11 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, "for key of size %d", __func__, bufsize, keylen); brelse(this_node->bh); goto error_alloc; - }; + } - strncpy(keybuf, keystart, keylen); + strlcpy(keybuf, keystart, keylen + 1); *value = fs64_to_cpu(sb, valarray[cur_key]); *keysize = keylen; - keybuf[keylen] = '\0'; befs_debug(sb, "Read [%llu,%d]: Key \"%.*s\", Value %llu", node_off, cur_key, keylen, keybuf, *value); @@ -707,7 +706,7 @@ befs_bt_get_key(struct super_block *sb, befs_btree_node * node, * @key1: pointer to the first key to be compared * @keylen1: length in bytes of key1 * @key2: pointer to the second key to be compared - * @kelen2: length in bytes of key2 + * @keylen2: length in bytes of key2 * * Returns 0 if @key1 and @key2 are equal. * Returns >0 if @key1 is greater. diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c index c467bebd50af..1e8e0b8d8836 100644 --- a/fs/befs/datastream.c +++ b/fs/befs/datastream.c @@ -116,7 +116,7 @@ befs_fblock2brun(struct super_block *sb, befs_data_stream * data, * befs_read_lsmylink - read long symlink from datastream. * @sb: Filesystem superblock * @ds: Datastrem to read from - * @buf: Buffer in which to place long symlink data + * @buff: Buffer in which to place long symlink data * @len: Length of the long symlink in bytes * * Returns the number of bytes read diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index d626756ff721..a16fbd4e8241 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -133,14 +133,6 @@ befs_get_block(struct inode *inode, sector_t block, befs_debug(sb, "---> befs_get_block() for inode %lu, block %ld", (unsigned long)inode->i_ino, (long)block); - - if (block < 0) { - befs_error(sb, "befs_get_block() was asked for a block " - "number less than zero: block %ld in inode %lu", - (long)block, (unsigned long)inode->i_ino); - return -EIO; - } - if (create) { befs_error(sb, "befs_get_block() was asked to write to " "block %ld in inode %lu", (long)block, @@ -396,9 +388,8 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) if (S_ISLNK(inode->i_mode) && !(befs_ino->i_flags & BEFS_LONG_SYMLINK)){ inode->i_size = 0; inode->i_blocks = befs_sb->block_size / VFS_BLOCK_SIZE; - strncpy(befs_ino->i_data.symlink, raw_inode->data.symlink, - BEFS_SYMLINK_LEN - 1); - befs_ino->i_data.symlink[BEFS_SYMLINK_LEN - 1] = '\0'; + strlcpy(befs_ino->i_data.symlink, raw_inode->data.symlink, + BEFS_SYMLINK_LEN); } else { int num_blks; @@ -591,21 +582,21 @@ befs_utf2nls(struct super_block *sb, const char *in, /** * befs_nls2utf - Convert NLS string to utf8 encodeing * @sb: Superblock - * @src: Input string buffer in NLS format - * @srclen: Length of input string in bytes - * @dest: The output string in UTF-8 format - * @destlen: Length of the output buffer + * @in: Input string buffer in NLS format + * @in_len: Length of input string in bytes + * @out: The output string in UTF-8 format + * @out_len: Length of the output buffer * - * Converts input string @src, which is in the format of the loaded NLS map, + * Converts input string @in, which is in the format of the loaded NLS map, * into a utf8 string. * - * The destination string @dest is allocated by this function and the caller is + * The destination string @out is allocated by this function and the caller is * responsible for freeing it with kfree() * - * On return, *@destlen is the length of @dest in bytes. + * On return, *@out_len is the length of @out in bytes. * * On success, the return value is the number of utf8 characters written to - * the output buffer @dest. + * the output buffer @out. * * On Failure, a negative number coresponding to the error code is returned. */ diff --git a/fs/bfs/file.c b/fs/bfs/file.c index ae2892218335..e7f88ace1a25 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -23,10 +23,10 @@ const struct file_operations bfs_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write = new_sync_write, + .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .splice_read = generic_file_splice_read, }; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index aa3cb626671e..3892c1a23241 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1108,6 +1108,14 @@ static bool always_dump_vma(struct vm_area_struct *vma) /* Any vsyscall mappings? */ if (vma == get_gate_vma(vma->vm_mm)) return true; + + /* + * Assume that all vmas with a .name op should always be dumped. + * If this changes, a new vm_ops field can easily be added. + */ + if (vma->vm_ops && vma->vm_ops->name && vma->vm_ops->name(vma)) + return true; + /* * arch_vma_name() returns non-NULL for special architecture mappings, * such as vDSO sections. @@ -1686,7 +1694,7 @@ static size_t get_note_info_size(struct elf_note_info *info) static int write_note_info(struct elf_note_info *info, struct coredump_params *cprm) { - bool first = 1; + bool first = true; struct elf_thread_core_info *t = info->thread; do { @@ -1710,7 +1718,7 @@ static int write_note_info(struct elf_note_info *info, !writenote(&t->notes[i], cprm)) return 0; - first = 0; + first = false; t = t->next; } while (t); diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index d50bbe59da1e..f723cd3a455c 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -380,7 +380,7 @@ failed: /****************************************************************************/ -void old_reloc(unsigned long rl) +static void old_reloc(unsigned long rl) { #ifdef DEBUG char *segment[] = { "TEXT", "DATA", "BSS", "*UNKNOWN*" }; diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c deleted file mode 100644 index 1c2ce0c87711..000000000000 --- a/fs/bio-integrity.c +++ /dev/null @@ -1,657 +0,0 @@ -/* - * bio-integrity.c - bio data integrity extensions - * - * Copyright (C) 2007, 2008, 2009 Oracle Corporation - * Written by: Martin K. Petersen <martin.petersen@oracle.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, - * USA. - * - */ - -#include <linux/blkdev.h> -#include <linux/mempool.h> -#include <linux/export.h> -#include <linux/bio.h> -#include <linux/workqueue.h> -#include <linux/slab.h> - -#define BIP_INLINE_VECS 4 - -static struct kmem_cache *bip_slab; -static struct workqueue_struct *kintegrityd_wq; - -/** - * bio_integrity_alloc - Allocate integrity payload and attach it to bio - * @bio: bio to attach integrity metadata to - * @gfp_mask: Memory allocation mask - * @nr_vecs: Number of integrity metadata scatter-gather elements - * - * Description: This function prepares a bio for attaching integrity - * metadata. nr_vecs specifies the maximum number of pages containing - * integrity metadata that can be attached. - */ -struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, - gfp_t gfp_mask, - unsigned int nr_vecs) -{ - struct bio_integrity_payload *bip; - struct bio_set *bs = bio->bi_pool; - unsigned long idx = BIO_POOL_NONE; - unsigned inline_vecs; - - if (!bs) { - bip = kmalloc(sizeof(struct bio_integrity_payload) + - sizeof(struct bio_vec) * nr_vecs, gfp_mask); - inline_vecs = nr_vecs; - } else { - bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask); - inline_vecs = BIP_INLINE_VECS; - } - - if (unlikely(!bip)) - return NULL; - - memset(bip, 0, sizeof(*bip)); - - if (nr_vecs > inline_vecs) { - bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx, - bs->bvec_integrity_pool); - if (!bip->bip_vec) - goto err; - } else { - bip->bip_vec = bip->bip_inline_vecs; - } - - bip->bip_slab = idx; - bip->bip_bio = bio; - bio->bi_integrity = bip; - - return bip; -err: - mempool_free(bip, bs->bio_integrity_pool); - return NULL; -} -EXPORT_SYMBOL(bio_integrity_alloc); - -/** - * bio_integrity_free - Free bio integrity payload - * @bio: bio containing bip to be freed - * - * Description: Used to free the integrity portion of a bio. Usually - * called from bio_free(). - */ -void bio_integrity_free(struct bio *bio) -{ - struct bio_integrity_payload *bip = bio->bi_integrity; - struct bio_set *bs = bio->bi_pool; - - if (bip->bip_owns_buf) - kfree(bip->bip_buf); - - if (bs) { - if (bip->bip_slab != BIO_POOL_NONE) - bvec_free(bs->bvec_integrity_pool, bip->bip_vec, - bip->bip_slab); - - mempool_free(bip, bs->bio_integrity_pool); - } else { - kfree(bip); - } - - bio->bi_integrity = NULL; -} -EXPORT_SYMBOL(bio_integrity_free); - -static inline unsigned int bip_integrity_vecs(struct bio_integrity_payload *bip) -{ - if (bip->bip_slab == BIO_POOL_NONE) - return BIP_INLINE_VECS; - - return bvec_nr_vecs(bip->bip_slab); -} - -/** - * bio_integrity_add_page - Attach integrity metadata - * @bio: bio to update - * @page: page containing integrity metadata - * @len: number of bytes of integrity metadata in page - * @offset: start offset within page - * - * Description: Attach a page containing integrity metadata to bio. - */ -int bio_integrity_add_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int offset) -{ - struct bio_integrity_payload *bip = bio->bi_integrity; - struct bio_vec *iv; - - if (bip->bip_vcnt >= bip_integrity_vecs(bip)) { - printk(KERN_ERR "%s: bip_vec full\n", __func__); - return 0; - } - - iv = bip->bip_vec + bip->bip_vcnt; - - iv->bv_page = page; - iv->bv_len = len; - iv->bv_offset = offset; - bip->bip_vcnt++; - - return len; -} -EXPORT_SYMBOL(bio_integrity_add_page); - -static int bdev_integrity_enabled(struct block_device *bdev, int rw) -{ - struct blk_integrity *bi = bdev_get_integrity(bdev); - - if (bi == NULL) - return 0; - - if (rw == READ && bi->verify_fn != NULL && - (bi->flags & INTEGRITY_FLAG_READ)) - return 1; - - if (rw == WRITE && bi->generate_fn != NULL && - (bi->flags & INTEGRITY_FLAG_WRITE)) - return 1; - - return 0; -} - -/** - * bio_integrity_enabled - Check whether integrity can be passed - * @bio: bio to check - * - * Description: Determines whether bio_integrity_prep() can be called - * on this bio or not. bio data direction and target device must be - * set prior to calling. The functions honors the write_generate and - * read_verify flags in sysfs. - */ -int bio_integrity_enabled(struct bio *bio) -{ - if (!bio_is_rw(bio)) - return 0; - - /* Already protected? */ - if (bio_integrity(bio)) - return 0; - - return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio)); -} -EXPORT_SYMBOL(bio_integrity_enabled); - -/** - * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto - * @bi: blk_integrity profile for device - * @sectors: Number of 512 sectors to convert - * - * Description: The block layer calculates everything in 512 byte - * sectors but integrity metadata is done in terms of the hardware - * sector size of the storage device. Convert the block layer sectors - * to physical sectors. - */ -static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi, - unsigned int sectors) -{ - /* At this point there are only 512b or 4096b DIF/EPP devices */ - if (bi->sector_size == 4096) - return sectors >>= 3; - - return sectors; -} - -static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, - unsigned int sectors) -{ - return bio_integrity_hw_sectors(bi, sectors) * bi->tuple_size; -} - -/** - * bio_integrity_tag_size - Retrieve integrity tag space - * @bio: bio to inspect - * - * Description: Returns the maximum number of tag bytes that can be - * attached to this bio. Filesystems can use this to determine how - * much metadata to attach to an I/O. - */ -unsigned int bio_integrity_tag_size(struct bio *bio) -{ - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - - BUG_ON(bio->bi_iter.bi_size == 0); - - return bi->tag_size * (bio->bi_iter.bi_size / bi->sector_size); -} -EXPORT_SYMBOL(bio_integrity_tag_size); - -static int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, - int set) -{ - struct bio_integrity_payload *bip = bio->bi_integrity; - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - unsigned int nr_sectors; - - BUG_ON(bip->bip_buf == NULL); - - if (bi->tag_size == 0) - return -1; - - nr_sectors = bio_integrity_hw_sectors(bi, - DIV_ROUND_UP(len, bi->tag_size)); - - if (nr_sectors * bi->tuple_size > bip->bip_iter.bi_size) { - printk(KERN_ERR "%s: tag too big for bio: %u > %u\n", __func__, - nr_sectors * bi->tuple_size, bip->bip_iter.bi_size); - return -1; - } - - if (set) - bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors); - else - bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors); - - return 0; -} - -/** - * bio_integrity_set_tag - Attach a tag buffer to a bio - * @bio: bio to attach buffer to - * @tag_buf: Pointer to a buffer containing tag data - * @len: Length of the included buffer - * - * Description: Use this function to tag a bio by leveraging the extra - * space provided by devices formatted with integrity protection. The - * size of the integrity buffer must be <= to the size reported by - * bio_integrity_tag_size(). - */ -int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len) -{ - BUG_ON(bio_data_dir(bio) != WRITE); - - return bio_integrity_tag(bio, tag_buf, len, 1); -} -EXPORT_SYMBOL(bio_integrity_set_tag); - -/** - * bio_integrity_get_tag - Retrieve a tag buffer from a bio - * @bio: bio to retrieve buffer from - * @tag_buf: Pointer to a buffer for the tag data - * @len: Length of the target buffer - * - * Description: Use this function to retrieve the tag buffer from a - * completed I/O. The size of the integrity buffer must be <= to the - * size reported by bio_integrity_tag_size(). - */ -int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len) -{ - BUG_ON(bio_data_dir(bio) != READ); - - return bio_integrity_tag(bio, tag_buf, len, 0); -} -EXPORT_SYMBOL(bio_integrity_get_tag); - -/** - * bio_integrity_generate_verify - Generate/verify integrity metadata for a bio - * @bio: bio to generate/verify integrity metadata for - * @operate: operate number, 1 for generate, 0 for verify - */ -static int bio_integrity_generate_verify(struct bio *bio, int operate) -{ - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - struct blk_integrity_exchg bix; - struct bio_vec *bv; - sector_t sector; - unsigned int sectors, ret = 0, i; - void *prot_buf = bio->bi_integrity->bip_buf; - - if (operate) - sector = bio->bi_iter.bi_sector; - else - sector = bio->bi_integrity->bip_iter.bi_sector; - - bix.disk_name = bio->bi_bdev->bd_disk->disk_name; - bix.sector_size = bi->sector_size; - - bio_for_each_segment_all(bv, bio, i) { - void *kaddr = kmap_atomic(bv->bv_page); - bix.data_buf = kaddr + bv->bv_offset; - bix.data_size = bv->bv_len; - bix.prot_buf = prot_buf; - bix.sector = sector; - - if (operate) - bi->generate_fn(&bix); - else { - ret = bi->verify_fn(&bix); - if (ret) { - kunmap_atomic(kaddr); - return ret; - } - } - - sectors = bv->bv_len / bi->sector_size; - sector += sectors; - prot_buf += sectors * bi->tuple_size; - - kunmap_atomic(kaddr); - } - return ret; -} - -/** - * bio_integrity_generate - Generate integrity metadata for a bio - * @bio: bio to generate integrity metadata for - * - * Description: Generates integrity metadata for a bio by calling the - * block device's generation callback function. The bio must have a - * bip attached with enough room to accommodate the generated - * integrity metadata. - */ -static void bio_integrity_generate(struct bio *bio) -{ - bio_integrity_generate_verify(bio, 1); -} - -static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi) -{ - if (bi) - return bi->tuple_size; - - return 0; -} - -/** - * bio_integrity_prep - Prepare bio for integrity I/O - * @bio: bio to prepare - * - * Description: Allocates a buffer for integrity metadata, maps the - * pages and attaches them to a bio. The bio must have data - * direction, target device and start sector set priot to calling. In - * the WRITE case, integrity metadata will be generated using the - * block device's integrity function. In the READ case, the buffer - * will be prepared for DMA and a suitable end_io handler set up. - */ -int bio_integrity_prep(struct bio *bio) -{ - struct bio_integrity_payload *bip; - struct blk_integrity *bi; - struct request_queue *q; - void *buf; - unsigned long start, end; - unsigned int len, nr_pages; - unsigned int bytes, offset, i; - unsigned int sectors; - - bi = bdev_get_integrity(bio->bi_bdev); - q = bdev_get_queue(bio->bi_bdev); - BUG_ON(bi == NULL); - BUG_ON(bio_integrity(bio)); - - sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio)); - - /* Allocate kernel buffer for protection data */ - len = sectors * blk_integrity_tuple_size(bi); - buf = kmalloc(len, GFP_NOIO | q->bounce_gfp); - if (unlikely(buf == NULL)) { - printk(KERN_ERR "could not allocate integrity buffer\n"); - return -ENOMEM; - } - - end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - start = ((unsigned long) buf) >> PAGE_SHIFT; - nr_pages = end - start; - - /* Allocate bio integrity payload and integrity vectors */ - bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages); - if (unlikely(bip == NULL)) { - printk(KERN_ERR "could not allocate data integrity bioset\n"); - kfree(buf); - return -EIO; - } - - bip->bip_owns_buf = 1; - bip->bip_buf = buf; - bip->bip_iter.bi_size = len; - bip->bip_iter.bi_sector = bio->bi_iter.bi_sector; - - /* Map it */ - offset = offset_in_page(buf); - for (i = 0 ; i < nr_pages ; i++) { - int ret; - bytes = PAGE_SIZE - offset; - - if (len <= 0) - break; - - if (bytes > len) - bytes = len; - - ret = bio_integrity_add_page(bio, virt_to_page(buf), - bytes, offset); - - if (ret == 0) - return 0; - - if (ret < bytes) - break; - - buf += bytes; - len -= bytes; - offset = 0; - } - - /* Install custom I/O completion handler if read verify is enabled */ - if (bio_data_dir(bio) == READ) { - bip->bip_end_io = bio->bi_end_io; - bio->bi_end_io = bio_integrity_endio; - } - - /* Auto-generate integrity metadata if this is a write */ - if (bio_data_dir(bio) == WRITE) - bio_integrity_generate(bio); - - return 0; -} -EXPORT_SYMBOL(bio_integrity_prep); - -/** - * bio_integrity_verify - Verify integrity metadata for a bio - * @bio: bio to verify - * - * Description: This function is called to verify the integrity of a - * bio. The data in the bio io_vec is compared to the integrity - * metadata returned by the HBA. - */ -static int bio_integrity_verify(struct bio *bio) -{ - return bio_integrity_generate_verify(bio, 0); -} - -/** - * bio_integrity_verify_fn - Integrity I/O completion worker - * @work: Work struct stored in bio to be verified - * - * Description: This workqueue function is called to complete a READ - * request. The function verifies the transferred integrity metadata - * and then calls the original bio end_io function. - */ -static void bio_integrity_verify_fn(struct work_struct *work) -{ - struct bio_integrity_payload *bip = - container_of(work, struct bio_integrity_payload, bip_work); - struct bio *bio = bip->bip_bio; - int error; - - error = bio_integrity_verify(bio); - - /* Restore original bio completion handler */ - bio->bi_end_io = bip->bip_end_io; - bio_endio_nodec(bio, error); -} - -/** - * bio_integrity_endio - Integrity I/O completion function - * @bio: Protected bio - * @error: Pointer to errno - * - * Description: Completion for integrity I/O - * - * Normally I/O completion is done in interrupt context. However, - * verifying I/O integrity is a time-consuming task which must be run - * in process context. This function postpones completion - * accordingly. - */ -void bio_integrity_endio(struct bio *bio, int error) -{ - struct bio_integrity_payload *bip = bio->bi_integrity; - - BUG_ON(bip->bip_bio != bio); - - /* In case of an I/O error there is no point in verifying the - * integrity metadata. Restore original bio end_io handler - * and run it. - */ - if (error) { - bio->bi_end_io = bip->bip_end_io; - bio_endio(bio, error); - - return; - } - - INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); - queue_work(kintegrityd_wq, &bip->bip_work); -} -EXPORT_SYMBOL(bio_integrity_endio); - -/** - * bio_integrity_advance - Advance integrity vector - * @bio: bio whose integrity vector to update - * @bytes_done: number of data bytes that have been completed - * - * Description: This function calculates how many integrity bytes the - * number of completed data bytes correspond to and advances the - * integrity vector accordingly. - */ -void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) -{ - struct bio_integrity_payload *bip = bio->bi_integrity; - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9); - - bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes); -} -EXPORT_SYMBOL(bio_integrity_advance); - -/** - * bio_integrity_trim - Trim integrity vector - * @bio: bio whose integrity vector to update - * @offset: offset to first data sector - * @sectors: number of data sectors - * - * Description: Used to trim the integrity vector in a cloned bio. - * The ivec will be advanced corresponding to 'offset' data sectors - * and the length will be truncated corresponding to 'len' data - * sectors. - */ -void bio_integrity_trim(struct bio *bio, unsigned int offset, - unsigned int sectors) -{ - struct bio_integrity_payload *bip = bio->bi_integrity; - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - - bio_integrity_advance(bio, offset << 9); - bip->bip_iter.bi_size = bio_integrity_bytes(bi, sectors); -} -EXPORT_SYMBOL(bio_integrity_trim); - -/** - * bio_integrity_clone - Callback for cloning bios with integrity metadata - * @bio: New bio - * @bio_src: Original bio - * @gfp_mask: Memory allocation mask - * - * Description: Called to allocate a bip when cloning a bio - */ -int bio_integrity_clone(struct bio *bio, struct bio *bio_src, - gfp_t gfp_mask) -{ - struct bio_integrity_payload *bip_src = bio_src->bi_integrity; - struct bio_integrity_payload *bip; - - BUG_ON(bip_src == NULL); - - bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt); - - if (bip == NULL) - return -EIO; - - memcpy(bip->bip_vec, bip_src->bip_vec, - bip_src->bip_vcnt * sizeof(struct bio_vec)); - - bip->bip_vcnt = bip_src->bip_vcnt; - bip->bip_iter = bip_src->bip_iter; - - return 0; -} -EXPORT_SYMBOL(bio_integrity_clone); - -int bioset_integrity_create(struct bio_set *bs, int pool_size) -{ - if (bs->bio_integrity_pool) - return 0; - - bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab); - if (!bs->bio_integrity_pool) - return -1; - - bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size); - if (!bs->bvec_integrity_pool) { - mempool_destroy(bs->bio_integrity_pool); - return -1; - } - - return 0; -} -EXPORT_SYMBOL(bioset_integrity_create); - -void bioset_integrity_free(struct bio_set *bs) -{ - if (bs->bio_integrity_pool) - mempool_destroy(bs->bio_integrity_pool); - - if (bs->bvec_integrity_pool) - mempool_destroy(bs->bvec_integrity_pool); -} -EXPORT_SYMBOL(bioset_integrity_free); - -void __init bio_integrity_init(void) -{ - /* - * kintegrityd won't block much but may burn a lot of CPU cycles. - * Make it highpri CPU intensive wq with max concurrency of 1. - */ - kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM | - WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1); - if (!kintegrityd_wq) - panic("Failed to create kintegrityd\n"); - - bip_slab = kmem_cache_create("bio_integrity_payload", - sizeof(struct bio_integrity_payload) + - sizeof(struct bio_vec) * BIP_INLINE_VECS, - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - if (!bip_slab) - panic("Failed to create slab\n"); -} diff --git a/fs/bio.c b/fs/bio.c deleted file mode 100644 index 6f0362b77806..000000000000 --- a/fs/bio.c +++ /dev/null @@ -1,2037 +0,0 @@ -/* - * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public Licens - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- - * - */ -#include <linux/mm.h> -#include <linux/swap.h> -#include <linux/bio.h> -#include <linux/blkdev.h> -#include <linux/uio.h> -#include <linux/iocontext.h> -#include <linux/slab.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/export.h> -#include <linux/mempool.h> -#include <linux/workqueue.h> -#include <linux/cgroup.h> -#include <scsi/sg.h> /* for struct sg_iovec */ - -#include <trace/events/block.h> - -/* - * Test patch to inline a certain number of bi_io_vec's inside the bio - * itself, to shrink a bio data allocation from two mempool calls to one - */ -#define BIO_INLINE_VECS 4 - -/* - * if you change this list, also change bvec_alloc or things will - * break badly! cannot be bigger than what you can fit into an - * unsigned short - */ -#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) } -static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { - BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES), -}; -#undef BV - -/* - * fs_bio_set is the bio_set containing bio and iovec memory pools used by - * IO code that does not need private memory pools. - */ -struct bio_set *fs_bio_set; -EXPORT_SYMBOL(fs_bio_set); - -/* - * Our slab pool management - */ -struct bio_slab { - struct kmem_cache *slab; - unsigned int slab_ref; - unsigned int slab_size; - char name[8]; -}; -static DEFINE_MUTEX(bio_slab_lock); -static struct bio_slab *bio_slabs; -static unsigned int bio_slab_nr, bio_slab_max; - -static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) -{ - unsigned int sz = sizeof(struct bio) + extra_size; - struct kmem_cache *slab = NULL; - struct bio_slab *bslab, *new_bio_slabs; - unsigned int new_bio_slab_max; - unsigned int i, entry = -1; - - mutex_lock(&bio_slab_lock); - - i = 0; - while (i < bio_slab_nr) { - bslab = &bio_slabs[i]; - - if (!bslab->slab && entry == -1) - entry = i; - else if (bslab->slab_size == sz) { - slab = bslab->slab; - bslab->slab_ref++; - break; - } - i++; - } - - if (slab) - goto out_unlock; - - if (bio_slab_nr == bio_slab_max && entry == -1) { - new_bio_slab_max = bio_slab_max << 1; - new_bio_slabs = krealloc(bio_slabs, - new_bio_slab_max * sizeof(struct bio_slab), - GFP_KERNEL); - if (!new_bio_slabs) - goto out_unlock; - bio_slab_max = new_bio_slab_max; - bio_slabs = new_bio_slabs; - } - if (entry == -1) - entry = bio_slab_nr++; - - bslab = &bio_slabs[entry]; - - snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry); - slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL); - if (!slab) - goto out_unlock; - - bslab->slab = slab; - bslab->slab_ref = 1; - bslab->slab_size = sz; -out_unlock: - mutex_unlock(&bio_slab_lock); - return slab; -} - -static void bio_put_slab(struct bio_set *bs) -{ - struct bio_slab *bslab = NULL; - unsigned int i; - - mutex_lock(&bio_slab_lock); - - for (i = 0; i < bio_slab_nr; i++) { - if (bs->bio_slab == bio_slabs[i].slab) { - bslab = &bio_slabs[i]; - break; - } - } - - if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n")) - goto out; - - WARN_ON(!bslab->slab_ref); - - if (--bslab->slab_ref) - goto out; - - kmem_cache_destroy(bslab->slab); - bslab->slab = NULL; - -out: - mutex_unlock(&bio_slab_lock); -} - -unsigned int bvec_nr_vecs(unsigned short idx) -{ - return bvec_slabs[idx].nr_vecs; -} - -void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) -{ - BIO_BUG_ON(idx >= BIOVEC_NR_POOLS); - - if (idx == BIOVEC_MAX_IDX) - mempool_free(bv, pool); - else { - struct biovec_slab *bvs = bvec_slabs + idx; - - kmem_cache_free(bvs->slab, bv); - } -} - -struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, - mempool_t *pool) -{ - struct bio_vec *bvl; - - /* - * see comment near bvec_array define! - */ - switch (nr) { - case 1: - *idx = 0; - break; - case 2 ... 4: - *idx = 1; - break; - case 5 ... 16: - *idx = 2; - break; - case 17 ... 64: - *idx = 3; - break; - case 65 ... 128: - *idx = 4; - break; - case 129 ... BIO_MAX_PAGES: - *idx = 5; - break; - default: - return NULL; - } - - /* - * idx now points to the pool we want to allocate from. only the - * 1-vec entry pool is mempool backed. - */ - if (*idx == BIOVEC_MAX_IDX) { -fallback: - bvl = mempool_alloc(pool, gfp_mask); - } else { - struct biovec_slab *bvs = bvec_slabs + *idx; - gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO); - - /* - * Make this allocation restricted and don't dump info on - * allocation failures, since we'll fallback to the mempool - * in case of failure. - */ - __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; - - /* - * Try a slab allocation. If this fails and __GFP_WAIT - * is set, retry with the 1-entry mempool - */ - bvl = kmem_cache_alloc(bvs->slab, __gfp_mask); - if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) { - *idx = BIOVEC_MAX_IDX; - goto fallback; - } - } - - return bvl; -} - -static void __bio_free(struct bio *bio) -{ - bio_disassociate_task(bio); - - if (bio_integrity(bio)) - bio_integrity_free(bio); -} - -static void bio_free(struct bio *bio) -{ - struct bio_set *bs = bio->bi_pool; - void *p; - - __bio_free(bio); - - if (bs) { - if (bio_flagged(bio, BIO_OWNS_VEC)) - bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio)); - - /* - * If we have front padding, adjust the bio pointer before freeing - */ - p = bio; - p -= bs->front_pad; - - mempool_free(p, bs->bio_pool); - } else { - /* Bio was allocated by bio_kmalloc() */ - kfree(bio); - } -} - -void bio_init(struct bio *bio) -{ - memset(bio, 0, sizeof(*bio)); - bio->bi_flags = 1 << BIO_UPTODATE; - atomic_set(&bio->bi_remaining, 1); - atomic_set(&bio->bi_cnt, 1); -} -EXPORT_SYMBOL(bio_init); - -/** - * bio_reset - reinitialize a bio - * @bio: bio to reset - * - * Description: - * After calling bio_reset(), @bio will be in the same state as a freshly - * allocated bio returned bio bio_alloc_bioset() - the only fields that are - * preserved are the ones that are initialized by bio_alloc_bioset(). See - * comment in struct bio. - */ -void bio_reset(struct bio *bio) -{ - unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS); - - __bio_free(bio); - - memset(bio, 0, BIO_RESET_BYTES); - bio->bi_flags = flags|(1 << BIO_UPTODATE); - atomic_set(&bio->bi_remaining, 1); -} -EXPORT_SYMBOL(bio_reset); - -static void bio_chain_endio(struct bio *bio, int error) -{ - bio_endio(bio->bi_private, error); - bio_put(bio); -} - -/** - * bio_chain - chain bio completions - * - * The caller won't have a bi_end_io called when @bio completes - instead, - * @parent's bi_end_io won't be called until both @parent and @bio have - * completed; the chained bio will also be freed when it completes. - * - * The caller must not set bi_private or bi_end_io in @bio. - */ -void bio_chain(struct bio *bio, struct bio *parent) -{ - BUG_ON(bio->bi_private || bio->bi_end_io); - - bio->bi_private = parent; - bio->bi_end_io = bio_chain_endio; - atomic_inc(&parent->bi_remaining); -} -EXPORT_SYMBOL(bio_chain); - -static void bio_alloc_rescue(struct work_struct *work) -{ - struct bio_set *bs = container_of(work, struct bio_set, rescue_work); - struct bio *bio; - - while (1) { - spin_lock(&bs->rescue_lock); - bio = bio_list_pop(&bs->rescue_list); - spin_unlock(&bs->rescue_lock); - - if (!bio) - break; - - generic_make_request(bio); - } -} - -static void punt_bios_to_rescuer(struct bio_set *bs) -{ - struct bio_list punt, nopunt; - struct bio *bio; - - /* - * In order to guarantee forward progress we must punt only bios that - * were allocated from this bio_set; otherwise, if there was a bio on - * there for a stacking driver higher up in the stack, processing it - * could require allocating bios from this bio_set, and doing that from - * our own rescuer would be bad. - * - * Since bio lists are singly linked, pop them all instead of trying to - * remove from the middle of the list: - */ - - bio_list_init(&punt); - bio_list_init(&nopunt); - - while ((bio = bio_list_pop(current->bio_list))) - bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); - - *current->bio_list = nopunt; - - spin_lock(&bs->rescue_lock); - bio_list_merge(&bs->rescue_list, &punt); - spin_unlock(&bs->rescue_lock); - - queue_work(bs->rescue_workqueue, &bs->rescue_work); -} - -/** - * bio_alloc_bioset - allocate a bio for I/O - * @gfp_mask: the GFP_ mask given to the slab allocator - * @nr_iovecs: number of iovecs to pre-allocate - * @bs: the bio_set to allocate from. - * - * Description: - * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is - * backed by the @bs's mempool. - * - * When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be - * able to allocate a bio. This is due to the mempool guarantees. To make this - * work, callers must never allocate more than 1 bio at a time from this pool. - * Callers that need to allocate more than 1 bio must always submit the - * previously allocated bio for IO before attempting to allocate a new one. - * Failure to do so can cause deadlocks under memory pressure. - * - * Note that when running under generic_make_request() (i.e. any block - * driver), bios are not submitted until after you return - see the code in - * generic_make_request() that converts recursion into iteration, to prevent - * stack overflows. - * - * This would normally mean allocating multiple bios under - * generic_make_request() would be susceptible to deadlocks, but we have - * deadlock avoidance code that resubmits any blocked bios from a rescuer - * thread. - * - * However, we do not guarantee forward progress for allocations from other - * mempools. Doing multiple allocations from the same mempool under - * generic_make_request() should be avoided - instead, use bio_set's front_pad - * for per bio allocations. - * - * RETURNS: - * Pointer to new bio on success, NULL on failure. - */ -struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) -{ - gfp_t saved_gfp = gfp_mask; - unsigned front_pad; - unsigned inline_vecs; - unsigned long idx = BIO_POOL_NONE; - struct bio_vec *bvl = NULL; - struct bio *bio; - void *p; - - if (!bs) { - if (nr_iovecs > UIO_MAXIOV) - return NULL; - - p = kmalloc(sizeof(struct bio) + - nr_iovecs * sizeof(struct bio_vec), - gfp_mask); - front_pad = 0; - inline_vecs = nr_iovecs; - } else { - /* - * generic_make_request() converts recursion to iteration; this - * means if we're running beneath it, any bios we allocate and - * submit will not be submitted (and thus freed) until after we - * return. - * - * This exposes us to a potential deadlock if we allocate - * multiple bios from the same bio_set() while running - * underneath generic_make_request(). If we were to allocate - * multiple bios (say a stacking block driver that was splitting - * bios), we would deadlock if we exhausted the mempool's - * reserve. - * - * We solve this, and guarantee forward progress, with a rescuer - * workqueue per bio_set. If we go to allocate and there are - * bios on current->bio_list, we first try the allocation - * without __GFP_WAIT; if that fails, we punt those bios we - * would be blocking to the rescuer workqueue before we retry - * with the original gfp_flags. - */ - - if (current->bio_list && !bio_list_empty(current->bio_list)) - gfp_mask &= ~__GFP_WAIT; - - p = mempool_alloc(bs->bio_pool, gfp_mask); - if (!p && gfp_mask != saved_gfp) { - punt_bios_to_rescuer(bs); - gfp_mask = saved_gfp; - p = mempool_alloc(bs->bio_pool, gfp_mask); - } - - front_pad = bs->front_pad; - inline_vecs = BIO_INLINE_VECS; - } - - if (unlikely(!p)) - return NULL; - - bio = p + front_pad; - bio_init(bio); - - if (nr_iovecs > inline_vecs) { - bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); - if (!bvl && gfp_mask != saved_gfp) { - punt_bios_to_rescuer(bs); - gfp_mask = saved_gfp; - bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); - } - - if (unlikely(!bvl)) - goto err_free; - - bio->bi_flags |= 1 << BIO_OWNS_VEC; - } else if (nr_iovecs) { - bvl = bio->bi_inline_vecs; - } - - bio->bi_pool = bs; - bio->bi_flags |= idx << BIO_POOL_OFFSET; - bio->bi_max_vecs = nr_iovecs; - bio->bi_io_vec = bvl; - return bio; - -err_free: - mempool_free(p, bs->bio_pool); - return NULL; -} -EXPORT_SYMBOL(bio_alloc_bioset); - -void zero_fill_bio(struct bio *bio) -{ - unsigned long flags; - struct bio_vec bv; - struct bvec_iter iter; - - bio_for_each_segment(bv, bio, iter) { - char *data = bvec_kmap_irq(&bv, &flags); - memset(data, 0, bv.bv_len); - flush_dcache_page(bv.bv_page); - bvec_kunmap_irq(data, &flags); - } -} -EXPORT_SYMBOL(zero_fill_bio); - -/** - * bio_put - release a reference to a bio - * @bio: bio to release reference to - * - * Description: - * Put a reference to a &struct bio, either one you have gotten with - * bio_alloc, bio_get or bio_clone. The last put of a bio will free it. - **/ -void bio_put(struct bio *bio) -{ - BIO_BUG_ON(!atomic_read(&bio->bi_cnt)); - - /* - * last put frees it - */ - if (atomic_dec_and_test(&bio->bi_cnt)) - bio_free(bio); -} -EXPORT_SYMBOL(bio_put); - -inline int bio_phys_segments(struct request_queue *q, struct bio *bio) -{ - if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) - blk_recount_segments(q, bio); - - return bio->bi_phys_segments; -} -EXPORT_SYMBOL(bio_phys_segments); - -/** - * __bio_clone_fast - clone a bio that shares the original bio's biovec - * @bio: destination bio - * @bio_src: bio to clone - * - * Clone a &bio. Caller will own the returned bio, but not - * the actual data it points to. Reference count of returned - * bio will be one. - * - * Caller must ensure that @bio_src is not freed before @bio. - */ -void __bio_clone_fast(struct bio *bio, struct bio *bio_src) -{ - BUG_ON(bio->bi_pool && BIO_POOL_IDX(bio) != BIO_POOL_NONE); - - /* - * most users will be overriding ->bi_bdev with a new target, - * so we don't set nor calculate new physical/hw segment counts here - */ - bio->bi_bdev = bio_src->bi_bdev; - bio->bi_flags |= 1 << BIO_CLONED; - bio->bi_rw = bio_src->bi_rw; - bio->bi_iter = bio_src->bi_iter; - bio->bi_io_vec = bio_src->bi_io_vec; -} -EXPORT_SYMBOL(__bio_clone_fast); - -/** - * bio_clone_fast - clone a bio that shares the original bio's biovec - * @bio: bio to clone - * @gfp_mask: allocation priority - * @bs: bio_set to allocate from - * - * Like __bio_clone_fast, only also allocates the returned bio - */ -struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs) -{ - struct bio *b; - - b = bio_alloc_bioset(gfp_mask, 0, bs); - if (!b) - return NULL; - - __bio_clone_fast(b, bio); - - if (bio_integrity(bio)) { - int ret; - - ret = bio_integrity_clone(b, bio, gfp_mask); - - if (ret < 0) { - bio_put(b); - return NULL; - } - } - - return b; -} -EXPORT_SYMBOL(bio_clone_fast); - -/** - * bio_clone_bioset - clone a bio - * @bio_src: bio to clone - * @gfp_mask: allocation priority - * @bs: bio_set to allocate from - * - * Clone bio. Caller will own the returned bio, but not the actual data it - * points to. Reference count of returned bio will be one. - */ -struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, - struct bio_set *bs) -{ - struct bvec_iter iter; - struct bio_vec bv; - struct bio *bio; - - /* - * Pre immutable biovecs, __bio_clone() used to just do a memcpy from - * bio_src->bi_io_vec to bio->bi_io_vec. - * - * We can't do that anymore, because: - * - * - The point of cloning the biovec is to produce a bio with a biovec - * the caller can modify: bi_idx and bi_bvec_done should be 0. - * - * - The original bio could've had more than BIO_MAX_PAGES biovecs; if - * we tried to clone the whole thing bio_alloc_bioset() would fail. - * But the clone should succeed as long as the number of biovecs we - * actually need to allocate is fewer than BIO_MAX_PAGES. - * - * - Lastly, bi_vcnt should not be looked at or relied upon by code - * that does not own the bio - reason being drivers don't use it for - * iterating over the biovec anymore, so expecting it to be kept up - * to date (i.e. for clones that share the parent biovec) is just - * asking for trouble and would force extra work on - * __bio_clone_fast() anyways. - */ - - bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); - if (!bio) - return NULL; - - bio->bi_bdev = bio_src->bi_bdev; - bio->bi_rw = bio_src->bi_rw; - bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; - bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; - - if (bio->bi_rw & REQ_DISCARD) - goto integrity_clone; - - if (bio->bi_rw & REQ_WRITE_SAME) { - bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; - goto integrity_clone; - } - - bio_for_each_segment(bv, bio_src, iter) - bio->bi_io_vec[bio->bi_vcnt++] = bv; - -integrity_clone: - if (bio_integrity(bio_src)) { - int ret; - - ret = bio_integrity_clone(bio, bio_src, gfp_mask); - if (ret < 0) { - bio_put(bio); - return NULL; - } - } - - return bio; -} -EXPORT_SYMBOL(bio_clone_bioset); - -/** - * bio_get_nr_vecs - return approx number of vecs - * @bdev: I/O target - * - * Return the approximate number of pages we can send to this target. - * There's no guarantee that you will be able to fit this number of pages - * into a bio, it does not account for dynamic restrictions that vary - * on offset. - */ -int bio_get_nr_vecs(struct block_device *bdev) -{ - struct request_queue *q = bdev_get_queue(bdev); - int nr_pages; - - nr_pages = min_t(unsigned, - queue_max_segments(q), - queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1); - - return min_t(unsigned, nr_pages, BIO_MAX_PAGES); - -} -EXPORT_SYMBOL(bio_get_nr_vecs); - -static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page - *page, unsigned int len, unsigned int offset, - unsigned int max_sectors) -{ - int retried_segments = 0; - struct bio_vec *bvec; - - /* - * cloned bio must not modify vec list - */ - if (unlikely(bio_flagged(bio, BIO_CLONED))) - return 0; - - if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors) - return 0; - - /* - * For filesystems with a blocksize smaller than the pagesize - * we will often be called with the same page as last time and - * a consecutive offset. Optimize this special case. - */ - if (bio->bi_vcnt > 0) { - struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; - - if (page == prev->bv_page && - offset == prev->bv_offset + prev->bv_len) { - unsigned int prev_bv_len = prev->bv_len; - prev->bv_len += len; - - if (q->merge_bvec_fn) { - struct bvec_merge_data bvm = { - /* prev_bvec is already charged in - bi_size, discharge it in order to - simulate merging updated prev_bvec - as new bvec. */ - .bi_bdev = bio->bi_bdev, - .bi_sector = bio->bi_iter.bi_sector, - .bi_size = bio->bi_iter.bi_size - - prev_bv_len, - .bi_rw = bio->bi_rw, - }; - - if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) { - prev->bv_len -= len; - return 0; - } - } - - goto done; - } - } - - if (bio->bi_vcnt >= bio->bi_max_vecs) - return 0; - - /* - * we might lose a segment or two here, but rather that than - * make this too complex. - */ - - while (bio->bi_phys_segments >= queue_max_segments(q)) { - - if (retried_segments) - return 0; - - retried_segments = 1; - blk_recount_segments(q, bio); - } - - /* - * setup the new entry, we might clear it again later if we - * cannot add the page - */ - bvec = &bio->bi_io_vec[bio->bi_vcnt]; - bvec->bv_page = page; - bvec->bv_len = len; - bvec->bv_offset = offset; - - /* - * if queue has other restrictions (eg varying max sector size - * depending on offset), it can specify a merge_bvec_fn in the - * queue to get further control - */ - if (q->merge_bvec_fn) { - struct bvec_merge_data bvm = { - .bi_bdev = bio->bi_bdev, - .bi_sector = bio->bi_iter.bi_sector, - .bi_size = bio->bi_iter.bi_size, - .bi_rw = bio->bi_rw, - }; - - /* - * merge_bvec_fn() returns number of bytes it can accept - * at this offset - */ - if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) { - bvec->bv_page = NULL; - bvec->bv_len = 0; - bvec->bv_offset = 0; - return 0; - } - } - - /* If we may be able to merge these biovecs, force a recount */ - if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) - bio->bi_flags &= ~(1 << BIO_SEG_VALID); - - bio->bi_vcnt++; - bio->bi_phys_segments++; - done: - bio->bi_iter.bi_size += len; - return len; -} - -/** - * bio_add_pc_page - attempt to add page to bio - * @q: the target queue - * @bio: destination bio - * @page: page to add - * @len: vec entry length - * @offset: vec entry offset - * - * Attempt to add a page to the bio_vec maplist. This can fail for a - * number of reasons, such as the bio being full or target block device - * limitations. The target block device must allow bio's up to PAGE_SIZE, - * so it is always possible to add a single page to an empty bio. - * - * This should only be used by REQ_PC bios. - */ -int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, - unsigned int len, unsigned int offset) -{ - return __bio_add_page(q, bio, page, len, offset, - queue_max_hw_sectors(q)); -} -EXPORT_SYMBOL(bio_add_pc_page); - -/** - * bio_add_page - attempt to add page to bio - * @bio: destination bio - * @page: page to add - * @len: vec entry length - * @offset: vec entry offset - * - * Attempt to add a page to the bio_vec maplist. This can fail for a - * number of reasons, such as the bio being full or target block device - * limitations. The target block device must allow bio's up to PAGE_SIZE, - * so it is always possible to add a single page to an empty bio. - */ -int bio_add_page(struct bio *bio, struct page *page, unsigned int len, - unsigned int offset) -{ - struct request_queue *q = bdev_get_queue(bio->bi_bdev); - return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q)); -} -EXPORT_SYMBOL(bio_add_page); - -struct submit_bio_ret { - struct completion event; - int error; -}; - -static void submit_bio_wait_endio(struct bio *bio, int error) -{ - struct submit_bio_ret *ret = bio->bi_private; - - ret->error = error; - complete(&ret->event); -} - -/** - * submit_bio_wait - submit a bio, and wait until it completes - * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) - * @bio: The &struct bio which describes the I/O - * - * Simple wrapper around submit_bio(). Returns 0 on success, or the error from - * bio_endio() on failure. - */ -int submit_bio_wait(int rw, struct bio *bio) -{ - struct submit_bio_ret ret; - - rw |= REQ_SYNC; - init_completion(&ret.event); - bio->bi_private = &ret; - bio->bi_end_io = submit_bio_wait_endio; - submit_bio(rw, bio); - wait_for_completion(&ret.event); - - return ret.error; -} -EXPORT_SYMBOL(submit_bio_wait); - -/** - * bio_advance - increment/complete a bio by some number of bytes - * @bio: bio to advance - * @bytes: number of bytes to complete - * - * This updates bi_sector, bi_size and bi_idx; if the number of bytes to - * complete doesn't align with a bvec boundary, then bv_len and bv_offset will - * be updated on the last bvec as well. - * - * @bio will then represent the remaining, uncompleted portion of the io. - */ -void bio_advance(struct bio *bio, unsigned bytes) -{ - if (bio_integrity(bio)) - bio_integrity_advance(bio, bytes); - - bio_advance_iter(bio, &bio->bi_iter, bytes); -} -EXPORT_SYMBOL(bio_advance); - -/** - * bio_alloc_pages - allocates a single page for each bvec in a bio - * @bio: bio to allocate pages for - * @gfp_mask: flags for allocation - * - * Allocates pages up to @bio->bi_vcnt. - * - * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are - * freed. - */ -int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) -{ - int i; - struct bio_vec *bv; - - bio_for_each_segment_all(bv, bio, i) { - bv->bv_page = alloc_page(gfp_mask); - if (!bv->bv_page) { - while (--bv >= bio->bi_io_vec) - __free_page(bv->bv_page); - return -ENOMEM; - } - } - - return 0; -} -EXPORT_SYMBOL(bio_alloc_pages); - -/** - * bio_copy_data - copy contents of data buffers from one chain of bios to - * another - * @src: source bio list - * @dst: destination bio list - * - * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats - * @src and @dst as linked lists of bios. - * - * Stops when it reaches the end of either @src or @dst - that is, copies - * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios). - */ -void bio_copy_data(struct bio *dst, struct bio *src) -{ - struct bvec_iter src_iter, dst_iter; - struct bio_vec src_bv, dst_bv; - void *src_p, *dst_p; - unsigned bytes; - - src_iter = src->bi_iter; - dst_iter = dst->bi_iter; - - while (1) { - if (!src_iter.bi_size) { - src = src->bi_next; - if (!src) - break; - - src_iter = src->bi_iter; - } - - if (!dst_iter.bi_size) { - dst = dst->bi_next; - if (!dst) - break; - - dst_iter = dst->bi_iter; - } - - src_bv = bio_iter_iovec(src, src_iter); - dst_bv = bio_iter_iovec(dst, dst_iter); - - bytes = min(src_bv.bv_len, dst_bv.bv_len); - - src_p = kmap_atomic(src_bv.bv_page); - dst_p = kmap_atomic(dst_bv.bv_page); - - memcpy(dst_p + dst_bv.bv_offset, - src_p + src_bv.bv_offset, - bytes); - - kunmap_atomic(dst_p); - kunmap_atomic(src_p); - - bio_advance_iter(src, &src_iter, bytes); - bio_advance_iter(dst, &dst_iter, bytes); - } -} -EXPORT_SYMBOL(bio_copy_data); - -struct bio_map_data { - int nr_sgvecs; - int is_our_pages; - struct sg_iovec sgvecs[]; -}; - -static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, - const struct sg_iovec *iov, int iov_count, - int is_our_pages) -{ - memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count); - bmd->nr_sgvecs = iov_count; - bmd->is_our_pages = is_our_pages; - bio->bi_private = bmd; -} - -static struct bio_map_data *bio_alloc_map_data(int nr_segs, - unsigned int iov_count, - gfp_t gfp_mask) -{ - if (iov_count > UIO_MAXIOV) - return NULL; - - return kmalloc(sizeof(struct bio_map_data) + - sizeof(struct sg_iovec) * iov_count, gfp_mask); -} - -static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_count, - int to_user, int from_user, int do_free_page) -{ - int ret = 0, i; - struct bio_vec *bvec; - int iov_idx = 0; - unsigned int iov_off = 0; - - bio_for_each_segment_all(bvec, bio, i) { - char *bv_addr = page_address(bvec->bv_page); - unsigned int bv_len = bvec->bv_len; - - while (bv_len && iov_idx < iov_count) { - unsigned int bytes; - char __user *iov_addr; - - bytes = min_t(unsigned int, - iov[iov_idx].iov_len - iov_off, bv_len); - iov_addr = iov[iov_idx].iov_base + iov_off; - - if (!ret) { - if (to_user) - ret = copy_to_user(iov_addr, bv_addr, - bytes); - - if (from_user) - ret = copy_from_user(bv_addr, iov_addr, - bytes); - - if (ret) - ret = -EFAULT; - } - - bv_len -= bytes; - bv_addr += bytes; - iov_addr += bytes; - iov_off += bytes; - - if (iov[iov_idx].iov_len == iov_off) { - iov_idx++; - iov_off = 0; - } - } - - if (do_free_page) - __free_page(bvec->bv_page); - } - - return ret; -} - -/** - * bio_uncopy_user - finish previously mapped bio - * @bio: bio being terminated - * - * Free pages allocated from bio_copy_user() and write back data - * to user space in case of a read. - */ -int bio_uncopy_user(struct bio *bio) -{ - struct bio_map_data *bmd = bio->bi_private; - struct bio_vec *bvec; - int ret = 0, i; - - if (!bio_flagged(bio, BIO_NULL_MAPPED)) { - /* - * if we're in a workqueue, the request is orphaned, so - * don't copy into a random user address space, just free. - */ - if (current->mm) - ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs, - bio_data_dir(bio) == READ, - 0, bmd->is_our_pages); - else if (bmd->is_our_pages) - bio_for_each_segment_all(bvec, bio, i) - __free_page(bvec->bv_page); - } - kfree(bmd); - bio_put(bio); - return ret; -} -EXPORT_SYMBOL(bio_uncopy_user); - -/** - * bio_copy_user_iov - copy user data to bio - * @q: destination block queue - * @map_data: pointer to the rq_map_data holding pages (if necessary) - * @iov: the iovec. - * @iov_count: number of elements in the iovec - * @write_to_vm: bool indicating writing to pages or not - * @gfp_mask: memory allocation flags - * - * Prepares and returns a bio for indirect user io, bouncing data - * to/from kernel pages as necessary. Must be paired with - * call bio_uncopy_user() on io completion. - */ -struct bio *bio_copy_user_iov(struct request_queue *q, - struct rq_map_data *map_data, - const struct sg_iovec *iov, int iov_count, - int write_to_vm, gfp_t gfp_mask) -{ - struct bio_map_data *bmd; - struct bio_vec *bvec; - struct page *page; - struct bio *bio; - int i, ret; - int nr_pages = 0; - unsigned int len = 0; - unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0; - - for (i = 0; i < iov_count; i++) { - unsigned long uaddr; - unsigned long end; - unsigned long start; - - uaddr = (unsigned long)iov[i].iov_base; - end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; - start = uaddr >> PAGE_SHIFT; - - /* - * Overflow, abort - */ - if (end < start) - return ERR_PTR(-EINVAL); - - nr_pages += end - start; - len += iov[i].iov_len; - } - - if (offset) - nr_pages++; - - bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask); - if (!bmd) - return ERR_PTR(-ENOMEM); - - ret = -ENOMEM; - bio = bio_kmalloc(gfp_mask, nr_pages); - if (!bio) - goto out_bmd; - - if (!write_to_vm) - bio->bi_rw |= REQ_WRITE; - - ret = 0; - - if (map_data) { - nr_pages = 1 << map_data->page_order; - i = map_data->offset / PAGE_SIZE; - } - while (len) { - unsigned int bytes = PAGE_SIZE; - - bytes -= offset; - - if (bytes > len) - bytes = len; - - if (map_data) { - if (i == map_data->nr_entries * nr_pages) { - ret = -ENOMEM; - break; - } - - page = map_data->pages[i / nr_pages]; - page += (i % nr_pages); - - i++; - } else { - page = alloc_page(q->bounce_gfp | gfp_mask); - if (!page) { - ret = -ENOMEM; - break; - } - } - - if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) - break; - - len -= bytes; - offset = 0; - } - - if (ret) - goto cleanup; - - /* - * success - */ - if ((!write_to_vm && (!map_data || !map_data->null_mapped)) || - (map_data && map_data->from_user)) { - ret = __bio_copy_iov(bio, iov, iov_count, 0, 1, 0); - if (ret) - goto cleanup; - } - - bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1); - return bio; -cleanup: - if (!map_data) - bio_for_each_segment_all(bvec, bio, i) - __free_page(bvec->bv_page); - - bio_put(bio); -out_bmd: - kfree(bmd); - return ERR_PTR(ret); -} - -/** - * bio_copy_user - copy user data to bio - * @q: destination block queue - * @map_data: pointer to the rq_map_data holding pages (if necessary) - * @uaddr: start of user address - * @len: length in bytes - * @write_to_vm: bool indicating writing to pages or not - * @gfp_mask: memory allocation flags - * - * Prepares and returns a bio for indirect user io, bouncing data - * to/from kernel pages as necessary. Must be paired with - * call bio_uncopy_user() on io completion. - */ -struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data, - unsigned long uaddr, unsigned int len, - int write_to_vm, gfp_t gfp_mask) -{ - struct sg_iovec iov; - - iov.iov_base = (void __user *)uaddr; - iov.iov_len = len; - - return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask); -} -EXPORT_SYMBOL(bio_copy_user); - -static struct bio *__bio_map_user_iov(struct request_queue *q, - struct block_device *bdev, - const struct sg_iovec *iov, int iov_count, - int write_to_vm, gfp_t gfp_mask) -{ - int i, j; - int nr_pages = 0; - struct page **pages; - struct bio *bio; - int cur_page = 0; - int ret, offset; - - for (i = 0; i < iov_count; i++) { - unsigned long uaddr = (unsigned long)iov[i].iov_base; - unsigned long len = iov[i].iov_len; - unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = uaddr >> PAGE_SHIFT; - - /* - * Overflow, abort - */ - if (end < start) - return ERR_PTR(-EINVAL); - - nr_pages += end - start; - /* - * buffer must be aligned to at least hardsector size for now - */ - if (uaddr & queue_dma_alignment(q)) - return ERR_PTR(-EINVAL); - } - - if (!nr_pages) - return ERR_PTR(-EINVAL); - - bio = bio_kmalloc(gfp_mask, nr_pages); - if (!bio) - return ERR_PTR(-ENOMEM); - - ret = -ENOMEM; - pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask); - if (!pages) - goto out; - - for (i = 0; i < iov_count; i++) { - unsigned long uaddr = (unsigned long)iov[i].iov_base; - unsigned long len = iov[i].iov_len; - unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = uaddr >> PAGE_SHIFT; - const int local_nr_pages = end - start; - const int page_limit = cur_page + local_nr_pages; - - ret = get_user_pages_fast(uaddr, local_nr_pages, - write_to_vm, &pages[cur_page]); - if (ret < local_nr_pages) { - ret = -EFAULT; - goto out_unmap; - } - - offset = uaddr & ~PAGE_MASK; - for (j = cur_page; j < page_limit; j++) { - unsigned int bytes = PAGE_SIZE - offset; - - if (len <= 0) - break; - - if (bytes > len) - bytes = len; - - /* - * sorry... - */ - if (bio_add_pc_page(q, bio, pages[j], bytes, offset) < - bytes) - break; - - len -= bytes; - offset = 0; - } - - cur_page = j; - /* - * release the pages we didn't map into the bio, if any - */ - while (j < page_limit) - page_cache_release(pages[j++]); - } - - kfree(pages); - - /* - * set data direction, and check if mapped pages need bouncing - */ - if (!write_to_vm) - bio->bi_rw |= REQ_WRITE; - - bio->bi_bdev = bdev; - bio->bi_flags |= (1 << BIO_USER_MAPPED); - return bio; - - out_unmap: - for (i = 0; i < nr_pages; i++) { - if(!pages[i]) - break; - page_cache_release(pages[i]); - } - out: - kfree(pages); - bio_put(bio); - return ERR_PTR(ret); -} - -/** - * bio_map_user - map user address into bio - * @q: the struct request_queue for the bio - * @bdev: destination block device - * @uaddr: start of user address - * @len: length in bytes - * @write_to_vm: bool indicating writing to pages or not - * @gfp_mask: memory allocation flags - * - * Map the user space address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev, - unsigned long uaddr, unsigned int len, int write_to_vm, - gfp_t gfp_mask) -{ - struct sg_iovec iov; - - iov.iov_base = (void __user *)uaddr; - iov.iov_len = len; - - return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask); -} -EXPORT_SYMBOL(bio_map_user); - -/** - * bio_map_user_iov - map user sg_iovec table into bio - * @q: the struct request_queue for the bio - * @bdev: destination block device - * @iov: the iovec. - * @iov_count: number of elements in the iovec - * @write_to_vm: bool indicating writing to pages or not - * @gfp_mask: memory allocation flags - * - * Map the user space address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev, - const struct sg_iovec *iov, int iov_count, - int write_to_vm, gfp_t gfp_mask) -{ - struct bio *bio; - - bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm, - gfp_mask); - if (IS_ERR(bio)) - return bio; - - /* - * subtle -- if __bio_map_user() ended up bouncing a bio, - * it would normally disappear when its bi_end_io is run. - * however, we need it for the unmap, so grab an extra - * reference to it - */ - bio_get(bio); - - return bio; -} - -static void __bio_unmap_user(struct bio *bio) -{ - struct bio_vec *bvec; - int i; - - /* - * make sure we dirty pages we wrote to - */ - bio_for_each_segment_all(bvec, bio, i) { - if (bio_data_dir(bio) == READ) - set_page_dirty_lock(bvec->bv_page); - - page_cache_release(bvec->bv_page); - } - - bio_put(bio); -} - -/** - * bio_unmap_user - unmap a bio - * @bio: the bio being unmapped - * - * Unmap a bio previously mapped by bio_map_user(). Must be called with - * a process context. - * - * bio_unmap_user() may sleep. - */ -void bio_unmap_user(struct bio *bio) -{ - __bio_unmap_user(bio); - bio_put(bio); -} -EXPORT_SYMBOL(bio_unmap_user); - -static void bio_map_kern_endio(struct bio *bio, int err) -{ - bio_put(bio); -} - -static struct bio *__bio_map_kern(struct request_queue *q, void *data, - unsigned int len, gfp_t gfp_mask) -{ - unsigned long kaddr = (unsigned long)data; - unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = kaddr >> PAGE_SHIFT; - const int nr_pages = end - start; - int offset, i; - struct bio *bio; - - bio = bio_kmalloc(gfp_mask, nr_pages); - if (!bio) - return ERR_PTR(-ENOMEM); - - offset = offset_in_page(kaddr); - for (i = 0; i < nr_pages; i++) { - unsigned int bytes = PAGE_SIZE - offset; - - if (len <= 0) - break; - - if (bytes > len) - bytes = len; - - if (bio_add_pc_page(q, bio, virt_to_page(data), bytes, - offset) < bytes) - break; - - data += bytes; - len -= bytes; - offset = 0; - } - - bio->bi_end_io = bio_map_kern_endio; - return bio; -} - -/** - * bio_map_kern - map kernel address into bio - * @q: the struct request_queue for the bio - * @data: pointer to buffer to map - * @len: length in bytes - * @gfp_mask: allocation flags for bio allocation - * - * Map the kernel address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, - gfp_t gfp_mask) -{ - struct bio *bio; - - bio = __bio_map_kern(q, data, len, gfp_mask); - if (IS_ERR(bio)) - return bio; - - if (bio->bi_iter.bi_size == len) - return bio; - - /* - * Don't support partial mappings. - */ - bio_put(bio); - return ERR_PTR(-EINVAL); -} -EXPORT_SYMBOL(bio_map_kern); - -static void bio_copy_kern_endio(struct bio *bio, int err) -{ - struct bio_vec *bvec; - const int read = bio_data_dir(bio) == READ; - struct bio_map_data *bmd = bio->bi_private; - int i; - char *p = bmd->sgvecs[0].iov_base; - - bio_for_each_segment_all(bvec, bio, i) { - char *addr = page_address(bvec->bv_page); - - if (read) - memcpy(p, addr, bvec->bv_len); - - __free_page(bvec->bv_page); - p += bvec->bv_len; - } - - kfree(bmd); - bio_put(bio); -} - -/** - * bio_copy_kern - copy kernel address into bio - * @q: the struct request_queue for the bio - * @data: pointer to buffer to copy - * @len: length in bytes - * @gfp_mask: allocation flags for bio and page allocation - * @reading: data direction is READ - * - * copy the kernel address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, - gfp_t gfp_mask, int reading) -{ - struct bio *bio; - struct bio_vec *bvec; - int i; - - bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask); - if (IS_ERR(bio)) - return bio; - - if (!reading) { - void *p = data; - - bio_for_each_segment_all(bvec, bio, i) { - char *addr = page_address(bvec->bv_page); - - memcpy(addr, p, bvec->bv_len); - p += bvec->bv_len; - } - } - - bio->bi_end_io = bio_copy_kern_endio; - - return bio; -} -EXPORT_SYMBOL(bio_copy_kern); - -/* - * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions - * for performing direct-IO in BIOs. - * - * The problem is that we cannot run set_page_dirty() from interrupt context - * because the required locks are not interrupt-safe. So what we can do is to - * mark the pages dirty _before_ performing IO. And in interrupt context, - * check that the pages are still dirty. If so, fine. If not, redirty them - * in process context. - * - * We special-case compound pages here: normally this means reads into hugetlb - * pages. The logic in here doesn't really work right for compound pages - * because the VM does not uniformly chase down the head page in all cases. - * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't - * handle them at all. So we skip compound pages here at an early stage. - * - * Note that this code is very hard to test under normal circumstances because - * direct-io pins the pages with get_user_pages(). This makes - * is_page_cache_freeable return false, and the VM will not clean the pages. - * But other code (eg, flusher threads) could clean the pages if they are mapped - * pagecache. - * - * Simply disabling the call to bio_set_pages_dirty() is a good way to test the - * deferred bio dirtying paths. - */ - -/* - * bio_set_pages_dirty() will mark all the bio's pages as dirty. - */ -void bio_set_pages_dirty(struct bio *bio) -{ - struct bio_vec *bvec; - int i; - - bio_for_each_segment_all(bvec, bio, i) { - struct page *page = bvec->bv_page; - - if (page && !PageCompound(page)) - set_page_dirty_lock(page); - } -} - -static void bio_release_pages(struct bio *bio) -{ - struct bio_vec *bvec; - int i; - - bio_for_each_segment_all(bvec, bio, i) { - struct page *page = bvec->bv_page; - - if (page) - put_page(page); - } -} - -/* - * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. - * If they are, then fine. If, however, some pages are clean then they must - * have been written out during the direct-IO read. So we take another ref on - * the BIO and the offending pages and re-dirty the pages in process context. - * - * It is expected that bio_check_pages_dirty() will wholly own the BIO from - * here on. It will run one page_cache_release() against each page and will - * run one bio_put() against the BIO. - */ - -static void bio_dirty_fn(struct work_struct *work); - -static DECLARE_WORK(bio_dirty_work, bio_dirty_fn); -static DEFINE_SPINLOCK(bio_dirty_lock); -static struct bio *bio_dirty_list; - -/* - * This runs in process context - */ -static void bio_dirty_fn(struct work_struct *work) -{ - unsigned long flags; - struct bio *bio; - - spin_lock_irqsave(&bio_dirty_lock, flags); - bio = bio_dirty_list; - bio_dirty_list = NULL; - spin_unlock_irqrestore(&bio_dirty_lock, flags); - - while (bio) { - struct bio *next = bio->bi_private; - - bio_set_pages_dirty(bio); - bio_release_pages(bio); - bio_put(bio); - bio = next; - } -} - -void bio_check_pages_dirty(struct bio *bio) -{ - struct bio_vec *bvec; - int nr_clean_pages = 0; - int i; - - bio_for_each_segment_all(bvec, bio, i) { - struct page *page = bvec->bv_page; - - if (PageDirty(page) || PageCompound(page)) { - page_cache_release(page); - bvec->bv_page = NULL; - } else { - nr_clean_pages++; - } - } - - if (nr_clean_pages) { - unsigned long flags; - - spin_lock_irqsave(&bio_dirty_lock, flags); - bio->bi_private = bio_dirty_list; - bio_dirty_list = bio; - spin_unlock_irqrestore(&bio_dirty_lock, flags); - schedule_work(&bio_dirty_work); - } else { - bio_put(bio); - } -} - -#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE -void bio_flush_dcache_pages(struct bio *bi) -{ - struct bio_vec bvec; - struct bvec_iter iter; - - bio_for_each_segment(bvec, bi, iter) - flush_dcache_page(bvec.bv_page); -} -EXPORT_SYMBOL(bio_flush_dcache_pages); -#endif - -/** - * bio_endio - end I/O on a bio - * @bio: bio - * @error: error, if any - * - * Description: - * bio_endio() will end I/O on the whole bio. bio_endio() is the - * preferred way to end I/O on a bio, it takes care of clearing - * BIO_UPTODATE on error. @error is 0 on success, and and one of the - * established -Exxxx (-EIO, for instance) error values in case - * something went wrong. No one should call bi_end_io() directly on a - * bio unless they own it and thus know that it has an end_io - * function. - **/ -void bio_endio(struct bio *bio, int error) -{ - while (bio) { - BUG_ON(atomic_read(&bio->bi_remaining) <= 0); - - if (error) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - error = -EIO; - - if (!atomic_dec_and_test(&bio->bi_remaining)) - return; - - /* - * Need to have a real endio function for chained bios, - * otherwise various corner cases will break (like stacking - * block devices that save/restore bi_end_io) - however, we want - * to avoid unbounded recursion and blowing the stack. Tail call - * optimization would handle this, but compiling with frame - * pointers also disables gcc's sibling call optimization. - */ - if (bio->bi_end_io == bio_chain_endio) { - struct bio *parent = bio->bi_private; - bio_put(bio); - bio = parent; - } else { - if (bio->bi_end_io) - bio->bi_end_io(bio, error); - bio = NULL; - } - } -} -EXPORT_SYMBOL(bio_endio); - -/** - * bio_endio_nodec - end I/O on a bio, without decrementing bi_remaining - * @bio: bio - * @error: error, if any - * - * For code that has saved and restored bi_end_io; thing hard before using this - * function, probably you should've cloned the entire bio. - **/ -void bio_endio_nodec(struct bio *bio, int error) -{ - atomic_inc(&bio->bi_remaining); - bio_endio(bio, error); -} -EXPORT_SYMBOL(bio_endio_nodec); - -/** - * bio_split - split a bio - * @bio: bio to split - * @sectors: number of sectors to split from the front of @bio - * @gfp: gfp mask - * @bs: bio set to allocate from - * - * Allocates and returns a new bio which represents @sectors from the start of - * @bio, and updates @bio to represent the remaining sectors. - * - * The newly allocated bio will point to @bio's bi_io_vec; it is the caller's - * responsibility to ensure that @bio is not freed before the split. - */ -struct bio *bio_split(struct bio *bio, int sectors, - gfp_t gfp, struct bio_set *bs) -{ - struct bio *split = NULL; - - BUG_ON(sectors <= 0); - BUG_ON(sectors >= bio_sectors(bio)); - - split = bio_clone_fast(bio, gfp, bs); - if (!split) - return NULL; - - split->bi_iter.bi_size = sectors << 9; - - if (bio_integrity(split)) - bio_integrity_trim(split, 0, sectors); - - bio_advance(bio, split->bi_iter.bi_size); - - return split; -} -EXPORT_SYMBOL(bio_split); - -/** - * bio_trim - trim a bio - * @bio: bio to trim - * @offset: number of sectors to trim from the front of @bio - * @size: size we want to trim @bio to, in sectors - */ -void bio_trim(struct bio *bio, int offset, int size) -{ - /* 'bio' is a cloned bio which we need to trim to match - * the given offset and size. - */ - - size <<= 9; - if (offset == 0 && size == bio->bi_iter.bi_size) - return; - - clear_bit(BIO_SEG_VALID, &bio->bi_flags); - - bio_advance(bio, offset << 9); - - bio->bi_iter.bi_size = size; -} -EXPORT_SYMBOL_GPL(bio_trim); - -/* - * create memory pools for biovec's in a bio_set. - * use the global biovec slabs created for general use. - */ -mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries) -{ - struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX; - - return mempool_create_slab_pool(pool_entries, bp->slab); -} - -void bioset_free(struct bio_set *bs) -{ - if (bs->rescue_workqueue) - destroy_workqueue(bs->rescue_workqueue); - - if (bs->bio_pool) - mempool_destroy(bs->bio_pool); - - if (bs->bvec_pool) - mempool_destroy(bs->bvec_pool); - - bioset_integrity_free(bs); - bio_put_slab(bs); - - kfree(bs); -} -EXPORT_SYMBOL(bioset_free); - -/** - * bioset_create - Create a bio_set - * @pool_size: Number of bio and bio_vecs to cache in the mempool - * @front_pad: Number of bytes to allocate in front of the returned bio - * - * Description: - * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller - * to ask for a number of bytes to be allocated in front of the bio. - * Front pad allocation is useful for embedding the bio inside - * another structure, to avoid allocating extra data to go with the bio. - * Note that the bio must be embedded at the END of that structure always, - * or things will break badly. - */ -struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) -{ - unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); - struct bio_set *bs; - - bs = kzalloc(sizeof(*bs), GFP_KERNEL); - if (!bs) - return NULL; - - bs->front_pad = front_pad; - - spin_lock_init(&bs->rescue_lock); - bio_list_init(&bs->rescue_list); - INIT_WORK(&bs->rescue_work, bio_alloc_rescue); - - bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad); - if (!bs->bio_slab) { - kfree(bs); - return NULL; - } - - bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab); - if (!bs->bio_pool) - goto bad; - - bs->bvec_pool = biovec_create_pool(bs, pool_size); - if (!bs->bvec_pool) - goto bad; - - bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0); - if (!bs->rescue_workqueue) - goto bad; - - return bs; -bad: - bioset_free(bs); - return NULL; -} -EXPORT_SYMBOL(bioset_create); - -#ifdef CONFIG_BLK_CGROUP -/** - * bio_associate_current - associate a bio with %current - * @bio: target bio - * - * Associate @bio with %current if it hasn't been associated yet. Block - * layer will treat @bio as if it were issued by %current no matter which - * task actually issues it. - * - * This function takes an extra reference of @task's io_context and blkcg - * which will be put when @bio is released. The caller must own @bio, - * ensure %current->io_context exists, and is responsible for synchronizing - * calls to this function. - */ -int bio_associate_current(struct bio *bio) -{ - struct io_context *ioc; - struct cgroup_subsys_state *css; - - if (bio->bi_ioc) - return -EBUSY; - - ioc = current->io_context; - if (!ioc) - return -ENOENT; - - /* acquire active ref on @ioc and associate */ - get_io_context_active(ioc); - bio->bi_ioc = ioc; - - /* associate blkcg if exists */ - rcu_read_lock(); - css = task_css(current, blkio_cgrp_id); - if (css && css_tryget(css)) - bio->bi_css = css; - rcu_read_unlock(); - - return 0; -} - -/** - * bio_disassociate_task - undo bio_associate_current() - * @bio: target bio - */ -void bio_disassociate_task(struct bio *bio) -{ - if (bio->bi_ioc) { - put_io_context(bio->bi_ioc); - bio->bi_ioc = NULL; - } - if (bio->bi_css) { - css_put(bio->bi_css); - bio->bi_css = NULL; - } -} - -#endif /* CONFIG_BLK_CGROUP */ - -static void __init biovec_init_slabs(void) -{ - int i; - - for (i = 0; i < BIOVEC_NR_POOLS; i++) { - int size; - struct biovec_slab *bvs = bvec_slabs + i; - - if (bvs->nr_vecs <= BIO_INLINE_VECS) { - bvs->slab = NULL; - continue; - } - - size = bvs->nr_vecs * sizeof(struct bio_vec); - bvs->slab = kmem_cache_create(bvs->name, size, 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - } -} - -static int __init init_bio(void) -{ - bio_slab_max = 2; - bio_slab_nr = 0; - bio_slabs = kzalloc(bio_slab_max * sizeof(struct bio_slab), GFP_KERNEL); - if (!bio_slabs) - panic("bio: can't allocate bios\n"); - - bio_integrity_init(); - biovec_init_slabs(); - - fs_bio_set = bioset_create(BIO_POOL_SIZE, 0); - if (!fs_bio_set) - panic("bio: can't allocate bios\n"); - - if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE)) - panic("bio: can't create integrity pool\n"); - - return 0; -} -subsys_initcall(init_bio); diff --git a/fs/block_dev.c b/fs/block_dev.c index 552a8d13bc32..6d7274619bf9 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -165,14 +165,15 @@ blkdev_get_block(struct inode *inode, sector_t iblock, } static ssize_t -blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t offset, unsigned long nr_segs) +blkdev_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset, - nr_segs, blkdev_get_block, NULL, NULL, 0); + return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iter, + offset, blkdev_get_block, + NULL, NULL, 0); } int __sync_blockdev(struct block_device *bdev, int wait) @@ -363,6 +364,69 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) } EXPORT_SYMBOL(blkdev_fsync); +/** + * bdev_read_page() - Start reading a page from a block device + * @bdev: The device to read the page from + * @sector: The offset on the device to read the page to (need not be aligned) + * @page: The page to read + * + * On entry, the page should be locked. It will be unlocked when the page + * has been read. If the block driver implements rw_page synchronously, + * that will be true on exit from this function, but it need not be. + * + * Errors returned by this function are usually "soft", eg out of memory, or + * queue full; callers should try a different route to read this page rather + * than propagate an error back up the stack. + * + * Return: negative errno if an error occurs, 0 if submission was successful. + */ +int bdev_read_page(struct block_device *bdev, sector_t sector, + struct page *page) +{ + const struct block_device_operations *ops = bdev->bd_disk->fops; + if (!ops->rw_page) + return -EOPNOTSUPP; + return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ); +} +EXPORT_SYMBOL_GPL(bdev_read_page); + +/** + * bdev_write_page() - Start writing a page to a block device + * @bdev: The device to write the page to + * @sector: The offset on the device to write the page to (need not be aligned) + * @page: The page to write + * @wbc: The writeback_control for the write + * + * On entry, the page should be locked and not currently under writeback. + * On exit, if the write started successfully, the page will be unlocked and + * under writeback. If the write failed already (eg the driver failed to + * queue the page to the device), the page will still be locked. If the + * caller is a ->writepage implementation, it will need to unlock the page. + * + * Errors returned by this function are usually "soft", eg out of memory, or + * queue full; callers should try a different route to write this page rather + * than propagate an error back up the stack. + * + * Return: negative errno if an error occurs, 0 if submission was successful. + */ +int bdev_write_page(struct block_device *bdev, sector_t sector, + struct page *page, struct writeback_control *wbc) +{ + int result; + int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE; + const struct block_device_operations *ops = bdev->bd_disk->fops; + if (!ops->rw_page) + return -EOPNOTSUPP; + set_page_writeback(page); + result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw); + if (result) + end_page_writeback(page); + else + unlock_page(page); + return result; +} +EXPORT_SYMBOL_GPL(bdev_write_page); + /* * pseudo-fs */ @@ -1508,43 +1572,38 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) * Does not take i_mutex for the write and thus is not for general purpose * use. */ -ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct blk_plug plug; ssize_t ret; - BUG_ON(iocb->ki_pos != pos); - blk_start_plug(&plug); - ret = __generic_file_aio_write(iocb, iov, nr_segs); + ret = __generic_file_write_iter(iocb, from); if (ret > 0) { ssize_t err; - - err = generic_write_sync(file, pos, ret); + err = generic_write_sync(file, iocb->ki_pos - ret, ret); if (err < 0) ret = err; } blk_finish_plug(&plug); return ret; } -EXPORT_SYMBOL_GPL(blkdev_aio_write); +EXPORT_SYMBOL_GPL(blkdev_write_iter); -static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *bd_inode = file->f_mapping->host; loff_t size = i_size_read(bd_inode); + loff_t pos = iocb->ki_pos; if (pos >= size) return 0; size -= pos; - if (size < iocb->ki_nbytes) - nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size); - return generic_file_aio_read(iocb, iov, nr_segs, pos); + iov_iter_truncate(to, size); + return generic_file_read_iter(iocb, to); } /* @@ -1576,10 +1635,10 @@ const struct file_operations def_blk_fops = { .open = blkdev_open, .release = blkdev_close, .llseek = block_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = blkdev_aio_read, - .aio_write = blkdev_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = blkdev_read_iter, + .write_iter = blkdev_write_iter, .mmap = generic_file_mmap, .fsync = blkdev_fsync, .unlocked_ioctl = block_ioctl, @@ -1587,7 +1646,7 @@ const struct file_operations def_blk_fops = { .compat_ioctl = compat_blkdev_ioctl, #endif .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, }; int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index f341a98031d2..6d1d0b93b1aa 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -16,4 +16,4 @@ btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ tests/extent-buffer-tests.o tests/btrfs-tests.o \ - tests/extent-io-tests.o tests/inode-tests.o + tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index ff9b3995d453..9a0124a95851 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -79,13 +79,6 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans, const char *name; char *value = NULL; - if (acl) { - ret = posix_acl_valid(acl); - if (ret < 0) - return ret; - ret = 0; - } - switch (type) { case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 10db21fa0926..e25564bfcb46 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -900,7 +900,11 @@ again: goto out; BUG_ON(ret == 0); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (trans && likely(trans->type != __TRANS_DUMMY)) { +#else if (trans) { +#endif /* * look if there are updates for this ref queued and lock the * head @@ -984,11 +988,12 @@ again: goto out; } if (ref->count && ref->parent) { - if (extent_item_pos && !ref->inode_list) { + if (extent_item_pos && !ref->inode_list && + ref->level == 0) { u32 bsz; struct extent_buffer *eb; bsz = btrfs_level_size(fs_info->extent_root, - info_level); + ref->level); eb = read_tree_block(fs_info->extent_root, ref->parent, bsz, 0); if (!eb || !extent_buffer_uptodate(eb)) { @@ -1404,9 +1409,10 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, * returns <0 on error */ static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb, - struct btrfs_extent_item *ei, u32 item_size, - struct btrfs_extent_inline_ref **out_eiref, - int *out_type) + struct btrfs_key *key, + struct btrfs_extent_item *ei, u32 item_size, + struct btrfs_extent_inline_ref **out_eiref, + int *out_type) { unsigned long end; u64 flags; @@ -1416,19 +1422,26 @@ static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb, /* first call */ flags = btrfs_extent_flags(eb, ei); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - info = (struct btrfs_tree_block_info *)(ei + 1); - *out_eiref = - (struct btrfs_extent_inline_ref *)(info + 1); + if (key->type == BTRFS_METADATA_ITEM_KEY) { + /* a skinny metadata extent */ + *out_eiref = + (struct btrfs_extent_inline_ref *)(ei + 1); + } else { + WARN_ON(key->type != BTRFS_EXTENT_ITEM_KEY); + info = (struct btrfs_tree_block_info *)(ei + 1); + *out_eiref = + (struct btrfs_extent_inline_ref *)(info + 1); + } } else { *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1); } *ptr = (unsigned long)*out_eiref; - if ((void *)*ptr >= (void *)ei + item_size) + if ((unsigned long)(*ptr) >= (unsigned long)ei + item_size) return -ENOENT; } end = (unsigned long)ei + item_size; - *out_eiref = (struct btrfs_extent_inline_ref *)*ptr; + *out_eiref = (struct btrfs_extent_inline_ref *)(*ptr); *out_type = btrfs_extent_inline_ref_type(eb, *out_eiref); *ptr += btrfs_extent_inline_ref_size(*out_type); @@ -1447,8 +1460,8 @@ static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb, * <0 on error. */ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, - struct btrfs_extent_item *ei, u32 item_size, - u64 *out_root, u8 *out_level) + struct btrfs_key *key, struct btrfs_extent_item *ei, + u32 item_size, u64 *out_root, u8 *out_level) { int ret; int type; @@ -1459,8 +1472,8 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, return 1; while (1) { - ret = __get_extent_inline_ref(ptr, eb, ei, item_size, - &eiref, &type); + ret = __get_extent_inline_ref(ptr, eb, key, ei, item_size, + &eiref, &type); if (ret < 0) return ret; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index a910b27a8ad9..86fc20fec282 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -40,8 +40,8 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, u64 *flags); int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, - struct btrfs_extent_item *ei, u32 item_size, - u64 *out_root, u8 *out_level); + struct btrfs_key *key, struct btrfs_extent_item *ei, + u32 item_size, u64 *out_root, u8 *out_level); int iterate_extent_inodes(struct btrfs_fs_info *fs_info, u64 extent_item_objectid, @@ -55,8 +55,8 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); int btrfs_find_all_roots(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots); + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist **roots); char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, u32 name_len, unsigned long name_off, struct extent_buffer *eb_in, u64 parent, diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index c9a24444ec9a..4794923c410c 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -279,9 +279,11 @@ static inline void btrfs_inode_block_unlocked_dio(struct inode *inode) static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags); } +bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end); + #endif diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 0e8388e72d8d..ce92ae30250f 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1093,6 +1093,7 @@ leaf_item_out_of_bounce_error: next_stack = btrfsic_stack_frame_alloc(); if (NULL == next_stack) { + sf->error = -1; btrfsic_release_block_ctx( &sf-> next_block_ctx); @@ -1190,8 +1191,10 @@ continue_with_current_node_stack_frame: sf->next_block_ctx.datav[0]; next_stack = btrfsic_stack_frame_alloc(); - if (NULL == next_stack) + if (NULL == next_stack) { + sf->error = -1; goto one_stack_frame_backwards; + } next_stack->i = -1; next_stack->block = sf->next_block; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index d43c544d3b68..1daea0b47187 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -821,7 +821,7 @@ static void free_workspace(int type, struct list_head *workspace) spin_lock(workspace_lock); if (*num_workspace < num_online_cpus()) { - list_add_tail(workspace, idle_workspace); + list_add(workspace, idle_workspace); (*num_workspace)++; spin_unlock(workspace_lock); goto wake; @@ -887,7 +887,7 @@ int btrfs_compress_pages(int type, struct address_space *mapping, workspace = find_workspace(type); if (IS_ERR(workspace)) - return -1; + return PTR_ERR(workspace); ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, start, len, pages, @@ -923,7 +923,7 @@ static int btrfs_decompress_biovec(int type, struct page **pages_in, workspace = find_workspace(type); if (IS_ERR(workspace)) - return -ENOMEM; + return PTR_ERR(workspace); ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in, disk_start, @@ -945,7 +945,7 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, workspace = find_workspace(type); if (IS_ERR(workspace)) - return -ENOMEM; + return PTR_ERR(workspace); ret = btrfs_compress_op[type-1]->decompress(workspace, data_in, dest_page, start_byte, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 1bcfcdb23cf4..aeab453b8e24 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -224,7 +224,8 @@ static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) static void add_root_to_dirty_list(struct btrfs_root *root) { spin_lock(&root->fs_info->trans_lock); - if (root->track_dirty && list_empty(&root->dirty_list)) { + if (test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state) && + list_empty(&root->dirty_list)) { list_add(&root->dirty_list, &root->fs_info->dirty_cowonly_roots); } @@ -246,9 +247,10 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, int level; struct btrfs_disk_key disk_key; - WARN_ON(root->ref_cows && trans->transid != - root->fs_info->running_transaction->transid); - WARN_ON(root->ref_cows && trans->transid != root->last_trans); + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + trans->transid != root->fs_info->running_transaction->transid); + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + trans->transid != root->last_trans); level = btrfs_header_level(buf); if (level == 0) @@ -354,44 +356,14 @@ static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info) } /* - * Increment the upper half of tree_mod_seq, set lower half zero. - * - * Must be called with fs_info->tree_mod_seq_lock held. - */ -static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info) -{ - u64 seq = atomic64_read(&fs_info->tree_mod_seq); - seq &= 0xffffffff00000000ull; - seq += 1ull << 32; - atomic64_set(&fs_info->tree_mod_seq, seq); - return seq; -} - -/* - * Increment the lower half of tree_mod_seq. - * - * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers - * are generated should not technically require a spin lock here. (Rationale: - * incrementing the minor while incrementing the major seq number is between its - * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it - * just returns a unique sequence number as usual.) We have decided to leave - * that requirement in here and rethink it once we notice it really imposes a - * problem on some workload. + * Pull a new tree mod seq number for our operation. */ -static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info) +static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) { return atomic64_inc_return(&fs_info->tree_mod_seq); } /* - * return the last minor in the previous major tree_mod_seq number - */ -u64 btrfs_tree_mod_seq_prev(u64 seq) -{ - return (seq & 0xffffffff00000000ull) - 1ull; -} - -/* * This adds a new blocker to the tree mod log's blocker list if the @elem * passed does not already have a sequence number set. So when a caller expects * to record tree modifications, it should ensure to set elem->seq to zero @@ -402,19 +374,16 @@ u64 btrfs_tree_mod_seq_prev(u64 seq) u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem) { - u64 seq; - tree_mod_log_write_lock(fs_info); spin_lock(&fs_info->tree_mod_seq_lock); if (!elem->seq) { - elem->seq = btrfs_inc_tree_mod_seq_major(fs_info); + elem->seq = btrfs_inc_tree_mod_seq(fs_info); list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); } - seq = btrfs_inc_tree_mod_seq_minor(fs_info); spin_unlock(&fs_info->tree_mod_seq_lock); tree_mod_log_write_unlock(fs_info); - return seq; + return elem->seq; } void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, @@ -487,9 +456,7 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) BUG_ON(!tm); - spin_lock(&fs_info->tree_mod_seq_lock); - tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info); - spin_unlock(&fs_info->tree_mod_seq_lock); + tm->seq = btrfs_inc_tree_mod_seq(fs_info); tm_root = &fs_info->tree_mod_log; new = &tm_root->rb_node; @@ -997,14 +964,14 @@ int btrfs_block_can_be_shared(struct btrfs_root *root, * snapshot and the block was not allocated by tree relocation, * we know the block is not shared. */ - if (root->ref_cows && + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && buf != root->node && buf != root->commit_root && (btrfs_header_generation(buf) <= btrfs_root_last_snapshot(&root->root_item) || btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) return 1; #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (root->ref_cows && + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) return 1; #endif @@ -1146,9 +1113,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_assert_tree_locked(buf); - WARN_ON(root->ref_cows && trans->transid != - root->fs_info->running_transaction->transid); - WARN_ON(root->ref_cows && trans->transid != root->last_trans); + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + trans->transid != root->fs_info->running_transaction->transid); + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + trans->transid != root->last_trans); level = btrfs_header_level(buf); @@ -1193,7 +1161,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, return ret; } - if (root->ref_cows) { + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { ret = btrfs_reloc_cow_block(trans, root, buf, cow); if (ret) return ret; @@ -1538,6 +1506,10 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return 0; +#endif /* ensure we can see the force_cow */ smp_rmb(); @@ -1556,7 +1528,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && - !root->force_cow) + !test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) return 0; return 1; } @@ -5125,7 +5097,17 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) return ret; btrfs_item_key(path->nodes[0], &found_key, 0); ret = comp_keys(&found_key, &key); - if (ret < 0) + /* + * We might have had an item with the previous key in the tree right + * before we released our path. And after we released our path, that + * item might have been pushed to the first slot (0) of the leaf we + * were holding due to a tree balance. Alternatively, an item with the + * previous key can exist as the only element of a leaf (big fat item). + * Therefore account for these 2 cases, so that our callers (like + * btrfs_previous_item) don't miss an existing item with a key matching + * the previous key we computed above. + */ + if (ret <= 0) return 0; return 1; } @@ -5736,6 +5718,24 @@ again: ret = 0; goto done; } + /* + * So the above check misses one case: + * - after releasing the path above, someone has removed the item that + * used to be at the very end of the block, and balance between leafs + * gets another one with bigger key.offset to replace it. + * + * This one should be returned as well, or we can get leaf corruption + * later(esp. in __btrfs_drop_extents()). + * + * And a bit more explanation about this check, + * with ret > 0, the key isn't found, the path points to the slot + * where it should be inserted, so the path->slots[0] item must be the + * bigger one. + */ + if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) { + ret = 0; + goto done; + } while (level < BTRFS_MAX_LEVEL) { if (!path->nodes[level]) { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4c48df572bd6..be91397f4e92 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -33,6 +33,7 @@ #include <asm/kmap_types.h> #include <linux/pagemap.h> #include <linux/btrfs.h> +#include <linux/workqueue.h> #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" @@ -756,6 +757,12 @@ struct btrfs_dir_item { #define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0) +/* + * Internal in-memory flag that a subvolume has been marked for deletion but + * still visible as a directory + */ +#define BTRFS_ROOT_SUBVOL_DEAD (1ULL << 48) + struct btrfs_root_item { struct btrfs_inode_item inode; __le64 generation; @@ -840,7 +847,10 @@ struct btrfs_disk_balance_args { /* BTRFS_BALANCE_ARGS_* */ __le64 flags; - __le64 unused[8]; + /* BTRFS_BALANCE_ARGS_LIMIT value */ + __le64 limit; + + __le64 unused[7]; } __attribute__ ((__packed__)); /* @@ -1113,6 +1123,12 @@ struct btrfs_qgroup_limit_item { __le64 rsv_excl; } __attribute__ ((__packed__)); +/* For raid type sysfs entries */ +struct raid_kobject { + int raid_type; + struct kobject kobj; +}; + struct btrfs_space_info { spinlock_t lock; @@ -1163,7 +1179,7 @@ struct btrfs_space_info { wait_queue_head_t wait; struct kobject kobj; - struct kobject block_group_kobjs[BTRFS_NR_RAID_TYPES]; + struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES]; }; #define BTRFS_BLOCK_RSV_GLOBAL 1 @@ -1243,11 +1259,19 @@ struct btrfs_block_group_cache { spinlock_t lock; u64 pinned; u64 reserved; + u64 delalloc_bytes; u64 bytes_super; u64 flags; u64 sectorsize; u64 cache_generation; + /* + * It is just used for the delayed data space allocation because + * only the data space allocation and the relative metadata update + * can be done cross the transaction. + */ + struct rw_semaphore data_rwsem; + /* for raid56, this is a full stripe, without parity */ unsigned long full_stripe_len; @@ -1313,6 +1337,8 @@ struct btrfs_stripe_hash_table { #define BTRFS_STRIPE_HASH_TABLE_BITS 11 +void btrfs_init_async_reclaim_work(struct work_struct *work); + /* fs_info */ struct reloc_control; struct btrfs_device; @@ -1534,6 +1560,9 @@ struct btrfs_fs_info { */ struct btrfs_workqueue *fixup_workers; struct btrfs_workqueue *delayed_workers; + + /* the extent workers do delayed refs on the extent allocation tree */ + struct btrfs_workqueue *extent_workers; struct task_struct *transaction_kthread; struct task_struct *cleaner_kthread; int thread_pool_size; @@ -1636,7 +1665,10 @@ struct btrfs_fs_info { /* holds configuration and tracking. Protected by qgroup_lock */ struct rb_root qgroup_tree; + struct rb_root qgroup_op_tree; spinlock_t qgroup_lock; + spinlock_t qgroup_op_lock; + atomic_t qgroup_op_seq; /* * used to avoid frequently calling ulist_alloc()/ulist_free() @@ -1688,6 +1720,9 @@ struct btrfs_fs_info { struct semaphore uuid_tree_rescan_sem; unsigned int update_uuid_tree_gen:1; + + /* Used to reclaim the metadata space in the background. */ + struct work_struct async_reclaim_work; }; struct btrfs_subvolume_writers { @@ -1696,6 +1731,26 @@ struct btrfs_subvolume_writers { }; /* + * The state of btrfs root + */ +/* + * btrfs_record_root_in_trans is a multi-step process, + * and it can race with the balancing code. But the + * race is very small, and only the first time the root + * is added to each transaction. So IN_TRANS_SETUP + * is used to tell us when more checks are required + */ +#define BTRFS_ROOT_IN_TRANS_SETUP 0 +#define BTRFS_ROOT_REF_COWS 1 +#define BTRFS_ROOT_TRACK_DIRTY 2 +#define BTRFS_ROOT_IN_RADIX 3 +#define BTRFS_ROOT_DUMMY_ROOT 4 +#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 5 +#define BTRFS_ROOT_DEFRAG_RUNNING 6 +#define BTRFS_ROOT_FORCE_COW 7 +#define BTRFS_ROOT_MULTI_LOG_TASKS 8 + +/* * in ram representation of the tree. extent_root is used for all allocations * and for the extent tree extent_root root. */ @@ -1706,6 +1761,7 @@ struct btrfs_root { struct btrfs_root *log_root; struct btrfs_root *reloc_root; + unsigned long state; struct btrfs_root_item root_item; struct btrfs_key root_key; struct btrfs_fs_info *fs_info; @@ -1740,7 +1796,6 @@ struct btrfs_root { /* Just be updated when the commit succeeds. */ int last_log_commit; pid_t log_start_pid; - bool log_multiple_pids; u64 objectid; u64 last_trans; @@ -1760,23 +1815,13 @@ struct btrfs_root { u64 highest_objectid; - /* btrfs_record_root_in_trans is a multi-step process, - * and it can race with the balancing code. But the - * race is very small, and only the first time the root - * is added to each transaction. So in_trans_setup - * is used to tell us when more checks are required - */ - unsigned long in_trans_setup; - int ref_cows; - int track_dirty; - int in_radix; #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - int dummy_root; + u64 alloc_bytenr; #endif + u64 defrag_trans_start; struct btrfs_key defrag_progress; struct btrfs_key defrag_max; - int defrag_running; char *name; /* the dirty list is only used by non-reference counted roots */ @@ -1790,7 +1835,6 @@ struct btrfs_root { spinlock_t orphan_lock; atomic_t orphan_inodes; struct btrfs_block_rsv *orphan_block_rsv; - int orphan_item_inserted; int orphan_cleanup_state; spinlock_t inode_lock; @@ -1808,8 +1852,6 @@ struct btrfs_root { */ dev_t anon_dev; - int force_cow; - spinlock_t root_item_lock; atomic_t refs; @@ -2058,6 +2100,20 @@ struct btrfs_ioctl_defrag_range_args { #define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) #define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ BTRFS_MOUNT_##opt) +#define btrfs_set_and_info(root, opt, fmt, args...) \ +{ \ + if (!btrfs_test_opt(root, opt)) \ + btrfs_info(root->fs_info, fmt, ##args); \ + btrfs_set_opt(root->fs_info->mount_opt, opt); \ +} + +#define btrfs_clear_and_info(root, opt, fmt, args...) \ +{ \ + if (btrfs_test_opt(root, opt)) \ + btrfs_info(root->fs_info, fmt, ##args); \ + btrfs_clear_opt(root->fs_info->mount_opt, opt); \ +} + /* * Inode flags */ @@ -2774,6 +2830,11 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root) return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; } +static inline bool btrfs_root_dead(struct btrfs_root *root) +{ + return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; +} + /* struct btrfs_root_backup */ BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, tree_root, 64); @@ -2883,6 +2944,7 @@ btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, cpu->vend = le64_to_cpu(disk->vend); cpu->target = le64_to_cpu(disk->target); cpu->flags = le64_to_cpu(disk->flags); + cpu->limit = le64_to_cpu(disk->limit); } static inline void @@ -2900,6 +2962,7 @@ btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, disk->vend = cpu_to_le64(cpu->vend); disk->target = cpu_to_le64(cpu->target); disk->flags = cpu_to_le64(cpu->flags); + disk->limit = cpu_to_le64(cpu->limit); } /* struct btrfs_super_block */ @@ -3222,6 +3285,8 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); +int btrfs_async_run_delayed_refs(struct btrfs_root *root, + unsigned long count, int wait); int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, @@ -3259,11 +3324,11 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, struct btrfs_key *ins); int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, - struct btrfs_key *ins, int is_data); + struct btrfs_key *ins, int is_data, int delalloc); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int for_cow); + struct extent_buffer *buf, int full_backref, int no_quota); int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int for_cow); + struct extent_buffer *buf, int full_backref, int no_quota); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 flags, @@ -3271,9 +3336,10 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int for_cow); + u64 owner, u64 offset, int no_quota); -int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); +int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len, + int delalloc); int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, u64 start, u64 len); void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, @@ -3283,7 +3349,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset, int for_cow); + u64 root_objectid, u64 owner, u64 offset, int no_quota); int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); @@ -3371,7 +3437,6 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info); int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int __get_raid_index(u64 flags); - int btrfs_start_nocow_write(struct btrfs_root *root); void btrfs_end_nocow_write(struct btrfs_root *root); /* ctree.c */ @@ -3547,7 +3612,6 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem); void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem); -u64 btrfs_tree_mod_seq_prev(u64 seq); int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); /* root-item.c */ @@ -3694,6 +3758,12 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 file_start, int contig); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); +void btrfs_extent_item_to_extent_map(struct inode *inode, + const struct btrfs_path *path, + struct btrfs_file_extent_item *fi, + const bool new_inline, + struct extent_map *em); + /* inode.c */ struct btrfs_delalloc_work { struct inode *inode; @@ -4055,52 +4125,6 @@ void btrfs_reada_detach(void *handle); int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, u64 start, int err); -/* qgroup.c */ -struct qgroup_update { - struct list_head list; - struct btrfs_delayed_ref_node *node; - struct btrfs_delayed_extent_op *extent_op; -}; - -int btrfs_quota_enable(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -int btrfs_quota_disable(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); -void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); -int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info); -int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 src, u64 dst); -int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 src, u64 dst); -int btrfs_create_qgroup(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 qgroupid, - char *name); -int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 qgroupid); -int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 qgroupid, - struct btrfs_qgroup_limit *limit); -int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); -void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); -struct btrfs_delayed_extent_op; -int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, - struct btrfs_delayed_extent_op *extent_op); -int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_node *node, - struct btrfs_delayed_extent_op *extent_op); -int btrfs_run_qgroups(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, - struct btrfs_qgroup_inherit *inherit); -int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes); -void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes); - -void assert_qgroups_uptodate(struct btrfs_trans_handle *trans); - static inline int is_fstree(u64 rootid) { if (rootid == BTRFS_FS_TREE_OBJECTID || @@ -4117,6 +4141,8 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) /* Sanity test specific functions */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_destroy_inode(struct inode *inode); +int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, + u64 rfer, u64 excl); #endif #endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 33e561a84013..da775bfdebc9 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -149,8 +149,8 @@ again: spin_lock(&root->inode_lock); ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node); if (ret == -EEXIST) { - kmem_cache_free(delayed_node_cache, node); spin_unlock(&root->inode_lock); + kmem_cache_free(delayed_node_cache, node); radix_tree_preload_end(); goto again; } @@ -267,14 +267,17 @@ static void __btrfs_release_delayed_node( mutex_unlock(&delayed_node->mutex); if (atomic_dec_and_test(&delayed_node->refs)) { + bool free = false; struct btrfs_root *root = delayed_node->root; spin_lock(&root->inode_lock); if (atomic_read(&delayed_node->refs) == 0) { radix_tree_delete(&root->delayed_nodes_tree, delayed_node->inode_id); - kmem_cache_free(delayed_node_cache, delayed_node); + free = true; } spin_unlock(&root->inode_lock); + if (free) + kmem_cache_free(delayed_node_cache, delayed_node); } } diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 31299646024d..6d16bea94e1c 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -106,6 +106,10 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2, return -1; if (ref1->type > ref2->type) return 1; + if (ref1->no_quota > ref2->no_quota) + return 1; + if (ref1->no_quota < ref2->no_quota) + return -1; /* merging of sequenced refs is not allowed */ if (compare_seq) { if (ref1->seq < ref2->seq) @@ -635,7 +639,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head_ref, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, - int action, int for_cow) + int action, int no_quota) { struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_tree_ref *full_ref; @@ -645,6 +649,8 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; + if (is_fstree(ref_root)) + seq = atomic64_read(&fs_info->tree_mod_seq); delayed_refs = &trans->transaction->delayed_refs; /* first set the basic ref node struct up */ @@ -655,9 +661,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref->action = action; ref->is_head = 0; ref->in_tree = 1; - - if (need_ref_seq(for_cow, ref_root)) - seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem); + ref->no_quota = no_quota; ref->seq = seq; full_ref = btrfs_delayed_node_to_tree_ref(ref); @@ -697,7 +701,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head_ref, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, - u64 offset, int action, int for_cow) + u64 offset, int action, int no_quota) { struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_data_ref *full_ref; @@ -709,6 +713,9 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; + if (is_fstree(ref_root)) + seq = atomic64_read(&fs_info->tree_mod_seq); + /* first set the basic ref node struct up */ atomic_set(&ref->refs, 1); ref->bytenr = bytenr; @@ -717,9 +724,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, ref->action = action; ref->is_head = 0; ref->in_tree = 1; - - if (need_ref_seq(for_cow, ref_root)) - seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem); + ref->no_quota = no_quota; ref->seq = seq; full_ref = btrfs_delayed_node_to_data_ref(ref); @@ -762,12 +767,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, struct btrfs_delayed_extent_op *extent_op, - int for_cow) + int no_quota) { struct btrfs_delayed_tree_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; + if (!is_fstree(ref_root) || !fs_info->quota_enabled) + no_quota = 0; + BUG_ON(extent_op && extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); if (!ref) @@ -793,10 +801,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, num_bytes, parent, ref_root, level, action, - for_cow); + no_quota); spin_unlock(&delayed_refs->lock); - if (need_ref_seq(for_cow, ref_root)) - btrfs_qgroup_record_ref(trans, &ref->node, extent_op); return 0; } @@ -810,12 +816,15 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, u64 parent, u64 ref_root, u64 owner, u64 offset, int action, struct btrfs_delayed_extent_op *extent_op, - int for_cow) + int no_quota) { struct btrfs_delayed_data_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; + if (!is_fstree(ref_root) || !fs_info->quota_enabled) + no_quota = 0; + BUG_ON(extent_op && !extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); if (!ref) @@ -841,10 +850,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, num_bytes, parent, ref_root, owner, offset, - action, for_cow); + action, no_quota); spin_unlock(&delayed_refs->lock); - if (need_ref_seq(for_cow, ref_root)) - btrfs_qgroup_record_ref(trans, &ref->node, extent_op); return 0; } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 4ba9b93022ff..a764e2340d48 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -52,6 +52,7 @@ struct btrfs_delayed_ref_node { unsigned int action:8; unsigned int type:8; + unsigned int no_quota:1; /* is this node still in the rbtree? */ unsigned int is_head:1; unsigned int in_tree:1; @@ -196,14 +197,14 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, struct btrfs_delayed_extent_op *extent_op, - int for_cow); + int no_quota); int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, u64 offset, int action, struct btrfs_delayed_extent_op *extent_op, - int for_cow); + int no_quota); int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, @@ -231,25 +232,6 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); /* - * delayed refs with a ref_seq > 0 must be held back during backref walking. - * this only applies to items in one of the fs-trees. for_cow items never need - * to be held back, so they won't get a ref_seq number. - */ -static inline int need_ref_seq(int for_cow, u64 rootid) -{ - if (for_cow) - return 0; - - if (rootid == BTRFS_FS_TREE_OBJECTID) - return 1; - - if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) - return 1; - - return 0; -} - -/* * a node might live in a head or a regular ref, this lets you * test for the proper type to use. */ diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 9f2290509aca..eea26e1b2fda 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -36,6 +36,7 @@ #include "check-integrity.h" #include "rcu-string.h" #include "dev-replace.h" +#include "sysfs.h" static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret); @@ -313,7 +314,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, if (btrfs_fs_incompat(fs_info, RAID56)) { btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6"); - return -EINVAL; + return -EOPNOTSUPP; } switch (args->start.cont_reading_from_srcdev_mode) { @@ -562,6 +563,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, fs_info->fs_devices->latest_bdev = tgt_device->bdev; list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); + /* replace the sysfs entry */ + btrfs_kobj_rm_device(fs_info, src_device); + btrfs_kobj_add_device(fs_info, tgt_device); + btrfs_rm_dev_replace_blocked(fs_info); btrfs_rm_dev_replace_srcdev(fs_info, src_device); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 029d46c2e170..08e65e9cf2aa 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -49,6 +49,7 @@ #include "dev-replace.h" #include "raid56.h" #include "sysfs.h" +#include "qgroup.h" #ifdef CONFIG_X86 #include <asm/cpufeature.h> @@ -368,7 +369,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, out: unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, &cached_state, GFP_NOFS); - btrfs_tree_read_unlock_blocking(eb); + if (need_lock) + btrfs_tree_read_unlock_blocking(eb); return ret; } @@ -1109,6 +1111,11 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) { +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return alloc_test_extent_buffer(root->fs_info, bytenr, + blocksize); +#endif return alloc_extent_buffer(root->fs_info, bytenr, blocksize); } @@ -1201,10 +1208,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->nodesize = nodesize; root->leafsize = leafsize; root->stripesize = stripesize; - root->ref_cows = 0; - root->track_dirty = 0; - root->in_radix = 0; - root->orphan_item_inserted = 0; + root->state = 0; root->orphan_cleanup_state = 0; root->objectid = objectid; @@ -1265,7 +1269,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, else root->defrag_trans_start = 0; init_completion(&root->kobj_unregister); - root->defrag_running = 0; root->root_key.objectid = objectid; root->anon_dev = 0; @@ -1290,7 +1293,8 @@ struct btrfs_root *btrfs_alloc_dummy_root(void) if (!root) return ERR_PTR(-ENOMEM); __setup_root(4096, 4096, 4096, 4096, root, NULL, 1); - root->dummy_root = 1; + set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state); + root->alloc_bytenr = 0; return root; } @@ -1341,8 +1345,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); root->commit_root = btrfs_root_node(root); - root->track_dirty = 1; - + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); root->root_item.flags = 0; root->root_item.byte_limit = 0; @@ -1371,6 +1374,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, fail: if (leaf) { btrfs_tree_unlock(leaf); + free_extent_buffer(root->commit_root); free_extent_buffer(leaf); } kfree(root); @@ -1396,13 +1400,15 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; + /* + * DON'T set REF_COWS for log trees + * * log trees do not get reference counted because they go away * before a real commit is actually done. They do store pointers * to file data extents, and those reference counts still get * updated (along with back refs to the log tree). */ - root->ref_cows = 0; leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, BTRFS_TREE_LOG_OBJECTID, NULL, @@ -1536,7 +1542,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, return root; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - root->ref_cows = 1; + set_bit(BTRFS_ROOT_REF_COWS, &root->state); btrfs_check_and_init_root_item(&root->root_item); } @@ -1606,7 +1612,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, (unsigned long)root->root_key.objectid, root); if (ret == 0) - root->in_radix = 1; + set_bit(BTRFS_ROOT_IN_RADIX, &root->state); spin_unlock(&fs_info->fs_roots_radix_lock); radix_tree_preload_end(); @@ -1662,7 +1668,7 @@ again: if (ret < 0) goto fail; if (ret == 0) - root->orphan_item_inserted = 1; + set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); ret = btrfs_insert_fs_root(fs_info, root); if (ret) { @@ -2064,6 +2070,7 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->readahead_workers); btrfs_destroy_workqueue(fs_info->flush_workers); btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers); + btrfs_destroy_workqueue(fs_info->extent_workers); } static void free_root_extent_buffers(struct btrfs_root *root) @@ -2090,7 +2097,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) free_root_extent_buffers(info->chunk_root); } -static void del_fs_roots(struct btrfs_fs_info *fs_info) +void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) { int ret; struct btrfs_root *gang[8]; @@ -2101,7 +2108,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info) struct btrfs_root, root_list); list_del(&gang[0]->root_list); - if (gang[0]->in_radix) { + if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) { btrfs_drop_and_free_fs_root(fs_info, gang[0]); } else { free_extent_buffer(gang[0]->node); @@ -2221,6 +2228,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->free_chunk_lock); spin_lock_init(&fs_info->tree_mod_seq_lock); spin_lock_init(&fs_info->super_lock); + spin_lock_init(&fs_info->qgroup_op_lock); spin_lock_init(&fs_info->buffer_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->reloc_mutex); @@ -2246,6 +2254,7 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->async_submit_draining, 0); atomic_set(&fs_info->nr_async_bios, 0); atomic_set(&fs_info->defrag_running, 0); + atomic_set(&fs_info->qgroup_op_seq, 0); atomic64_set(&fs_info->tree_mod_seq, 0); fs_info->sb = sb; fs_info->max_inline = 8192 * 1024; @@ -2291,6 +2300,7 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->balance_cancel_req, 0); fs_info->balance_ctl = NULL; init_waitqueue_head(&fs_info->balance_wait_q); + btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work); sb->s_blocksize = 4096; sb->s_blocksize_bits = blksize_bits(4096); @@ -2354,6 +2364,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->qgroup_lock); mutex_init(&fs_info->qgroup_ioctl_lock); fs_info->qgroup_tree = RB_ROOT; + fs_info->qgroup_op_tree = RB_ROOT; INIT_LIST_HEAD(&fs_info->dirty_qgroups); fs_info->qgroup_seq = 1; fs_info->quota_enabled = 0; @@ -2577,6 +2588,10 @@ int open_ctree(struct super_block *sb, btrfs_alloc_workqueue("readahead", flags, max_active, 2); fs_info->qgroup_rescan_workers = btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0); + fs_info->extent_workers = + btrfs_alloc_workqueue("extent-refs", flags, + min_t(u64, fs_devices->num_devices, + max_active), 8); if (!(fs_info->workers && fs_info->delalloc_workers && fs_info->submit_workers && fs_info->flush_workers && @@ -2586,6 +2601,7 @@ int open_ctree(struct super_block *sb, fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->readahead_workers && fs_info->fixup_workers && fs_info->delayed_workers && + fs_info->fixup_workers && fs_info->extent_workers && fs_info->qgroup_rescan_workers)) { err = -ENOMEM; goto fail_sb_buffer; @@ -2693,7 +2709,7 @@ retry_root_backup: ret = PTR_ERR(extent_root); goto recovery_tree_root; } - extent_root->track_dirty = 1; + set_bit(BTRFS_ROOT_TRACK_DIRTY, &extent_root->state); fs_info->extent_root = extent_root; location.objectid = BTRFS_DEV_TREE_OBJECTID; @@ -2702,7 +2718,7 @@ retry_root_backup: ret = PTR_ERR(dev_root); goto recovery_tree_root; } - dev_root->track_dirty = 1; + set_bit(BTRFS_ROOT_TRACK_DIRTY, &dev_root->state); fs_info->dev_root = dev_root; btrfs_init_devices_late(fs_info); @@ -2712,13 +2728,13 @@ retry_root_backup: ret = PTR_ERR(csum_root); goto recovery_tree_root; } - csum_root->track_dirty = 1; + set_bit(BTRFS_ROOT_TRACK_DIRTY, &csum_root->state); fs_info->csum_root = csum_root; location.objectid = BTRFS_QUOTA_TREE_OBJECTID; quota_root = btrfs_read_tree_root(tree_root, &location); if (!IS_ERR(quota_root)) { - quota_root->track_dirty = 1; + set_bit(BTRFS_ROOT_TRACK_DIRTY, "a_root->state); fs_info->quota_enabled = 1; fs_info->pending_quota_state = 1; fs_info->quota_root = quota_root; @@ -2733,7 +2749,7 @@ retry_root_backup: create_uuid_tree = true; check_uuid_tree = false; } else { - uuid_root->track_dirty = 1; + set_bit(BTRFS_ROOT_TRACK_DIRTY, &uuid_root->state); fs_info->uuid_root = uuid_root; create_uuid_tree = false; check_uuid_tree = @@ -2861,7 +2877,7 @@ retry_root_backup: printk(KERN_ERR "BTRFS: failed to read log tree\n"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); - goto fail_trans_kthread; + goto fail_qgroup; } /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); @@ -2870,26 +2886,28 @@ retry_root_backup: "Failed to recover log tree"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); - goto fail_trans_kthread; + goto fail_qgroup; } if (sb->s_flags & MS_RDONLY) { ret = btrfs_commit_super(tree_root); if (ret) - goto fail_trans_kthread; + goto fail_qgroup; } } ret = btrfs_find_orphan_roots(tree_root); if (ret) - goto fail_trans_kthread; + goto fail_qgroup; if (!(sb->s_flags & MS_RDONLY)) { ret = btrfs_cleanup_fs_roots(fs_info); if (ret) - goto fail_trans_kthread; + goto fail_qgroup; + mutex_lock(&fs_info->cleaner_mutex); ret = btrfs_recover_relocation(tree_root); + mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { printk(KERN_WARNING "BTRFS: failed to recover relocation\n"); @@ -2966,7 +2984,7 @@ fail_qgroup: fail_trans_kthread: kthread_stop(fs_info->transaction_kthread); btrfs_cleanup_transaction(fs_info->tree_root); - del_fs_roots(fs_info); + btrfs_free_fs_roots(fs_info); fail_cleaner: kthread_stop(fs_info->cleaner_kthread); @@ -3501,8 +3519,10 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) btrfs_free_log(NULL, root); - __btrfs_remove_free_space_cache(root->free_ino_pinned); - __btrfs_remove_free_space_cache(root->free_ino_ctl); + if (root->free_ino_pinned) + __btrfs_remove_free_space_cache(root->free_ino_pinned); + if (root->free_ino_ctl) + __btrfs_remove_free_space_cache(root->free_ino_ctl); free_fs_root(root); } @@ -3533,28 +3553,51 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) { u64 root_objectid = 0; struct btrfs_root *gang[8]; - int i; - int ret; + int i = 0; + int err = 0; + unsigned int ret = 0; + int index; while (1) { + index = srcu_read_lock(&fs_info->subvol_srcu); ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, (void **)gang, root_objectid, ARRAY_SIZE(gang)); - if (!ret) + if (!ret) { + srcu_read_unlock(&fs_info->subvol_srcu, index); break; - + } root_objectid = gang[ret - 1]->root_key.objectid + 1; + for (i = 0; i < ret; i++) { - int err; + /* Avoid to grab roots in dead_roots */ + if (btrfs_root_refs(&gang[i]->root_item) == 0) { + gang[i] = NULL; + continue; + } + /* grab all the search result for later use */ + gang[i] = btrfs_grab_fs_root(gang[i]); + } + srcu_read_unlock(&fs_info->subvol_srcu, index); + for (i = 0; i < ret; i++) { + if (!gang[i]) + continue; root_objectid = gang[i]->root_key.objectid; err = btrfs_orphan_cleanup(gang[i]); if (err) - return err; + break; + btrfs_put_fs_root(gang[i]); } root_objectid++; } - return 0; + + /* release the uncleaned roots due to error */ + for (; i < ret; i++) { + if (gang[i]) + btrfs_put_fs_root(gang[i]); + } + return err; } int btrfs_commit_super(struct btrfs_root *root) @@ -3603,6 +3646,8 @@ int close_ctree(struct btrfs_root *root) /* clear out the rbtree of defraggable inodes */ btrfs_cleanup_defrag_inodes(fs_info); + cancel_work_sync(&fs_info->async_reclaim_work); + if (!(fs_info->sb->s_flags & MS_RDONLY)) { ret = btrfs_commit_super(root); if (ret) @@ -3627,12 +3672,17 @@ int close_ctree(struct btrfs_root *root) btrfs_sysfs_remove_one(fs_info); - del_fs_roots(fs_info); + btrfs_free_fs_roots(fs_info); btrfs_put_block_group_cache(fs_info); btrfs_free_block_groups(fs_info); + /* + * we must make sure there is not any read request to + * submit after we stopping all workers. + */ + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); btrfs_stop_all_workers(fs_info); free_root_pointers(fs_info, 1); @@ -3709,6 +3759,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) __percpu_counter_add(&root->fs_info->dirty_metadata_bytes, buf->len, root->fs_info->dirty_metadata_batch); +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) { + btrfs_print_leaf(root, buf); + ASSERT(0); + } +#endif } static void __btrfs_btree_balance_dirty(struct btrfs_root *root, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 53059df350f8..23ce3ceba0a9 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -68,6 +68,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, int btrfs_init_fs_root(struct btrfs_root *root); int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); +void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_key *key, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1306487c82cf..813537f362f9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -26,16 +26,16 @@ #include <linux/ratelimit.h> #include <linux/percpu_counter.h> #include "hash.h" -#include "ctree.h" +#include "tree-log.h" #include "disk-io.h" #include "print-tree.h" -#include "transaction.h" #include "volumes.h" #include "raid56.h" #include "locking.h" #include "free-space-cache.h" #include "math.h" #include "sysfs.h" +#include "qgroup.h" #undef SCRAMBLE_DELAYED_REFS @@ -81,7 +81,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner_objectid, u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extra_op); + struct btrfs_delayed_extent_op *extra_op, + int no_quota); static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, struct btrfs_extent_item *ei); @@ -94,7 +95,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, u64 flags, struct btrfs_disk_key *key, - int level, struct btrfs_key *ins); + int level, struct btrfs_key *ins, + int no_quota); static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 flags, int force); @@ -103,7 +105,8 @@ static int find_next_key(struct btrfs_path *path, int level, static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int dump_block_groups); static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve); + u64 num_bytes, int reserve, + int delalloc); static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); int btrfs_pin_extent(struct btrfs_root *root, @@ -1271,7 +1274,7 @@ fail: static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - int refs_to_drop) + int refs_to_drop, int *last_ref) { struct btrfs_key key; struct btrfs_extent_data_ref *ref1 = NULL; @@ -1307,6 +1310,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, if (num_refs == 0) { ret = btrfs_del_item(trans, root, path); + *last_ref = 1; } else { if (key.type == BTRFS_EXTENT_DATA_REF_KEY) btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); @@ -1542,6 +1546,7 @@ again: ret = 0; } if (ret) { + key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; btrfs_release_path(path); @@ -1763,7 +1768,8 @@ void update_inline_extent_backref(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_mod, - struct btrfs_delayed_extent_op *extent_op) + struct btrfs_delayed_extent_op *extent_op, + int *last_ref) { struct extent_buffer *leaf; struct btrfs_extent_item *ei; @@ -1807,6 +1813,7 @@ void update_inline_extent_backref(struct btrfs_root *root, else btrfs_set_shared_data_ref_count(leaf, sref, refs); } else { + *last_ref = 1; size = btrfs_extent_inline_ref_size(type); item_size = btrfs_item_size_nr(leaf, path->slots[0]); ptr = (unsigned long)iref; @@ -1838,7 +1845,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, if (ret == 0) { BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); update_inline_extent_backref(root, path, iref, - refs_to_add, extent_op); + refs_to_add, extent_op, NULL); } else if (ret == -ENOENT) { setup_inline_extent_backref(root, path, iref, parent, root_objectid, owner, offset, @@ -1871,17 +1878,19 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, - int refs_to_drop, int is_data) + int refs_to_drop, int is_data, int *last_ref) { int ret = 0; BUG_ON(!is_data && refs_to_drop != 1); if (iref) { update_inline_extent_backref(root, path, iref, - -refs_to_drop, NULL); + -refs_to_drop, NULL, last_ref); } else if (is_data) { - ret = remove_extent_data_ref(trans, root, path, refs_to_drop); + ret = remove_extent_data_ref(trans, root, path, refs_to_drop, + last_ref); } else { + *last_ref = 1; ret = btrfs_del_item(trans, root, path); } return ret; @@ -1945,7 +1954,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset, int for_cow) + u64 root_objectid, u64 owner, u64 offset, + int no_quota) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; @@ -1957,12 +1967,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, (int)owner, - BTRFS_ADD_DELAYED_REF, NULL, for_cow); + BTRFS_ADD_DELAYED_REF, NULL, no_quota); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, owner, offset, - BTRFS_ADD_DELAYED_REF, NULL, for_cow); + BTRFS_ADD_DELAYED_REF, NULL, no_quota); } return ret; } @@ -1972,31 +1982,64 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset, int refs_to_add, + int no_quota, struct btrfs_delayed_extent_op *extent_op) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_extent_item *item; + struct btrfs_key key; u64 refs; int ret; + enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL; path = btrfs_alloc_path(); if (!path) return -ENOMEM; + if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled) + no_quota = 1; + path->reada = 1; path->leave_spinning = 1; /* this will setup the path even if it fails to insert the back ref */ - ret = insert_inline_extent_backref(trans, root->fs_info->extent_root, - path, bytenr, num_bytes, parent, + ret = insert_inline_extent_backref(trans, fs_info->extent_root, path, + bytenr, num_bytes, parent, root_objectid, owner, offset, refs_to_add, extent_op); - if (ret != -EAGAIN) + if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota)) goto out; + /* + * Ok we were able to insert an inline extent and it appears to be a new + * reference, deal with the qgroup accounting. + */ + if (!ret && !no_quota) { + ASSERT(root->fs_info->quota_enabled); + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add) + type = BTRFS_QGROUP_OPER_ADD_SHARED; + btrfs_release_path(path); + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, + bytenr, num_bytes, type, 0); + goto out; + } + + /* + * Ok we had -EAGAIN which means we didn't have space to insert and + * inline extent ref, so just update the reference count and add a + * normal backref. + */ leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, item); + if (refs) + type = BTRFS_QGROUP_OPER_ADD_SHARED; btrfs_set_extent_refs(leaf, item, refs + refs_to_add); if (extent_op) __run_delayed_extent_op(extent_op, leaf, item); @@ -2004,9 +2047,15 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); + if (!no_quota) { + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, + bytenr, num_bytes, type, 0); + if (ret) + goto out; + } + path->reada = 1; path->leave_spinning = 1; - /* now insert the actual backref */ ret = insert_extent_backref(trans, root->fs_info->extent_root, path, bytenr, parent, root_objectid, @@ -2040,8 +2089,7 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, if (node->type == BTRFS_SHARED_DATA_REF_KEY) parent = ref->parent; - else - ref_root = ref->root; + ref_root = ref->root; if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { if (extent_op) @@ -2055,13 +2103,13 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, node->num_bytes, parent, ref_root, ref->objectid, ref->offset, node->ref_mod, - extent_op); + node->no_quota, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes, parent, ref_root, ref->objectid, ref->offset, node->ref_mod, - extent_op); + extent_op, node->no_quota); } else { BUG(); } @@ -2198,8 +2246,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) parent = ref->parent; - else - ref_root = ref->root; + ref_root = ref->root; ins.objectid = node->bytenr; if (skinny_metadata) { @@ -2217,15 +2264,18 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, parent, ref_root, extent_op->flags_to_set, &extent_op->key, - ref->level, &ins); + ref->level, &ins, + node->no_quota); } else if (node->action == BTRFS_ADD_DELAYED_REF) { ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, node->num_bytes, parent, ref_root, - ref->level, 0, 1, extent_op); + ref->level, 0, 1, node->no_quota, + extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes, parent, ref_root, - ref->level, 0, 1, extent_op); + ref->level, 0, 1, extent_op, + node->no_quota); } else { BUG(); } @@ -2573,42 +2623,6 @@ static u64 find_middle(struct rb_root *root) } #endif -int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) -{ - struct qgroup_update *qgroup_update; - int ret = 0; - - if (list_empty(&trans->qgroup_ref_list) != - !trans->delayed_ref_elem.seq) { - /* list without seq or seq without list */ - btrfs_err(fs_info, - "qgroup accounting update error, list is%s empty, seq is %#x.%x", - list_empty(&trans->qgroup_ref_list) ? "" : " not", - (u32)(trans->delayed_ref_elem.seq >> 32), - (u32)trans->delayed_ref_elem.seq); - BUG(); - } - - if (!trans->delayed_ref_elem.seq) - return 0; - - while (!list_empty(&trans->qgroup_ref_list)) { - qgroup_update = list_first_entry(&trans->qgroup_ref_list, - struct qgroup_update, list); - list_del(&qgroup_update->list); - if (!ret) - ret = btrfs_qgroup_account_ref( - trans, fs_info, qgroup_update->node, - qgroup_update->extent_op); - kfree(qgroup_update); - } - - btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem); - - return ret; -} - static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) { u64 num_bytes; @@ -2661,15 +2675,94 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, u64 num_entries = atomic_read(&trans->transaction->delayed_refs.num_entries); u64 avg_runtime; + u64 val; smp_mb(); avg_runtime = fs_info->avg_delayed_ref_runtime; + val = num_entries * avg_runtime; if (num_entries * avg_runtime >= NSEC_PER_SEC) return 1; + if (val >= NSEC_PER_SEC / 2) + return 2; return btrfs_check_space_for_delayed_refs(trans, root); } +struct async_delayed_refs { + struct btrfs_root *root; + int count; + int error; + int sync; + struct completion wait; + struct btrfs_work work; +}; + +static void delayed_ref_async_start(struct btrfs_work *work) +{ + struct async_delayed_refs *async; + struct btrfs_trans_handle *trans; + int ret; + + async = container_of(work, struct async_delayed_refs, work); + + trans = btrfs_join_transaction(async->root); + if (IS_ERR(trans)) { + async->error = PTR_ERR(trans); + goto done; + } + + /* + * trans->sync means that when we call end_transaciton, we won't + * wait on delayed refs + */ + trans->sync = true; + ret = btrfs_run_delayed_refs(trans, async->root, async->count); + if (ret) + async->error = ret; + + ret = btrfs_end_transaction(trans, async->root); + if (ret && !async->error) + async->error = ret; +done: + if (async->sync) + complete(&async->wait); + else + kfree(async); +} + +int btrfs_async_run_delayed_refs(struct btrfs_root *root, + unsigned long count, int wait) +{ + struct async_delayed_refs *async; + int ret; + + async = kmalloc(sizeof(*async), GFP_NOFS); + if (!async) + return -ENOMEM; + + async->root = root->fs_info->tree_root; + async->count = count; + async->error = 0; + if (wait) + async->sync = 1; + else + async->sync = 0; + init_completion(&async->wait); + + btrfs_init_work(&async->work, delayed_ref_async_start, + NULL, NULL); + + btrfs_queue_work(root->fs_info->extent_workers, &async->work); + + if (wait) { + wait_for_completion(&async->wait); + ret = async->error; + kfree(async); + return ret; + } + return 0; +} + /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be @@ -2697,8 +2790,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, if (root == root->fs_info->extent_root) root = root->fs_info->tree_root; - btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); - delayed_refs = &trans->transaction->delayed_refs; if (count == 0) { count = atomic_read(&delayed_refs->num_entries) * 2; @@ -2757,6 +2848,9 @@ again: goto again; } out: + ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info); + if (ret) + return ret; assert_qgroups_uptodate(trans); return 0; } @@ -2963,7 +3057,7 @@ out: static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - int full_backref, int inc, int for_cow) + int full_backref, int inc, int no_quota) { u64 bytenr; u64 num_bytes; @@ -2978,11 +3072,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, u64, u64, u64, u64, u64, u64, int); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return 0; +#endif ref_root = btrfs_header_owner(buf); nritems = btrfs_header_nritems(buf); level = btrfs_header_level(buf); - if (!root->ref_cows && level == 0) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) return 0; if (inc) @@ -3013,7 +3111,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, key.offset -= btrfs_file_extent_offset(buf, fi); ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, key.objectid, - key.offset, for_cow); + key.offset, no_quota); if (ret) goto fail; } else { @@ -3021,7 +3119,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, num_bytes = btrfs_level_size(root, level - 1); ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, level - 1, 0, - for_cow); + no_quota); if (ret) goto fail; } @@ -3032,15 +3130,15 @@ fail: } int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int for_cow) + struct extent_buffer *buf, int full_backref, int no_quota) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow); + return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota); } int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int for_cow) + struct extent_buffer *buf, int full_backref, int no_quota) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow); + return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota); } static int write_one_cache_group(struct btrfs_trans_handle *trans, @@ -3163,7 +3261,8 @@ again: spin_lock(&block_group->lock); if (block_group->cached != BTRFS_CACHE_FINISHED || - !btrfs_test_opt(root, SPACE_CACHE)) { + !btrfs_test_opt(root, SPACE_CACHE) || + block_group->delalloc_bytes) { /* * don't bother trying to write stuff out _if_ * a) we're not cached, @@ -3400,10 +3499,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, return ret; } - for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) INIT_LIST_HEAD(&found->block_groups[i]); - kobject_init(&found->block_group_kobjs[i], &btrfs_raid_ktype); - } init_rwsem(&found->groups_sem); spin_lock_init(&found->lock); found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; @@ -3542,11 +3639,13 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) return extended_to_chunk(flags | tmp); } -static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) +static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags) { unsigned seq; + u64 flags; do { + flags = orig_flags; seq = read_seqbegin(&root->fs_info->profiles_lock); if (flags & BTRFS_BLOCK_GROUP_DATA) @@ -4201,6 +4300,104 @@ static int flush_space(struct btrfs_root *root, return ret; } + +static inline u64 +btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, + struct btrfs_space_info *space_info) +{ + u64 used; + u64 expected; + u64 to_reclaim; + + to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024, + 16 * 1024 * 1024); + spin_lock(&space_info->lock); + if (can_overcommit(root, space_info, to_reclaim, + BTRFS_RESERVE_FLUSH_ALL)) { + to_reclaim = 0; + goto out; + } + + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; + if (can_overcommit(root, space_info, 1024 * 1024, + BTRFS_RESERVE_FLUSH_ALL)) + expected = div_factor_fine(space_info->total_bytes, 95); + else + expected = div_factor_fine(space_info->total_bytes, 90); + + if (used > expected) + to_reclaim = used - expected; + else + to_reclaim = 0; + to_reclaim = min(to_reclaim, space_info->bytes_may_use + + space_info->bytes_reserved); +out: + spin_unlock(&space_info->lock); + + return to_reclaim; +} + +static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, + struct btrfs_fs_info *fs_info, u64 used) +{ + return (used >= div_factor_fine(space_info->total_bytes, 98) && + !btrfs_fs_closing(fs_info) && + !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); +} + +static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info, + struct btrfs_fs_info *fs_info) +{ + u64 used; + + spin_lock(&space_info->lock); + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; + if (need_do_async_reclaim(space_info, fs_info, used)) { + spin_unlock(&space_info->lock); + return 1; + } + spin_unlock(&space_info->lock); + + return 0; +} + +static void btrfs_async_reclaim_metadata_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + u64 to_reclaim; + int flush_state; + + fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); + space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, + space_info); + if (!to_reclaim) + return; + + flush_state = FLUSH_DELAYED_ITEMS_NR; + do { + flush_space(fs_info->fs_root, space_info, to_reclaim, + to_reclaim, flush_state); + flush_state++; + if (!btrfs_need_do_async_reclaim(space_info, fs_info)) + return; + } while (flush_state <= COMMIT_TRANS); + + if (btrfs_need_do_async_reclaim(space_info, fs_info)) + queue_work(system_unbound_wq, work); +} + +void btrfs_init_async_reclaim_work(struct work_struct *work) +{ + INIT_WORK(work, btrfs_async_reclaim_metadata_space); +} + /** * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space * @root - the root we're allocating for @@ -4308,8 +4505,13 @@ again: if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { flushing = true; space_info->flush = 1; + } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { + used += orig_bytes; + if (need_do_async_reclaim(space_info, root->fs_info, used) && + !work_busy(&root->fs_info->async_reclaim_work)) + queue_work(system_unbound_wq, + &root->fs_info->async_reclaim_work); } - spin_unlock(&space_info->lock); if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) @@ -4366,7 +4568,7 @@ static struct btrfs_block_rsv *get_block_rsv( { struct btrfs_block_rsv *block_rsv = NULL; - if (root->ref_cows) + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) block_rsv = trans->block_rsv; if (root == root->fs_info->csum_root && trans->adding_csums) @@ -5413,6 +5615,7 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log, * @cache: The cache we are manipulating * @num_bytes: The number of bytes in question * @reserve: One of the reservation enums + * @delalloc: The blocks are allocated for the delalloc write * * This is called by the allocator when it reserves space, or by somebody who is * freeing space that was never actually used on disk. For example if you @@ -5431,7 +5634,7 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log, * succeeds. */ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve) + u64 num_bytes, int reserve, int delalloc) { struct btrfs_space_info *space_info = cache->space_info; int ret = 0; @@ -5450,12 +5653,18 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, num_bytes, 0); space_info->bytes_may_use -= num_bytes; } + + if (delalloc) + cache->delalloc_bytes += num_bytes; } } else { if (cache->ro) space_info->bytes_readonly += num_bytes; cache->reserved -= num_bytes; space_info->bytes_reserved -= num_bytes; + + if (delalloc) + cache->delalloc_bytes -= num_bytes; } spin_unlock(&cache->lock); spin_unlock(&space_info->lock); @@ -5469,7 +5678,6 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_caching_control *next; struct btrfs_caching_control *caching_ctl; struct btrfs_block_group_cache *cache; - struct btrfs_space_info *space_info; down_write(&fs_info->commit_root_sem); @@ -5492,9 +5700,6 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, up_write(&fs_info->commit_root_sem); - list_for_each_entry_rcu(space_info, &fs_info->space_info, list) - percpu_counter_set(&space_info->total_bytes_pinned, 0); - update_global_block_rsv(fs_info); } @@ -5532,6 +5737,7 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) spin_lock(&cache->lock); cache->pinned -= len; space_info->bytes_pinned -= len; + percpu_counter_add(&space_info->total_bytes_pinned, -len); if (cache->ro) { space_info->bytes_readonly += len; readonly = true; @@ -5618,7 +5824,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner_objectid, u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extent_op) + struct btrfs_delayed_extent_op *extent_op, + int no_quota) { struct btrfs_key key; struct btrfs_path *path; @@ -5634,9 +5841,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, int num_to_del = 1; u32 item_size; u64 refs; + int last_ref = 0; + enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL; bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); + if (!info->quota_enabled || !is_fstree(root_objectid)) + no_quota = 1; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -5684,7 +5896,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, BUG_ON(iref); ret = remove_extent_backref(trans, extent_root, path, NULL, refs_to_drop, - is_data); + is_data, &last_ref); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -5719,6 +5931,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (ret > 0 && skinny_metadata) { skinny_metadata = false; + key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; btrfs_release_path(path); @@ -5802,7 +6015,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, refs = btrfs_extent_refs(leaf, ei); if (refs < refs_to_drop) { btrfs_err(info, "trying to drop %d refs but we only have %Lu " - "for bytenr %Lu\n", refs_to_drop, refs, bytenr); + "for bytenr %Lu", refs_to_drop, refs, bytenr); ret = -EINVAL; btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -5810,6 +6023,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, refs -= refs_to_drop; if (refs > 0) { + type = BTRFS_QGROUP_OPER_SUB_SHARED; if (extent_op) __run_delayed_extent_op(extent_op, leaf, ei); /* @@ -5825,7 +6039,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (found_extent) { ret = remove_extent_backref(trans, extent_root, path, iref, refs_to_drop, - is_data); + is_data, &last_ref); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -5846,6 +6060,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } + last_ref = 1; ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); if (ret) { @@ -5868,6 +6083,20 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, goto out; } } + btrfs_release_path(path); + + /* Deal with the quota accounting */ + if (!ret && last_ref && !no_quota) { + int mod_seq = 0; + + if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID && + type == BTRFS_QGROUP_OPER_SUB_SHARED) + mod_seq = 1; + + ret = btrfs_qgroup_record_ref(trans, info, root_objectid, + bytenr, num_bytes, type, + mod_seq); + } out: btrfs_free_path(path); return ret; @@ -5983,7 +6212,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(cache, buf->start, buf->len); - btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); + btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0); trace_btrfs_reserved_extent_free(root, buf->start, buf->len); pin = 0; } @@ -6004,11 +6233,15 @@ out: /* Can return -ENOMEM */ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int for_cow) + u64 owner, u64 offset, int no_quota) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return 0; +#endif add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); /* @@ -6024,13 +6257,13 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, (int)owner, - BTRFS_DROP_DELAYED_REF, NULL, for_cow); + BTRFS_DROP_DELAYED_REF, NULL, no_quota); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, owner, offset, BTRFS_DROP_DELAYED_REF, - NULL, for_cow); + NULL, no_quota); } return ret; } @@ -6138,6 +6371,70 @@ enum btrfs_loop_type { LOOP_NO_EMPTY_SIZE = 3, }; +static inline void +btrfs_lock_block_group(struct btrfs_block_group_cache *cache, + int delalloc) +{ + if (delalloc) + down_read(&cache->data_rwsem); +} + +static inline void +btrfs_grab_block_group(struct btrfs_block_group_cache *cache, + int delalloc) +{ + btrfs_get_block_group(cache); + if (delalloc) + down_read(&cache->data_rwsem); +} + +static struct btrfs_block_group_cache * +btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + int delalloc) +{ + struct btrfs_block_group_cache *used_bg; + bool locked = false; +again: + spin_lock(&cluster->refill_lock); + if (locked) { + if (used_bg == cluster->block_group) + return used_bg; + + up_read(&used_bg->data_rwsem); + btrfs_put_block_group(used_bg); + } + + used_bg = cluster->block_group; + if (!used_bg) + return NULL; + + if (used_bg == block_group) + return used_bg; + + btrfs_get_block_group(used_bg); + + if (!delalloc) + return used_bg; + + if (down_read_trylock(&used_bg->data_rwsem)) + return used_bg; + + spin_unlock(&cluster->refill_lock); + down_read(&used_bg->data_rwsem); + locked = true; + goto again; +} + +static inline void +btrfs_release_block_group(struct btrfs_block_group_cache *cache, + int delalloc) +{ + if (delalloc) + up_read(&cache->data_rwsem); + btrfs_put_block_group(cache); +} + /* * walks the btree of allocated extents and find a hole of a given size. * The key ins is changed to record the hole: @@ -6152,7 +6449,7 @@ enum btrfs_loop_type { static noinline int find_free_extent(struct btrfs_root *orig_root, u64 num_bytes, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, - u64 flags) + u64 flags, int delalloc) { int ret = 0; struct btrfs_root *root = orig_root->fs_info->extent_root; @@ -6240,6 +6537,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, up_read(&space_info->groups_sem); } else { index = get_block_group_index(block_group); + btrfs_lock_block_group(block_group, delalloc); goto have_block_group; } } else if (block_group) { @@ -6254,7 +6552,7 @@ search: u64 offset; int cached; - btrfs_get_block_group(block_group); + btrfs_grab_block_group(block_group, delalloc); search_start = block_group->key.objectid; /* @@ -6302,16 +6600,16 @@ have_block_group: * the refill lock keeps out other * people trying to start a new cluster */ - spin_lock(&last_ptr->refill_lock); - used_block_group = last_ptr->block_group; - if (used_block_group != block_group && - (!used_block_group || - used_block_group->ro || - !block_group_bits(used_block_group, flags))) + used_block_group = btrfs_lock_cluster(block_group, + last_ptr, + delalloc); + if (!used_block_group) goto refill_cluster; - if (used_block_group != block_group) - btrfs_get_block_group(used_block_group); + if (used_block_group != block_group && + (used_block_group->ro || + !block_group_bits(used_block_group, flags))) + goto release_cluster; offset = btrfs_alloc_from_cluster(used_block_group, last_ptr, @@ -6325,16 +6623,15 @@ have_block_group: used_block_group, search_start, num_bytes); if (used_block_group != block_group) { - btrfs_put_block_group(block_group); + btrfs_release_block_group(block_group, + delalloc); block_group = used_block_group; } goto checks; } WARN_ON(last_ptr->block_group != used_block_group); - if (used_block_group != block_group) - btrfs_put_block_group(used_block_group); -refill_cluster: +release_cluster: /* If we are on LOOP_NO_EMPTY_SIZE, we can't * set up a new clusters, so lets just skip it * and let the allocator find whatever block @@ -6351,8 +6648,10 @@ refill_cluster: * succeeding in the unclustered * allocation. */ if (loop >= LOOP_NO_EMPTY_SIZE && - last_ptr->block_group != block_group) { + used_block_group != block_group) { spin_unlock(&last_ptr->refill_lock); + btrfs_release_block_group(used_block_group, + delalloc); goto unclustered_alloc; } @@ -6362,6 +6661,10 @@ refill_cluster: */ btrfs_return_cluster_to_free_space(NULL, last_ptr); + if (used_block_group != block_group) + btrfs_release_block_group(used_block_group, + delalloc); +refill_cluster: if (loop >= LOOP_NO_EMPTY_SIZE) { spin_unlock(&last_ptr->refill_lock); goto unclustered_alloc; @@ -6469,7 +6772,7 @@ checks: BUG_ON(offset > search_start); ret = btrfs_update_reserved_bytes(block_group, num_bytes, - alloc_type); + alloc_type, delalloc); if (ret == -EAGAIN) { btrfs_add_free_space(block_group, offset, num_bytes); goto loop; @@ -6481,13 +6784,13 @@ checks: trace_btrfs_reserve_extent(orig_root, block_group, search_start, num_bytes); - btrfs_put_block_group(block_group); + btrfs_release_block_group(block_group, delalloc); break; loop: failed_cluster_refill = false; failed_alloc = false; BUG_ON(index != get_block_group_index(block_group)); - btrfs_put_block_group(block_group); + btrfs_release_block_group(block_group, delalloc); } up_read(&space_info->groups_sem); @@ -6510,8 +6813,14 @@ loop: loop++; if (loop == LOOP_ALLOC_CHUNK) { struct btrfs_trans_handle *trans; + int exist = 0; + + trans = current->journal_info; + if (trans) + exist = 1; + else + trans = btrfs_join_transaction(root); - trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; @@ -6528,7 +6837,8 @@ loop: root, ret); else ret = 0; - btrfs_end_transaction(trans, root); + if (!exist) + btrfs_end_transaction(trans, root); if (ret) goto out; } @@ -6593,7 +6903,7 @@ again: int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, - struct btrfs_key *ins, int is_data) + struct btrfs_key *ins, int is_data, int delalloc) { bool final_tried = false; u64 flags; @@ -6603,7 +6913,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, again: WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, - flags); + flags, delalloc); if (ret == -ENOSPC) { if (!final_tried && ins->offset) { @@ -6628,7 +6938,8 @@ again: } static int __btrfs_free_reserved_extent(struct btrfs_root *root, - u64 start, u64 len, int pin) + u64 start, u64 len, + int pin, int delalloc) { struct btrfs_block_group_cache *cache; int ret = 0; @@ -6647,7 +6958,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root, pin_down_extent(root, cache, start, len, 1); else { btrfs_add_free_space(cache, start, len); - btrfs_update_reserved_bytes(cache, len, RESERVE_FREE); + btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); } btrfs_put_block_group(cache); @@ -6657,15 +6968,15 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root, } int btrfs_free_reserved_extent(struct btrfs_root *root, - u64 start, u64 len) + u64 start, u64 len, int delalloc) { - return __btrfs_free_reserved_extent(root, start, len, 0); + return __btrfs_free_reserved_extent(root, start, len, 0, delalloc); } int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, u64 start, u64 len) { - return __btrfs_free_reserved_extent(root, start, len, 1); + return __btrfs_free_reserved_extent(root, start, len, 1, 0); } static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, @@ -6729,6 +7040,13 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); + /* Always set parent to 0 here since its exclusive anyway. */ + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, + ins->objectid, ins->offset, + BTRFS_QGROUP_OPER_ADD_EXCL, 0); + if (ret) + return ret; + ret = update_block_group(root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", @@ -6743,7 +7061,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, u64 flags, struct btrfs_disk_key *key, - int level, struct btrfs_key *ins) + int level, struct btrfs_key *ins, + int no_quota) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; @@ -6753,6 +7072,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct extent_buffer *leaf; u32 size = sizeof(*extent_item) + sizeof(*iref); + u64 num_bytes = ins->offset; bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); @@ -6786,6 +7106,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, if (skinny_metadata) { iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); + num_bytes = root->leafsize; } else { block_info = (struct btrfs_tree_block_info *)(extent_item + 1); btrfs_set_tree_block_key(leaf, block_info, key); @@ -6807,6 +7128,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); + if (!no_quota) { + ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, + ins->objectid, num_bytes, + BTRFS_QGROUP_OPER_ADD_EXCL, 0); + if (ret) + return ret; + } + ret = update_block_group(root, ins->objectid, root->leafsize, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", @@ -6862,7 +7191,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, return -EINVAL; ret = btrfs_update_reserved_bytes(block_group, ins->offset, - RESERVE_ALLOC_NO_ACCOUNT); + RESERVE_ALLOC_NO_ACCOUNT, 0); BUG_ON(ret); /* logic error */ ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 0, owner, offset, ins, 1); @@ -6990,12 +7319,21 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) { + buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, + blocksize, level); + if (!IS_ERR(buf)) + root->alloc_bytenr += blocksize; + return buf; + } +#endif block_rsv = use_block_rsv(trans, root, blocksize); if (IS_ERR(block_rsv)) return ERR_CAST(block_rsv); ret = btrfs_reserve_extent(root, blocksize, blocksize, - empty_size, hint, &ins, 0); + empty_size, hint, &ins, 0, 0); if (ret) { unuse_block_rsv(root->fs_info, block_rsv, blocksize); return ERR_PTR(ret); @@ -7731,7 +8069,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } } - if (root->in_radix) { + if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { btrfs_drop_and_free_fs_root(tree_root->fs_info, root); } else { free_extent_buffer(root->node); @@ -8323,8 +8661,9 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) list_del(&space_info->list); for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { struct kobject *kobj; - kobj = &space_info->block_group_kobjs[i]; - if (kobj->parent) { + kobj = space_info->block_group_kobjs[i]; + space_info->block_group_kobjs[i] = NULL; + if (kobj) { kobject_del(kobj); kobject_put(kobj); } @@ -8348,17 +8687,26 @@ static void __link_block_group(struct btrfs_space_info *space_info, up_write(&space_info->groups_sem); if (first) { - struct kobject *kobj = &space_info->block_group_kobjs[index]; + struct raid_kobject *rkobj; int ret; - kobject_get(&space_info->kobj); /* put in release */ - ret = kobject_add(kobj, &space_info->kobj, "%s", - get_raid_name(index)); + rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); + if (!rkobj) + goto out_err; + rkobj->raid_type = index; + kobject_init(&rkobj->kobj, &btrfs_raid_ktype); + ret = kobject_add(&rkobj->kobj, &space_info->kobj, + "%s", get_raid_name(index)); if (ret) { - pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n"); - kobject_put(&space_info->kobj); + kobject_put(&rkobj->kobj); + goto out_err; } + space_info->block_group_kobjs[index] = &rkobj->kobj; } + + return; +out_err: + pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n"); } static struct btrfs_block_group_cache * @@ -8388,6 +8736,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) start); atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); + init_rwsem(&cache->data_rwsem); INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); INIT_LIST_HEAD(&cache->new_bg_list); @@ -8607,7 +8956,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, extent_root = root->fs_info->extent_root; - root->fs_info->last_trans_log_full_commit = trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); cache = btrfs_create_block_group_cache(root, chunk_offset, size); if (!cache) @@ -8693,6 +9042,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root = root->fs_info->tree_root; struct btrfs_key key; struct inode *inode; + struct kobject *kobj = NULL; int ret; int index; int factor; @@ -8792,11 +9142,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, */ list_del_init(&block_group->list); if (list_empty(&block_group->space_info->block_groups[index])) { - kobject_del(&block_group->space_info->block_group_kobjs[index]); - kobject_put(&block_group->space_info->block_group_kobjs[index]); + kobj = block_group->space_info->block_group_kobjs[index]; + block_group->space_info->block_group_kobjs[index] = NULL; clear_avail_alloc_bits(root->fs_info, block_group->flags); } up_write(&block_group->space_info->groups_sem); + if (kobj) { + kobject_del(kobj); + kobject_put(kobj); + } if (block_group->cached == BTRFS_CACHE_STARTED) wait_block_group_cache_done(block_group); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3955e475ceec..a389820d158b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1693,6 +1693,7 @@ again: * shortening the size of the delalloc range we're searching */ free_extent_state(cached_state); + cached_state = NULL; if (!loops) { max_bytes = PAGE_CACHE_SIZE; loops = 1; @@ -2353,7 +2354,7 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end) { int uptodate = (err == 0); struct extent_io_tree *tree; - int ret; + int ret = 0; tree = &BTRFS_I(page->mapping->host)->io_tree; @@ -2367,6 +2368,8 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end) if (!uptodate) { ClearPageUptodate(page); SetPageError(page); + ret = ret < 0 ? ret : -EIO; + mapping_set_error(page->mapping, ret); } return 0; } @@ -3098,143 +3101,130 @@ static noinline void update_nr_written(struct page *page, } /* - * the writepage semantics are similar to regular writepage. extent - * records are inserted to lock ranges in the tree, and as dirty areas - * are found, they are marked writeback. Then the lock bits are removed - * and the end_io handler clears the writeback ranges + * helper for __extent_writepage, doing all of the delayed allocation setup. + * + * This returns 1 if our fill_delalloc function did all the work required + * to write the page (copy into inline extent). In this case the IO has + * been started and the page is already unlocked. + * + * This returns 0 if all went well (page still locked) + * This returns < 0 if there were errors (page still locked) */ -static int __extent_writepage(struct page *page, struct writeback_control *wbc, - void *data) +static noinline_for_stack int writepage_delalloc(struct inode *inode, + struct page *page, struct writeback_control *wbc, + struct extent_page_data *epd, + u64 delalloc_start, + unsigned long *nr_written) +{ + struct extent_io_tree *tree = epd->tree; + u64 page_end = delalloc_start + PAGE_CACHE_SIZE - 1; + u64 nr_delalloc; + u64 delalloc_to_write = 0; + u64 delalloc_end = 0; + int ret; + int page_started = 0; + + if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc) + return 0; + + while (delalloc_end < page_end) { + nr_delalloc = find_lock_delalloc_range(inode, tree, + page, + &delalloc_start, + &delalloc_end, + 128 * 1024 * 1024); + if (nr_delalloc == 0) { + delalloc_start = delalloc_end + 1; + continue; + } + ret = tree->ops->fill_delalloc(inode, page, + delalloc_start, + delalloc_end, + &page_started, + nr_written); + /* File system has been set read-only */ + if (ret) { + SetPageError(page); + /* fill_delalloc should be return < 0 for error + * but just in case, we use > 0 here meaning the + * IO is started, so we don't want to return > 0 + * unless things are going well. + */ + ret = ret < 0 ? ret : -EIO; + goto done; + } + /* + * delalloc_end is already one less than the total + * length, so we don't subtract one from + * PAGE_CACHE_SIZE + */ + delalloc_to_write += (delalloc_end - delalloc_start + + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; + delalloc_start = delalloc_end + 1; + } + if (wbc->nr_to_write < delalloc_to_write) { + int thresh = 8192; + + if (delalloc_to_write < thresh * 2) + thresh = delalloc_to_write; + wbc->nr_to_write = min_t(u64, delalloc_to_write, + thresh); + } + + /* did the fill delalloc function already unlock and start + * the IO? + */ + if (page_started) { + /* + * we've unlocked the page, so we can't update + * the mapping's writeback index, just update + * nr_to_write. + */ + wbc->nr_to_write -= *nr_written; + return 1; + } + + ret = 0; + +done: + return ret; +} + +/* + * helper for __extent_writepage. This calls the writepage start hooks, + * and does the loop to map the page into extents and bios. + * + * We return 1 if the IO is started and the page is unlocked, + * 0 if all went well (page still locked) + * < 0 if there were errors (page still locked) + */ +static noinline_for_stack int __extent_writepage_io(struct inode *inode, + struct page *page, + struct writeback_control *wbc, + struct extent_page_data *epd, + loff_t i_size, + unsigned long nr_written, + int write_flags, int *nr_ret) { - struct inode *inode = page->mapping->host; - struct extent_page_data *epd = data; struct extent_io_tree *tree = epd->tree; u64 start = page_offset(page); - u64 delalloc_start; u64 page_end = start + PAGE_CACHE_SIZE - 1; u64 end; u64 cur = start; u64 extent_offset; - u64 last_byte = i_size_read(inode); u64 block_start; u64 iosize; sector_t sector; struct extent_state *cached_state = NULL; struct extent_map *em; struct block_device *bdev; - int ret; - int nr = 0; size_t pg_offset = 0; size_t blocksize; - loff_t i_size = i_size_read(inode); - unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; - u64 nr_delalloc; - u64 delalloc_end; - int page_started; - int compressed; - int write_flags; - unsigned long nr_written = 0; - bool fill_delalloc = true; - - if (wbc->sync_mode == WB_SYNC_ALL) - write_flags = WRITE_SYNC; - else - write_flags = WRITE; - - trace___extent_writepage(page, inode, wbc); - - WARN_ON(!PageLocked(page)); - - ClearPageError(page); - - pg_offset = i_size & (PAGE_CACHE_SIZE - 1); - if (page->index > end_index || - (page->index == end_index && !pg_offset)) { - page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); - unlock_page(page); - return 0; - } - - if (page->index == end_index) { - char *userpage; - - userpage = kmap_atomic(page); - memset(userpage + pg_offset, 0, - PAGE_CACHE_SIZE - pg_offset); - kunmap_atomic(userpage); - flush_dcache_page(page); - } - pg_offset = 0; - - set_page_extent_mapped(page); - - if (!tree->ops || !tree->ops->fill_delalloc) - fill_delalloc = false; - - delalloc_start = start; - delalloc_end = 0; - page_started = 0; - if (!epd->extent_locked && fill_delalloc) { - u64 delalloc_to_write = 0; - /* - * make sure the wbc mapping index is at least updated - * to this page. - */ - update_nr_written(page, wbc, 0); - - while (delalloc_end < page_end) { - nr_delalloc = find_lock_delalloc_range(inode, tree, - page, - &delalloc_start, - &delalloc_end, - 128 * 1024 * 1024); - if (nr_delalloc == 0) { - delalloc_start = delalloc_end + 1; - continue; - } - ret = tree->ops->fill_delalloc(inode, page, - delalloc_start, - delalloc_end, - &page_started, - &nr_written); - /* File system has been set read-only */ - if (ret) { - SetPageError(page); - goto done; - } - /* - * delalloc_end is already one less than the total - * length, so we don't subtract one from - * PAGE_CACHE_SIZE - */ - delalloc_to_write += (delalloc_end - delalloc_start + - PAGE_CACHE_SIZE) >> - PAGE_CACHE_SHIFT; - delalloc_start = delalloc_end + 1; - } - if (wbc->nr_to_write < delalloc_to_write) { - int thresh = 8192; - - if (delalloc_to_write < thresh * 2) - thresh = delalloc_to_write; - wbc->nr_to_write = min_t(u64, delalloc_to_write, - thresh); - } + int ret = 0; + int nr = 0; + bool compressed; - /* did the fill delalloc function already unlock and start - * the IO? - */ - if (page_started) { - ret = 0; - /* - * we've unlocked the page, so we can't update - * the mapping's writeback index, just update - * nr_to_write. - */ - wbc->nr_to_write -= nr_written; - goto done_unlocked; - } - } if (tree->ops && tree->ops->writepage_start_hook) { ret = tree->ops->writepage_start_hook(page, start, page_end); @@ -3244,9 +3234,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, wbc->pages_skipped++; else redirty_page_for_writepage(wbc, page); + update_nr_written(page, wbc, nr_written); unlock_page(page); - ret = 0; + ret = 1; goto done_unlocked; } } @@ -3258,7 +3249,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, update_nr_written(page, wbc, nr_written + 1); end = page_end; - if (last_byte <= start) { + if (i_size <= start) { if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, start, page_end, NULL, 1); @@ -3268,7 +3259,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, blocksize = inode->i_sb->s_blocksize; while (cur <= end) { - if (cur >= last_byte) { + u64 em_end; + if (cur >= i_size) { if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, cur, page_end, NULL, 1); @@ -3278,13 +3270,15 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, end - cur + 1, 1); if (IS_ERR_OR_NULL(em)) { SetPageError(page); + ret = PTR_ERR_OR_ZERO(em); break; } extent_offset = cur - em->start; - BUG_ON(extent_map_end(em) <= cur); + em_end = extent_map_end(em); + BUG_ON(em_end <= cur); BUG_ON(end < cur); - iosize = min(extent_map_end(em) - cur, end - cur + 1); + iosize = min(em_end - cur, end - cur + 1); iosize = ALIGN(iosize, blocksize); sector = (em->block_start + extent_offset) >> 9; bdev = em->bdev; @@ -3320,13 +3314,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, pg_offset += iosize; continue; } - /* leave this out until we have a page_mkwrite call */ - if (0 && !test_range_bit(tree, cur, cur + iosize - 1, - EXTENT_DIRTY, 0, NULL)) { - cur = cur + iosize; - pg_offset += iosize; - continue; - } if (tree->ops && tree->ops->writepage_io_hook) { ret = tree->ops->writepage_io_hook(page, cur, @@ -3337,7 +3324,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, if (ret) { SetPageError(page); } else { - unsigned long max_nr = end_index + 1; + unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1; set_range_writeback(tree, cur, cur + iosize - 1); if (!PageWriteback(page)) { @@ -3359,17 +3346,94 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, nr++; } done: + *nr_ret = nr; + +done_unlocked: + + /* drop our reference on any cached states */ + free_extent_state(cached_state); + return ret; +} + +/* + * the writepage semantics are similar to regular writepage. extent + * records are inserted to lock ranges in the tree, and as dirty areas + * are found, they are marked writeback. Then the lock bits are removed + * and the end_io handler clears the writeback ranges + */ +static int __extent_writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct inode *inode = page->mapping->host; + struct extent_page_data *epd = data; + u64 start = page_offset(page); + u64 page_end = start + PAGE_CACHE_SIZE - 1; + int ret; + int nr = 0; + size_t pg_offset = 0; + loff_t i_size = i_size_read(inode); + unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; + int write_flags; + unsigned long nr_written = 0; + + if (wbc->sync_mode == WB_SYNC_ALL) + write_flags = WRITE_SYNC; + else + write_flags = WRITE; + + trace___extent_writepage(page, inode, wbc); + + WARN_ON(!PageLocked(page)); + + ClearPageError(page); + + pg_offset = i_size & (PAGE_CACHE_SIZE - 1); + if (page->index > end_index || + (page->index == end_index && !pg_offset)) { + page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); + unlock_page(page); + return 0; + } + + if (page->index == end_index) { + char *userpage; + + userpage = kmap_atomic(page); + memset(userpage + pg_offset, 0, + PAGE_CACHE_SIZE - pg_offset); + kunmap_atomic(userpage); + flush_dcache_page(page); + } + + pg_offset = 0; + + set_page_extent_mapped(page); + + ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written); + if (ret == 1) + goto done_unlocked; + if (ret) + goto done; + + ret = __extent_writepage_io(inode, page, wbc, epd, + i_size, nr_written, write_flags, &nr); + if (ret == 1) + goto done_unlocked; + +done: if (nr == 0) { /* make sure the mapping tag for page dirty gets cleared */ set_page_writeback(page); end_page_writeback(page); } + if (PageError(page)) { + ret = ret < 0 ? ret : -EIO; + end_extent_writepage(page, ret, start, page_end); + } unlock_page(page); + return ret; done_unlocked: - - /* drop our reference on any cached states */ - free_extent_state(cached_state); return 0; } @@ -3385,9 +3449,10 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb) TASK_UNINTERRUPTIBLE); } -static int lock_extent_buffer_for_io(struct extent_buffer *eb, - struct btrfs_fs_info *fs_info, - struct extent_page_data *epd) +static noinline_for_stack int +lock_extent_buffer_for_io(struct extent_buffer *eb, + struct btrfs_fs_info *fs_info, + struct extent_page_data *epd) { unsigned long i, num_pages; int flush = 0; @@ -3458,7 +3523,7 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb, static void end_extent_buffer_writeback(struct extent_buffer *eb) { clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); } @@ -3492,7 +3557,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err) bio_put(bio); } -static int write_one_eb(struct extent_buffer *eb, +static noinline_for_stack int write_one_eb(struct extent_buffer *eb, struct btrfs_fs_info *fs_info, struct writeback_control *wbc, struct extent_page_data *epd) @@ -3690,6 +3755,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, struct inode *inode = mapping->host; int ret = 0; int done = 0; + int err = 0; int nr_to_write_done = 0; struct pagevec pvec; int nr_pages; @@ -3776,8 +3842,8 @@ retry: unlock_page(page); ret = 0; } - if (ret) - done = 1; + if (!err && ret < 0) + err = ret; /* * the filesystem may choose to bump up nr_to_write. @@ -3789,7 +3855,7 @@ retry: pagevec_release(&pvec); cond_resched(); } - if (!scanned && !done) { + if (!scanned && !done && !err) { /* * We hit the last page and there is more work to be done: wrap * back to the start of the file @@ -3799,7 +3865,7 @@ retry: goto retry; } btrfs_add_delayed_iput(inode); - return ret; + return err; } static void flush_epd_write_bio(struct extent_page_data *epd) @@ -4510,7 +4576,8 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) spin_unlock(&eb->refs_lock); } -static void mark_extent_buffer_accessed(struct extent_buffer *eb) +static void mark_extent_buffer_accessed(struct extent_buffer *eb, + struct page *accessed) { unsigned long num_pages, i; @@ -4519,7 +4586,8 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb) num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { struct page *p = extent_buffer_page(eb, i); - mark_page_accessed(p); + if (p != accessed) + mark_page_accessed(p); } } @@ -4533,7 +4601,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, start >> PAGE_CACHE_SHIFT); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); - mark_extent_buffer_accessed(eb); + mark_extent_buffer_accessed(eb, NULL); return eb; } rcu_read_unlock(); @@ -4541,6 +4609,53 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, return NULL; } +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start, unsigned long len) +{ + struct extent_buffer *eb, *exists = NULL; + int ret; + + eb = find_extent_buffer(fs_info, start); + if (eb) + return eb; + eb = alloc_dummy_extent_buffer(start, len); + if (!eb) + return NULL; + eb->fs_info = fs_info; +again: + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + if (ret) + goto free_eb; + spin_lock(&fs_info->buffer_lock); + ret = radix_tree_insert(&fs_info->buffer_radix, + start >> PAGE_CACHE_SHIFT, eb); + spin_unlock(&fs_info->buffer_lock); + radix_tree_preload_end(); + if (ret == -EEXIST) { + exists = find_extent_buffer(fs_info, start); + if (exists) + goto free_eb; + else + goto again; + } + check_buffer_tree_ref(eb); + set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); + + /* + * We will free dummy extent buffer's if they come into + * free_extent_buffer with a ref count of 2, but if we are using this we + * want the buffers to stay in memory until we're done with them, so + * bump the ref count again. + */ + atomic_inc(&eb->refs); + return eb; +free_eb: + btrfs_release_extent_buffer(eb); + return exists; +} +#endif + struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, unsigned long len) { @@ -4581,7 +4696,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, spin_unlock(&mapping->private_lock); unlock_page(p); page_cache_release(p); - mark_extent_buffer_accessed(exists); + mark_extent_buffer_accessed(exists, p); goto free_eb; } @@ -4596,7 +4711,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, attach_extent_buffer_page(eb, p); spin_unlock(&mapping->private_lock); WARN_ON(PageDirty(p)); - mark_page_accessed(p); eb->pages[i] = p; if (!PageUptodate(p)) uptodate = 0; @@ -4954,6 +5068,43 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, } } +int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv, + unsigned long start, + unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char __user *dst = (char __user *)dstv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + int ret = 0; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = extent_buffer_page(eb, i); + + cur = min(len, (PAGE_CACHE_SIZE - offset)); + kaddr = page_address(page); + if (copy_to_user(dst, kaddr + offset, cur)) { + ret = -EFAULT; + break; + } + + dst += cur; + len -= cur; + offset = 0; + i++; + } + + return ret; +} + int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, unsigned long min_len, char **map, unsigned long *map_start, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index c488b45237bf..ccc264e7bde1 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -158,7 +158,6 @@ struct extent_buffer { * to unlock */ wait_queue_head_t read_lock_wq; - wait_queue_head_t lock_wq; struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; #ifdef CONFIG_BTRFS_DEBUG struct list_head leak_list; @@ -304,6 +303,9 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, void read_extent_buffer(struct extent_buffer *eb, void *dst, unsigned long start, unsigned long len); +int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dst, + unsigned long start, + unsigned long len); void write_extent_buffer(struct extent_buffer *eb, const void *src, unsigned long start, unsigned long len); void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, @@ -350,5 +352,7 @@ noinline u64 find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, struct page *locked_page, u64 *start, u64 *end, u64 max_bytes); +struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start, unsigned long len); #endif #endif diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 1874aee69c86..225302b39afb 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -75,6 +75,8 @@ void free_extent_map(struct extent_map *em) if (atomic_dec_and_test(&em->refs)) { WARN_ON(extent_map_in_tree(em)); WARN_ON(!list_empty(&em->list)); + if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) + kfree(em->bdev); kmem_cache_free(extent_map_cache, em); } } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index e7fd8a56a140..b2991fd8583e 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -15,6 +15,7 @@ #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ #define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ #define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */ +#define EXTENT_FLAG_FS_MAPPING 6 /* filesystem extent mapping type */ struct extent_map { struct rb_node rb_node; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 127555b29f58..f46cfe45d686 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -281,10 +281,10 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, found: csum += count * csum_size; nblocks -= count; + bio_index += count; while (count--) { disk_bytenr += bvec->bv_len; offset += bvec->bv_len; - bio_index++; bvec++; } } @@ -750,7 +750,7 @@ again: int slot = path->slots[0] + 1; /* we didn't find a csum item, insert one */ nritems = btrfs_header_nritems(path->nodes[0]); - if (path->slots[0] >= nritems - 1) { + if (!nritems || (path->slots[0] >= nritems - 1)) { ret = btrfs_next_leaf(root, path); if (ret == 1) found_next = 1; @@ -885,3 +885,79 @@ out: fail_unlock: goto out; } + +void btrfs_extent_item_to_extent_map(struct inode *inode, + const struct btrfs_path *path, + struct btrfs_file_extent_item *fi, + const bool new_inline, + struct extent_map *em) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_buffer *leaf = path->nodes[0]; + const int slot = path->slots[0]; + struct btrfs_key key; + u64 extent_start, extent_end; + u64 bytenr; + u8 type = btrfs_file_extent_type(leaf, fi); + int compress_type = btrfs_file_extent_compression(leaf, fi); + + em->bdev = root->fs_info->fs_devices->latest_bdev; + btrfs_item_key_to_cpu(leaf, &key, slot); + extent_start = key.offset; + + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + extent_end = extent_start + + btrfs_file_extent_num_bytes(leaf, fi); + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + size_t size; + size = btrfs_file_extent_inline_len(leaf, slot, fi); + extent_end = ALIGN(extent_start + size, root->sectorsize); + } + + em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { + em->start = extent_start; + em->len = extent_end - extent_start; + em->orig_start = extent_start - + btrfs_file_extent_offset(leaf, fi); + em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + if (bytenr == 0) { + em->block_start = EXTENT_MAP_HOLE; + return; + } + if (compress_type != BTRFS_COMPRESS_NONE) { + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->compress_type = compress_type; + em->block_start = bytenr; + em->block_len = em->orig_block_len; + } else { + bytenr += btrfs_file_extent_offset(leaf, fi); + em->block_start = bytenr; + em->block_len = em->len; + if (type == BTRFS_FILE_EXTENT_PREALLOC) + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + } + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + em->block_start = EXTENT_MAP_INLINE; + em->start = extent_start; + em->len = extent_end - extent_start; + /* + * Initialize orig_start and block_len with the same values + * as in inode.c:btrfs_get_extent(). + */ + em->orig_start = EXTENT_MAP_HOLE; + em->block_len = (u64)-1; + if (!new_inline && compress_type != BTRFS_COMPRESS_NONE) { + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->compress_type = compress_type; + } + } else { + btrfs_err(root->fs_info, + "unknown file extent item type %d, inode %llu, offset %llu, root %llu", + type, btrfs_ino(inode), extent_start, + root->root_key.objectid); + } +} diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index eb742c07e7a4..1f2b99cb55ea 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -40,6 +40,7 @@ #include "tree-log.h" #include "locking.h" #include "volumes.h" +#include "qgroup.h" static struct kmem_cache *btrfs_inode_defrag_cachep; /* @@ -447,7 +448,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, write_bytes -= copied; total_copied += copied; - /* Return to btrfs_file_aio_write to fault page */ + /* Return to btrfs_file_write_iter to fault page */ if (unlikely(copied == 0)) break; @@ -470,11 +471,12 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages) for (i = 0; i < num_pages; i++) { /* page checked is some magic around finding pages that * have been modified without going through btrfs_set_page_dirty - * clear it here + * clear it here. There should be no need to mark the pages + * accessed as prepare_pages should have marked them accessed + * in prepare_pages via find_or_create_page() */ ClearPageChecked(pages[i]); unlock_page(pages[i]); - mark_page_accessed(pages[i]); page_cache_release(pages[i]); } } @@ -714,7 +716,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, int recow; int ret; int modify_tree = -1; - int update_refs = (root->ref_cows || root == root->fs_info->tree_root); + int update_refs; int found = 0; int leafs_visited = 0; @@ -724,6 +726,8 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent) modify_tree = 0; + update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + root == root->fs_info->tree_root); while (1) { recow = 0; ret = btrfs_lookup_file_extent(trans, root, path, ino, @@ -780,6 +784,18 @@ next_slot: extent_end = search_start; } + /* + * Don't skip extent items representing 0 byte lengths. They + * used to be created (bug) if while punching holes we hit + * -ENOSPC condition. So if we find one here, just ensure we + * delete it, otherwise we would insert a new file extent item + * with the same key (offset) as that 0 bytes length file + * extent item in the call to setup_items_for_insert() later + * in this function. + */ + if (extent_end == key.offset && extent_end >= search_start) + goto delete_extent_item; + if (extent_end <= search_start) { path->slots[0]++; goto next_slot; @@ -800,7 +816,7 @@ next_slot: if (start > key.offset && end < extent_end) { BUG_ON(del_nr > 0); if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - ret = -EINVAL; + ret = -EOPNOTSUPP; break; } @@ -835,7 +851,7 @@ next_slot: disk_bytenr, num_bytes, 0, root->root_key.objectid, new_key.objectid, - start - extent_offset, 0); + start - extent_offset, 1); BUG_ON(ret); /* -ENOMEM */ } key.offset = start; @@ -846,7 +862,7 @@ next_slot: */ if (start <= key.offset && end < extent_end) { if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - ret = -EINVAL; + ret = -EOPNOTSUPP; break; } @@ -872,7 +888,7 @@ next_slot: if (start > key.offset && end >= extent_end) { BUG_ON(del_nr > 0); if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - ret = -EINVAL; + ret = -EOPNOTSUPP; break; } @@ -893,6 +909,7 @@ next_slot: * | ------ extent ------ | */ if (start <= key.offset && end >= extent_end) { +delete_extent_item: if (del_nr == 0) { del_slot = path->slots[0]; del_nr = 1; @@ -1191,7 +1208,7 @@ again: ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset, 0); + ino, orig_offset, 1); BUG_ON(ret); /* -ENOMEM */ if (split == start) { @@ -1658,27 +1675,22 @@ again: } static ssize_t __btrfs_direct_write(struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, loff_t pos, - size_t count, size_t ocount) + struct iov_iter *from, + loff_t pos) { struct file *file = iocb->ki_filp; - struct iov_iter i; ssize_t written; ssize_t written_buffered; loff_t endbyte; int err; - written = generic_file_direct_write(iocb, iov, &nr_segs, pos, - count, ocount); + written = generic_file_direct_write(iocb, from, pos); - if (written < 0 || written == count) + if (written < 0 || !iov_iter_count(from)) return written; pos += written; - count -= written; - iov_iter_init(&i, iov, nr_segs, count, written); - written_buffered = __btrfs_buffered_write(file, &i, pos); + written_buffered = __btrfs_buffered_write(file, from, pos); if (written_buffered < 0) { err = written_buffered; goto out; @@ -1713,9 +1725,8 @@ static void update_time_for_write(struct inode *inode) inode_inc_iversion(inode); } -static ssize_t btrfs_file_aio_write(struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t btrfs_file_write_iter(struct kiocb *iocb, + struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); @@ -1724,18 +1735,12 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, u64 end_pos; ssize_t num_written = 0; ssize_t err = 0; - size_t count, ocount; + size_t count = iov_iter_count(from); bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); + loff_t pos = iocb->ki_pos; mutex_lock(&inode->i_mutex); - err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); - if (err) { - mutex_unlock(&inode->i_mutex); - goto out; - } - count = ocount; - current->backing_dev_info = inode->i_mapping->backing_dev_info; err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) { @@ -1748,6 +1753,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, goto out; } + iov_iter_truncate(from, count); + err = file_remove_suid(file); if (err) { mutex_unlock(&inode->i_mutex); @@ -1777,7 +1784,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, start_pos = round_down(pos, root->sectorsize); if (start_pos > i_size_read(inode)) { /* Expand hole size to cover write data, preventing empty gap */ - end_pos = round_up(pos + iov->iov_len, root->sectorsize); + end_pos = round_up(pos + count, root->sectorsize); err = btrfs_cont_expand(inode, i_size_read(inode), end_pos); if (err) { mutex_unlock(&inode->i_mutex); @@ -1789,14 +1796,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, atomic_inc(&BTRFS_I(inode)->sync_writers); if (unlikely(file->f_flags & O_DIRECT)) { - num_written = __btrfs_direct_write(iocb, iov, nr_segs, - pos, count, ocount); + num_written = __btrfs_direct_write(iocb, from, pos); } else { - struct iov_iter i; - - iov_iter_init(&i, iov, nr_segs, count, num_written); - - num_written = __btrfs_buffered_write(file, &i, pos); + num_written = __btrfs_buffered_write(file, from, pos); if (num_written > 0) iocb->ki_pos = pos + num_written; } @@ -2009,8 +2011,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (!full_sync) { ret = btrfs_wait_ordered_range(inode, start, end - start + 1); - if (ret) + if (ret) { + btrfs_end_transaction(trans, root); goto out; + } } ret = btrfs_commit_transaction(trans, root); } else { @@ -2168,6 +2172,37 @@ out: return 0; } +/* + * Find a hole extent on given inode and change start/len to the end of hole + * extent.(hole/vacuum extent whose em->start <= start && + * em->start + em->len > start) + * When a hole extent is found, return 1 and modify start/len. + */ +static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len) +{ + struct extent_map *em; + int ret = 0; + + em = btrfs_get_extent(inode, NULL, 0, *start, *len, 0); + if (IS_ERR_OR_NULL(em)) { + if (!em) + ret = -ENOMEM; + else + ret = PTR_ERR(em); + return ret; + } + + /* Hole or vacuum extent(only exists in no-hole mode) */ + if (em->block_start == EXTENT_MAP_HOLE) { + ret = 1; + *len = em->start + em->len > *start + *len ? + 0 : *start + *len - em->start - em->len; + *start = em->start + em->len; + } + free_extent_map(em); + return ret; +} + static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -2175,25 +2210,42 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) struct btrfs_path *path; struct btrfs_block_rsv *rsv; struct btrfs_trans_handle *trans; - u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); - u64 lockend = round_down(offset + len, - BTRFS_I(inode)->root->sectorsize) - 1; - u64 cur_offset = lockstart; + u64 lockstart; + u64 lockend; + u64 tail_start; + u64 tail_len; + u64 orig_start = offset; + u64 cur_offset; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); u64 drop_end; int ret = 0; int err = 0; int rsv_count; - bool same_page = ((offset >> PAGE_CACHE_SHIFT) == - ((offset + len - 1) >> PAGE_CACHE_SHIFT)); + bool same_page; bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); - u64 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE); + u64 ino_size; ret = btrfs_wait_ordered_range(inode, offset, len); if (ret) return ret; mutex_lock(&inode->i_mutex); + ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE); + ret = find_first_non_hole(inode, &offset, &len); + if (ret < 0) + goto out_only_mutex; + if (ret && !len) { + /* Already in a large hole */ + ret = 0; + goto out_only_mutex; + } + + lockstart = round_up(offset , BTRFS_I(inode)->root->sectorsize); + lockend = round_down(offset + len, + BTRFS_I(inode)->root->sectorsize) - 1; + same_page = ((offset >> PAGE_CACHE_SHIFT) == + ((offset + len - 1) >> PAGE_CACHE_SHIFT)); + /* * We needn't truncate any page which is beyond the end of the file * because we are sure there is no data there. @@ -2205,8 +2257,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (same_page && len < PAGE_CACHE_SIZE) { if (offset < ino_size) ret = btrfs_truncate_page(inode, offset, len, 0); - mutex_unlock(&inode->i_mutex); - return ret; + goto out_only_mutex; } /* zero back part of the first page */ @@ -2218,12 +2269,39 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } } - /* zero the front end of the last page */ - if (offset + len < ino_size) { - ret = btrfs_truncate_page(inode, offset + len, 0, 1); - if (ret) { - mutex_unlock(&inode->i_mutex); - return ret; + /* Check the aligned pages after the first unaligned page, + * if offset != orig_start, which means the first unaligned page + * including serveral following pages are already in holes, + * the extra check can be skipped */ + if (offset == orig_start) { + /* after truncate page, check hole again */ + len = offset + len - lockstart; + offset = lockstart; + ret = find_first_non_hole(inode, &offset, &len); + if (ret < 0) + goto out_only_mutex; + if (ret && !len) { + ret = 0; + goto out_only_mutex; + } + lockstart = offset; + } + + /* Check the tail unaligned part is in a hole */ + tail_start = lockend + 1; + tail_len = offset + len - tail_start; + if (tail_len) { + ret = find_first_non_hole(inode, &tail_start, &tail_len); + if (unlikely(ret < 0)) + goto out_only_mutex; + if (!ret) { + /* zero the front end of the last page */ + if (tail_start + tail_len < ino_size) { + ret = btrfs_truncate_page(inode, + tail_start + tail_len, 0, 1); + if (ret) + goto out_only_mutex; + } } } @@ -2249,9 +2327,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if ((!ordered || (ordered->file_offset + ordered->len <= lockstart || ordered->file_offset > lockend)) && - !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockend, EXTENT_UPTODATE, 0, - cached_state)) { + !btrfs_page_exists_in_range(inode, lockstart, lockend)) { if (ordered) btrfs_put_ordered_extent(ordered); break; @@ -2299,6 +2375,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) BUG_ON(ret); trans->block_rsv = rsv; + cur_offset = lockstart; + len = lockend - cur_offset; while (cur_offset < lockend) { ret = __btrfs_drop_extents(trans, root, inode, path, cur_offset, lockend + 1, @@ -2339,6 +2417,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) rsv, min_size); BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; + + ret = find_first_non_hole(inode, &cur_offset, &len); + if (unlikely(ret < 0)) + break; + if (ret && !len) { + ret = 0; + break; + } } if (ret) { @@ -2347,7 +2433,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } trans->block_rsv = &root->fs_info->trans_block_rsv; - if (cur_offset < ino_size) { + /* + * Don't insert file hole extent item if it's for a range beyond eof + * (because it's useless) or if it represents a 0 bytes range (when + * cur_offset == drop_end). + */ + if (cur_offset < ino_size && cur_offset < drop_end) { ret = fill_holes(trans, inode, path, cur_offset, drop_end); if (ret) { err = ret; @@ -2372,6 +2463,7 @@ out_free: out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state, GFP_NOFS); +out_only_mutex: mutex_unlock(&inode->i_mutex); if (ret && !err) err = ret; @@ -2633,11 +2725,11 @@ out: const struct file_operations btrfs_file_operations = { .llseek = btrfs_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, .splice_read = generic_file_splice_read, - .aio_write = btrfs_file_aio_write, + .write_iter = btrfs_file_write_iter, .mmap = btrfs_file_mmap, .open = generic_file_open, .release = btrfs_release_file, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 73f3de7a083c..2b0a627cb5f9 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -274,18 +274,32 @@ struct io_ctl { }; static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode, - struct btrfs_root *root) + struct btrfs_root *root, int write) { + int num_pages; + int check_crcs = 0; + + num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + + if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID) + check_crcs = 1; + + /* Make sure we can fit our crcs into the first page */ + if (write && check_crcs && + (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) + return -ENOSPC; + memset(io_ctl, 0, sizeof(struct io_ctl)); - io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; - io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages, - GFP_NOFS); + + io_ctl->pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS); if (!io_ctl->pages) return -ENOMEM; + + io_ctl->num_pages = num_pages; io_ctl->root = root; - if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID) - io_ctl->check_crcs = 1; + io_ctl->check_crcs = check_crcs; + return 0; } @@ -666,6 +680,13 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, generation = btrfs_free_space_generation(leaf, header); btrfs_release_path(path); + if (!BTRFS_I(inode)->generation) { + btrfs_info(root->fs_info, + "The free space cache file (%llu) is invalid. skip it\n", + offset); + return 0; + } + if (BTRFS_I(inode)->generation != generation) { btrfs_err(root->fs_info, "free space inode generation (%llu) " @@ -677,7 +698,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, if (!num_entries) return 0; - ret = io_ctl_init(&io_ctl, inode, root); + ret = io_ctl_init(&io_ctl, inode, root, 0); if (ret) return ret; @@ -831,7 +852,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, if (!matched) { __btrfs_remove_free_space_cache(ctl); - btrfs_err(fs_info, "block group %llu has wrong amount of free space", + btrfs_warn(fs_info, "block group %llu has wrong amount of free space", block_group->key.objectid); ret = -1; } @@ -843,7 +864,7 @@ out: spin_unlock(&block_group->lock); ret = 0; - btrfs_err(fs_info, "failed to load free space cache for block group %llu", + btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuild it now", block_group->key.objectid); } @@ -851,90 +872,44 @@ out: return ret; } -/** - * __btrfs_write_out_cache - write out cached info to an inode - * @root - the root the inode belongs to - * @ctl - the free space cache we are going to write out - * @block_group - the block_group for this cache if it belongs to a block_group - * @trans - the trans handle - * @path - the path to use - * @offset - the offset for the key we'll insert - * - * This function writes out a free space cache struct to disk for quick recovery - * on mount. This will return 0 if it was successfull in writing the cache out, - * and -1 if it was not. - */ -static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, - struct btrfs_free_space_ctl *ctl, - struct btrfs_block_group_cache *block_group, - struct btrfs_trans_handle *trans, - struct btrfs_path *path, u64 offset) +static noinline_for_stack +int write_cache_extent_entries(struct io_ctl *io_ctl, + struct btrfs_free_space_ctl *ctl, + struct btrfs_block_group_cache *block_group, + int *entries, int *bitmaps, + struct list_head *bitmap_list) { - struct btrfs_free_space_header *header; - struct extent_buffer *leaf; - struct rb_node *node; - struct list_head *pos, *n; - struct extent_state *cached_state = NULL; - struct btrfs_free_cluster *cluster = NULL; - struct extent_io_tree *unpin = NULL; - struct io_ctl io_ctl; - struct list_head bitmap_list; - struct btrfs_key key; - u64 start, extent_start, extent_end, len; - int entries = 0; - int bitmaps = 0; int ret; - int err = -1; - - INIT_LIST_HEAD(&bitmap_list); - - if (!i_size_read(inode)) - return -1; - - ret = io_ctl_init(&io_ctl, inode, root); - if (ret) - return -1; + struct btrfs_free_cluster *cluster = NULL; + struct rb_node *node = rb_first(&ctl->free_space_offset); /* Get the cluster for this block_group if it exists */ - if (block_group && !list_empty(&block_group->cluster_list)) + if (block_group && !list_empty(&block_group->cluster_list)) { cluster = list_entry(block_group->cluster_list.next, struct btrfs_free_cluster, block_group_list); + } - /* Lock all pages first so we can lock the extent safely. */ - io_ctl_prepare_pages(&io_ctl, inode, 0); - - lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, - 0, &cached_state); - - node = rb_first(&ctl->free_space_offset); if (!node && cluster) { node = rb_first(&cluster->root); cluster = NULL; } - /* Make sure we can fit our crcs into the first page */ - if (io_ctl.check_crcs && - (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) - goto out_nospc; - - io_ctl_set_generation(&io_ctl, trans->transid); - /* Write out the extent entries */ while (node) { struct btrfs_free_space *e; e = rb_entry(node, struct btrfs_free_space, offset_index); - entries++; + *entries += 1; - ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes, + ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes, e->bitmap); if (ret) - goto out_nospc; + goto fail; if (e->bitmap) { - list_add_tail(&e->list, &bitmap_list); - bitmaps++; + list_add_tail(&e->list, bitmap_list); + *bitmaps += 1; } node = rb_next(node); if (!node && cluster) { @@ -942,136 +917,289 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, cluster = NULL; } } + return 0; +fail: + return -ENOSPC; +} + +static noinline_for_stack int +update_cache_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + struct btrfs_path *path, u64 offset, + int entries, int bitmaps) +{ + struct btrfs_key key; + struct btrfs_free_space_header *header; + struct extent_buffer *leaf; + int ret; + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = offset; + key.type = 0; + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, + GFP_NOFS); + goto fail; + } + leaf = path->nodes[0]; + if (ret > 0) { + struct btrfs_key found_key; + ASSERT(path->slots[0]); + path->slots[0]--; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || + found_key.offset != offset) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, + inode->i_size - 1, + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, + NULL, GFP_NOFS); + btrfs_release_path(path); + goto fail; + } + } + + BTRFS_I(inode)->generation = trans->transid; + header = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_header); + btrfs_set_free_space_entries(leaf, header, entries); + btrfs_set_free_space_bitmaps(leaf, header, bitmaps); + btrfs_set_free_space_generation(leaf, header, trans->transid); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + return 0; + +fail: + return -1; +} + +static noinline_for_stack int +write_pinned_extent_entries(struct btrfs_root *root, + struct btrfs_block_group_cache *block_group, + struct io_ctl *io_ctl, + int *entries) +{ + u64 start, extent_start, extent_end, len; + struct extent_io_tree *unpin = NULL; + int ret; + + if (!block_group) + return 0; /* * We want to add any pinned extents to our free space cache * so we don't leak the space - */ - - /* + * * We shouldn't have switched the pinned extents yet so this is the * right one */ unpin = root->fs_info->pinned_extents; - if (block_group) - start = block_group->key.objectid; + start = block_group->key.objectid; - while (block_group && (start < block_group->key.objectid + - block_group->key.offset)) { + while (start < block_group->key.objectid + block_group->key.offset) { ret = find_first_extent_bit(unpin, start, &extent_start, &extent_end, EXTENT_DIRTY, NULL); - if (ret) { - ret = 0; - break; - } + if (ret) + return 0; /* This pinned extent is out of our range */ if (extent_start >= block_group->key.objectid + block_group->key.offset) - break; + return 0; extent_start = max(extent_start, start); extent_end = min(block_group->key.objectid + block_group->key.offset, extent_end + 1); len = extent_end - extent_start; - entries++; - ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL); + *entries += 1; + ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL); if (ret) - goto out_nospc; + return -ENOSPC; start = extent_end; } + return 0; +} + +static noinline_for_stack int +write_bitmap_entries(struct io_ctl *io_ctl, struct list_head *bitmap_list) +{ + struct list_head *pos, *n; + int ret; + /* Write out the bitmaps */ - list_for_each_safe(pos, n, &bitmap_list) { + list_for_each_safe(pos, n, bitmap_list) { struct btrfs_free_space *entry = list_entry(pos, struct btrfs_free_space, list); - ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap); + ret = io_ctl_add_bitmap(io_ctl, entry->bitmap); if (ret) - goto out_nospc; + return -ENOSPC; list_del_init(&entry->list); } - /* Zero out the rest of the pages just to make sure */ - io_ctl_zero_remaining_pages(&io_ctl); - - ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages, - 0, i_size_read(inode), &cached_state); - io_ctl_drop_pages(&io_ctl); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, - i_size_read(inode) - 1, &cached_state, GFP_NOFS); + return 0; +} - if (ret) - goto out; +static int flush_dirty_cache(struct inode *inode) +{ + int ret; ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); - if (ret) { + if (ret) clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, GFP_NOFS); - goto out; - } - key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; - key.type = 0; + return ret; +} - ret = btrfs_search_slot(trans, root, &key, path, 0, 1); - if (ret < 0) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, - GFP_NOFS); - goto out; +static void noinline_for_stack +cleanup_write_cache_enospc(struct inode *inode, + struct io_ctl *io_ctl, + struct extent_state **cached_state, + struct list_head *bitmap_list) +{ + struct list_head *pos, *n; + + list_for_each_safe(pos, n, bitmap_list) { + struct btrfs_free_space *entry = + list_entry(pos, struct btrfs_free_space, list); + list_del_init(&entry->list); } - leaf = path->nodes[0]; - if (ret > 0) { - struct btrfs_key found_key; - ASSERT(path->slots[0]); - path->slots[0]--; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || - found_key.offset != offset) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, - inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, - NULL, GFP_NOFS); - btrfs_release_path(path); + io_ctl_drop_pages(io_ctl); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, + i_size_read(inode) - 1, cached_state, + GFP_NOFS); +} + +/** + * __btrfs_write_out_cache - write out cached info to an inode + * @root - the root the inode belongs to + * @ctl - the free space cache we are going to write out + * @block_group - the block_group for this cache if it belongs to a block_group + * @trans - the trans handle + * @path - the path to use + * @offset - the offset for the key we'll insert + * + * This function writes out a free space cache struct to disk for quick recovery + * on mount. This will return 0 if it was successfull in writing the cache out, + * and -1 if it was not. + */ +static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, + struct btrfs_free_space_ctl *ctl, + struct btrfs_block_group_cache *block_group, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, u64 offset) +{ + struct extent_state *cached_state = NULL; + struct io_ctl io_ctl; + LIST_HEAD(bitmap_list); + int entries = 0; + int bitmaps = 0; + int ret; + + if (!i_size_read(inode)) + return -1; + + ret = io_ctl_init(&io_ctl, inode, root, 1); + if (ret) + return -1; + + if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) { + down_write(&block_group->data_rwsem); + spin_lock(&block_group->lock); + if (block_group->delalloc_bytes) { + block_group->disk_cache_state = BTRFS_DC_WRITTEN; + spin_unlock(&block_group->lock); + up_write(&block_group->data_rwsem); + BTRFS_I(inode)->generation = 0; + ret = 0; goto out; } + spin_unlock(&block_group->lock); } - BTRFS_I(inode)->generation = trans->transid; - header = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_free_space_header); - btrfs_set_free_space_entries(leaf, header, entries); - btrfs_set_free_space_bitmaps(leaf, header, bitmaps); - btrfs_set_free_space_generation(leaf, header, trans->transid); - btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(path); + /* Lock all pages first so we can lock the extent safely. */ + io_ctl_prepare_pages(&io_ctl, inode, 0); + + lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + 0, &cached_state); + + io_ctl_set_generation(&io_ctl, trans->transid); + + /* Write out the extent entries in the free space cache */ + ret = write_cache_extent_entries(&io_ctl, ctl, + block_group, &entries, &bitmaps, + &bitmap_list); + if (ret) + goto out_nospc; + + /* + * Some spaces that are freed in the current transaction are pinned, + * they will be added into free space cache after the transaction is + * committed, we shouldn't lose them. + */ + ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries); + if (ret) + goto out_nospc; + + /* At last, we write out all the bitmaps. */ + ret = write_bitmap_entries(&io_ctl, &bitmap_list); + if (ret) + goto out_nospc; + + /* Zero out the rest of the pages just to make sure */ + io_ctl_zero_remaining_pages(&io_ctl); + + /* Everything is written out, now we dirty the pages in the file. */ + ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages, + 0, i_size_read(inode), &cached_state); + if (ret) + goto out_nospc; + + if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) + up_write(&block_group->data_rwsem); + /* + * Release the pages and unlock the extent, we will flush + * them out later + */ + io_ctl_drop_pages(&io_ctl); + + unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, + i_size_read(inode) - 1, &cached_state, GFP_NOFS); + + /* Flush the dirty pages in the cache file. */ + ret = flush_dirty_cache(inode); + if (ret) + goto out; - err = 0; + /* Update the cache item to tell everyone this cache file is valid. */ + ret = update_cache_item(trans, root, inode, path, offset, + entries, bitmaps); out: io_ctl_free(&io_ctl); - if (err) { + if (ret) { invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; } btrfs_update_inode(trans, root, inode); - return err; + return ret; out_nospc: - list_for_each_safe(pos, n, &bitmap_list) { - struct btrfs_free_space *entry = - list_entry(pos, struct btrfs_free_space, list); - list_del_init(&entry->list); - } - io_ctl_drop_pages(&io_ctl); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, - i_size_read(inode) - 1, &cached_state, GFP_NOFS); + cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list); + + if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) + up_write(&block_group->data_rwsem); + goto out; } @@ -1091,6 +1219,12 @@ int btrfs_write_out_cache(struct btrfs_root *root, spin_unlock(&block_group->lock); return 0; } + + if (block_group->delalloc_bytes) { + block_group->disk_cache_state = BTRFS_DC_WRITTEN; + spin_unlock(&block_group->lock); + return 0; + } spin_unlock(&block_group->lock); inode = lookup_free_space_inode(root, block_group, path); diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index cc8ca193d830..888fbe19079f 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -174,9 +174,13 @@ static void start_caching(struct btrfs_root *root) BTRFS_LAST_FREE_OBJECTID - objectid + 1); } - tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n", + tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu", root->root_key.objectid); - BUG_ON(IS_ERR(tsk)); /* -ENOMEM */ + if (IS_ERR(tsk)) { + btrfs_warn(root->fs_info, "failed to start inode caching task"); + btrfs_clear_and_info(root, CHANGE_INODE_CACHE, + "disabling inode map caching"); + } } int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) @@ -205,24 +209,14 @@ again: void btrfs_return_ino(struct btrfs_root *root, u64 objectid) { - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; if (!btrfs_test_opt(root, INODE_MAP_CACHE)) return; - again: if (root->cached == BTRFS_CACHE_FINISHED) { - __btrfs_add_free_space(ctl, objectid, 1); + __btrfs_add_free_space(pinned, objectid, 1); } else { - /* - * If we are in the process of caching free ino chunks, - * to avoid adding the same inode number to the free_ino - * tree twice due to cross transaction, we'll leave it - * in the pinned tree until a transaction is committed - * or the caching work is done. - */ - down_write(&root->fs_info->commit_root_sem); spin_lock(&root->cache_lock); if (root->cached == BTRFS_CACHE_FINISHED) { @@ -234,11 +228,7 @@ again: start_caching(root); - if (objectid <= root->cache_progress || - objectid >= root->highest_objectid) - __btrfs_add_free_space(ctl, objectid, 1); - else - __btrfs_add_free_space(pinned, objectid, 1); + __btrfs_add_free_space(pinned, objectid, 1); up_write(&root->fs_info->commit_root_sem); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5f805bc944fa..3668048e16f8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -125,7 +125,7 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, * the btree. The caller should have done a btrfs_drop_extents so that * no overlapping inline items exist in the btree */ -static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, +static int insert_inline_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path, int extent_inserted, struct btrfs_root *root, struct inode *inode, u64 start, size_t size, size_t compressed_size, @@ -693,7 +693,7 @@ retry: ret = btrfs_reserve_extent(root, async_extent->compressed_size, async_extent->compressed_size, - 0, alloc_hint, &ins, 1); + 0, alloc_hint, &ins, 1, 1); if (ret) { int i; @@ -794,7 +794,7 @@ retry: out: return ret; out_free_reserve: - btrfs_free_reserved_extent(root, ins.objectid, ins.offset); + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); out_free: extent_clear_unlock_delalloc(inode, async_extent->start, async_extent->start + @@ -917,7 +917,7 @@ static noinline int cow_file_range(struct inode *inode, cur_alloc_size = disk_num_bytes; ret = btrfs_reserve_extent(root, cur_alloc_size, root->sectorsize, 0, alloc_hint, - &ins, 1); + &ins, 1, 1); if (ret < 0) goto out_unlock; @@ -995,7 +995,7 @@ out: return ret; out_reserve: - btrfs_free_reserved_extent(root, ins.objectid, ins.offset); + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); out_unlock: extent_clear_unlock_delalloc(inode, start, end, locked_page, EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | @@ -2599,6 +2599,21 @@ out_kfree: return NULL; } +static void btrfs_release_delalloc_bytes(struct btrfs_root *root, + u64 start, u64 len) +{ + struct btrfs_block_group_cache *cache; + + cache = btrfs_lookup_block_group(root->fs_info, start); + ASSERT(cache); + + spin_lock(&cache->lock); + cache->delalloc_bytes -= len; + spin_unlock(&cache->lock); + + btrfs_put_block_group(cache); +} + /* as ordered data IO finishes, this gets called so we can finish * an ordered extent if the range of bytes in the file it covers are * fully written. @@ -2678,6 +2693,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) trans = NULL; goto out_unlock; } + trans->block_rsv = &root->fs_info->delalloc_block_rsv; if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) @@ -2697,6 +2713,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) logical_len, logical_len, compress_type, 0, 0, BTRFS_FILE_EXTENT_REG); + if (!ret) + btrfs_release_delalloc_bytes(root, + ordered_extent->start, + ordered_extent->disk_len); } unpin_extent_cache(&BTRFS_I(inode)->extent_tree, ordered_extent->file_offset, ordered_extent->len, @@ -2749,7 +2769,7 @@ out: !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) btrfs_free_reserved_extent(root, ordered_extent->start, - ordered_extent->disk_len); + ordered_extent->disk_len, 1); } @@ -2947,14 +2967,15 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, root->orphan_block_rsv = NULL; spin_unlock(&root->orphan_lock); - if (root->orphan_item_inserted && + if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) && btrfs_root_refs(&root->root_item) > 0) { ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, root->root_key.objectid); if (ret) btrfs_abort_transaction(trans, root, ret); else - root->orphan_item_inserted = 0; + clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, + &root->state); } if (block_rsv) { @@ -3271,7 +3292,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) btrfs_block_rsv_release(root, root->orphan_block_rsv, (u64)-1); - if (root->orphan_block_rsv || root->orphan_item_inserted) { + if (root->orphan_block_rsv || + test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) { trans = btrfs_join_transaction(root); if (!IS_ERR(trans)) btrfs_end_transaction(trans, root); @@ -3473,7 +3495,7 @@ cache_acl: ret = btrfs_load_inode_props(inode, path); if (ret) btrfs_err(root->fs_info, - "error loading props for ino %llu (root %llu): %d\n", + "error loading props for ino %llu (root %llu): %d", btrfs_ino(inode), root->root_key.objectid, ret); } @@ -3998,7 +4020,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, * not block aligned since we will be keeping the last block of the * extent just the way it is. */ - if (root->ref_cows || root == root->fs_info->tree_root) + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + root == root->fs_info->tree_root) btrfs_drop_extent_cache(inode, ALIGN(new_size, root->sectorsize), (u64)-1, 0); @@ -4091,7 +4114,9 @@ search_again: extent_num_bytes); num_dec = (orig_num_bytes - extent_num_bytes); - if (root->ref_cows && extent_start != 0) + if (test_bit(BTRFS_ROOT_REF_COWS, + &root->state) && + extent_start != 0) inode_sub_bytes(inode, num_dec); btrfs_mark_buffer_dirty(leaf); } else { @@ -4105,7 +4130,8 @@ search_again: num_dec = btrfs_file_extent_num_bytes(leaf, fi); if (extent_start != 0) { found_extent = 1; - if (root->ref_cows) + if (test_bit(BTRFS_ROOT_REF_COWS, + &root->state)) inode_sub_bytes(inode, num_dec); } } @@ -4120,10 +4146,9 @@ search_again: btrfs_file_extent_other_encoding(leaf, fi) == 0) { u32 size = new_size - found_key.offset; - if (root->ref_cows) { + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) inode_sub_bytes(inode, item_end + 1 - new_size); - } /* * update the ram bytes to properly reflect @@ -4133,7 +4158,8 @@ search_again: size = btrfs_file_extent_calc_inline_size(size); btrfs_truncate_item(root, path, size, 1); - } else if (root->ref_cows) { + } else if (test_bit(BTRFS_ROOT_REF_COWS, + &root->state)) { inode_sub_bytes(inode, item_end + 1 - found_key.offset); } @@ -4155,8 +4181,9 @@ delete: } else { break; } - if (found_extent && (root->ref_cows || - root == root->fs_info->tree_root)) { + if (found_extent && + (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + root == root->fs_info->tree_root)) { btrfs_set_path_blocking(path); ret = btrfs_free_extent(trans, root, extent_start, extent_num_bytes, 0, @@ -5168,8 +5195,7 @@ static int btrfs_dentry_delete(const struct dentry *dentry) static void btrfs_dentry_release(struct dentry *dentry) { - if (dentry->d_fsdata) - kfree(dentry->d_fsdata); + kfree(dentry->d_fsdata); } static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, @@ -5553,6 +5579,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, struct btrfs_inode_ref *ref; struct btrfs_key key[2]; u32 sizes[2]; + int nitems = name ? 2 : 1; unsigned long ptr; int ret; @@ -5572,7 +5599,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, */ inode->i_ino = objectid; - if (dir) { + if (dir && name) { trace_btrfs_inode_request(dir); ret = btrfs_set_inode_index(dir, index); @@ -5581,6 +5608,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, iput(inode); return ERR_PTR(ret); } + } else if (dir) { + *index = 0; } /* * index_cnt is ignored for everything but a dir, @@ -5605,21 +5634,24 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); key[0].offset = 0; - /* - * Start new inodes with an inode_ref. This is slightly more - * efficient for small numbers of hard links since they will - * be packed into one item. Extended refs will kick in if we - * add more hard links than can fit in the ref item. - */ - key[1].objectid = objectid; - btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); - key[1].offset = ref_objectid; - sizes[0] = sizeof(struct btrfs_inode_item); - sizes[1] = name_len + sizeof(*ref); + + if (name) { + /* + * Start new inodes with an inode_ref. This is slightly more + * efficient for small numbers of hard links since they will + * be packed into one item. Extended refs will kick in if we + * add more hard links than can fit in the ref item. + */ + key[1].objectid = objectid; + btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); + key[1].offset = ref_objectid; + + sizes[1] = name_len + sizeof(*ref); + } path->leave_spinning = 1; - ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); + ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems); if (ret != 0) goto fail; @@ -5632,12 +5664,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, sizeof(*inode_item)); fill_inode_item(trans, path->nodes[0], inode_item, inode); - ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, - struct btrfs_inode_ref); - btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); - btrfs_set_inode_ref_index(path->nodes[0], ref, *index); - ptr = (unsigned long)(ref + 1); - write_extent_buffer(path->nodes[0], name, ptr, name_len); + if (name) { + ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, + struct btrfs_inode_ref); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_index(path->nodes[0], ref, *index); + ptr = (unsigned long)(ref + 1); + write_extent_buffer(path->nodes[0], name, ptr, name_len); + } btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); @@ -5673,7 +5707,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, return inode; fail: - if (dir) + if (dir && name) BTRFS_I(dir)->index_cnt--; btrfs_free_path(path); iput(inode); @@ -5958,6 +5992,15 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, err = btrfs_update_inode(trans, root, inode); if (err) goto fail; + if (inode->i_nlink == 1) { + /* + * If new hard link count is 1, it's a file created + * with open(2) O_TMPFILE flag. + */ + err = btrfs_orphan_del(trans, inode); + if (err) + goto fail; + } d_instantiate(dentry, inode); btrfs_log_new_name(trans, inode, NULL, parent); } @@ -6086,16 +6129,8 @@ static noinline int uncompress_inline(struct btrfs_path *path, max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); ret = btrfs_decompress(compress_type, tmp, page, extent_offset, inline_size, max_size); - if (ret) { - char *kaddr = kmap_atomic(page); - unsigned long copy_size = min_t(u64, - PAGE_CACHE_SIZE - pg_offset, - max_size - extent_offset); - memset(kaddr + pg_offset, 0, copy_size); - kunmap_atomic(kaddr); - } kfree(tmp); - return 0; + return ret; } /* @@ -6113,7 +6148,6 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, { int ret; int err = 0; - u64 bytenr; u64 extent_start = 0; u64 extent_end = 0; u64 objectid = btrfs_ino(inode); @@ -6127,7 +6161,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_trans_handle *trans = NULL; - int compress_type; + const bool new_inline = !page || create; again: read_lock(&em_tree->lock); @@ -6201,7 +6235,6 @@ again: found_type = btrfs_file_extent_type(leaf, item); extent_start = found_key.offset; - compress_type = btrfs_file_extent_compression(leaf, item); if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { extent_end = extent_start + @@ -6236,32 +6269,10 @@ next: goto not_found_em; } - em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, item); + btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em); + if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { - em->start = extent_start; - em->len = extent_end - extent_start; - em->orig_start = extent_start - - btrfs_file_extent_offset(leaf, item); - em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, - item); - bytenr = btrfs_file_extent_disk_bytenr(leaf, item); - if (bytenr == 0) { - em->block_start = EXTENT_MAP_HOLE; - goto insert; - } - if (compress_type != BTRFS_COMPRESS_NONE) { - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - em->compress_type = compress_type; - em->block_start = bytenr; - em->block_len = em->orig_block_len; - } else { - bytenr += btrfs_file_extent_offset(leaf, item); - em->block_start = bytenr; - em->block_len = em->len; - if (found_type == BTRFS_FILE_EXTENT_PREALLOC) - set_bit(EXTENT_FLAG_PREALLOC, &em->flags); - } goto insert; } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { unsigned long ptr; @@ -6270,12 +6281,8 @@ next: size_t extent_offset; size_t copy_size; - em->block_start = EXTENT_MAP_INLINE; - if (!page || create) { - em->start = extent_start; - em->len = extent_end - extent_start; + if (new_inline) goto out; - } size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); extent_offset = page_offset(page) + pg_offset - extent_start; @@ -6285,10 +6292,6 @@ next: em->len = ALIGN(copy_size, root->sectorsize); em->orig_block_len = em->len; em->orig_start = em->start; - if (compress_type) { - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - em->compress_type = compress_type; - } ptr = btrfs_file_extent_inline_start(item) + extent_offset; if (create == 0 && !PageUptodate(page)) { if (btrfs_file_extent_compression(leaf, item) != @@ -6296,7 +6299,10 @@ next: ret = uncompress_inline(path, inode, page, pg_offset, extent_offset, item); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + err = ret; + goto out; + } } else { map = kmap(page); read_extent_buffer(leaf, map + pg_offset, ptr, @@ -6332,8 +6338,6 @@ next: set_extent_uptodate(io_tree, em->start, extent_map_end(em) - 1, NULL, GFP_NOFS); goto insert; - } else { - WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type); } not_found: em->start = start; @@ -6550,21 +6554,21 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, alloc_hint = get_extent_allocation_hint(inode, start, len); ret = btrfs_reserve_extent(root, len, root->sectorsize, 0, - alloc_hint, &ins, 1); + alloc_hint, &ins, 1, 1); if (ret) return ERR_PTR(ret); em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, ins.offset, ins.offset, ins.offset, 0); if (IS_ERR(em)) { - btrfs_free_reserved_extent(root, ins.objectid, ins.offset); + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); return em; } ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, ins.offset, ins.offset, 0); if (ret) { - btrfs_free_reserved_extent(root, ins.objectid, ins.offset); + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); free_extent_map(em); return ERR_PTR(ret); } @@ -6717,6 +6721,76 @@ out: return ret; } +bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) +{ + struct radix_tree_root *root = &inode->i_mapping->page_tree; + int found = false; + void **pagep = NULL; + struct page *page = NULL; + int start_idx; + int end_idx; + + start_idx = start >> PAGE_CACHE_SHIFT; + + /* + * end is the last byte in the last page. end == start is legal + */ + end_idx = end >> PAGE_CACHE_SHIFT; + + rcu_read_lock(); + + /* Most of the code in this while loop is lifted from + * find_get_page. It's been modified to begin searching from a + * page and return just the first page found in that range. If the + * found idx is less than or equal to the end idx then we know that + * a page exists. If no pages are found or if those pages are + * outside of the range then we're fine (yay!) */ + while (page == NULL && + radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) { + page = radix_tree_deref_slot(pagep); + if (unlikely(!page)) + break; + + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + page = NULL; + continue; + } + /* + * Otherwise, shmem/tmpfs must be storing a swap entry + * here as an exceptional entry: so return it without + * attempting to raise page count. + */ + page = NULL; + break; /* TODO: Is this relevant for this use case? */ + } + + if (!page_cache_get_speculative(page)) { + page = NULL; + continue; + } + + /* + * Has the page moved? + * This is part of the lockless pagecache protocol. See + * include/linux/pagemap.h for details. + */ + if (unlikely(page != *pagep)) { + page_cache_release(page); + page = NULL; + } + } + + if (page) { + if (page->index <= end_idx) + found = true; + page_cache_release(page); + } + + rcu_read_unlock(); + return found; +} + static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, struct extent_state **cached_state, int writing) { @@ -6741,10 +6815,9 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, * invalidate needs to happen so that reads after a write do not * get stale data. */ - if (!ordered && (!writing || - !test_range_bit(&BTRFS_I(inode)->io_tree, - lockstart, lockend, EXTENT_UPTODATE, 0, - *cached_state))) + if (!ordered && + (!writing || + !btrfs_page_exists_in_range(inode, lockstart, lockend))) break; unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, @@ -7126,7 +7199,7 @@ static void btrfs_end_dio_bio(struct bio *bio, int err) * before atomic variable goto zero, we must make sure * dip->errors is perceived to be set. */ - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); } /* if there are more bios still pending for this dio, just exit */ @@ -7306,7 +7379,7 @@ out_err: * before atomic variable goto zero, we must * make sure dip->errors is perceived to be set. */ - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); if (atomic_dec_and_test(&dip->pending_bios)) bio_io_error(dip->orig_bio); @@ -7383,7 +7456,7 @@ free_ordered: if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) btrfs_free_reserved_extent(root, ordered->start, - ordered->disk_len); + ordered->disk_len, 1); btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); } @@ -7391,39 +7464,30 @@ free_ordered: } static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + const struct iov_iter *iter, loff_t offset) { int seg; int i; - size_t size; - unsigned long addr; unsigned blocksize_mask = root->sectorsize - 1; ssize_t retval = -EINVAL; - loff_t end = offset; if (offset & blocksize_mask) goto out; - /* Check the memory alignment. Blocks cannot straddle pages */ - for (seg = 0; seg < nr_segs; seg++) { - addr = (unsigned long)iov[seg].iov_base; - size = iov[seg].iov_len; - end += size; - if ((addr & blocksize_mask) || (size & blocksize_mask)) - goto out; - - /* If this is a write we don't need to check anymore */ - if (rw & WRITE) - continue; + if (iov_iter_alignment(iter) & blocksize_mask) + goto out; - /* - * Check to make sure we don't have duplicate iov_base's in this - * iovec, if so return EINVAL, otherwise we'll get csum errors - * when reading back. - */ - for (i = seg + 1; i < nr_segs; i++) { - if (iov[seg].iov_base == iov[i].iov_base) + /* If this is a write we don't need to check anymore */ + if (rw & WRITE) + return 0; + /* + * Check to make sure we don't have duplicate iov_base's in this + * iovec, if so return EINVAL, otherwise we'll get csum errors + * when reading back. + */ + for (seg = 0; seg < iter->nr_segs; seg++) { + for (i = seg + 1; i < iter->nr_segs; i++) { + if (iter->iov[seg].iov_base == iter->iov[i].iov_base) goto out; } } @@ -7433,8 +7497,7 @@ out: } static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; @@ -7444,12 +7507,11 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, bool relock = false; ssize_t ret; - if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, - offset, nr_segs)) + if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iter, offset)) return 0; atomic_inc(&inode->i_dio_count); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); /* * The generic stuff only does filemap_write_and_wait_range, which @@ -7457,7 +7519,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, * we need to flush the dirty pages again to make absolutely sure * that any outstanding dirty pages are on disk. */ - count = iov_length(iov, nr_segs); + count = iov_iter_count(iter); if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags)) filemap_fdatawrite_range(inode->i_mapping, offset, count); @@ -7484,7 +7546,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, ret = __blockdev_direct_IO(rw, iocb, inode, BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, - iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, + iter, offset, btrfs_get_blocks_direct, NULL, btrfs_submit_direct, flags); if (rw & WRITE) { if (ret < 0 && ret != -EIOCBQUEUED) @@ -7992,7 +8054,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, err = btrfs_subvol_inherit_props(trans, new_root, parent_root); if (err) btrfs_err(new_root->fs_info, - "error inheriting subvolume %llu properties: %d\n", + "error inheriting subvolume %llu properties: %d", new_root->root_key.objectid, err); err = btrfs_update_inode(trans, new_root, inode); @@ -8311,7 +8373,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, BTRFS_I(old_inode)->dir_index = 0ULL; if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { /* force full log commit if subvolume involved. */ - root->fs_info->last_trans_log_full_commit = trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); } else { ret = btrfs_insert_inode_ref(trans, dest, new_dentry->d_name.name, @@ -8765,7 +8827,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, cur_bytes = min(num_bytes, 256ULL * 1024 * 1024); cur_bytes = max(cur_bytes, min_size); ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0, - *alloc_hint, &ins, 1); + *alloc_hint, &ins, 1, 0); if (ret) { if (own_trans) btrfs_end_transaction(trans, root); @@ -8779,7 +8841,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, BTRFS_FILE_EXTENT_PREALLOC); if (ret) { btrfs_free_reserved_extent(root, ins.objectid, - ins.offset); + ins.offset, 0); btrfs_abort_transaction(trans, root, ret); if (own_trans) btrfs_end_transaction(trans, root); @@ -8889,6 +8951,66 @@ static int btrfs_permission(struct inode *inode, int mask) return generic_permission(inode, mask); } +static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct inode *inode = NULL; + u64 objectid; + u64 index; + int ret = 0; + + /* + * 5 units required for adding orphan entry + */ + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_find_free_ino(root, &objectid); + if (ret) + goto out; + + inode = btrfs_new_inode(trans, root, dir, NULL, 0, + btrfs_ino(dir), objectid, mode, &index); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; + goto out; + } + + ret = btrfs_init_inode_security(trans, inode, dir, NULL); + if (ret) + goto out; + + ret = btrfs_update_inode(trans, root, inode); + if (ret) + goto out; + + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + + ret = btrfs_orphan_add(trans, inode); + if (ret) + goto out; + + d_tmpfile(dentry, inode); + mark_inode_dirty(inode); + +out: + btrfs_end_transaction(trans, root); + if (ret) + iput(inode); + btrfs_balance_delayed_items(root); + btrfs_btree_balance_dirty(root); + + return ret; +} + static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, @@ -8909,6 +9031,7 @@ static const struct inode_operations btrfs_dir_inode_operations = { .get_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, + .tmpfile = btrfs_tmpfile, }; static const struct inode_operations btrfs_dir_ro_inode_operations = { .lookup = btrfs_lookup, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e79ff6b90cb7..47aceb494d1d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -58,6 +58,7 @@ #include "dev-replace.h" #include "props.h" #include "sysfs.h" +#include "qgroup.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI @@ -135,19 +136,22 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags) void btrfs_update_iflags(struct inode *inode) { struct btrfs_inode *ip = BTRFS_I(inode); - - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + unsigned int new_fl = 0; if (ip->flags & BTRFS_INODE_SYNC) - inode->i_flags |= S_SYNC; + new_fl |= S_SYNC; if (ip->flags & BTRFS_INODE_IMMUTABLE) - inode->i_flags |= S_IMMUTABLE; + new_fl |= S_IMMUTABLE; if (ip->flags & BTRFS_INODE_APPEND) - inode->i_flags |= S_APPEND; + new_fl |= S_APPEND; if (ip->flags & BTRFS_INODE_NOATIME) - inode->i_flags |= S_NOATIME; + new_fl |= S_NOATIME; if (ip->flags & BTRFS_INODE_DIRSYNC) - inode->i_flags |= S_DIRSYNC; + new_fl |= S_DIRSYNC; + + set_mask_bits(&inode->i_flags, + S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC, + new_fl); } /* @@ -638,11 +642,11 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct btrfs_trans_handle *trans; int ret; - if (!root->ref_cows) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return -EINVAL; atomic_inc(&root->will_be_snapshoted); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); btrfs_wait_nocow_write(root); ret = btrfs_start_delalloc_inodes(root, 0); @@ -711,6 +715,35 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto fail; + /* + * If orphan cleanup did remove any orphans, it means the tree was + * modified and therefore the commit root is not the same as the + * current root anymore. This is a problem, because send uses the + * commit root and therefore can see inode items that don't exist + * in the current root anymore, and for example make calls to + * btrfs_iget, which will do tree lookups based on the current root + * and not on the commit root. Those lookups will fail, returning a + * -ESTALE error, and making send fail with that error. So make sure + * a send does not see any orphans we have just removed, and that it + * will see the same inodes regardless of whether a transaction + * commit happened before it started (meaning that the commit root + * will be the same as the current root) or not. + */ + if (readonly && pending_snapshot->snap->node != + pending_snapshot->snap->commit_root) { + trans = btrfs_join_transaction(pending_snapshot->snap); + if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) { + ret = PTR_ERR(trans); + goto fail; + } + if (!IS_ERR(trans)) { + ret = btrfs_commit_transaction(trans, + pending_snapshot->snap); + if (ret) + goto fail; + } + } + inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); if (IS_ERR(inode)) { ret = PTR_ERR(inode); @@ -1502,11 +1535,12 @@ static noinline int btrfs_ioctl_resize(struct file *file, sizestr = vol_args->name; devstr = strchr(sizestr, ':'); if (devstr) { - char *end; sizestr = devstr + 1; *devstr = '\0'; devstr = vol_args->name; - devid = simple_strtoull(devstr, &end, 10); + ret = kstrtoull(devstr, 10, &devid); + if (ret) + goto out_free; if (!devid) { ret = -EINVAL; goto out_free; @@ -1562,7 +1596,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, new_size = old_size - new_size; } else if (mod > 0) { if (new_size > ULLONG_MAX - old_size) { - ret = -EINVAL; + ret = -ERANGE; goto out_free; } new_size = old_size + new_size; @@ -1926,7 +1960,8 @@ static noinline int copy_to_sk(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, struct btrfs_ioctl_search_key *sk, - char *buf, + size_t *buf_size, + char __user *ubuf, unsigned long *sk_offset, int *num_found) { @@ -1958,13 +1993,25 @@ static noinline int copy_to_sk(struct btrfs_root *root, if (!key_in_sk(key, sk)) continue; - if (sizeof(sh) + item_len > BTRFS_SEARCH_ARGS_BUFSIZE) + if (sizeof(sh) + item_len > *buf_size) { + if (*num_found) { + ret = 1; + goto out; + } + + /* + * return one empty item back for v1, which does not + * handle -EOVERFLOW + */ + + *buf_size = sizeof(sh) + item_len; item_len = 0; + ret = -EOVERFLOW; + } - if (sizeof(sh) + item_len + *sk_offset > - BTRFS_SEARCH_ARGS_BUFSIZE) { + if (sizeof(sh) + item_len + *sk_offset > *buf_size) { ret = 1; - goto overflow; + goto out; } sh.objectid = key->objectid; @@ -1974,20 +2021,33 @@ static noinline int copy_to_sk(struct btrfs_root *root, sh.transid = found_transid; /* copy search result header */ - memcpy(buf + *sk_offset, &sh, sizeof(sh)); + if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) { + ret = -EFAULT; + goto out; + } + *sk_offset += sizeof(sh); if (item_len) { - char *p = buf + *sk_offset; + char __user *up = ubuf + *sk_offset; /* copy the item */ - read_extent_buffer(leaf, p, - item_off, item_len); + if (read_extent_buffer_to_user(leaf, up, + item_off, item_len)) { + ret = -EFAULT; + goto out; + } + *sk_offset += item_len; } (*num_found)++; - if (*num_found >= sk->nr_items) - break; + if (ret) /* -EOVERFLOW from above */ + goto out; + + if (*num_found >= sk->nr_items) { + ret = 1; + goto out; + } } advance_key: ret = 0; @@ -2002,22 +2062,37 @@ advance_key: key->objectid++; } else ret = 1; -overflow: +out: + /* + * 0: all items from this leaf copied, continue with next + * 1: * more items can be copied, but unused buffer is too small + * * all items were found + * Either way, it will stops the loop which iterates to the next + * leaf + * -EOVERFLOW: item was to large for buffer + * -EFAULT: could not copy extent buffer back to userspace + */ return ret; } static noinline int search_ioctl(struct inode *inode, - struct btrfs_ioctl_search_args *args) + struct btrfs_ioctl_search_key *sk, + size_t *buf_size, + char __user *ubuf) { struct btrfs_root *root; struct btrfs_key key; struct btrfs_path *path; - struct btrfs_ioctl_search_key *sk = &args->key; struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info; int ret; int num_found = 0; unsigned long sk_offset = 0; + if (*buf_size < sizeof(struct btrfs_ioctl_search_header)) { + *buf_size = sizeof(struct btrfs_ioctl_search_header); + return -EOVERFLOW; + } + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -2051,14 +2126,15 @@ static noinline int search_ioctl(struct inode *inode, ret = 0; goto err; } - ret = copy_to_sk(root, path, &key, sk, args->buf, + ret = copy_to_sk(root, path, &key, sk, buf_size, ubuf, &sk_offset, &num_found); btrfs_release_path(path); - if (ret || num_found >= sk->nr_items) + if (ret) break; } - ret = 0; + if (ret > 0) + ret = 0; err: sk->nr_items = num_found; btrfs_free_path(path); @@ -2068,22 +2144,73 @@ err: static noinline int btrfs_ioctl_tree_search(struct file *file, void __user *argp) { - struct btrfs_ioctl_search_args *args; - struct inode *inode; - int ret; + struct btrfs_ioctl_search_args __user *uargs; + struct btrfs_ioctl_search_key sk; + struct inode *inode; + int ret; + size_t buf_size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - args = memdup_user(argp, sizeof(*args)); - if (IS_ERR(args)) - return PTR_ERR(args); + uargs = (struct btrfs_ioctl_search_args __user *)argp; + + if (copy_from_user(&sk, &uargs->key, sizeof(sk))) + return -EFAULT; + + buf_size = sizeof(uargs->buf); inode = file_inode(file); - ret = search_ioctl(inode, args); - if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) + ret = search_ioctl(inode, &sk, &buf_size, uargs->buf); + + /* + * In the origin implementation an overflow is handled by returning a + * search header with a len of zero, so reset ret. + */ + if (ret == -EOVERFLOW) + ret = 0; + + if (ret == 0 && copy_to_user(&uargs->key, &sk, sizeof(sk))) ret = -EFAULT; - kfree(args); + return ret; +} + +static noinline int btrfs_ioctl_tree_search_v2(struct file *file, + void __user *argp) +{ + struct btrfs_ioctl_search_args_v2 __user *uarg; + struct btrfs_ioctl_search_args_v2 args; + struct inode *inode; + int ret; + size_t buf_size; + const size_t buf_limit = 16 * 1024 * 1024; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* copy search header and buffer size */ + uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp; + if (copy_from_user(&args, uarg, sizeof(args))) + return -EFAULT; + + buf_size = args.buf_size; + + if (buf_size < sizeof(struct btrfs_ioctl_search_header)) + return -EOVERFLOW; + + /* limit result size to 16MB */ + if (buf_size > buf_limit) + buf_size = buf_limit; + + inode = file_inode(file); + ret = search_ioctl(inode, &args.key, &buf_size, + (char *)(&uarg->buf[0])); + if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key))) + ret = -EFAULT; + else if (ret == -EOVERFLOW && + copy_to_user(&uarg->buf_size, &buf_size, sizeof(buf_size))) + ret = -EFAULT; + return ret; } @@ -2219,6 +2346,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, struct btrfs_ioctl_vol_args *vol_args; struct btrfs_trans_handle *trans; struct btrfs_block_rsv block_rsv; + u64 root_flags; u64 qgroup_reserved; int namelen; int ret; @@ -2240,6 +2368,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (err) goto out; + err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); if (err == -EINTR) goto out_drop_write; @@ -2301,6 +2430,27 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, } mutex_lock(&inode->i_mutex); + + /* + * Don't allow to delete a subvolume with send in progress. This is + * inside the i_mutex so the error handling that has to drop the bit + * again is not run concurrently. + */ + spin_lock(&dest->root_item_lock); + root_flags = btrfs_root_flags(&dest->root_item); + if (dest->send_in_progress == 0) { + btrfs_set_root_flags(&dest->root_item, + root_flags | BTRFS_ROOT_SUBVOL_DEAD); + spin_unlock(&dest->root_item_lock); + } else { + spin_unlock(&dest->root_item_lock); + btrfs_warn(root->fs_info, + "Attempt to delete subvolume %llu during send", + dest->root_key.objectid); + err = -EPERM; + goto out_dput; + } + err = d_invalidate(dentry); if (err) goto out_unlock; @@ -2346,7 +2496,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, dest->root_item.drop_level = 0; btrfs_set_root_refs(&dest->root_item, 0); - if (!xchg(&dest->orphan_item_inserted, 1)) { + if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) { ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, dest->root_key.objectid); @@ -2389,11 +2539,19 @@ out_release: out_up_write: up_write(&root->fs_info->subvol_sem); out_unlock: + if (err) { + spin_lock(&dest->root_item_lock); + root_flags = btrfs_root_flags(&dest->root_item); + btrfs_set_root_flags(&dest->root_item, + root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); + spin_unlock(&dest->root_item_lock); + } mutex_unlock(&inode->i_mutex); if (!err) { shrink_dcache_sb(root->fs_info->sb); btrfs_invalidate_inodes(dest); d_delete(dentry); + ASSERT(dest->send_in_progress == 0); /* the last ref */ if (dest->cache_inode) { @@ -2557,9 +2715,6 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; int ret = 0; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL); if (!fi_args) return -ENOMEM; @@ -2574,6 +2729,10 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) } mutex_unlock(&fs_devices->device_list_mutex); + fi_args->nodesize = root->fs_info->super_copy->nodesize; + fi_args->sectorsize = root->fs_info->super_copy->sectorsize; + fi_args->clone_alignment = root->fs_info->super_copy->sectorsize; + if (copy_to_user(arg, fi_args, sizeof(*fi_args))) ret = -EFAULT; @@ -2589,9 +2748,6 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) int ret = 0; char *s_uuid = NULL; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - di_args = memdup_user(arg, sizeof(*di_args)); if (IS_ERR(di_args)) return PTR_ERR(di_args); @@ -2669,10 +2825,15 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); ordered = btrfs_lookup_first_ordered_extent(inode, off + len - 1); - if (!ordered && + if ((!ordered || + ordered->file_offset + ordered->len <= off || + ordered->file_offset >= off + len) && !test_range_bit(&BTRFS_I(inode)->io_tree, off, - off + len - 1, EXTENT_DELALLOC, 0, NULL)) + off + len - 1, EXTENT_DELALLOC, 0, NULL)) { + if (ordered) + btrfs_put_ordered_extent(ordered); break; + } unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); if (ordered) btrfs_put_ordered_extent(ordered); @@ -2912,6 +3073,129 @@ out: return ret; } +/* Helper to check and see if this root currently has a ref on the given disk + * bytenr. If it does then we need to update the quota for this root. This + * doesn't do anything if quotas aren't enabled. + */ +static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + u64 disko) +{ + struct seq_list tree_mod_seq_elem = {}; + struct ulist *roots; + struct ulist_iterator uiter; + struct ulist_node *root_node = NULL; + int ret; + + if (!root->fs_info->quota_enabled) + return 1; + + btrfs_get_tree_mod_seq(root->fs_info, &tree_mod_seq_elem); + ret = btrfs_find_all_roots(trans, root->fs_info, disko, + tree_mod_seq_elem.seq, &roots); + if (ret < 0) + goto out; + ret = 0; + ULIST_ITER_INIT(&uiter); + while ((root_node = ulist_next(roots, &uiter))) { + if (root_node->val == root->objectid) { + ret = 1; + break; + } + } + ulist_free(roots); +out: + btrfs_put_tree_mod_seq(root->fs_info, &tree_mod_seq_elem); + return ret; +} + +static int clone_finish_inode_update(struct btrfs_trans_handle *trans, + struct inode *inode, + u64 endoff, + const u64 destoff, + const u64 olen) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + inode_inc_iversion(inode); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + /* + * We round up to the block size at eof when determining which + * extents to clone above, but shouldn't round up the file size. + */ + if (endoff > destoff + olen) + endoff = destoff + olen; + if (endoff > inode->i_size) + btrfs_i_size_write(inode, endoff); + + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + btrfs_end_transaction(trans, root); + goto out; + } + ret = btrfs_end_transaction(trans, root); +out: + return ret; +} + +static void clone_update_extent_map(struct inode *inode, + const struct btrfs_trans_handle *trans, + const struct btrfs_path *path, + const u64 hole_offset, + const u64 hole_len) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + int ret; + + em = alloc_extent_map(); + if (!em) { + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + return; + } + + if (path) { + struct btrfs_file_extent_item *fi; + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + btrfs_extent_item_to_extent_map(inode, path, fi, false, em); + em->generation = -1; + if (btrfs_file_extent_type(path->nodes[0], fi) == + BTRFS_FILE_EXTENT_INLINE) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + } else { + em->start = hole_offset; + em->len = hole_len; + em->ram_bytes = em->len; + em->orig_start = hole_offset; + em->block_start = EXTENT_MAP_HOLE; + em->block_len = 0; + em->orig_block_len = 0; + em->compress_type = BTRFS_COMPRESS_NONE; + em->generation = trans->transid; + } + + while (1) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 1); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, em->start, + em->start + em->len - 1, 0); + } + + if (unlikely(ret)) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); +} + /** * btrfs_clone() - clone a range from inode file to another * @@ -2924,7 +3208,8 @@ out: * @destoff: Offset within @inode to start clone */ static int btrfs_clone(struct inode *src, struct inode *inode, - u64 off, u64 olen, u64 olen_aligned, u64 destoff) + const u64 off, const u64 olen, const u64 olen_aligned, + const u64 destoff) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path = NULL; @@ -2935,7 +3220,10 @@ static int btrfs_clone(struct inode *src, struct inode *inode, u32 nritems; int slot; int ret; - u64 len = olen_aligned; + int no_quota; + const u64 len = olen_aligned; + u64 last_disko = 0; + u64 last_dest_end = destoff; ret = -ENOMEM; buf = vmalloc(btrfs_level_size(root, 0)); @@ -2952,7 +3240,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, /* clone data */ key.objectid = btrfs_ino(src); key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = 0; + key.offset = off; while (1) { /* @@ -2964,9 +3252,21 @@ static int btrfs_clone(struct inode *src, struct inode *inode, 0, 0); if (ret < 0) goto out; + /* + * First search, if no extent item that starts at offset off was + * found but the previous item is an extent item, it's possible + * it might overlap our target range, therefore process it. + */ + if (key.offset == off && ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0] - 1); + if (key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } nritems = btrfs_header_nritems(path->nodes[0]); process_slot: + no_quota = 1; if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(BTRFS_I(src)->root, path); if (ret < 0) @@ -2991,7 +3291,7 @@ process_slot: u64 disko = 0, diskl = 0; u64 datao = 0, datal = 0; u8 comp; - u64 endoff; + u64 drop_start; extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); @@ -3012,10 +3312,16 @@ process_slot: extent); } - if (key.offset + datal <= off || - key.offset >= off + len - 1) { + /* + * The first search might have left us at an extent + * item that ends before our target range's start, can + * happen if we have holes and NO_HOLES feature enabled. + */ + if (key.offset + datal <= off) { path->slots[0]++; goto process_slot; + } else if (key.offset >= off + len) { + break; } size = btrfs_item_size_nr(leaf, slot); @@ -3034,6 +3340,18 @@ process_slot: new_key.offset = destoff; /* + * Deal with a hole that doesn't have an extent item + * that represents it (NO_HOLES feature enabled). + * This hole is either in the middle of the cloning + * range or at the beginning (fully overlaps it or + * partially overlaps it). + */ + if (new_key.offset != last_dest_end) + drop_start = last_dest_end; + else + drop_start = new_key.offset; + + /* * 1 - adjusting old extent (we may have to split it) * 1 - add new extent * 1 - inode update @@ -3051,22 +3369,22 @@ process_slot: * | ------------- extent ------------- | */ - /* substract range b */ + /* subtract range b */ if (key.offset + datal > off + len) datal = off + len - key.offset; - /* substract range a */ + /* subtract range a */ if (off > key.offset) { datao += off - key.offset; datal -= off - key.offset; } ret = btrfs_drop_extents(trans, root, inode, - new_key.offset, + drop_start, new_key.offset + datal, 1); if (ret) { - if (ret != -EINVAL) + if (ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, root, ret); btrfs_end_transaction(trans, root); @@ -3099,6 +3417,28 @@ process_slot: datao); btrfs_set_file_extent_num_bytes(leaf, extent, datal); + + /* + * We need to look up the roots that point at + * this bytenr and see if the new root does. If + * it does not we need to make sure we update + * quotas appropriately. + */ + if (disko && root != BTRFS_I(src)->root && + disko != last_disko) { + no_quota = check_ref(trans, root, + disko); + if (no_quota < 0) { + btrfs_abort_transaction(trans, + root, + ret); + btrfs_end_transaction(trans, + root); + ret = no_quota; + goto out; + } + } + if (disko) { inode_add_bytes(inode, datal); ret = btrfs_inc_extent_ref(trans, root, @@ -3106,7 +3446,7 @@ process_slot: root->root_key.objectid, btrfs_ino(inode), new_key.offset - datao, - 0); + no_quota); if (ret) { btrfs_abort_transaction(trans, root, @@ -3120,6 +3460,8 @@ process_slot: } else if (type == BTRFS_FILE_EXTENT_INLINE) { u64 skip = 0; u64 trim = 0; + u64 aligned_end = 0; + if (off > key.offset) { skip = off - key.offset; new_key.offset += skip; @@ -3136,12 +3478,14 @@ process_slot: size -= skip + trim; datal -= skip + trim; + aligned_end = ALIGN(new_key.offset + datal, + root->sectorsize); ret = btrfs_drop_extents(trans, root, inode, - new_key.offset, - new_key.offset + datal, + drop_start, + aligned_end, 1); if (ret) { - if (ret != -EINVAL) + if (ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, root, ret); btrfs_end_transaction(trans, root); @@ -3172,38 +3516,62 @@ process_slot: inode_add_bytes(inode, datal); } + /* If we have an implicit hole (NO_HOLES feature). */ + if (drop_start < new_key.offset) + clone_update_extent_map(inode, trans, + NULL, drop_start, + new_key.offset - drop_start); + + clone_update_extent_map(inode, trans, path, 0, 0); + btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - inode_inc_iversion(inode); - inode->i_mtime = inode->i_ctime = CURRENT_TIME; - - /* - * we round up to the block size at eof when - * determining which extents to clone above, - * but shouldn't round up the file size - */ - endoff = new_key.offset + datal; - if (endoff > destoff+olen) - endoff = destoff+olen; - if (endoff > inode->i_size) - btrfs_i_size_write(inode, endoff); - - ret = btrfs_update_inode(trans, root, inode); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - btrfs_end_transaction(trans, root); + last_dest_end = new_key.offset + datal; + ret = clone_finish_inode_update(trans, inode, + last_dest_end, + destoff, olen); + if (ret) goto out; - } - ret = btrfs_end_transaction(trans, root); + if (new_key.offset + datal >= destoff + len) + break; } btrfs_release_path(path); key.offset++; } ret = 0; + if (last_dest_end < destoff + len) { + /* + * We have an implicit hole (NO_HOLES feature is enabled) that + * fully or partially overlaps our cloning range at its end. + */ + btrfs_release_path(path); + + /* + * 1 - remove extent(s) + * 1 - inode update + */ + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + ret = btrfs_drop_extents(trans, root, inode, + last_dest_end, destoff + len, 1); + if (ret) { + if (ret != -EOPNOTSUPP) + btrfs_abort_transaction(trans, root, ret); + btrfs_end_transaction(trans, root); + goto out; + } + clone_update_extent_map(inode, trans, NULL, last_dest_end, + destoff + len - last_dest_end); + ret = clone_finish_inode_update(trans, inode, destoff + len, + destoff, olen); + } + out: - btrfs_release_path(path); btrfs_free_path(path); vfree(buf); return ret; @@ -3315,15 +3683,41 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, goto out_unlock; } - /* truncate page cache pages from target inode range */ - truncate_inode_pages_range(&inode->i_data, destoff, - PAGE_CACHE_ALIGN(destoff + len) - 1); + /* + * Lock the target range too. Right after we replace the file extent + * items in the fs tree (which now point to the cloned data), we might + * have a worker replace them with extent items relative to a write + * operation that was issued before this clone operation (i.e. confront + * with inode.c:btrfs_finish_ordered_io). + */ + if (same_inode) { + u64 lock_start = min_t(u64, off, destoff); + u64 lock_len = max_t(u64, off, destoff) + len - lock_start; - lock_extent_range(src, off, len); + lock_extent_range(src, lock_start, lock_len); + } else { + lock_extent_range(src, off, len); + lock_extent_range(inode, destoff, len); + } ret = btrfs_clone(src, inode, off, olen, len, destoff); - unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); + if (same_inode) { + u64 lock_start = min_t(u64, off, destoff); + u64 lock_end = max_t(u64, off, destoff) + len - 1; + + unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end); + } else { + unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); + unlock_extent(&BTRFS_I(inode)->io_tree, destoff, + destoff + len - 1); + } + /* + * Truncate page cache pages so that future reads will see the cloned + * data immediately and not the previous data. + */ + truncate_inode_pages_range(&inode->i_data, destoff, + PAGE_CACHE_ALIGN(destoff + len) - 1); out_unlock: if (!same_inode) { if (inode < src) { @@ -4898,6 +5292,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_trans_end(file); case BTRFS_IOC_TREE_SEARCH: return btrfs_ioctl_tree_search(file, argp); + case BTRFS_IOC_TREE_SEARCH_V2: + return btrfs_ioctl_tree_search_v2(file, argp); case BTRFS_IOC_INO_LOOKUP: return btrfs_ioctl_ino_lookup(file, argp); case BTRFS_IOC_INO_PATHS: diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 01277b8f2373..5665d2149249 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -33,14 +33,14 @@ static void btrfs_assert_tree_read_locked(struct extent_buffer *eb); */ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) { - if (eb->lock_nested) { - read_lock(&eb->lock); - if (eb->lock_nested && current->pid == eb->lock_owner) { - read_unlock(&eb->lock); - return; - } - read_unlock(&eb->lock); - } + /* + * no lock is required. The lock owner may change if + * we have a read lock, but it won't change to or away + * from us. If we have the write lock, we are the owner + * and it'll never change. + */ + if (eb->lock_nested && current->pid == eb->lock_owner) + return; if (rw == BTRFS_WRITE_LOCK) { if (atomic_read(&eb->blocking_writers) == 0) { WARN_ON(atomic_read(&eb->spinning_writers) != 1); @@ -65,14 +65,15 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) */ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) { - if (eb->lock_nested) { - read_lock(&eb->lock); - if (eb->lock_nested && current->pid == eb->lock_owner) { - read_unlock(&eb->lock); - return; - } - read_unlock(&eb->lock); - } + /* + * no lock is required. The lock owner may change if + * we have a read lock, but it won't change to or away + * from us. If we have the write lock, we are the owner + * and it'll never change. + */ + if (eb->lock_nested && current->pid == eb->lock_owner) + return; + if (rw == BTRFS_WRITE_LOCK_BLOCKING) { BUG_ON(atomic_read(&eb->blocking_writers) != 1); write_lock(&eb->lock); @@ -99,6 +100,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) void btrfs_tree_read_lock(struct extent_buffer *eb) { again: + BUG_ON(!atomic_read(&eb->blocking_writers) && + current->pid == eb->lock_owner); + read_lock(&eb->lock); if (atomic_read(&eb->blocking_writers) && current->pid == eb->lock_owner) { @@ -132,7 +136,9 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb) if (atomic_read(&eb->blocking_writers)) return 0; - read_lock(&eb->lock); + if (!read_trylock(&eb->lock)) + return 0; + if (atomic_read(&eb->blocking_writers)) { read_unlock(&eb->lock); return 0; @@ -151,7 +157,10 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb) if (atomic_read(&eb->blocking_writers) || atomic_read(&eb->blocking_readers)) return 0; - write_lock(&eb->lock); + + if (!write_trylock(&eb->lock)) + return 0; + if (atomic_read(&eb->blocking_writers) || atomic_read(&eb->blocking_readers)) { write_unlock(&eb->lock); @@ -168,14 +177,15 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb) */ void btrfs_tree_read_unlock(struct extent_buffer *eb) { - if (eb->lock_nested) { - read_lock(&eb->lock); - if (eb->lock_nested && current->pid == eb->lock_owner) { - eb->lock_nested = 0; - read_unlock(&eb->lock); - return; - } - read_unlock(&eb->lock); + /* + * if we're nested, we have the write lock. No new locking + * is needed as long as we are the lock owner. + * The write unlock will do a barrier for us, and the lock_nested + * field only matters to the lock owner. + */ + if (eb->lock_nested && current->pid == eb->lock_owner) { + eb->lock_nested = 0; + return; } btrfs_assert_tree_read_locked(eb); WARN_ON(atomic_read(&eb->spinning_readers) == 0); @@ -189,14 +199,15 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb) */ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) { - if (eb->lock_nested) { - read_lock(&eb->lock); - if (eb->lock_nested && current->pid == eb->lock_owner) { - eb->lock_nested = 0; - read_unlock(&eb->lock); - return; - } - read_unlock(&eb->lock); + /* + * if we're nested, we have the write lock. No new locking + * is needed as long as we are the lock owner. + * The write unlock will do a barrier for us, and the lock_nested + * field only matters to the lock owner. + */ + if (eb->lock_nested && current->pid == eb->lock_owner) { + eb->lock_nested = 0; + return; } btrfs_assert_tree_read_locked(eb); WARN_ON(atomic_read(&eb->blocking_readers) == 0); @@ -244,6 +255,7 @@ void btrfs_tree_unlock(struct extent_buffer *eb) BUG_ON(blockers > 1); btrfs_assert_tree_locked(eb); + eb->lock_owner = 0; atomic_dec(&eb->write_locks); if (blockers) { diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index b47f669aca75..dfad8514f0da 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -143,7 +143,7 @@ static int lzo_compress_pages(struct list_head *ws, if (ret != LZO_E_OK) { printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n", ret); - ret = -1; + ret = -EIO; goto out; } @@ -189,7 +189,7 @@ static int lzo_compress_pages(struct list_head *ws, kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; - ret = -1; + ret = -E2BIG; goto out; } @@ -208,7 +208,7 @@ static int lzo_compress_pages(struct list_head *ws, /* we're making it bigger, give up */ if (tot_in > 8192 && tot_in < tot_out) { - ret = -1; + ret = -E2BIG; goto out; } @@ -335,7 +335,7 @@ cont: break; if (page_in_index + 1 >= total_pages_in) { - ret = -1; + ret = -EIO; goto done; } @@ -358,7 +358,7 @@ cont: kunmap(pages_in[page_in_index - 1]); if (ret != LZO_E_OK) { printk(KERN_WARNING "BTRFS: decompress failed\n"); - ret = -1; + ret = -EIO; break; } @@ -402,12 +402,12 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in, ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); if (ret != LZO_E_OK) { printk(KERN_WARNING "BTRFS: decompress failed!\n"); - ret = -1; + ret = -EIO; goto out; } if (out_len < start_byte) { - ret = -1; + ret = -EIO; goto out; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a94b05f72869..7187b14faa6c 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -67,7 +67,7 @@ static void ordered_data_tree_panic(struct inode *inode, int errno, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset " - "%llu\n", offset); + "%llu", offset); } /* @@ -484,8 +484,19 @@ void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) log_list); list_del_init(&ordered->log_list); spin_unlock_irq(&log->log_extents_lock[index]); + + if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && + !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { + struct inode *inode = ordered->inode; + u64 start = ordered->file_offset; + u64 end = ordered->file_offset + ordered->len - 1; + + WARN_ON(!inode); + filemap_fdatawrite_range(inode->i_mapping, start, end); + } wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)); + btrfs_put_ordered_extent(ordered); spin_lock_irq(&log->log_extents_lock[index]); } diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 6efd70d3b64f..9626b4ad3b9a 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -54,7 +54,7 @@ static void print_extent_data_ref(struct extent_buffer *eb, btrfs_extent_data_ref_count(eb, ref)); } -static void print_extent_item(struct extent_buffer *eb, int slot) +static void print_extent_item(struct extent_buffer *eb, int slot, int type) { struct btrfs_extent_item *ei; struct btrfs_extent_inline_ref *iref; @@ -63,7 +63,6 @@ static void print_extent_item(struct extent_buffer *eb, int slot) struct btrfs_disk_key key; unsigned long end; unsigned long ptr; - int type; u32 item_size = btrfs_item_size_nr(eb, slot); u64 flags; u64 offset; @@ -88,7 +87,8 @@ static void print_extent_item(struct extent_buffer *eb, int slot) btrfs_extent_refs(eb, ei), btrfs_extent_generation(eb, ei), flags); - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + if ((type == BTRFS_EXTENT_ITEM_KEY) && + flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { struct btrfs_tree_block_info *info; info = (struct btrfs_tree_block_info *)(ei + 1); btrfs_tree_block_key(eb, info, &key); @@ -223,7 +223,8 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) btrfs_disk_root_refs(l, ri)); break; case BTRFS_EXTENT_ITEM_KEY: - print_extent_item(l, i); + case BTRFS_METADATA_ITEM_KEY: + print_extent_item(l, i, type); break; case BTRFS_TREE_BLOCK_REF_KEY: printk(KERN_INFO "\t\ttree block backref\n"); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 2cf905877aaf..98cb6b2630f9 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -32,6 +32,7 @@ #include "ulist.h" #include "backref.h" #include "extent_io.h" +#include "qgroup.h" /* TODO XXX FIXME * - subvol delete -> delete when ref goes to 0? delete limits also? @@ -84,8 +85,8 @@ struct btrfs_qgroup { /* * temp variables for accounting operations */ - u64 tag; - u64 refcnt; + u64 old_refcnt; + u64 new_refcnt; }; /* @@ -98,6 +99,9 @@ struct btrfs_qgroup_list { struct btrfs_qgroup *member; }; +#define ptr_to_u64(x) ((u64)(uintptr_t)x) +#define u64_to_ptr(x) ((struct btrfs_qgroup *)(uintptr_t)x) + static int qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, int init_flags); @@ -242,6 +246,21 @@ static int del_relation_rb(struct btrfs_fs_info *fs_info, return -ENOENT; } +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, + u64 rfer, u64 excl) +{ + struct btrfs_qgroup *qgroup; + + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (!qgroup) + return -EINVAL; + if (qgroup->rfer != rfer || qgroup->excl != excl) + return -EINVAL; + return 0; +} +#endif + /* * The full config is read in one go, only called from open_ctree() * It doesn't use any locking, as at this point we're still single-threaded @@ -520,6 +539,10 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_key key; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, "a_root->state))) + return 0; +#endif path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -669,6 +692,10 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, int ret; int slot; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return 0; +#endif key.objectid = 0; key.type = BTRFS_QGROUP_INFO_KEY; key.offset = qgroup->qgroupid; @@ -1174,33 +1201,198 @@ out: mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } +static int comp_oper(struct btrfs_qgroup_operation *oper1, + struct btrfs_qgroup_operation *oper2) +{ + if (oper1->bytenr < oper2->bytenr) + return -1; + if (oper1->bytenr > oper2->bytenr) + return 1; + if (oper1->seq < oper2->seq) + return -1; + if (oper1->seq > oper2->seq) + return -1; + if (oper1->ref_root < oper2->ref_root) + return -1; + if (oper1->ref_root > oper2->ref_root) + return 1; + if (oper1->type < oper2->type) + return -1; + if (oper1->type > oper2->type) + return 1; + return 0; +} + +static int insert_qgroup_oper(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct btrfs_qgroup_operation *cur; + int cmp; + + spin_lock(&fs_info->qgroup_op_lock); + p = &fs_info->qgroup_op_tree.rb_node; + while (*p) { + parent = *p; + cur = rb_entry(parent, struct btrfs_qgroup_operation, n); + cmp = comp_oper(cur, oper); + if (cmp < 0) { + p = &(*p)->rb_right; + } else if (cmp) { + p = &(*p)->rb_left; + } else { + spin_unlock(&fs_info->qgroup_op_lock); + return -EEXIST; + } + } + rb_link_node(&oper->n, parent, p); + rb_insert_color(&oper->n, &fs_info->qgroup_op_tree); + spin_unlock(&fs_info->qgroup_op_lock); + return 0; +} /* - * btrfs_qgroup_record_ref is called when the ref is added or deleted. it puts - * the modification into a list that's later used by btrfs_end_transaction to - * pass the recorded modifications on to btrfs_qgroup_account_ref. + * Record a quota operation for processing later on. + * @trans: the transaction we are adding the delayed op to. + * @fs_info: the fs_info for this fs. + * @ref_root: the root of the reference we are acting on, + * @bytenr: the bytenr we are acting on. + * @num_bytes: the number of bytes in the reference. + * @type: the type of operation this is. + * @mod_seq: do we need to get a sequence number for looking up roots. + * + * We just add it to our trans qgroup_ref_list and carry on and process these + * operations in order at some later point. If the reference root isn't a fs + * root then we don't bother with doing anything. + * + * MUST BE HOLDING THE REF LOCK. */ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *node, - struct btrfs_delayed_extent_op *extent_op) + struct btrfs_fs_info *fs_info, u64 ref_root, + u64 bytenr, u64 num_bytes, + enum btrfs_qgroup_operation_type type, int mod_seq) { - struct qgroup_update *u; + struct btrfs_qgroup_operation *oper; + int ret; + + if (!is_fstree(ref_root) || !fs_info->quota_enabled) + return 0; - BUG_ON(!trans->delayed_ref_elem.seq); - u = kmalloc(sizeof(*u), GFP_NOFS); - if (!u) + oper = kmalloc(sizeof(*oper), GFP_NOFS); + if (!oper) return -ENOMEM; - u->node = node; - u->extent_op = extent_op; - list_add_tail(&u->list, &trans->qgroup_ref_list); + oper->ref_root = ref_root; + oper->bytenr = bytenr; + oper->num_bytes = num_bytes; + oper->type = type; + oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq); + INIT_LIST_HEAD(&oper->elem.list); + oper->elem.seq = 0; + ret = insert_qgroup_oper(fs_info, oper); + if (ret) { + /* Shouldn't happen so have an assert for developers */ + ASSERT(0); + kfree(oper); + return ret; + } + list_add_tail(&oper->list, &trans->qgroup_ref_list); + + if (mod_seq) + btrfs_get_tree_mod_seq(fs_info, &oper->elem); return 0; } -static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info, - struct ulist *roots, struct ulist *tmp, - u64 seq) +/* + * The easy accounting, if we are adding/removing the only ref for an extent + * then this qgroup and all of the parent qgroups get their refrence and + * exclusive counts adjusted. + */ +static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct btrfs_qgroup *qgroup; + struct ulist *tmp; + struct btrfs_qgroup_list *glist; + struct ulist_node *unode; + struct ulist_iterator uiter; + int sign = 0; + int ret = 0; + + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + + spin_lock(&fs_info->qgroup_lock); + if (!fs_info->quota_root) + goto out; + qgroup = find_qgroup_rb(fs_info, oper->ref_root); + if (!qgroup) + goto out; + switch (oper->type) { + case BTRFS_QGROUP_OPER_ADD_EXCL: + sign = 1; + break; + case BTRFS_QGROUP_OPER_SUB_EXCL: + sign = -1; + break; + default: + ASSERT(0); + } + qgroup->rfer += sign * oper->num_bytes; + qgroup->rfer_cmpr += sign * oper->num_bytes; + + WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes); + qgroup->excl += sign * oper->num_bytes; + qgroup->excl_cmpr += sign * oper->num_bytes; + + qgroup_dirty(fs_info, qgroup); + + /* Get all of the parent groups that contain this qgroup */ + list_for_each_entry(glist, &qgroup->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + goto out; + } + + /* Iterate all of the parents and adjust their reference counts */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(tmp, &uiter))) { + qgroup = u64_to_ptr(unode->aux); + qgroup->rfer += sign * oper->num_bytes; + qgroup->rfer_cmpr += sign * oper->num_bytes; + qgroup->excl += sign * oper->num_bytes; + if (sign < 0) + WARN_ON(qgroup->excl < oper->num_bytes); + qgroup->excl_cmpr += sign * oper->num_bytes; + qgroup_dirty(fs_info, qgroup); + + /* Add any parents of the parents */ + list_for_each_entry(glist, &qgroup->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + goto out; + } + } + ret = 0; +out: + spin_unlock(&fs_info->qgroup_lock); + ulist_free(tmp); + return ret; +} + +/* + * Walk all of the roots that pointed to our bytenr and adjust their refcnts as + * properly. + */ +static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info, + u64 root_to_skip, struct ulist *tmp, + struct ulist *roots, struct ulist *qgroups, + u64 seq, int *old_roots, int rescan) { struct ulist_node *unode; struct ulist_iterator uiter; @@ -1211,256 +1403,551 @@ static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info, ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(roots, &uiter))) { + /* We don't count our current root here */ + if (unode->val == root_to_skip) + continue; qg = find_qgroup_rb(fs_info, unode->val); if (!qg) continue; + /* + * We could have a pending removal of this same ref so we may + * not have actually found our ref root when doing + * btrfs_find_all_roots, so we need to keep track of how many + * old roots we find in case we removed ours and added a + * different one at the same time. I don't think this could + * happen in practice but that sort of thinking leads to pain + * and suffering and to the dark side. + */ + (*old_roots)++; ulist_reinit(tmp); - /* XXX id not needed */ - ret = ulist_add(tmp, qg->qgroupid, - (u64)(uintptr_t)qg, GFP_ATOMIC); + ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), + GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), GFP_ATOMIC); if (ret < 0) return ret; ULIST_ITER_INIT(&tmp_uiter); while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { struct btrfs_qgroup_list *glist; - qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux; - if (qg->refcnt < seq) - qg->refcnt = seq + 1; + qg = u64_to_ptr(tmp_unode->aux); + /* + * We use this sequence number to keep from having to + * run the whole list and 0 out the refcnt every time. + * We basically use sequnce as the known 0 count and + * then add 1 everytime we see a qgroup. This is how we + * get how many of the roots actually point up to the + * upper level qgroups in order to determine exclusive + * counts. + * + * For rescan we want to set old_refcnt to seq so our + * exclusive calculations end up correct. + */ + if (rescan) + qg->old_refcnt = seq; + else if (qg->old_refcnt < seq) + qg->old_refcnt = seq + 1; else - ++qg->refcnt; + qg->old_refcnt++; + if (qg->new_refcnt < seq) + qg->new_refcnt = seq + 1; + else + qg->new_refcnt++; list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(qgroups, glist->group->qgroupid, + ptr_to_u64(glist->group), + GFP_ATOMIC); + if (ret < 0) + return ret; ret = ulist_add(tmp, glist->group->qgroupid, - (u64)(uintptr_t)glist->group, + ptr_to_u64(glist->group), GFP_ATOMIC); if (ret < 0) return ret; } } } + return 0; +} + +/* + * We need to walk forward in our operation tree and account for any roots that + * were deleted after we made this operation. + */ +static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper, + struct ulist *tmp, + struct ulist *qgroups, u64 seq, + int *old_roots) +{ + struct ulist_node *unode; + struct ulist_iterator uiter; + struct btrfs_qgroup *qg; + struct btrfs_qgroup_operation *tmp_oper; + struct rb_node *n; + int ret; + + ulist_reinit(tmp); + /* + * We only walk forward in the tree since we're only interested in + * removals that happened _after_ our operation. + */ + spin_lock(&fs_info->qgroup_op_lock); + n = rb_next(&oper->n); + spin_unlock(&fs_info->qgroup_op_lock); + if (!n) + return 0; + tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); + while (tmp_oper->bytenr == oper->bytenr) { + /* + * If it's not a removal we don't care, additions work out + * properly with our refcnt tracking. + */ + if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED && + tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL) + goto next; + qg = find_qgroup_rb(fs_info, tmp_oper->ref_root); + if (!qg) + goto next; + ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), + GFP_ATOMIC); + if (ret) { + if (ret < 0) + return ret; + /* + * We only want to increase old_roots if this qgroup is + * not already in the list of qgroups. If it is already + * there then that means it must have been re-added or + * the delete will be discarded because we had an + * existing ref that we haven't looked up yet. In this + * case we don't want to increase old_roots. So if ret + * == 1 then we know that this is the first time we've + * seen this qgroup and we can bump the old_roots. + */ + (*old_roots)++; + ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), + GFP_ATOMIC); + if (ret < 0) + return ret; + } +next: + spin_lock(&fs_info->qgroup_op_lock); + n = rb_next(&tmp_oper->n); + spin_unlock(&fs_info->qgroup_op_lock); + if (!n) + break; + tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); + } + + /* Ok now process the qgroups we found */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(tmp, &uiter))) { + struct btrfs_qgroup_list *glist; + + qg = u64_to_ptr(unode->aux); + if (qg->old_refcnt < seq) + qg->old_refcnt = seq + 1; + else + qg->old_refcnt++; + if (qg->new_refcnt < seq) + qg->new_refcnt = seq + 1; + else + qg->new_refcnt++; + list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(qgroups, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + return ret; + } + } return 0; } -static int qgroup_account_ref_step2(struct btrfs_fs_info *fs_info, - struct ulist *roots, struct ulist *tmp, - u64 seq, int sgn, u64 num_bytes, - struct btrfs_qgroup *qgroup) +/* Add refcnt for the newly added reference. */ +static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper, + struct btrfs_qgroup *qgroup, + struct ulist *tmp, struct ulist *qgroups, + u64 seq) { struct ulist_node *unode; struct ulist_iterator uiter; struct btrfs_qgroup *qg; - struct btrfs_qgroup_list *glist; int ret; ulist_reinit(tmp); - ret = ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC); + ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup), + GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup), + GFP_ATOMIC); if (ret < 0) return ret; - ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(tmp, &uiter))) { - qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; - if (qg->refcnt < seq) { - /* not visited by step 1 */ - qg->rfer += sgn * num_bytes; - qg->rfer_cmpr += sgn * num_bytes; - if (roots->nnodes == 0) { - qg->excl += sgn * num_bytes; - qg->excl_cmpr += sgn * num_bytes; - } - qgroup_dirty(fs_info, qg); - } - WARN_ON(qg->tag >= seq); - qg->tag = seq; + struct btrfs_qgroup_list *glist; + qg = u64_to_ptr(unode->aux); + if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { + if (qg->new_refcnt < seq) + qg->new_refcnt = seq + 1; + else + qg->new_refcnt++; + } else { + if (qg->old_refcnt < seq) + qg->old_refcnt = seq + 1; + else + qg->old_refcnt++; + } list_for_each_entry(glist, &qg->groups, next_group) { ret = ulist_add(tmp, glist->group->qgroupid, - (uintptr_t)glist->group, GFP_ATOMIC); + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + return ret; + ret = ulist_add(qgroups, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); if (ret < 0) return ret; } } - return 0; } -static int qgroup_account_ref_step3(struct btrfs_fs_info *fs_info, - struct ulist *roots, struct ulist *tmp, - u64 seq, int sgn, u64 num_bytes) +/* + * This adjusts the counters for all referenced qgroups if need be. + */ +static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info, + u64 root_to_skip, u64 num_bytes, + struct ulist *qgroups, u64 seq, + int old_roots, int new_roots, int rescan) { struct ulist_node *unode; struct ulist_iterator uiter; struct btrfs_qgroup *qg; - struct ulist_node *tmp_unode; - struct ulist_iterator tmp_uiter; - int ret; + u64 cur_new_count, cur_old_count; ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(roots, &uiter))) { - qg = find_qgroup_rb(fs_info, unode->val); - if (!qg) - continue; + while ((unode = ulist_next(qgroups, &uiter))) { + bool dirty = false; - ulist_reinit(tmp); - ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC); - if (ret < 0) - return ret; + qg = u64_to_ptr(unode->aux); + /* + * Wasn't referenced before but is now, add to the reference + * counters. + */ + if (qg->old_refcnt <= seq && qg->new_refcnt > seq) { + qg->rfer += num_bytes; + qg->rfer_cmpr += num_bytes; + dirty = true; + } - ULIST_ITER_INIT(&tmp_uiter); - while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { - struct btrfs_qgroup_list *glist; + /* + * Was referenced before but isn't now, subtract from the + * reference counters. + */ + if (qg->old_refcnt > seq && qg->new_refcnt <= seq) { + qg->rfer -= num_bytes; + qg->rfer_cmpr -= num_bytes; + dirty = true; + } - qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux; - if (qg->tag == seq) - continue; + if (qg->old_refcnt < seq) + cur_old_count = 0; + else + cur_old_count = qg->old_refcnt - seq; + if (qg->new_refcnt < seq) + cur_new_count = 0; + else + cur_new_count = qg->new_refcnt - seq; - if (qg->refcnt - seq == roots->nnodes) { - qg->excl -= sgn * num_bytes; - qg->excl_cmpr -= sgn * num_bytes; - qgroup_dirty(fs_info, qg); - } + /* + * If our refcount was the same as the roots previously but our + * new count isn't the same as the number of roots now then we + * went from having a exclusive reference on this range to not. + */ + if (old_roots && cur_old_count == old_roots && + (cur_new_count != new_roots || new_roots == 0)) { + WARN_ON(cur_new_count != new_roots && new_roots == 0); + qg->excl -= num_bytes; + qg->excl_cmpr -= num_bytes; + dirty = true; + } - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - (uintptr_t)glist->group, - GFP_ATOMIC); - if (ret < 0) - return ret; - } + /* + * If we didn't reference all the roots before but now we do we + * have an exclusive reference to this range. + */ + if ((!old_roots || (old_roots && cur_old_count != old_roots)) + && cur_new_count == new_roots) { + qg->excl += num_bytes; + qg->excl_cmpr += num_bytes; + dirty = true; } - } + if (dirty) + qgroup_dirty(fs_info, qg); + } return 0; } /* - * btrfs_qgroup_account_ref is called for every ref that is added to or deleted - * from the fs. First, all roots referencing the extent are searched, and - * then the space is accounted accordingly to the different roots. The - * accounting algorithm works in 3 steps documented inline. + * If we removed a data extent and there were other references for that bytenr + * then we need to lookup all referenced roots to make sure we still don't + * reference this bytenr. If we do then we can just discard this operation. */ -int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_node *node, - struct btrfs_delayed_extent_op *extent_op) +static int check_existing_refs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) { - struct btrfs_root *quota_root; - u64 ref_root; - struct btrfs_qgroup *qgroup; struct ulist *roots = NULL; - u64 seq; + struct ulist_node *unode; + struct ulist_iterator uiter; int ret = 0; - int sgn; - if (!fs_info->quota_enabled) - return 0; - - BUG_ON(!fs_info->quota_root); + ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, + oper->elem.seq, &roots); + if (ret < 0) + return ret; + ret = 0; - if (node->type == BTRFS_TREE_BLOCK_REF_KEY || - node->type == BTRFS_SHARED_BLOCK_REF_KEY) { - struct btrfs_delayed_tree_ref *ref; - ref = btrfs_delayed_node_to_tree_ref(node); - ref_root = ref->root; - } else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || - node->type == BTRFS_SHARED_DATA_REF_KEY) { - struct btrfs_delayed_data_ref *ref; - ref = btrfs_delayed_node_to_data_ref(node); - ref_root = ref->root; - } else { - BUG(); + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(roots, &uiter))) { + if (unode->val == oper->ref_root) { + ret = 1; + break; + } } + ulist_free(roots); + btrfs_put_tree_mod_seq(fs_info, &oper->elem); - if (!is_fstree(ref_root)) { - /* - * non-fs-trees are not being accounted - */ - return 0; - } + return ret; +} - switch (node->action) { - case BTRFS_ADD_DELAYED_REF: - case BTRFS_ADD_DELAYED_EXTENT: - sgn = 1; - seq = btrfs_tree_mod_seq_prev(node->seq); - break; - case BTRFS_DROP_DELAYED_REF: - sgn = -1; - seq = node->seq; - break; - case BTRFS_UPDATE_DELAYED_HEAD: - return 0; - default: - BUG(); - } +/* + * If we share a reference across multiple roots then we may need to adjust + * various qgroups referenced and exclusive counters. The basic premise is this + * + * 1) We have seq to represent a 0 count. Instead of looping through all of the + * qgroups and resetting their refcount to 0 we just constantly bump this + * sequence number to act as the base reference count. This means that if + * anybody is equal to or below this sequence they were never referenced. We + * jack this sequence up by the number of roots we found each time in order to + * make sure we don't have any overlap. + * + * 2) We first search all the roots that reference the area _except_ the root + * we're acting on currently. This makes up the old_refcnt of all the qgroups + * before. + * + * 3) We walk all of the qgroups referenced by the root we are currently acting + * on, and will either adjust old_refcnt in the case of a removal or the + * new_refcnt in the case of an addition. + * + * 4) Finally we walk all the qgroups that are referenced by this range + * including the root we are acting on currently. We will adjust the counters + * based on the number of roots we had and will have after this operation. + * + * Take this example as an illustration + * + * [qgroup 1/0] + * / | \ + * [qg 0/0] [qg 0/1] [qg 0/2] + * \ | / + * [ extent ] + * + * Say we are adding a reference that is covered by qg 0/0. The first step + * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with + * old_roots being 2. Because it is adding new_roots will be 1. We then go + * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's + * new_refcnt, bringing it to 3. We then walk through all of the qgroups, we + * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a + * reference and thus must add the size to the referenced bytes. Everything + * else is the same so nothing else changes. + */ +static int qgroup_shared_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct ulist *roots = NULL; + struct ulist *qgroups, *tmp; + struct btrfs_qgroup *qgroup; + struct seq_list elem = {}; + u64 seq; + int old_roots = 0; + int new_roots = 0; + int ret = 0; - mutex_lock(&fs_info->qgroup_rescan_lock); - if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { - if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) { - mutex_unlock(&fs_info->qgroup_rescan_lock); + if (oper->elem.seq) { + ret = check_existing_refs(trans, fs_info, oper); + if (ret < 0) + return ret; + if (ret) return 0; - } } - mutex_unlock(&fs_info->qgroup_rescan_lock); - /* - * the delayed ref sequence number we pass depends on the direction of - * the operation. for add operations, we pass - * tree_mod_log_prev_seq(node->seq) to skip - * the delayed ref's current sequence number, because we need the state - * of the tree before the add operation. for delete operations, we pass - * (node->seq) to include the delayed ref's current sequence number, - * because we need the state of the tree after the delete operation. - */ - ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, seq, &roots); - if (ret < 0) - return ret; - - spin_lock(&fs_info->qgroup_lock); + qgroups = ulist_alloc(GFP_NOFS); + if (!qgroups) + return -ENOMEM; - quota_root = fs_info->quota_root; - if (!quota_root) - goto unlock; + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) { + ulist_free(qgroups); + return -ENOMEM; + } - qgroup = find_qgroup_rb(fs_info, ref_root); + btrfs_get_tree_mod_seq(fs_info, &elem); + ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq, + &roots); + btrfs_put_tree_mod_seq(fs_info, &elem); + if (ret < 0) { + ulist_free(qgroups); + ulist_free(tmp); + return ret; + } + spin_lock(&fs_info->qgroup_lock); + qgroup = find_qgroup_rb(fs_info, oper->ref_root); if (!qgroup) - goto unlock; + goto out; + seq = fs_info->qgroup_seq; /* - * step 1: for each old ref, visit all nodes once and inc refcnt + * So roots is the list of all the roots currently pointing at the + * bytenr, including the ref we are adding if we are adding, or not if + * we are removing a ref. So we pass in the ref_root to skip that root + * in our calculations. We set old_refnct and new_refcnt cause who the + * hell knows what everything looked like before, and it doesn't matter + * except... */ - ulist_reinit(fs_info->qgroup_ulist); - seq = fs_info->qgroup_seq; - fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ + ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups, + seq, &old_roots, 0); + if (ret < 0) + goto out; - ret = qgroup_account_ref_step1(fs_info, roots, fs_info->qgroup_ulist, - seq); - if (ret) - goto unlock; + /* + * Now adjust the refcounts of the qgroups that care about this + * reference, either the old_count in the case of removal or new_count + * in the case of an addition. + */ + ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups, + seq); + if (ret < 0) + goto out; /* - * step 2: walk from the new root + * ...in the case of removals. If we had a removal before we got around + * to processing this operation then we need to find that guy and count + * his references as if they really existed so we don't end up screwing + * up the exclusive counts. Then whenever we go to process the delete + * everything will be grand and we can account for whatever exclusive + * changes need to be made there. We also have to pass in old_roots so + * we have an accurate count of the roots as it pertains to this + * operations view of the world. */ - ret = qgroup_account_ref_step2(fs_info, roots, fs_info->qgroup_ulist, - seq, sgn, node->num_bytes, qgroup); - if (ret) - goto unlock; + ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq, + &old_roots); + if (ret < 0) + goto out; /* - * step 3: walk again from old refs + * We are adding our root, need to adjust up the number of roots, + * otherwise old_roots is the number of roots we want. */ - ret = qgroup_account_ref_step3(fs_info, roots, fs_info->qgroup_ulist, - seq, sgn, node->num_bytes); - if (ret) - goto unlock; + if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { + new_roots = old_roots + 1; + } else { + new_roots = old_roots; + old_roots++; + } + fs_info->qgroup_seq += old_roots + 1; -unlock: + + /* + * And now the magic happens, bless Arne for having a pretty elegant + * solution for this. + */ + qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes, + qgroups, seq, old_roots, new_roots, 0); +out: spin_unlock(&fs_info->qgroup_lock); + ulist_free(qgroups); ulist_free(roots); + ulist_free(tmp); + return ret; +} + +/* + * btrfs_qgroup_account_ref is called for every ref that is added to or deleted + * from the fs. First, all roots referencing the extent are searched, and + * then the space is accounted accordingly to the different roots. The + * accounting algorithm works in 3 steps documented inline. + */ +static int btrfs_qgroup_account(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + int ret = 0; + + if (!fs_info->quota_enabled) + return 0; + + BUG_ON(!fs_info->quota_root); + + mutex_lock(&fs_info->qgroup_rescan_lock); + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) { + mutex_unlock(&fs_info->qgroup_rescan_lock); + return 0; + } + } + mutex_unlock(&fs_info->qgroup_rescan_lock); + + ASSERT(is_fstree(oper->ref_root)); + + switch (oper->type) { + case BTRFS_QGROUP_OPER_ADD_EXCL: + case BTRFS_QGROUP_OPER_SUB_EXCL: + ret = qgroup_excl_accounting(fs_info, oper); + break; + case BTRFS_QGROUP_OPER_ADD_SHARED: + case BTRFS_QGROUP_OPER_SUB_SHARED: + ret = qgroup_shared_accounting(trans, fs_info, oper); + break; + default: + ASSERT(0); + } + return ret; +} + +/* + * Needs to be called everytime we run delayed refs, even if there is an error + * in order to cleanup outstanding operations. + */ +int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_qgroup_operation *oper; + int ret = 0; + while (!list_empty(&trans->qgroup_ref_list)) { + oper = list_first_entry(&trans->qgroup_ref_list, + struct btrfs_qgroup_operation, list); + list_del_init(&oper->list); + if (!ret || !trans->aborted) + ret = btrfs_qgroup_account(trans, fs_info, oper); + spin_lock(&fs_info->qgroup_op_lock); + rb_erase(&oper->n, &fs_info->qgroup_op_tree); + spin_unlock(&fs_info->qgroup_op_lock); + btrfs_put_tree_mod_seq(fs_info, &oper->elem); + kfree(oper); + } return ret; } @@ -1629,8 +2116,16 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, srcgroup = find_qgroup_rb(fs_info, srcid); if (!srcgroup) goto unlock; - dstgroup->rfer = srcgroup->rfer - level_size; - dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size; + + /* + * We call inherit after we clone the root in order to make sure + * our counts don't go crazy, so at this point the only + * difference between the two roots should be the root node. + */ + dstgroup->rfer = srcgroup->rfer; + dstgroup->rfer_cmpr = srcgroup->rfer_cmpr; + dstgroup->excl = level_size; + dstgroup->excl_cmpr = level_size; srcgroup->excl = level_size; srcgroup->excl_cmpr = level_size; qgroup_dirty(fs_info, dstgroup); @@ -1734,7 +2229,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) struct btrfs_qgroup *qg; struct btrfs_qgroup_list *glist; - qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; + qg = u64_to_ptr(unode->aux); if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && qg->reserved + (s64)qg->rfer + num_bytes > @@ -1766,7 +2261,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { struct btrfs_qgroup *qg; - qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; + qg = u64_to_ptr(unode->aux); qg->reserved += num_bytes; } @@ -1812,7 +2307,7 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) struct btrfs_qgroup *qg; struct btrfs_qgroup_list *glist; - qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; + qg = u64_to_ptr(unode->aux); qg->reserved -= num_bytes; @@ -1848,15 +2343,15 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) */ static int qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - struct btrfs_trans_handle *trans, struct ulist *tmp, - struct extent_buffer *scratch_leaf) + struct btrfs_trans_handle *trans, struct ulist *qgroups, + struct ulist *tmp, struct extent_buffer *scratch_leaf) { struct btrfs_key found; struct ulist *roots = NULL; - struct ulist_node *unode; - struct ulist_iterator uiter; struct seq_list tree_mod_seq_elem = {}; + u64 num_bytes; u64 seq; + int new_roots; int slot; int ret; @@ -1897,8 +2392,6 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, mutex_unlock(&fs_info->qgroup_rescan_lock); for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { - u64 num_bytes; - btrfs_item_key_to_cpu(scratch_leaf, &found, slot); if (found.type != BTRFS_EXTENT_ITEM_KEY && found.type != BTRFS_METADATA_ITEM_KEY) @@ -1908,76 +2401,34 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, else num_bytes = found.offset; - ret = btrfs_find_all_roots(trans, fs_info, found.objectid, - tree_mod_seq_elem.seq, &roots); + ulist_reinit(qgroups); + ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, + &roots); if (ret < 0) goto out; spin_lock(&fs_info->qgroup_lock); seq = fs_info->qgroup_seq; fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ - ret = qgroup_account_ref_step1(fs_info, roots, tmp, seq); - if (ret) { + new_roots = 0; + ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups, + seq, &new_roots, 1); + if (ret < 0) { spin_unlock(&fs_info->qgroup_lock); ulist_free(roots); goto out; } - /* - * step2 of btrfs_qgroup_account_ref works from a single root, - * we're doing all at once here. - */ - ulist_reinit(tmp); - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(roots, &uiter))) { - struct btrfs_qgroup *qg; - - qg = find_qgroup_rb(fs_info, unode->val); - if (!qg) - continue; - - ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, - GFP_ATOMIC); - if (ret < 0) { - spin_unlock(&fs_info->qgroup_lock); - ulist_free(roots); - goto out; - } - } - - /* this loop is similar to step 2 of btrfs_qgroup_account_ref */ - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(tmp, &uiter))) { - struct btrfs_qgroup *qg; - struct btrfs_qgroup_list *glist; - - qg = (struct btrfs_qgroup *)(uintptr_t) unode->aux; - qg->rfer += num_bytes; - qg->rfer_cmpr += num_bytes; - WARN_ON(qg->tag >= seq); - if (qg->refcnt - seq == roots->nnodes) { - qg->excl += num_bytes; - qg->excl_cmpr += num_bytes; - } - qgroup_dirty(fs_info, qg); - - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - (uintptr_t)glist->group, - GFP_ATOMIC); - if (ret < 0) { - spin_unlock(&fs_info->qgroup_lock); - ulist_free(roots); - goto out; - } - } + ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups, + seq, 0, new_roots, 1); + if (ret < 0) { + spin_unlock(&fs_info->qgroup_lock); + ulist_free(roots); + goto out; } - spin_unlock(&fs_info->qgroup_lock); ulist_free(roots); - ret = 0; } - out: btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); @@ -1990,13 +2441,16 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) qgroup_rescan_work); struct btrfs_path *path; struct btrfs_trans_handle *trans = NULL; - struct ulist *tmp = NULL; + struct ulist *tmp = NULL, *qgroups = NULL; struct extent_buffer *scratch_leaf = NULL; int err = -ENOMEM; path = btrfs_alloc_path(); if (!path) goto out; + qgroups = ulist_alloc(GFP_NOFS); + if (!qgroups) + goto out; tmp = ulist_alloc(GFP_NOFS); if (!tmp) goto out; @@ -2015,7 +2469,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) err = -EINTR; } else { err = qgroup_rescan_leaf(fs_info, path, trans, - tmp, scratch_leaf); + qgroups, tmp, scratch_leaf); } if (err > 0) btrfs_commit_transaction(trans, fs_info->fs_root); @@ -2025,6 +2479,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) out: kfree(scratch_leaf); + ulist_free(qgroups); ulist_free(tmp); btrfs_free_path(path); diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h new file mode 100644 index 000000000000..5952ff1fbd7a --- /dev/null +++ b/fs/btrfs/qgroup.h @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2014 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_QGROUP__ +#define __BTRFS_QGROUP__ + +/* + * A description of the operations, all of these operations only happen when we + * are adding the 1st reference for that subvolume in the case of adding space + * or on the last reference delete in the case of subtraction. The only + * exception is the last one, which is added for confusion. + * + * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only + * one pointing at the bytes we are adding. This is called on the first + * allocation. + * + * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be + * shared between subvols. This is called on the creation of a ref that already + * has refs from a different subvolume, so basically reflink. + * + * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only + * one referencing the range. + * + * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with + * refs with other subvolumes. + */ +enum btrfs_qgroup_operation_type { + BTRFS_QGROUP_OPER_ADD_EXCL, + BTRFS_QGROUP_OPER_ADD_SHARED, + BTRFS_QGROUP_OPER_SUB_EXCL, + BTRFS_QGROUP_OPER_SUB_SHARED, +}; + +struct btrfs_qgroup_operation { + u64 ref_root; + u64 bytenr; + u64 num_bytes; + u64 seq; + enum btrfs_qgroup_operation_type type; + struct seq_list elem; + struct rb_node n; + struct list_head list; +}; + +int btrfs_quota_enable(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_quota_disable(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); +void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info); +int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst); +int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst); +int btrfs_create_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid, + char *name); +int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid); +int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 qgroupid, + struct btrfs_qgroup_limit *limit); +int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); +void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); +struct btrfs_delayed_extent_op; +int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 ref_root, + u64 bytenr, u64 num_bytes, + enum btrfs_qgroup_operation_type type, + int mod_seq); +int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper); +int btrfs_run_qgroups(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, + struct btrfs_qgroup_inherit *inherit); +int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes); +void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes); + +void assert_qgroups_uptodate(struct btrfs_trans_handle *trans); + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, + u64 rfer, u64 excl); +#endif + +#endif /* __BTRFS_QGROUP__ */ diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 4055291a523e..4a88f073fdd7 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1956,9 +1956,10 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) * pages are going to be uptodate. */ for (stripe = 0; stripe < bbio->num_stripes; stripe++) { - if (rbio->faila == stripe || - rbio->failb == stripe) + if (rbio->faila == stripe || rbio->failb == stripe) { + atomic_inc(&rbio->bbio->error); continue; + } for (pagenr = 0; pagenr < nr_pages; pagenr++) { struct page *p; diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 30947f923620..09230cf3a244 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -428,8 +428,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, continue; } if (!dev->bdev) { - /* cannot read ahead on missing device */ - continue; + /* + * cannot read ahead on missing device, but for RAID5/6, + * REQ_GET_READ_MIRRORS return 1. So don't skip missing + * device for such case. + */ + if (nzones > 1) + continue; } if (dev_replace_is_ongoing && dev == fs_info->dev_replace.tgtdev) { diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 7f92ab1daa87..65245a07275b 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -337,7 +337,7 @@ static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr) if (bnode->root) fs_info = bnode->root->fs_info; btrfs_panic(fs_info, errno, "Inconsistency in backref cache " - "found at offset %llu\n", bytenr); + "found at offset %llu", bytenr); } /* @@ -528,7 +528,7 @@ static int should_ignore_root(struct btrfs_root *root) { struct btrfs_root *reloc_root; - if (!root->ref_cows) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return 0; reloc_root = root->reloc_root; @@ -610,7 +610,7 @@ struct btrfs_root *find_tree_root(struct reloc_control *rc, root = read_fs_root(rc->extent_root->fs_info, root_objectid); BUG_ON(IS_ERR(root)); - if (root->ref_cows && + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && generation != btrfs_root_generation(&root->root_item)) return NULL; @@ -887,7 +887,7 @@ again: goto out; } - if (!root->ref_cows) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) cur->cowonly = 1; if (btrfs_root_level(&root->root_item) == cur->level) { @@ -954,7 +954,8 @@ again: upper->bytenr = eb->start; upper->owner = btrfs_header_owner(eb); upper->level = lower->level + 1; - if (!root->ref_cows) + if (!test_bit(BTRFS_ROOT_REF_COWS, + &root->state)) upper->cowonly = 1; /* @@ -1258,7 +1259,7 @@ static int __must_check __add_reloc_root(struct btrfs_root *root) if (rb_node) { btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found " "for start=%llu while inserting into relocation " - "tree\n", node->bytenr); + "tree", node->bytenr); kfree(node); return -EEXIST; } @@ -2441,7 +2442,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, next = walk_up_backref(next, edges, &index); root = next->root; BUG_ON(!root); - BUG_ON(!root->ref_cows); + BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state)); if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { record_reloc_root_in_trans(trans, root); @@ -2506,7 +2507,7 @@ struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans, BUG_ON(!root); /* no other choice for non-references counted tree */ - if (!root->ref_cows) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return root; if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) @@ -2893,14 +2894,14 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, goto out; } - if (!root || root->ref_cows) { + if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { ret = reserve_metadata_space(trans, rc, node); if (ret) goto out; } if (root) { - if (root->ref_cows) { + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { BUG_ON(node->new_bytenr); BUG_ON(!list_empty(&node->list)); btrfs_record_root_in_trans(trans, root); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 38bb47e7d6b1..360a728a639f 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -306,7 +306,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) break; } - root->orphan_item_inserted = 1; + set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); err = btrfs_insert_fs_root(root->fs_info, root); if (err) { diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 0be77993378e..b6d198f5181e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -588,8 +588,9 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { do { - ret = tree_backref_for_extent(&ptr, eb, ei, item_size, - &ref_root, &ref_level); + ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, + item_size, &ref_root, + &ref_level); printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev %s, " "sector %llu: metadata %s (level %d) in tree " @@ -717,8 +718,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) out: if (page) put_page(page); - if (inode) - iput(inode); + + iput(inode); if (ret < 0) return ret; @@ -2724,11 +2725,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); length = btrfs_dev_extent_length(l, dev_extent); - if (found_key.offset + length <= start) { - key.offset = found_key.offset + length; - btrfs_release_path(path); - continue; - } + if (found_key.offset + length <= start) + goto skip; chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); @@ -2739,10 +2737,12 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, * the chunk from going away while we scrub it */ cache = btrfs_lookup_block_group(fs_info, chunk_offset); - if (!cache) { - ret = -ENOENT; - break; - } + + /* some chunks are removed but not committed to disk yet, + * continue scrubbing */ + if (!cache) + goto skip; + dev_replace->cursor_right = found_key.offset + length; dev_replace->cursor_left = found_key.offset; dev_replace->item_needs_writeback = 1; @@ -2801,7 +2801,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, dev_replace->cursor_left = dev_replace->cursor_right; dev_replace->item_needs_writeback = 1; - +skip: key.offset = found_key.offset + length; btrfs_release_path(path); } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 1ac3ca98c429..6528aa662181 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -349,16 +349,24 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) if (p->buf_len >= len) return 0; + if (len > PATH_MAX) { + WARN_ON(1); + return -ENOMEM; + } + path_len = p->end - p->start; old_buf_len = p->buf_len; /* * First time the inline_buf does not suffice */ - if (p->buf == p->inline_buf) + if (p->buf == p->inline_buf) { tmp_buf = kmalloc(len, GFP_NOFS); - else + if (tmp_buf) + memcpy(tmp_buf, p->buf, old_buf_len); + } else { tmp_buf = krealloc(p->buf, len, GFP_NOFS); + } if (!tmp_buf) return -ENOMEM; p->buf = tmp_buf; @@ -967,7 +975,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_dir_item *di; struct btrfs_key di_key; char *buf = NULL; - const int buf_len = PATH_MAX; + int buf_len; u32 name_len; u32 data_len; u32 cur; @@ -977,6 +985,11 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, int num; u8 type; + if (found_key->type == BTRFS_XATTR_ITEM_KEY) + buf_len = BTRFS_MAX_XATTR_SIZE(root); + else + buf_len = PATH_MAX; + buf = kmalloc(buf_len, GFP_NOFS); if (!buf) { ret = -ENOMEM; @@ -998,12 +1011,23 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, type = btrfs_dir_type(eb, di); btrfs_dir_item_key_to_cpu(eb, di, &di_key); - /* - * Path too long - */ - if (name_len + data_len > buf_len) { - ret = -ENAMETOOLONG; - goto out; + if (type == BTRFS_FT_XATTR) { + if (name_len > XATTR_NAME_MAX) { + ret = -ENAMETOOLONG; + goto out; + } + if (name_len + data_len > buf_len) { + ret = -E2BIG; + goto out; + } + } else { + /* + * Path too long + */ + if (name_len + data_len > buf_len) { + ret = -ENAMETOOLONG; + goto out; + } } read_extent_buffer(eb, buf, (unsigned long)(di + 1), @@ -1341,7 +1365,7 @@ static int find_extent_clone(struct send_ctx *sctx, ret = -EIO; btrfs_err(sctx->send_root->fs_info, "did not find backref in " "send_root. inode=%llu, offset=%llu, " - "disk_byte=%llu found extent=%llu\n", + "disk_byte=%llu found extent=%llu", ino, data_offset, disk_byte, found_key.objectid); goto out; } @@ -1620,6 +1644,10 @@ static int lookup_dir_item_inode(struct btrfs_root *root, goto out; } btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); + if (key.type == BTRFS_ROOT_ITEM_KEY) { + ret = -ENOENT; + goto out; + } *found_inode = key.objectid; *found_type = btrfs_dir_type(path->nodes[0], di); @@ -1663,7 +1691,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, goto out; } - if (key.type == BTRFS_INODE_REF_KEY) { + if (found_key.type == BTRFS_INODE_REF_KEY) { struct btrfs_inode_ref *iref; iref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); @@ -1685,10 +1713,12 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, goto out; btrfs_release_path(path); - ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, NULL, - NULL, NULL); - if (ret < 0) - goto out; + if (dir_gen) { + ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, + NULL, NULL, NULL); + if (ret < 0) + goto out; + } *dir = parent_dir; @@ -1704,13 +1734,12 @@ static int is_first_ref(struct btrfs_root *root, int ret; struct fs_path *tmp_name; u64 tmp_dir; - u64 tmp_dir_gen; tmp_name = fs_path_alloc(); if (!tmp_name) return -ENOMEM; - ret = get_first_ref(root, ino, &tmp_dir, &tmp_dir_gen, tmp_name); + ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name); if (ret < 0) goto out; @@ -2021,7 +2050,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, { int ret; int nce_ret; - struct btrfs_path *path = NULL; struct name_cache_entry *nce = NULL; /* @@ -2047,10 +2075,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, } } - path = alloc_path_for_send(); - if (!path) - return -ENOMEM; - /* * If the inode is not existent yet, add the orphan name and return 1. * This should only happen for the parent dir that we determine in @@ -2126,7 +2150,6 @@ out_cache: name_cache_clean_unused(sctx); out: - btrfs_free_path(path); return ret; } @@ -2937,7 +2960,9 @@ static void free_waiting_dir_move(struct send_ctx *sctx, static int add_pending_dir_move(struct send_ctx *sctx, u64 ino, u64 ino_gen, - u64 parent_ino) + u64 parent_ino, + struct list_head *new_refs, + struct list_head *deleted_refs) { struct rb_node **p = &sctx->pending_dir_moves.rb_node; struct rb_node *parent = NULL; @@ -2969,12 +2994,12 @@ static int add_pending_dir_move(struct send_ctx *sctx, } } - list_for_each_entry(cur, &sctx->deleted_refs, list) { + list_for_each_entry(cur, deleted_refs, list) { ret = dup_ref(cur, &pm->update_refs); if (ret < 0) goto out; } - list_for_each_entry(cur, &sctx->new_refs, list) { + list_for_each_entry(cur, new_refs, list) { ret = dup_ref(cur, &pm->update_refs); if (ret < 0) goto out; @@ -3017,6 +3042,48 @@ static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx, return NULL; } +static int path_loop(struct send_ctx *sctx, struct fs_path *name, + u64 ino, u64 gen, u64 *ancestor_ino) +{ + int ret = 0; + u64 parent_inode = 0; + u64 parent_gen = 0; + u64 start_ino = ino; + + *ancestor_ino = 0; + while (ino != BTRFS_FIRST_FREE_OBJECTID) { + fs_path_reset(name); + + if (is_waiting_for_rm(sctx, ino)) + break; + if (is_waiting_for_move(sctx, ino)) { + if (*ancestor_ino == 0) + *ancestor_ino = ino; + ret = get_first_ref(sctx->parent_root, ino, + &parent_inode, &parent_gen, name); + } else { + ret = __get_cur_name_and_parent(sctx, ino, gen, + &parent_inode, + &parent_gen, name); + if (ret > 0) { + ret = 0; + break; + } + } + if (ret < 0) + break; + if (parent_inode == start_ino) { + ret = 1; + if (*ancestor_ino == 0) + *ancestor_ino = ino; + break; + } + ino = parent_inode; + gen = parent_gen; + } + return ret; +} + static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) { struct fs_path *from_path = NULL; @@ -3028,6 +3095,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) struct waiting_dir_move *dm = NULL; u64 rmdir_ino = 0; int ret; + u64 ancestor = 0; name = fs_path_alloc(); from_path = fs_path_alloc(); @@ -3046,34 +3114,33 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) if (ret < 0) goto out; - if (parent_ino == sctx->cur_ino) { - /* child only renamed, not moved */ - ASSERT(parent_gen == sctx->cur_inode_gen); - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, - from_path); - if (ret < 0) - goto out; - ret = fs_path_add_path(from_path, name); - if (ret < 0) - goto out; - } else { - /* child moved and maybe renamed too */ - sctx->send_progress = pm->ino; - ret = get_cur_path(sctx, pm->ino, pm->gen, from_path); + ret = get_cur_path(sctx, parent_ino, parent_gen, + from_path); + if (ret < 0) + goto out; + ret = fs_path_add_path(from_path, name); + if (ret < 0) + goto out; + + sctx->send_progress = sctx->cur_ino + 1; + ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor); + if (ret) { + LIST_HEAD(deleted_refs); + ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID); + ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor, + &pm->update_refs, &deleted_refs); if (ret < 0) goto out; - } - - fs_path_free(name); - name = NULL; - - to_path = fs_path_alloc(); - if (!to_path) { - ret = -ENOMEM; + if (rmdir_ino) { + dm = get_waiting_dir_move(sctx, pm->ino); + ASSERT(dm); + dm->rmdir_ino = rmdir_ino; + } goto out; } - - sctx->send_progress = sctx->cur_ino + 1; + fs_path_reset(name); + to_path = name; + name = NULL; ret = get_cur_path(sctx, pm->ino, pm->gen, to_path); if (ret < 0) goto out; @@ -3197,127 +3264,74 @@ out: static int wait_for_parent_move(struct send_ctx *sctx, struct recorded_ref *parent_ref) { - int ret; + int ret = 0; u64 ino = parent_ref->dir; u64 parent_ino_before, parent_ino_after; - u64 old_gen; struct fs_path *path_before = NULL; struct fs_path *path_after = NULL; int len1, len2; - int register_upper_dirs; - u64 gen; - - if (is_waiting_for_move(sctx, ino)) - return 1; - - if (parent_ref->dir <= sctx->cur_ino) - return 0; - - ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen, - NULL, NULL, NULL, NULL); - if (ret == -ENOENT) - return 0; - else if (ret < 0) - return ret; - - if (parent_ref->dir_gen != old_gen) - return 0; - - path_before = fs_path_alloc(); - if (!path_before) - return -ENOMEM; - - ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before, - NULL, path_before); - if (ret == -ENOENT) { - ret = 0; - goto out; - } else if (ret < 0) { - goto out; - } path_after = fs_path_alloc(); - if (!path_after) { + path_before = fs_path_alloc(); + if (!path_after || !path_before) { ret = -ENOMEM; goto out; } - ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, - &gen, path_after); - if (ret == -ENOENT) { - ret = 0; - goto out; - } else if (ret < 0) { - goto out; - } - - len1 = fs_path_len(path_before); - len2 = fs_path_len(path_after); - if (parent_ino_before != parent_ino_after || len1 != len2 || - memcmp(path_before->start, path_after->start, len1)) { - ret = 1; - goto out; - } - ret = 0; - /* - * Ok, our new most direct ancestor has a higher inode number but - * wasn't moved/renamed. So maybe some of the new ancestors higher in - * the hierarchy have an higher inode number too *and* were renamed - * or moved - in this case we need to wait for the ancestor's rename - * or move operation before we can do the move/rename for the current - * inode. + * Our current directory inode may not yet be renamed/moved because some + * ancestor (immediate or not) has to be renamed/moved first. So find if + * such ancestor exists and make sure our own rename/move happens after + * that ancestor is processed. */ - register_upper_dirs = 0; - ino = parent_ino_after; -again: - while ((ret == 0 || register_upper_dirs) && ino > sctx->cur_ino) { - u64 parent_gen; + while (ino > BTRFS_FIRST_FREE_OBJECTID) { + if (is_waiting_for_move(sctx, ino)) { + ret = 1; + break; + } fs_path_reset(path_before); fs_path_reset(path_after); ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, - &parent_gen, path_after); + NULL, path_after); if (ret < 0) goto out; ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before, NULL, path_before); - if (ret == -ENOENT) { - ret = 0; - break; - } else if (ret < 0) { + if (ret < 0 && ret != -ENOENT) { goto out; + } else if (ret == -ENOENT) { + ret = 1; + break; } len1 = fs_path_len(path_before); len2 = fs_path_len(path_after); - if (parent_ino_before != parent_ino_after || len1 != len2 || - memcmp(path_before->start, path_after->start, len1)) { + if (ino > sctx->cur_ino && + (parent_ino_before != parent_ino_after || len1 != len2 || + memcmp(path_before->start, path_after->start, len1))) { ret = 1; - if (register_upper_dirs) { - break; - } else { - register_upper_dirs = 1; - ino = parent_ref->dir; - gen = parent_ref->dir_gen; - goto again; - } - } else if (register_upper_dirs) { - ret = add_pending_dir_move(sctx, ino, gen, - parent_ino_after); - if (ret < 0 && ret != -EEXIST) - goto out; + break; } - ino = parent_ino_after; - gen = parent_gen; } out: fs_path_free(path_before); fs_path_free(path_after); + if (ret == 1) { + ret = add_pending_dir_move(sctx, + sctx->cur_ino, + sctx->cur_inode_gen, + ino, + &sctx->new_refs, + &sctx->deleted_refs); + if (!ret) + ret = 1; + } + return ret; } @@ -3478,10 +3492,6 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); if (ret < 0) goto out; if (ret) { - ret = add_pending_dir_move(sctx, - sctx->cur_ino, - sctx->cur_inode_gen, - cur->dir); *pending_move = 1; } else { ret = send_rename(sctx, valid_path, @@ -5482,7 +5492,7 @@ static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) */ if (root->send_in_progress < 0) btrfs_err(root->fs_info, - "send_in_progres unbalanced %d root %llu\n", + "send_in_progres unbalanced %d root %llu", root->send_in_progress, root->root_key.objectid); spin_unlock(&root->root_item_lock); } @@ -5510,7 +5520,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) /* * The subvolume must remain read-only during send, protect against - * making it RW. + * making it RW. This also protects against deletion. */ spin_lock(&send_root->root_item_lock); send_root->send_in_progress++; @@ -5570,6 +5580,15 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) } sctx->send_root = send_root; + /* + * Unlikely but possible, if the subvolume is marked for deletion but + * is slow to remove the directory entry, send can still be started + */ + if (btrfs_root_dead(sctx->send_root)) { + ret = -EPERM; + goto out; + } + sctx->clone_roots_cnt = arg->clone_sources_count; sctx->send_max_size = BTRFS_SEND_BUF_SIZE; @@ -5659,7 +5678,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) spin_lock(&sctx->parent_root->root_item_lock); sctx->parent_root->send_in_progress++; - if (!btrfs_root_readonly(sctx->parent_root)) { + if (!btrfs_root_readonly(sctx->parent_root) || + btrfs_root_dead(sctx->parent_root)) { spin_unlock(&sctx->parent_root->root_item_lock); srcu_read_unlock(&fs_info->subvol_srcu, index); ret = -EPERM; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 5011aadacab8..8e16bca69c56 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -385,20 +385,6 @@ static match_table_t tokens = { {Opt_err, NULL}, }; -#define btrfs_set_and_info(root, opt, fmt, args...) \ -{ \ - if (!btrfs_test_opt(root, opt)) \ - btrfs_info(root->fs_info, fmt, ##args); \ - btrfs_set_opt(root->fs_info->mount_opt, opt); \ -} - -#define btrfs_clear_and_info(root, opt, fmt, args...) \ -{ \ - if (btrfs_test_opt(root, opt)) \ - btrfs_info(root->fs_info, fmt, ##args); \ - btrfs_clear_opt(root->fs_info->mount_opt, opt); \ -} - /* * Regular mount options parser. Everything that is needed only when * reading in a new superblock is parsed here. @@ -525,7 +511,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) } else if (compress) { if (!btrfs_test_opt(root, COMPRESS)) btrfs_info(root->fs_info, - "btrfs: use %s compression\n", + "btrfs: use %s compression", compress_type); } break; @@ -536,9 +522,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_ssd_spread: btrfs_set_and_info(root, SSD_SPREAD, "use spread ssd allocation scheme"); + btrfs_set_opt(info->mount_opt, SSD); break; case Opt_nossd: - btrfs_clear_and_info(root, NOSSD, + btrfs_set_and_info(root, NOSSD, "not using ssd allocation scheme"); btrfs_clear_opt(info->mount_opt, SSD); break; @@ -594,8 +581,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) } break; case Opt_acl: +#ifdef CONFIG_BTRFS_FS_POSIX_ACL root->fs_info->sb->s_flags |= MS_POSIXACL; break; +#else + btrfs_err(root->fs_info, + "support for ACL not compiled in!"); + ret = -EINVAL; + goto out; +#endif case Opt_noacl: root->fs_info->sb->s_flags &= ~MS_POSIXACL; break; @@ -1186,7 +1180,6 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, return ERR_PTR(-ENOMEM); mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs); - kfree(newargs); if (PTR_RET(mnt) == -EBUSY) { if (flags & MS_RDONLY) { @@ -1196,17 +1189,22 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, int r; mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name, newargs); - if (IS_ERR(mnt)) + if (IS_ERR(mnt)) { + kfree(newargs); return ERR_CAST(mnt); + } r = btrfs_remount(mnt->mnt_sb, &flags, NULL); if (r < 0) { /* FIXME: release vfsmount mnt ??*/ + kfree(newargs); return ERR_PTR(r); } } } + kfree(newargs); + if (IS_ERR(mnt)) return ERR_CAST(mnt); @@ -1423,6 +1421,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) * this also happens on 'umount -rf' or on shutdown, when * the filesystem is busy. */ + cancel_work_sync(&fs_info->async_reclaim_work); /* wait for the uuid_scan task to finish */ down(&fs_info->uuid_tree_rescan_sem); @@ -1469,7 +1468,9 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; /* recover relocation */ + mutex_lock(&fs_info->cleaner_mutex); ret = btrfs_recover_relocation(root); + mutex_unlock(&fs_info->cleaner_mutex); if (ret) goto restore; @@ -1810,6 +1811,8 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root) list_for_each_entry(dev, head, dev_list) { if (dev->missing) continue; + if (!dev->name) + continue; if (!first_dev || dev->devid < first_dev->devid) first_dev = dev; } @@ -1904,6 +1907,9 @@ static int btrfs_run_sanity_tests(void) if (ret) goto out; ret = btrfs_test_inodes(); + if (ret) + goto out; + ret = btrfs_test_qgroups(); out: btrfs_destroy_test_fs(); return ret; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index c5eb2143dc66..78699364f537 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -254,6 +254,7 @@ static ssize_t global_rsv_reserved_show(struct kobject *kobj, BTRFS_ATTR(global_rsv_reserved, 0444, global_rsv_reserved_show); #define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj) +#define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj) static ssize_t raid_bytes_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf); @@ -266,7 +267,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj, { struct btrfs_space_info *sinfo = to_space_info(kobj->parent); struct btrfs_block_group_cache *block_group; - int index = kobj - sinfo->block_group_kobjs; + int index = to_raid_kobj(kobj)->raid_type; u64 val = 0; down_read(&sinfo->groups_sem); @@ -288,7 +289,7 @@ static struct attribute *raid_attributes[] = { static void release_raid_kobj(struct kobject *kobj) { - kobject_put(kobj->parent); + kfree(to_raid_kobj(kobj)); } struct kobj_type btrfs_raid_ktype = { @@ -374,11 +375,8 @@ static ssize_t btrfs_label_store(struct kobject *kobj, struct btrfs_root *root = fs_info->fs_root; int ret; - if (len >= BTRFS_LABEL_SIZE) { - pr_err("BTRFS: unable to set label with more than %d bytes\n", - BTRFS_LABEL_SIZE - 1); + if (len >= BTRFS_LABEL_SIZE) return -EINVAL; - } trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) @@ -396,8 +394,48 @@ static ssize_t btrfs_label_store(struct kobject *kobj, } BTRFS_ATTR_RW(label, 0644, btrfs_label_show, btrfs_label_store); +static ssize_t btrfs_no_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + return -EPERM; +} + +static ssize_t btrfs_nodesize_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize); +} + +BTRFS_ATTR_RW(nodesize, 0444, btrfs_nodesize_show, btrfs_no_store); + +static ssize_t btrfs_sectorsize_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize); +} + +BTRFS_ATTR_RW(sectorsize, 0444, btrfs_sectorsize_show, btrfs_no_store); + +static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize); +} + +BTRFS_ATTR_RW(clone_alignment, 0444, btrfs_clone_alignment_show, btrfs_no_store); + static struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(label), + BTRFS_ATTR_PTR(nodesize), + BTRFS_ATTR_PTR(sectorsize), + BTRFS_ATTR_PTR(clone_alignment), NULL, }; @@ -567,14 +605,37 @@ static void init_feature_attrs(void) } } -static int add_device_membership(struct btrfs_fs_info *fs_info) +int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device) +{ + struct hd_struct *disk; + struct kobject *disk_kobj; + + if (!fs_info->device_dir_kobj) + return -EINVAL; + + if (one_device) { + disk = one_device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; + + sysfs_remove_link(fs_info->device_dir_kobj, + disk_kobj->name); + } + + return 0; +} + +int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device) { int error = 0; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *dev; - fs_info->device_dir_kobj = kobject_create_and_add("devices", + if (!fs_info->device_dir_kobj) + fs_info->device_dir_kobj = kobject_create_and_add("devices", &fs_info->super_kobj); + if (!fs_info->device_dir_kobj) return -ENOMEM; @@ -585,6 +646,9 @@ static int add_device_membership(struct btrfs_fs_info *fs_info) if (!dev->bdev) continue; + if (one_device && one_device != dev) + continue; + disk = dev->bdev->bd_part; disk_kobj = &part_to_dev(disk)->kobj; @@ -628,7 +692,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) if (error) goto failure; - error = add_device_membership(fs_info); + error = btrfs_kobj_add_device(fs_info, NULL); if (error) goto failure; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 9ab576318a84..ac46df37504c 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -66,4 +66,8 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); extern const char * const btrfs_feature_set_names[3]; extern struct kobj_type space_info_ktype; extern struct kobj_type btrfs_raid_ktype; +int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device); +int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *one_device); #endif /* _BTRFS_SYSFS_H_ */ diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 757ef00a75a4..9626252ee6b4 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -21,6 +21,9 @@ #include <linux/magic.h> #include "btrfs-tests.h" #include "../ctree.h" +#include "../volumes.h" +#include "../disk-io.h" +#include "../qgroup.h" static struct vfsmount *test_mnt = NULL; @@ -72,3 +75,97 @@ void btrfs_destroy_test_fs(void) kern_unmount(test_mnt); unregister_filesystem(&test_type); } + +struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void) +{ + struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info), + GFP_NOFS); + + if (!fs_info) + return fs_info; + fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices), + GFP_NOFS); + if (!fs_info->fs_devices) { + kfree(fs_info); + return NULL; + } + fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block), + GFP_NOFS); + if (!fs_info->super_copy) { + kfree(fs_info->fs_devices); + kfree(fs_info); + return NULL; + } + + if (init_srcu_struct(&fs_info->subvol_srcu)) { + kfree(fs_info->fs_devices); + kfree(fs_info->super_copy); + kfree(fs_info); + return NULL; + } + + spin_lock_init(&fs_info->buffer_lock); + spin_lock_init(&fs_info->qgroup_lock); + spin_lock_init(&fs_info->qgroup_op_lock); + spin_lock_init(&fs_info->super_lock); + spin_lock_init(&fs_info->fs_roots_radix_lock); + spin_lock_init(&fs_info->tree_mod_seq_lock); + mutex_init(&fs_info->qgroup_ioctl_lock); + mutex_init(&fs_info->qgroup_rescan_lock); + rwlock_init(&fs_info->tree_mod_log_lock); + fs_info->running_transaction = NULL; + fs_info->qgroup_tree = RB_ROOT; + fs_info->qgroup_ulist = NULL; + atomic64_set(&fs_info->tree_mod_seq, 0); + INIT_LIST_HEAD(&fs_info->dirty_qgroups); + INIT_LIST_HEAD(&fs_info->dead_roots); + INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); + INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); + INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); + return fs_info; +} + +static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) +{ + struct radix_tree_iter iter; + void **slot; + + spin_lock(&fs_info->buffer_lock); +restart: + radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { + struct extent_buffer *eb; + + eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock); + if (!eb) + continue; + /* Shouldn't happen but that kind of thinking creates CVE's */ + if (radix_tree_exception(eb)) { + if (radix_tree_deref_retry(eb)) + goto restart; + continue; + } + spin_unlock(&fs_info->buffer_lock); + free_extent_buffer_stale(eb); + spin_lock(&fs_info->buffer_lock); + } + spin_unlock(&fs_info->buffer_lock); + + btrfs_free_qgroup_config(fs_info); + btrfs_free_fs_roots(fs_info); + cleanup_srcu_struct(&fs_info->subvol_srcu); + kfree(fs_info->super_copy); + kfree(fs_info->fs_devices); + kfree(fs_info); +} + +void btrfs_free_dummy_root(struct btrfs_root *root) +{ + if (!root) + return; + if (root->node) + free_extent_buffer(root->node); + if (root->fs_info) + btrfs_free_dummy_fs_info(root->fs_info); + kfree(root); +} + diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index 312560a9123d..fd3954224480 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -23,13 +23,18 @@ #define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__) +struct btrfs_root; + int btrfs_test_free_space_cache(void); int btrfs_test_extent_buffer_operations(void); int btrfs_test_extent_io(void); int btrfs_test_inodes(void); +int btrfs_test_qgroups(void); int btrfs_init_test_fs(void); void btrfs_destroy_test_fs(void); struct inode *btrfs_new_test_inode(void); +struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void); +void btrfs_free_dummy_root(struct btrfs_root *root); #else static inline int btrfs_test_free_space_cache(void) { @@ -54,6 +59,10 @@ static inline int btrfs_test_inodes(void) { return 0; } +static inline int btrfs_test_qgroups(void) +{ + return 0; +} #endif #endif diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 397d1f99a8eb..3ae0f5b8bb80 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -23,33 +23,6 @@ #include "../extent_io.h" #include "../volumes.h" -static struct btrfs_fs_info *alloc_dummy_fs_info(void) -{ - struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info), - GFP_NOFS); - if (!fs_info) - return fs_info; - fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices), - GFP_NOFS); - if (!fs_info->fs_devices) { - kfree(fs_info); - return NULL; - } - return fs_info; -} -static void free_dummy_root(struct btrfs_root *root) -{ - if (!root) - return; - if (root->fs_info) { - kfree(root->fs_info->fs_devices); - kfree(root->fs_info); - } - if (root->node) - free_extent_buffer(root->node); - kfree(root); -} - static void insert_extent(struct btrfs_root *root, u64 start, u64 len, u64 ram_bytes, u64 offset, u64 disk_bytenr, u64 disk_len, u32 type, u8 compression, int slot) @@ -276,7 +249,7 @@ static noinline int test_btrfs_get_extent(void) * We do this since btrfs_get_extent wants to assign em->bdev to * root->fs_info->fs_devices->latest_bdev. */ - root->fs_info = alloc_dummy_fs_info(); + root->fs_info = btrfs_alloc_dummy_fs_info(); if (!root->fs_info) { test_msg("Couldn't allocate dummy fs info\n"); goto out; @@ -837,7 +810,7 @@ out: if (!IS_ERR(em)) free_extent_map(em); iput(inode); - free_dummy_root(root); + btrfs_free_dummy_root(root); return ret; } @@ -864,7 +837,7 @@ static int test_hole_first(void) goto out; } - root->fs_info = alloc_dummy_fs_info(); + root->fs_info = btrfs_alloc_dummy_fs_info(); if (!root->fs_info) { test_msg("Couldn't allocate dummy fs info\n"); goto out; @@ -934,7 +907,7 @@ out: if (!IS_ERR(em)) free_extent_map(em); iput(inode); - free_dummy_root(root); + btrfs_free_dummy_root(root); return ret; } diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c new file mode 100644 index 000000000000..ec3dcb202357 --- /dev/null +++ b/fs/btrfs/tests/qgroup-tests.c @@ -0,0 +1,470 @@ +/* + * Copyright (C) 2013 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../transaction.h" +#include "../disk-io.h" +#include "../qgroup.h" + +static void init_dummy_trans(struct btrfs_trans_handle *trans) +{ + memset(trans, 0, sizeof(*trans)); + trans->transid = 1; + INIT_LIST_HEAD(&trans->qgroup_ref_list); + trans->type = __TRANS_DUMMY; +} + +static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 parent, u64 root_objectid) +{ + struct btrfs_trans_handle trans; + struct btrfs_extent_item *item; + struct btrfs_extent_inline_ref *iref; + struct btrfs_tree_block_info *block_info; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key ins; + u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info); + int ret; + + init_dummy_trans(&trans); + + ins.objectid = bytenr; + ins.type = BTRFS_EXTENT_ITEM_KEY; + ins.offset = num_bytes; + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Couldn't allocate path\n"); + return -ENOMEM; + } + + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(&trans, root, path, &ins, size); + if (ret) { + test_msg("Couldn't insert ref %d\n", ret); + btrfs_free_path(path); + return ret; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + btrfs_set_extent_refs(leaf, item, 1); + btrfs_set_extent_generation(leaf, item, 1); + btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_TREE_BLOCK); + block_info = (struct btrfs_tree_block_info *)(item + 1); + btrfs_set_tree_block_level(leaf, block_info, 1); + iref = (struct btrfs_extent_inline_ref *)(block_info + 1); + if (parent > 0) { + btrfs_set_extent_inline_ref_type(leaf, iref, + BTRFS_SHARED_BLOCK_REF_KEY); + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + } else { + btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY); + btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); + } + btrfs_free_path(path); + return 0; +} + +static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, + u64 parent, u64 root_objectid) +{ + struct btrfs_trans_handle trans; + struct btrfs_extent_item *item; + struct btrfs_path *path; + struct btrfs_key key; + u64 refs; + int ret; + + init_dummy_trans(&trans); + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Couldn't allocate path\n"); + return -ENOMEM; + } + + path->leave_spinning = 1; + ret = btrfs_search_slot(&trans, root, &key, path, 0, 1); + if (ret) { + test_msg("Couldn't find extent ref\n"); + btrfs_free_path(path); + return ret; + } + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + refs = btrfs_extent_refs(path->nodes[0], item); + btrfs_set_extent_refs(path->nodes[0], item, refs + 1); + btrfs_release_path(path); + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_BLOCK_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_TREE_BLOCK_REF_KEY; + key.offset = root_objectid; + } + + ret = btrfs_insert_empty_item(&trans, root, path, &key, 0); + if (ret) + test_msg("Failed to insert backref\n"); + btrfs_free_path(path); + return ret; +} + +static int remove_extent_item(struct btrfs_root *root, u64 bytenr, + u64 num_bytes) +{ + struct btrfs_trans_handle trans; + struct btrfs_key key; + struct btrfs_path *path; + int ret; + + init_dummy_trans(&trans); + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Couldn't allocate path\n"); + return -ENOMEM; + } + path->leave_spinning = 1; + + ret = btrfs_search_slot(&trans, root, &key, path, -1, 1); + if (ret) { + test_msg("Didn't find our key %d\n", ret); + btrfs_free_path(path); + return ret; + } + btrfs_del_item(&trans, root, path); + btrfs_free_path(path); + return 0; +} + +static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 parent, u64 root_objectid) +{ + struct btrfs_trans_handle trans; + struct btrfs_extent_item *item; + struct btrfs_path *path; + struct btrfs_key key; + u64 refs; + int ret; + + init_dummy_trans(&trans); + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Couldn't allocate path\n"); + return -ENOMEM; + } + + path->leave_spinning = 1; + ret = btrfs_search_slot(&trans, root, &key, path, 0, 1); + if (ret) { + test_msg("Couldn't find extent ref\n"); + btrfs_free_path(path); + return ret; + } + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + refs = btrfs_extent_refs(path->nodes[0], item); + btrfs_set_extent_refs(path->nodes[0], item, refs - 1); + btrfs_release_path(path); + + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_BLOCK_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_TREE_BLOCK_REF_KEY; + key.offset = root_objectid; + } + + ret = btrfs_search_slot(&trans, root, &key, path, -1, 1); + if (ret) { + test_msg("Couldn't find backref %d\n", ret); + btrfs_free_path(path); + return ret; + } + btrfs_del_item(&trans, root, path); + btrfs_free_path(path); + return ret; +} + +static int test_no_shared_qgroup(struct btrfs_root *root) +{ + struct btrfs_trans_handle trans; + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + + init_dummy_trans(&trans); + + test_msg("Qgroup basic add\n"); + ret = btrfs_create_qgroup(NULL, fs_info, 5, NULL); + if (ret) { + test_msg("Couldn't create a qgroup %d\n", ret); + return ret; + } + + ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, + BTRFS_QGROUP_OPER_ADD_EXCL, 0); + if (ret) { + test_msg("Couldn't add space to a qgroup %d\n", ret); + return ret; + } + + ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); + if (ret) + return ret; + + ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + if (ret) { + test_msg("Delayed qgroup accounting failed %d\n", ret); + return ret; + } + + if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + ret = remove_extent_item(root, 4096, 4096); + if (ret) + return -EINVAL; + + ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, + BTRFS_QGROUP_OPER_SUB_EXCL, 0); + if (ret) { + test_msg("Couldn't remove space from the qgroup %d\n", ret); + return -EINVAL; + } + + ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + if (ret) { + test_msg("Qgroup accounting failed %d\n", ret); + return -EINVAL; + } + + if (btrfs_verify_qgroup_counts(fs_info, 5, 0, 0)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + return 0; +} + +/* + * Add a ref for two different roots to make sure the shared value comes out + * right, also remove one of the roots and make sure the exclusive count is + * adjusted properly. + */ +static int test_multiple_refs(struct btrfs_root *root) +{ + struct btrfs_trans_handle trans; + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + + init_dummy_trans(&trans); + + test_msg("Qgroup multiple refs test\n"); + + /* We have 5 created already from the previous test */ + ret = btrfs_create_qgroup(NULL, fs_info, 256, NULL); + if (ret) { + test_msg("Couldn't create a qgroup %d\n", ret); + return ret; + } + + ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); + if (ret) + return ret; + + ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, + BTRFS_QGROUP_OPER_ADD_EXCL, 0); + if (ret) { + test_msg("Couldn't add space to a qgroup %d\n", ret); + return ret; + } + + ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + if (ret) { + test_msg("Delayed qgroup accounting failed %d\n", ret); + return ret; + } + + if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + ret = add_tree_ref(root, 4096, 4096, 0, 256); + if (ret) + return ret; + + ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096, + BTRFS_QGROUP_OPER_ADD_SHARED, 0); + if (ret) { + test_msg("Qgroup record ref failed %d\n", ret); + return ret; + } + + ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + if (ret) { + test_msg("Qgroup accounting failed %d\n", ret); + return ret; + } + + if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 0)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + if (btrfs_verify_qgroup_counts(fs_info, 256, 4096, 0)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + ret = remove_extent_ref(root, 4096, 4096, 0, 256); + if (ret) + return ret; + + ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096, + BTRFS_QGROUP_OPER_SUB_SHARED, 0); + if (ret) { + test_msg("Qgroup record ref failed %d\n", ret); + return ret; + } + + ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + if (ret) { + test_msg("Qgroup accounting failed %d\n", ret); + return ret; + } + + if (btrfs_verify_qgroup_counts(fs_info, 256, 0, 0)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + test_msg("Qgroup counts didn't match expected values\n"); + return -EINVAL; + } + + return 0; +} + +int btrfs_test_qgroups(void) +{ + struct btrfs_root *root; + struct btrfs_root *tmp_root; + int ret = 0; + + root = btrfs_alloc_dummy_root(); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); + return PTR_ERR(root); + } + + root->fs_info = btrfs_alloc_dummy_fs_info(); + if (!root->fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + ret = -ENOMEM; + goto out; + } + + /* + * Can't use bytenr 0, some things freak out + * *cough*backref walking code*cough* + */ + root->node = alloc_test_extent_buffer(root->fs_info, 4096, 4096); + if (!root->node) { + test_msg("Couldn't allocate dummy buffer\n"); + ret = -ENOMEM; + goto out; + } + btrfs_set_header_level(root->node, 0); + btrfs_set_header_nritems(root->node, 0); + root->alloc_bytenr += 8192; + + tmp_root = btrfs_alloc_dummy_root(); + if (IS_ERR(tmp_root)) { + test_msg("Couldn't allocate a fs root\n"); + ret = PTR_ERR(tmp_root); + goto out; + } + + tmp_root->root_key.objectid = 5; + root->fs_info->fs_root = tmp_root; + ret = btrfs_insert_fs_root(root->fs_info, tmp_root); + if (ret) { + test_msg("Couldn't insert fs root %d\n", ret); + goto out; + } + + tmp_root = btrfs_alloc_dummy_root(); + if (IS_ERR(tmp_root)) { + test_msg("Couldn't allocate a fs root\n"); + ret = PTR_ERR(tmp_root); + goto out; + } + + tmp_root->root_key.objectid = 256; + ret = btrfs_insert_fs_root(root->fs_info, tmp_root); + if (ret) { + test_msg("Couldn't insert fs root %d\n", ret); + goto out; + } + + /* We are using this root as our extent root */ + root->fs_info->extent_root = root; + + /* + * Some of the paths we test assume we have a filled out fs_info, so we + * just need to addt he root in there so we don't panic. + */ + root->fs_info->tree_root = root; + root->fs_info->quota_root = root; + root->fs_info->quota_enabled = 1; + + test_msg("Running qgroup tests\n"); + ret = test_no_shared_qgroup(root); + if (ret) + goto out; + ret = test_multiple_refs(root); +out: + btrfs_free_dummy_root(root); + return ret; +} diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 7579f6d0b854..5f379affdf23 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -31,6 +31,7 @@ #include "inode-map.h" #include "volumes.h" #include "dev-replace.h" +#include "qgroup.h" #define BTRFS_ROOT_TRANS_TAG 0 @@ -241,18 +242,19 @@ loop: static int record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - if (root->ref_cows && root->last_trans < trans->transid) { + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + root->last_trans < trans->transid) { WARN_ON(root == root->fs_info->extent_root); WARN_ON(root->commit_root != root->node); /* - * see below for in_trans_setup usage rules + * see below for IN_TRANS_SETUP usage rules * we have the reloc mutex held now, so there * is only one writer in this function */ - root->in_trans_setup = 1; + set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state); - /* make sure readers find in_trans_setup before + /* make sure readers find IN_TRANS_SETUP before * they find our root->last_trans update */ smp_wmb(); @@ -279,7 +281,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, * But, we have to set root->last_trans before we * init the relocation root, otherwise, we trip over warnings * in ctree.c. The solution used here is to flag ourselves - * with root->in_trans_setup. When this is 1, we're still + * with root IN_TRANS_SETUP. When this is 1, we're still * fixing up the reloc trees and everyone must wait. * * When this is zero, they can trust root->last_trans and fly @@ -288,8 +290,8 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, * done before we pop in the zero below */ btrfs_init_reloc_root(trans, root); - smp_wmb(); - root->in_trans_setup = 0; + smp_mb__before_atomic(); + clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state); } return 0; } @@ -298,16 +300,16 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - if (!root->ref_cows) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return 0; /* - * see record_root_in_trans for comments about in_trans_setup usage + * see record_root_in_trans for comments about IN_TRANS_SETUP usage * and barriers */ smp_rmb(); if (root->last_trans == trans->transid && - !root->in_trans_setup) + !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state)) return 0; mutex_lock(&root->fs_info->reloc_mutex); @@ -365,7 +367,7 @@ static int may_wait_transaction(struct btrfs_root *root, int type) static inline bool need_reserve_reloc_root(struct btrfs_root *root) { if (!root->fs_info->reloc_ctl || - !root->ref_cows || + !test_bit(BTRFS_ROOT_REF_COWS, &root->state) || root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || root->reloc_root) return false; @@ -384,11 +386,13 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, bool reloc_reserved = false; int ret; + /* Send isn't supposed to start transactions. */ + ASSERT(current->journal_info != (void *)BTRFS_SEND_TRANS_STUB); + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return ERR_PTR(-EROFS); - if (current->journal_info && - current->journal_info != (void *)BTRFS_SEND_TRANS_STUB) { + if (current->journal_info) { WARN_ON(type & TRANS_EXTWRITERS); h = current->journal_info; h->use_count++; @@ -489,6 +493,7 @@ again: smp_mb(); if (cur_trans->state >= TRANS_STATE_BLOCKED && may_wait_transaction(root, type)) { + current->journal_info = h; btrfs_commit_transaction(h, root); goto again; } @@ -695,6 +700,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, unsigned long cur = trans->delayed_ref_updates; int lock = (trans->type != TRANS_JOIN_NOLOCK); int err = 0; + int must_run_delayed_refs = 0; if (trans->use_count > 1) { trans->use_count--; @@ -702,14 +708,27 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, return 0; } - /* - * do the qgroup accounting as early as possible - */ - err = btrfs_delayed_refs_qgroup_accounting(trans, info); - btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; + if (!list_empty(&trans->new_bgs)) + btrfs_create_pending_block_groups(trans, root); + + trans->delayed_ref_updates = 0; + if (!trans->sync) { + must_run_delayed_refs = + btrfs_should_throttle_delayed_refs(trans, root); + cur = max_t(unsigned long, cur, 32); + + /* + * don't make the caller wait if they are from a NOLOCK + * or ATTACH transaction, it will deadlock with commit + */ + if (must_run_delayed_refs == 1 && + (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH))) + must_run_delayed_refs = 2; + } + if (trans->qgroup_reserved) { /* * the same root has to be passed here between start_transaction @@ -719,16 +738,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, trans->qgroup_reserved = 0; } - if (!list_empty(&trans->new_bgs)) - btrfs_create_pending_block_groups(trans, root); - - trans->delayed_ref_updates = 0; - if (!trans->sync && btrfs_should_throttle_delayed_refs(trans, root)) { - cur = max_t(unsigned long, cur, 32); - trans->delayed_ref_updates = 0; - btrfs_run_delayed_refs(trans, root, cur); - } - btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; @@ -778,6 +787,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, assert_qgroups_uptodate(trans); kmem_cache_free(btrfs_trans_handle_cachep, trans); + if (must_run_delayed_refs) { + btrfs_async_run_delayed_refs(root, cur, + must_run_delayed_refs == 1); + } return err; } @@ -1049,8 +1062,8 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, btrfs_save_ino_cache(root, trans); /* see comments in should_cow_block() */ - root->force_cow = 0; - smp_wmb(); + clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); + smp_mb__after_atomic(); if (root->commit_root != root->node) { list_add_tail(&root->dirty_list, @@ -1081,7 +1094,7 @@ int btrfs_defrag_root(struct btrfs_root *root) struct btrfs_trans_handle *trans; int ret; - if (xchg(&root->defrag_running, 1)) + if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state)) return 0; while (1) { @@ -1104,7 +1117,7 @@ int btrfs_defrag_root(struct btrfs_root *root) break; } } - root->defrag_running = 0; + clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state); return ret; } @@ -1168,12 +1181,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto no_free_objectid; } - pending->error = btrfs_qgroup_inherit(trans, fs_info, - root->root_key.objectid, - objectid, pending->inherit); - if (pending->error) - goto no_free_objectid; - key.objectid = objectid; key.offset = (u64)-1; key.type = BTRFS_ROOT_ITEM_KEY; @@ -1270,8 +1277,26 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } + /* + * We need to flush delayed refs in order to make sure all of our quota + * operations have been done before we call btrfs_qgroup_inherit. + */ + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + ret = btrfs_qgroup_inherit(trans, fs_info, + root->root_key.objectid, + objectid, pending->inherit); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + /* see comments in should_cow_block() */ - root->force_cow = 1; + set_bit(BTRFS_ROOT_FORCE_COW, &root->state); smp_wmb(); btrfs_set_root_node(new_root_item, tmp); @@ -1593,17 +1618,6 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, int ret; ret = btrfs_run_delayed_items(trans, root); - /* - * running the delayed items may have added new refs. account - * them now so that they hinder processing of more delayed refs - * as little as possible. - */ - if (ret) { - btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); - return ret; - } - - ret = btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); if (ret) return ret; @@ -1984,19 +1998,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) } root = list_first_entry(&fs_info->dead_roots, struct btrfs_root, root_list); - /* - * Make sure root is not involved in send, - * if we fail with first root, we return - * directly rather than continue. - */ - spin_lock(&root->root_item_lock); - if (root->send_in_progress) { - spin_unlock(&fs_info->trans_lock); - spin_unlock(&root->root_item_lock); - return 0; - } - spin_unlock(&root->root_item_lock); - list_del_init(&root->root_list); spin_unlock(&fs_info->trans_lock); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index b57b924e8e03..7dd558ed0716 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -69,6 +69,7 @@ struct btrfs_transaction { #define __TRANS_ATTACH (1U << 10) #define __TRANS_JOIN (1U << 11) #define __TRANS_JOIN_NOLOCK (1U << 12) +#define __TRANS_DUMMY (1U << 13) #define TRANS_USERSPACE (__TRANS_USERSPACE | __TRANS_FREEZABLE) #define TRANS_START (__TRANS_START | __TRANS_FREEZABLE) diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index 76928ca97741..a63719cc9578 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -49,7 +49,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, goto out; } - if (root->ref_cows == 0) + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) goto out; if (btrfs_test_opt(root, SSD)) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e2f45fc02610..9e1f2cd5e67a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -20,13 +20,11 @@ #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/list_sort.h> -#include "ctree.h" -#include "transaction.h" +#include "tree-log.h" #include "disk-io.h" #include "locking.h" #include "print-tree.h" #include "backref.h" -#include "tree-log.h" #include "hash.h" /* magic values for the inode_only field in btrfs_log_inode: @@ -144,17 +142,15 @@ static int start_log_trans(struct btrfs_trans_handle *trans, mutex_lock(&root->log_mutex); if (root->log_root) { - if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == - trans->transid) { + if (btrfs_need_log_full_commit(root->fs_info, trans)) { ret = -EAGAIN; goto out; } - if (!root->log_start_pid) { root->log_start_pid = current->pid; - root->log_multiple_pids = false; + clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); } else if (root->log_start_pid != current->pid) { - root->log_multiple_pids = true; + set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); } atomic_inc(&root->log_batch); @@ -181,7 +177,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, if (ret) goto out; } - root->log_multiple_pids = false; + clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); root->log_start_pid = current->pid; atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); @@ -2500,7 +2496,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, while (1) { int batch = atomic_read(&root->log_batch); /* when we're on an ssd, just kick the log commit out */ - if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) { + if (!btrfs_test_opt(root, SSD) && + test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { mutex_unlock(&root->log_mutex); schedule_timeout_uninterruptible(1); mutex_lock(&root->log_mutex); @@ -2511,8 +2508,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } /* bail out if we need to do a full commit */ - if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == - trans->transid) { + if (btrfs_need_log_full_commit(root->fs_info, trans)) { ret = -EAGAIN; btrfs_free_logged_extents(log, log_transid); mutex_unlock(&root->log_mutex); @@ -2533,8 +2529,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, blk_finish_plug(&plug); btrfs_abort_transaction(trans, root, ret); btrfs_free_logged_extents(log, log_transid); - ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = - trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); mutex_unlock(&root->log_mutex); goto out; } @@ -2577,8 +2572,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, list_del_init(&root_log_ctx.list); blk_finish_plug(&plug); - ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = - trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); + if (ret != -ENOSPC) { btrfs_abort_transaction(trans, root, ret); mutex_unlock(&log_root_tree->log_mutex); @@ -2622,8 +2617,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * now that we've moved on to the tree of log tree roots, * check the full commit flag again */ - if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == - trans->transid) { + if (btrfs_need_log_full_commit(root->fs_info, trans)) { blk_finish_plug(&plug); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); btrfs_free_logged_extents(log, log_transid); @@ -2637,8 +2631,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, EXTENT_DIRTY | EXTENT_NEW); blk_finish_plug(&plug); if (ret) { - ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = - trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); btrfs_abort_transaction(trans, root, ret); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); @@ -2667,8 +2660,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ ret = write_ctree_super(trans, root->fs_info->tree_root, 1); if (ret) { - ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = - trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); btrfs_abort_transaction(trans, root, ret); goto out_wake_log_root; } @@ -2886,7 +2878,7 @@ fail: out_unlock: mutex_unlock(&BTRFS_I(dir)->log_mutex); if (ret == -ENOSPC) { - root->fs_info->last_trans_log_full_commit = trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); ret = 0; } else if (ret < 0) btrfs_abort_transaction(trans, root, ret); @@ -2919,7 +2911,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, dirid, &index); mutex_unlock(&BTRFS_I(inode)->log_mutex); if (ret == -ENOSPC) { - root->fs_info->last_trans_log_full_commit = trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); ret = 0; } else if (ret < 0 && ret != -ENOENT) btrfs_abort_transaction(trans, root, ret); @@ -4130,8 +4122,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, * make sure any commits to the log are forced * to be full commits */ - root->fs_info->last_trans_log_full_commit = - trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); ret = 1; break; } @@ -4177,6 +4168,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_no_trans; } + /* + * The prev transaction commit doesn't complete, we need do + * full commit by ourselves. + */ if (root->fs_info->last_trans_log_full_commit > root->fs_info->last_trans_committed) { ret = 1; @@ -4246,7 +4241,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, end_trans: dput(old_parent); if (ret < 0) { - root->fs_info->last_trans_log_full_commit = trans->transid; + btrfs_set_log_full_commit(root->fs_info, trans); ret = 1; } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 91b145fce333..7f5b41bd5373 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -19,6 +19,9 @@ #ifndef __TREE_LOG_ #define __TREE_LOG_ +#include "ctree.h" +#include "transaction.h" + /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ #define BTRFS_NO_LOG_SYNC 256 @@ -35,6 +38,19 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx) INIT_LIST_HEAD(&ctx->list); } +static inline void btrfs_set_log_full_commit(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans) +{ + ACCESS_ONCE(fs_info->last_trans_log_full_commit) = trans->transid; +} + +static inline int btrfs_need_log_full_commit(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans) +{ + return ACCESS_ONCE(fs_info->last_trans_log_full_commit) == + trans->transid; +} + int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_log_ctx *ctx); int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 49d7fab73360..6cb82f62cb7c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -40,6 +40,7 @@ #include "rcu-string.h" #include "math.h" #include "dev-replace.h" +#include "sysfs.h" static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -554,12 +555,14 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) * This is ok to do without rcu read locked because we hold the * uuid mutex so nothing we touch in here is going to disappear. */ - name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); - if (!name) { - kfree(device); - goto error; + if (orig_dev->name) { + name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); + if (!name) { + kfree(device); + goto error; + } + rcu_assign_pointer(device->name, name); } - rcu_assign_pointer(device->name, name); list_add(&device->dev_list, &fs_devices->devices); device->fs_devices = fs_devices; @@ -1452,6 +1455,22 @@ out: return ret; } +/* + * Function to update ctime/mtime for a given device path. + * Mainly used for ctime/mtime based probe like libblkid. + */ +static void update_dev_time(char *path_name) +{ + struct file *filp; + + filp = filp_open(path_name, O_RDWR, 0); + if (!filp) + return; + file_update_time(filp); + filp_close(filp, NULL); + return; +} + static int btrfs_rm_dev_item(struct btrfs_root *root, struct btrfs_device *device) { @@ -1661,8 +1680,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (device->bdev == root->fs_info->fs_devices->latest_bdev) root->fs_info->fs_devices->latest_bdev = next_device->bdev; - if (device->bdev) + if (device->bdev) { device->fs_devices->open_devices--; + /* remove sysfs entry */ + btrfs_kobj_rm_device(root->fs_info, device); + } call_rcu(&device->rcu, free_device); @@ -1674,11 +1696,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) struct btrfs_fs_devices *fs_devices; fs_devices = root->fs_info->fs_devices; while (fs_devices) { - if (fs_devices->seed == cur_devices) + if (fs_devices->seed == cur_devices) { + fs_devices->seed = cur_devices->seed; break; + } fs_devices = fs_devices->seed; } - fs_devices->seed = cur_devices->seed; cur_devices->seed = NULL; lock_chunks(root); __btrfs_close_devices(cur_devices); @@ -1694,20 +1717,55 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) * remove it from the devices list and zero out the old super */ if (clear_super && disk_super) { + u64 bytenr; + int i; + /* make sure this device isn't detected as part of * the FS anymore */ memset(&disk_super->magic, 0, sizeof(disk_super->magic)); set_buffer_dirty(bh); sync_dirty_buffer(bh); + + /* clear the mirror copies of super block on the disk + * being removed, 0th copy is been taken care above and + * the below would take of the rest + */ + for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) { + bytenr = btrfs_sb_offset(i); + if (bytenr + BTRFS_SUPER_INFO_SIZE >= + i_size_read(bdev->bd_inode)) + break; + + brelse(bh); + bh = __bread(bdev, bytenr / 4096, + BTRFS_SUPER_INFO_SIZE); + if (!bh) + continue; + + disk_super = (struct btrfs_super_block *)bh->b_data; + + if (btrfs_super_bytenr(disk_super) != bytenr || + btrfs_super_magic(disk_super) != BTRFS_MAGIC) { + continue; + } + memset(&disk_super->magic, 0, + sizeof(disk_super->magic)); + set_buffer_dirty(bh); + sync_dirty_buffer(bh); + } } ret = 0; - /* Notify udev that device has changed */ - if (bdev) + if (bdev) { + /* Notify udev that device has changed */ btrfs_kobject_uevent(bdev, KOBJ_CHANGE); + /* Update ctime/mtime for device path for libblkid */ + update_dev_time(device_path); + } + error_brelse: brelse(bh); if (bdev) @@ -1883,7 +1941,6 @@ static int btrfs_prepare_sprout(struct btrfs_root *root) fs_devices->seeding = 0; fs_devices->num_devices = 0; fs_devices->open_devices = 0; - fs_devices->total_devices = 0; fs_devices->seed = seed_devices; generate_random_uuid(fs_devices->fsid); @@ -2092,9 +2149,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); btrfs_set_super_num_devices(root->fs_info->super_copy, total_bytes + 1); + + /* add sysfs device entry */ + btrfs_kobj_add_device(root->fs_info, device); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (seeding_dev) { + char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; ret = init_first_rw_device(trans, root, device); if (ret) { btrfs_abort_transaction(trans, root, ret); @@ -2105,6 +2167,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) btrfs_abort_transaction(trans, root, ret); goto error_trans; } + + /* Sprouting would change fsid of the mounted root, + * so rename the fsid on the sysfs + */ + snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", + root->fs_info->fsid); + if (kobject_rename(&root->fs_info->super_kobj, fsid_buf)) + goto error_trans; } else { ret = btrfs_add_device(trans, root, device); if (ret) { @@ -2146,12 +2216,15 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = btrfs_commit_transaction(trans, root); } + /* Update ctime/mtime for libblkid */ + update_dev_time(device_path); return ret; error_trans: unlock_chunks(root); btrfs_end_transaction(trans, root); rcu_string_free(device->name); + btrfs_kobj_rm_device(root->fs_info, device); kfree(device); error: blkdev_put(bdev, FMODE_EXCL); @@ -2490,9 +2563,6 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, remove_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); - kfree(map); - em->bdev = NULL; - /* once for the tree */ free_extent_map(em); /* once for us */ @@ -2922,6 +2992,16 @@ static int should_balance_chunk(struct btrfs_root *root, return 0; } + /* + * limited by count, must be the last filter + */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { + if (bargs->limit == 0) + return 0; + else + bargs->limit--; + } + return 1; } @@ -2944,6 +3024,9 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) int ret; int enospc_errors = 0; bool counting = true; + u64 limit_data = bctl->data.limit; + u64 limit_meta = bctl->meta.limit; + u64 limit_sys = bctl->sys.limit; /* step one make some room on all the devices */ devices = &fs_info->fs_devices->devices; @@ -2982,6 +3065,11 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) memset(&bctl->stat, 0, sizeof(bctl->stat)); spin_unlock(&fs_info->balance_lock); again: + if (!counting) { + bctl->data.limit = limit_data; + bctl->meta.limit = limit_meta; + bctl->sys.limit = limit_sys; + } key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; @@ -3881,7 +3969,8 @@ static int btrfs_add_system_chunk(struct btrfs_root *root, u8 *ptr; array_size = btrfs_super_sys_array_size(super_copy); - if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) + if (array_size + item_size + sizeof(disk_key) + > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) return -EFBIG; ptr = super_copy->sys_chunk_array + array_size; @@ -3986,6 +4075,16 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) btrfs_set_fs_incompat(info, RAID56); } +#define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r) \ + - sizeof(struct btrfs_item) \ + - sizeof(struct btrfs_chunk)) \ + / sizeof(struct btrfs_stripe) + 1) + +#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \ + - 2 * sizeof(struct btrfs_disk_key) \ + - 2 * sizeof(struct btrfs_chunk)) \ + / sizeof(struct btrfs_stripe) + 1) + static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 start, u64 type) @@ -4035,6 +4134,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (type & BTRFS_BLOCK_GROUP_DATA) { max_stripe_size = 1024 * 1024 * 1024; max_chunk_size = 10 * max_stripe_size; + if (!devs_max) + devs_max = BTRFS_MAX_DEVS(info->chunk_root); } else if (type & BTRFS_BLOCK_GROUP_METADATA) { /* for larger filesystems, use larger metadata chunks */ if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024) @@ -4042,11 +4143,15 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, else max_stripe_size = 256 * 1024 * 1024; max_chunk_size = max_stripe_size; + if (!devs_max) + devs_max = BTRFS_MAX_DEVS(info->chunk_root); } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { max_stripe_size = 32 * 1024 * 1024; max_chunk_size = 2 * max_stripe_size; + if (!devs_max) + devs_max = BTRFS_MAX_DEVS_SYS_CHUNK; } else { - btrfs_err(info, "invalid chunk type 0x%llx requested\n", + btrfs_err(info, "invalid chunk type 0x%llx requested", type); BUG_ON(1); } @@ -4213,9 +4318,11 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, em = alloc_extent_map(); if (!em) { + kfree(map); ret = -ENOMEM; goto error; } + set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); em->bdev = (struct block_device *)map; em->start = start; em->len = num_bytes; @@ -4258,7 +4365,6 @@ error_del_extent: /* One for the tree reference */ free_extent_map(em); error: - kfree(map); kfree(devices_info); return ret; } @@ -4294,7 +4400,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, if (em->start != chunk_offset || em->len != chunk_size) { btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted" - " %Lu-%Lu, found %Lu-%Lu\n", chunk_offset, + " %Lu-%Lu, found %Lu-%Lu", chunk_offset, chunk_size, em->start, em->len); free_extent_map(em); return -EINVAL; @@ -4470,7 +4576,6 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) write_unlock(&tree->map_tree.lock); if (!em) break; - kfree(em->bdev); /* once for us */ free_extent_map(em); /* once for the tree */ @@ -4496,14 +4601,14 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) * and exit, so return 1 so the callers don't try to use other copies. */ if (!em) { - btrfs_crit(fs_info, "No mapping for %Lu-%Lu\n", logical, + btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical, logical+len); return 1; } if (em->start > logical || em->start + em->len < logical) { btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got " - "%Lu-%Lu\n", logical, logical+len, em->start, + "%Lu-%Lu", logical, logical+len, em->start, em->start + em->len); free_extent_map(em); return 1; @@ -4684,7 +4789,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (em->start > logical || em->start + em->len < logical) { btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, " - "found %Lu-%Lu\n", logical, em->start, + "found %Lu-%Lu", logical, em->start, em->start + em->len); free_extent_map(em); return -EINVAL; @@ -5274,6 +5379,15 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, return 0; } +static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err) +{ + if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED)) + bio_endio_nodec(bio, err); + else + bio_endio(bio, err); + kfree(bbio); +} + static void btrfs_end_bio(struct bio *bio, int err) { struct btrfs_bio *bbio = bio->bi_private; @@ -5314,12 +5428,6 @@ static void btrfs_end_bio(struct bio *bio, int err) bio = bbio->orig_bio; } - /* - * We have original bio now. So increment bi_remaining to - * account for it in endio - */ - atomic_inc(&bio->bi_remaining); - bio->bi_private = bbio->private; bio->bi_end_io = bbio->end_io; btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; @@ -5336,9 +5444,8 @@ static void btrfs_end_bio(struct bio *bio, int err) set_bit(BIO_UPTODATE, &bio->bi_flags); err = 0; } - kfree(bbio); - bio_endio(bio, err); + btrfs_end_bbio(bbio, bio, err); } else if (!is_orig_bio) { bio_put(bio); } @@ -5501,12 +5608,15 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) { atomic_inc(&bbio->error); if (atomic_dec_and_test(&bbio->stripes_pending)) { + /* Shoud be the original bio. */ + WARN_ON(bio != bbio->orig_bio); + bio->bi_private = bbio->private; bio->bi_end_io = bbio->end_io; btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; bio->bi_iter.bi_sector = logical >> 9; - kfree(bbio); - bio_endio(bio, -EIO); + + btrfs_end_bbio(bbio, bio, -EIO); } } @@ -5593,6 +5703,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, BUG_ON(!bio); /* -ENOMEM */ } else { bio = first_bio; + bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED; } submit_stripe_bio(root, bbio, bio, @@ -5734,6 +5845,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, return -ENOMEM; } + set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); em->bdev = (struct block_device *)map; em->start = logical; em->len = length; @@ -5758,7 +5870,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, map->stripes[i].dev = btrfs_find_device(root->fs_info, devid, uuid, NULL); if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { - kfree(map); free_extent_map(em); return -EIO; } @@ -5766,7 +5877,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, map->stripes[i].dev = add_missing_dev(root, devid, uuid); if (!map->stripes[i].dev) { - kfree(map); free_extent_map(em); return -EIO; } @@ -6058,10 +6168,14 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) - device->dev_root = fs_info->dev_root; - mutex_unlock(&fs_devices->device_list_mutex); + while (fs_devices) { + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) + device->dev_root = fs_info->dev_root; + mutex_unlock(&fs_devices->device_list_mutex); + + fs_devices = fs_devices->seed; + } } static void __btrfs_reset_dev_stats(struct btrfs_device *dev) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 80754f9dd3df..2aaa00c47816 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -190,11 +190,14 @@ struct btrfs_bio_stripe { struct btrfs_bio; typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); +#define BTRFS_BIO_ORIG_BIO_SUBMITTED 0x1 + struct btrfs_bio { atomic_t stripes_pending; struct btrfs_fs_info *fs_info; bio_end_io_t *end_io; struct bio *orig_bio; + unsigned long flags; void *private; atomic_t error; int max_errors; @@ -255,6 +258,7 @@ struct map_lookup { #define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2) #define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) #define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) +#define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5) /* * Profile changing flags. When SOFT is set we won't relocate chunk if diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 8e57191950cb..b67d8fc81277 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -98,7 +98,7 @@ static int zlib_compress_pages(struct list_head *ws, if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { printk(KERN_WARNING "BTRFS: deflateInit failed\n"); - ret = -1; + ret = -EIO; goto out; } @@ -110,7 +110,7 @@ static int zlib_compress_pages(struct list_head *ws, out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); if (out_page == NULL) { - ret = -1; + ret = -ENOMEM; goto out; } cpage_out = kmap(out_page); @@ -128,7 +128,7 @@ static int zlib_compress_pages(struct list_head *ws, printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n", ret); zlib_deflateEnd(&workspace->def_strm); - ret = -1; + ret = -EIO; goto out; } @@ -136,7 +136,7 @@ static int zlib_compress_pages(struct list_head *ws, if (workspace->def_strm.total_in > 8192 && workspace->def_strm.total_in < workspace->def_strm.total_out) { - ret = -1; + ret = -E2BIG; goto out; } /* we need another page for writing out. Test this @@ -147,12 +147,12 @@ static int zlib_compress_pages(struct list_head *ws, kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; - ret = -1; + ret = -E2BIG; goto out; } out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); if (out_page == NULL) { - ret = -1; + ret = -ENOMEM; goto out; } cpage_out = kmap(out_page); @@ -188,12 +188,12 @@ static int zlib_compress_pages(struct list_head *ws, zlib_deflateEnd(&workspace->def_strm); if (ret != Z_STREAM_END) { - ret = -1; + ret = -EIO; goto out; } if (workspace->def_strm.total_out >= workspace->def_strm.total_in) { - ret = -1; + ret = -E2BIG; goto out; } @@ -253,7 +253,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { printk(KERN_WARNING "BTRFS: inflateInit failed\n"); - return -1; + return -EIO; } while (workspace->inf_strm.total_in < srclen) { ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); @@ -295,7 +295,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, } } if (ret != Z_STREAM_END) - ret = -1; + ret = -EIO; else ret = 0; done: @@ -337,7 +337,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in, if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { printk(KERN_WARNING "BTRFS: inflateInit failed\n"); - return -1; + return -EIO; } while (bytes_left > 0) { @@ -354,7 +354,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in, total_out = workspace->inf_strm.total_out; if (total_out == buf_start) { - ret = -1; + ret = -EIO; break; } @@ -382,7 +382,7 @@ next: } if (ret != Z_STREAM_END && bytes_left != 0) - ret = -1; + ret = -EIO; else ret = 0; diff --git a/fs/buffer.c b/fs/buffer.c index 9ddb9fc7d923..eba6e4f621ce 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -77,7 +77,7 @@ EXPORT_SYMBOL(__lock_buffer); void unlock_buffer(struct buffer_head *bh) { clear_bit_unlock(BH_Lock, &bh->b_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&bh->b_state, BH_Lock); } EXPORT_SYMBOL(unlock_buffer); @@ -227,7 +227,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) int all_mapped = 1; index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits); - page = find_get_page(bd_mapping, index); + page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED); if (!page) goto out; @@ -1366,12 +1366,13 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size) struct buffer_head *bh = lookup_bh_lru(bdev, block, size); if (bh == NULL) { + /* __find_get_block_slow will mark the page accessed */ bh = __find_get_block_slow(bdev, block); if (bh) bh_lru_install(bh); - } - if (bh) + } else touch_buffer(bh); + return bh; } EXPORT_SYMBOL(__find_get_block); @@ -1483,16 +1484,27 @@ EXPORT_SYMBOL(set_bh_page); /* * Called when truncating a buffer on a page completely. */ + +/* Bits that are cleared during an invalidate */ +#define BUFFER_FLAGS_DISCARD \ + (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \ + 1 << BH_Delay | 1 << BH_Unwritten) + static void discard_buffer(struct buffer_head * bh) { + unsigned long b_state, b_state_old; + lock_buffer(bh); clear_buffer_dirty(bh); bh->b_bdev = NULL; - clear_buffer_mapped(bh); - clear_buffer_req(bh); - clear_buffer_new(bh); - clear_buffer_delay(bh); - clear_buffer_unwritten(bh); + b_state = bh->b_state; + for (;;) { + b_state_old = cmpxchg(&bh->b_state, b_state, + (b_state & ~BUFFER_FLAGS_DISCARD)); + if (b_state_old == b_state) + break; + b_state = b_state_old; + } unlock_buffer(bh); } @@ -2879,10 +2891,9 @@ EXPORT_SYMBOL(block_truncate_page); /* * The generic ->writepage function for buffer-backed address_spaces - * this form passes in the end_io handler used to finish the IO. */ -int block_write_full_page_endio(struct page *page, get_block_t *get_block, - struct writeback_control *wbc, bh_end_io_t *handler) +int block_write_full_page(struct page *page, get_block_t *get_block, + struct writeback_control *wbc) { struct inode * const inode = page->mapping->host; loff_t i_size = i_size_read(inode); @@ -2892,7 +2903,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block, /* Is the page fully inside i_size? */ if (page->index < end_index) return __block_write_full_page(inode, page, get_block, wbc, - handler); + end_buffer_async_write); /* Is the page fully outside i_size? (truncate in progress) */ offset = i_size & (PAGE_CACHE_SIZE-1); @@ -2915,18 +2926,8 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block, * writes to that region are not written out to the file." */ zero_user_segment(page, offset, PAGE_CACHE_SIZE); - return __block_write_full_page(inode, page, get_block, wbc, handler); -} -EXPORT_SYMBOL(block_write_full_page_endio); - -/* - * The generic ->writepage function for buffer-backed address_spaces - */ -int block_write_full_page(struct page *page, get_block_t *get_block, - struct writeback_control *wbc) -{ - return block_write_full_page_endio(page, get_block, wbc, - end_buffer_async_write); + return __block_write_full_page(inode, page, get_block, wbc, + end_buffer_async_write); } EXPORT_SYMBOL(block_write_full_page); diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c index 5b99bafc31d1..d749731dc0ee 100644 --- a/fs/cachefiles/bind.c +++ b/fs/cachefiles/bind.c @@ -50,18 +50,18 @@ int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args) cache->brun_percent < 100); if (*args) { - kerror("'bind' command doesn't take an argument"); + pr_err("'bind' command doesn't take an argument"); return -EINVAL; } if (!cache->rootdirname) { - kerror("No cache directory specified"); + pr_err("No cache directory specified"); return -EINVAL; } /* don't permit already bound caches to be re-bound */ if (test_bit(CACHEFILES_READY, &cache->flags)) { - kerror("Cache already bound"); + pr_err("Cache already bound"); return -EBUSY; } @@ -228,9 +228,7 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) set_bit(CACHEFILES_READY, &cache->flags); dput(root); - printk(KERN_INFO "CacheFiles:" - " File cache on %s registered\n", - cache->cache.identifier); + pr_info("File cache on %s registered\n", cache->cache.identifier); /* check how much space the cache has */ cachefiles_has_space(cache, 0, 0); @@ -250,7 +248,7 @@ error_open_root: kmem_cache_free(cachefiles_object_jar, fsdef); error_root_object: cachefiles_end_secure(cache, saved_cred); - kerror("Failed to register: %d", ret); + pr_err("Failed to register: %d", ret); return ret; } @@ -262,9 +260,8 @@ void cachefiles_daemon_unbind(struct cachefiles_cache *cache) _enter(""); if (test_bit(CACHEFILES_READY, &cache->flags)) { - printk(KERN_INFO "CacheFiles:" - " File cache on %s unregistering\n", - cache->cache.identifier); + pr_info("File cache on %s unregistering\n", + cache->cache.identifier); fscache_withdraw_cache(&cache->cache); } diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index 0a1467b15516..b078d3081d6c 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -315,8 +315,7 @@ static unsigned int cachefiles_daemon_poll(struct file *file, static int cachefiles_daemon_range_error(struct cachefiles_cache *cache, char *args) { - kerror("Free space limits must be in range" - " 0%%<=stop<cull<run<100%%"); + pr_err("Free space limits must be in range 0%%<=stop<cull<run<100%%"); return -EINVAL; } @@ -476,12 +475,12 @@ static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args) _enter(",%s", args); if (!*args) { - kerror("Empty directory specified"); + pr_err("Empty directory specified"); return -EINVAL; } if (cache->rootdirname) { - kerror("Second cache directory specified"); + pr_err("Second cache directory specified"); return -EEXIST; } @@ -504,12 +503,12 @@ static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args) _enter(",%s", args); if (!*args) { - kerror("Empty security context specified"); + pr_err("Empty security context specified"); return -EINVAL; } if (cache->secctx) { - kerror("Second security context specified"); + pr_err("Second security context specified"); return -EINVAL; } @@ -532,7 +531,7 @@ static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args) _enter(",%s", args); if (!*args) { - kerror("Empty tag specified"); + pr_err("Empty tag specified"); return -EINVAL; } @@ -563,12 +562,12 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args) goto inval; if (!test_bit(CACHEFILES_READY, &cache->flags)) { - kerror("cull applied to unready cache"); + pr_err("cull applied to unready cache"); return -EIO; } if (test_bit(CACHEFILES_DEAD, &cache->flags)) { - kerror("cull applied to dead cache"); + pr_err("cull applied to dead cache"); return -EIO; } @@ -588,11 +587,11 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args) notdir: path_put(&path); - kerror("cull command requires dirfd to be a directory"); + pr_err("cull command requires dirfd to be a directory"); return -ENOTDIR; inval: - kerror("cull command requires dirfd and filename"); + pr_err("cull command requires dirfd and filename"); return -EINVAL; } @@ -615,7 +614,7 @@ static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args) return 0; inval: - kerror("debug command requires mask"); + pr_err("debug command requires mask"); return -EINVAL; } @@ -635,12 +634,12 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args) goto inval; if (!test_bit(CACHEFILES_READY, &cache->flags)) { - kerror("inuse applied to unready cache"); + pr_err("inuse applied to unready cache"); return -EIO; } if (test_bit(CACHEFILES_DEAD, &cache->flags)) { - kerror("inuse applied to dead cache"); + pr_err("inuse applied to dead cache"); return -EIO; } @@ -660,11 +659,11 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args) notdir: path_put(&path); - kerror("inuse command requires dirfd to be a directory"); + pr_err("inuse command requires dirfd to be a directory"); return -ENOTDIR; inval: - kerror("inuse command requires dirfd and filename"); + pr_err("inuse command requires dirfd and filename"); return -EINVAL; } diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 57e17fe6121a..584743d456c3 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -146,8 +146,7 @@ static int cachefiles_lookup_object(struct fscache_object *_object) if (ret < 0 && ret != -ETIMEDOUT) { if (ret != -ENOBUFS) - printk(KERN_WARNING - "CacheFiles: Lookup failed error %d\n", ret); + pr_warn("Lookup failed error %d\n", ret); fscache_object_lookup_error(&object->fscache); } diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 5349473df1b1..3d50998abf57 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -9,6 +9,13 @@ * 2 of the Licence, or (at your option) any later version. */ +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) "CacheFiles: " fmt + + #include <linux/fscache-cache.h> #include <linux/timer.h> #include <linux/wait.h> @@ -245,11 +252,10 @@ extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, /* * error handling */ -#define kerror(FMT, ...) printk(KERN_ERR "CacheFiles: "FMT"\n", ##__VA_ARGS__) #define cachefiles_io_error(___cache, FMT, ...) \ do { \ - kerror("I/O Error: " FMT, ##__VA_ARGS__); \ + pr_err("I/O Error: " FMT, ##__VA_ARGS__); \ fscache_io_error(&(___cache)->cache); \ set_bit(CACHEFILES_DEAD, &(___cache)->flags); \ } while (0) @@ -310,8 +316,8 @@ do { \ #define ASSERT(X) \ do { \ if (unlikely(!(X))) { \ - printk(KERN_ERR "\n"); \ - printk(KERN_ERR "CacheFiles: Assertion failed\n"); \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ BUG(); \ } \ } while (0) @@ -319,9 +325,9 @@ do { \ #define ASSERTCMP(X, OP, Y) \ do { \ if (unlikely(!((X) OP (Y)))) { \ - printk(KERN_ERR "\n"); \ - printk(KERN_ERR "CacheFiles: Assertion failed\n"); \ - printk(KERN_ERR "%lx " #OP " %lx is false\n", \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + pr_err("%lx " #OP " %lx is false\n", \ (unsigned long)(X), (unsigned long)(Y)); \ BUG(); \ } \ @@ -330,8 +336,8 @@ do { \ #define ASSERTIF(C, X) \ do { \ if (unlikely((C) && !(X))) { \ - printk(KERN_ERR "\n"); \ - printk(KERN_ERR "CacheFiles: Assertion failed\n"); \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ BUG(); \ } \ } while (0) @@ -339,9 +345,9 @@ do { \ #define ASSERTIFCMP(C, X, OP, Y) \ do { \ if (unlikely((C) && !((X) OP (Y)))) { \ - printk(KERN_ERR "\n"); \ - printk(KERN_ERR "CacheFiles: Assertion failed\n"); \ - printk(KERN_ERR "%lx " #OP " %lx is false\n", \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + pr_err("%lx " #OP " %lx is false\n", \ (unsigned long)(X), (unsigned long)(Y)); \ BUG(); \ } \ diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c index 4bfa8cf43bf5..180edfb45f66 100644 --- a/fs/cachefiles/main.c +++ b/fs/cachefiles/main.c @@ -68,8 +68,7 @@ static int __init cachefiles_init(void) SLAB_HWCACHE_ALIGN, cachefiles_object_init_once); if (!cachefiles_object_jar) { - printk(KERN_NOTICE - "CacheFiles: Failed to allocate an object jar\n"); + pr_notice("Failed to allocate an object jar\n"); goto error_object_jar; } @@ -77,7 +76,7 @@ static int __init cachefiles_init(void) if (ret < 0) goto error_proc; - printk(KERN_INFO "CacheFiles: Loaded\n"); + pr_info("Loaded\n"); return 0; error_proc: @@ -85,7 +84,7 @@ error_proc: error_object_jar: misc_deregister(&cachefiles_dev); error_dev: - kerror("failed to register: %d", ret); + pr_err("failed to register: %d", ret); return ret; } @@ -96,7 +95,7 @@ fs_initcall(cachefiles_init); */ static void __exit cachefiles_exit(void) { - printk(KERN_INFO "CacheFiles: Unloading\n"); + pr_info("Unloading\n"); cachefiles_proc_cleanup(); kmem_cache_destroy(cachefiles_object_jar); diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index c0a681705104..5bf2b41e66d3 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -35,22 +35,21 @@ void __cachefiles_printk_object(struct cachefiles_object *object, struct fscache_cookie *cookie; unsigned keylen, loop; - printk(KERN_ERR "%sobject: OBJ%x\n", - prefix, object->fscache.debug_id); - printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n", + pr_err("%sobject: OBJ%x\n", prefix, object->fscache.debug_id); + pr_err("%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n", prefix, object->fscache.state->name, object->fscache.flags, work_busy(&object->fscache.work), object->fscache.events, object->fscache.event_mask); - printk(KERN_ERR "%sops=%u inp=%u exc=%u\n", + pr_err("%sops=%u inp=%u exc=%u\n", prefix, object->fscache.n_ops, object->fscache.n_in_progress, object->fscache.n_exclusive); - printk(KERN_ERR "%sparent=%p\n", + pr_err("%sparent=%p\n", prefix, object->fscache.parent); spin_lock(&object->fscache.lock); cookie = object->fscache.cookie; if (cookie) { - printk(KERN_ERR "%scookie=%p [pr=%p nd=%p fl=%lx]\n", + pr_err("%scookie=%p [pr=%p nd=%p fl=%lx]\n", prefix, object->fscache.cookie, object->fscache.cookie->parent, @@ -62,16 +61,16 @@ void __cachefiles_printk_object(struct cachefiles_object *object, else keylen = 0; } else { - printk(KERN_ERR "%scookie=NULL\n", prefix); + pr_err("%scookie=NULL\n", prefix); keylen = 0; } spin_unlock(&object->fscache.lock); if (keylen) { - printk(KERN_ERR "%skey=[%u] '", prefix, keylen); + pr_err("%skey=[%u] '", prefix, keylen); for (loop = 0; loop < keylen; loop++) - printk("%02x", keybuf[loop]); - printk("'\n"); + pr_cont("%02x", keybuf[loop]); + pr_cont("'\n"); } } @@ -131,13 +130,11 @@ found_dentry: dentry); if (fscache_object_is_live(&object->fscache)) { - printk(KERN_ERR "\n"); - printk(KERN_ERR "CacheFiles: Error:" - " Can't preemptively bury live object\n"); + pr_err("\n"); + pr_err("Error: Can't preemptively bury live object\n"); cachefiles_printk_object(object, NULL); } else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) { - printk(KERN_ERR "CacheFiles: Error:" - " Object already preemptively buried\n"); + pr_err("Error: Object already preemptively buried\n"); } write_unlock(&cache->active_lock); @@ -160,7 +157,7 @@ try_again: write_lock(&cache->active_lock); if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) { - printk(KERN_ERR "CacheFiles: Error: Object already active\n"); + pr_err("Error: Object already active\n"); cachefiles_printk_object(object, NULL); BUG(); } @@ -193,9 +190,8 @@ try_again: * need to wait for it to be destroyed */ wait_for_old_object: if (fscache_object_is_live(&object->fscache)) { - printk(KERN_ERR "\n"); - printk(KERN_ERR "CacheFiles: Error:" - " Unexpected object collision\n"); + pr_err("\n"); + pr_err("Error: Unexpected object collision\n"); cachefiles_printk_object(object, xobject); BUG(); } @@ -241,9 +237,8 @@ wait_for_old_object: } if (timeout <= 0) { - printk(KERN_ERR "\n"); - printk(KERN_ERR "CacheFiles: Error: Overlong" - " wait for old active object to go away\n"); + pr_err("\n"); + pr_err("Error: Overlong wait for old active object to go away\n"); cachefiles_printk_object(object, xobject); goto requeue; } @@ -548,7 +543,7 @@ lookup_again: next, next->d_inode, next->d_inode->i_ino); } else if (!S_ISDIR(next->d_inode->i_mode)) { - kerror("inode %lu is not a directory", + pr_err("inode %lu is not a directory", next->d_inode->i_ino); ret = -ENOBUFS; goto error; @@ -579,7 +574,7 @@ lookup_again: } else if (!S_ISDIR(next->d_inode->i_mode) && !S_ISREG(next->d_inode->i_mode) ) { - kerror("inode %lu is not a file or directory", + pr_err("inode %lu is not a file or directory", next->d_inode->i_ino); ret = -ENOBUFS; goto error; @@ -773,7 +768,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, ASSERT(subdir->d_inode); if (!S_ISDIR(subdir->d_inode->i_mode)) { - kerror("%s is not a directory", dirname); + pr_err("%s is not a directory", dirname); ret = -EIO; goto check_error; } @@ -800,13 +795,13 @@ check_error: mkdir_error: mutex_unlock(&dir->d_inode->i_mutex); dput(subdir); - kerror("mkdir %s failed with error %d", dirname, ret); + pr_err("mkdir %s failed with error %d", dirname, ret); return ERR_PTR(ret); lookup_error: mutex_unlock(&dir->d_inode->i_mutex); ret = PTR_ERR(subdir); - kerror("Lookup %s failed with error %d", dirname, ret); + pr_err("Lookup %s failed with error %d", dirname, ret); return ERR_PTR(ret); nomem_d_alloc: @@ -896,7 +891,7 @@ lookup_error: if (ret == -EIO) { cachefiles_io_error(cache, "Lookup failed"); } else if (ret != -ENOMEM) { - kerror("Internal error: %d", ret); + pr_err("Internal error: %d", ret); ret = -EIO; } @@ -955,7 +950,7 @@ error: } if (ret != -ENOMEM) { - kerror("Internal error: %d", ret); + pr_err("Internal error: %d", ret); ret = -EIO; } diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c index 039b5011d83b..396c18ea2764 100644 --- a/fs/cachefiles/security.c +++ b/fs/cachefiles/security.c @@ -34,9 +34,7 @@ int cachefiles_get_security_ID(struct cachefiles_cache *cache) ret = set_security_override_from_ctx(new, cache->secctx); if (ret < 0) { put_cred(new); - printk(KERN_ERR "CacheFiles:" - " Security denies permission to nominate" - " security context: error %d\n", + pr_err("Security denies permission to nominate security context: error %d\n", ret); goto error; } @@ -59,16 +57,14 @@ static int cachefiles_check_cache_dir(struct cachefiles_cache *cache, ret = security_inode_mkdir(root->d_inode, root, 0); if (ret < 0) { - printk(KERN_ERR "CacheFiles:" - " Security denies permission to make dirs: error %d", + pr_err("Security denies permission to make dirs: error %d", ret); return ret; } ret = security_inode_create(root->d_inode, root, 0); if (ret < 0) - printk(KERN_ERR "CacheFiles:" - " Security denies permission to create files: error %d", + pr_err("Security denies permission to create files: error %d", ret); return ret; diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index 12b0eef84183..1ad51ffbb275 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -51,7 +51,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object) } if (ret != -EEXIST) { - kerror("Can't set xattr on %*.*s [%lu] (err %d)", + pr_err("Can't set xattr on %*.*s [%lu] (err %d)", dentry->d_name.len, dentry->d_name.len, dentry->d_name.name, dentry->d_inode->i_ino, -ret); @@ -64,7 +64,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object) if (ret == -ERANGE) goto bad_type_length; - kerror("Can't read xattr on %*.*s [%lu] (err %d)", + pr_err("Can't read xattr on %*.*s [%lu] (err %d)", dentry->d_name.len, dentry->d_name.len, dentry->d_name.name, dentry->d_inode->i_ino, -ret); @@ -85,14 +85,14 @@ error: return ret; bad_type_length: - kerror("Cache object %lu type xattr length incorrect", + pr_err("Cache object %lu type xattr length incorrect", dentry->d_inode->i_ino); ret = -EIO; goto error; bad_type: xtype[2] = 0; - kerror("Cache object %*.*s [%lu] type %s not %s", + pr_err("Cache object %*.*s [%lu] type %s not %s", dentry->d_name.len, dentry->d_name.len, dentry->d_name.name, dentry->d_inode->i_ino, xtype, type); @@ -293,7 +293,7 @@ error: return ret; bad_type_length: - kerror("Cache object %lu xattr length incorrect", + pr_err("Cache object %lu xattr length incorrect", dentry->d_inode->i_ino); ret = -EIO; goto error; diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 21887d63dad5..469f2e8657e8 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -104,12 +104,6 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) umode_t new_mode = inode->i_mode, old_mode = inode->i_mode; struct dentry *dentry; - if (acl) { - ret = posix_acl_valid(acl); - if (ret < 0) - goto out; - } - switch (type) { case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index b53278c9fd97..90b3954d48ed 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -211,18 +211,15 @@ static int readpage_nounlock(struct file *filp, struct page *page) SetPageError(page); ceph_fscache_readpage_cancel(inode, page); goto out; - } else { - if (err < PAGE_CACHE_SIZE) { - /* zero fill remainder of page */ - zero_user_segment(page, err, PAGE_CACHE_SIZE); - } else { - flush_dcache_page(page); - } } - SetPageUptodate(page); + if (err < PAGE_CACHE_SIZE) + /* zero fill remainder of page */ + zero_user_segment(page, err, PAGE_CACHE_SIZE); + else + flush_dcache_page(page); - if (err >= 0) - ceph_readpage_to_fscache(inode, page); + SetPageUptodate(page); + ceph_readpage_to_fscache(inode, page); out: return err < 0 ? err : 0; @@ -694,7 +691,7 @@ static int ceph_writepages_start(struct address_space *mapping, (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { - pr_warning("writepage_start %p on forced umount\n", inode); + pr_warn("writepage_start %p on forced umount\n", inode); return -EIO; /* we're in a forced umount, don't write! */ } if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize) @@ -1187,8 +1184,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, * never get called. */ static ssize_t ceph_direct_io(int rw, struct kiocb *iocb, - const struct iovec *iov, - loff_t pos, unsigned long nr_segs) + struct iov_iter *iter, + loff_t pos) { WARN_ON(1); return -EINVAL; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 2e5e648eb5c3..1fde164b74b5 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -221,8 +221,8 @@ int ceph_unreserve_caps(struct ceph_mds_client *mdsc, return 0; } -static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc, - struct ceph_cap_reservation *ctx) +struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx) { struct ceph_cap *cap = NULL; @@ -508,15 +508,14 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, * it is < 0. (This is so we can atomically add the cap and add an * open file reference to it.) */ -int ceph_add_cap(struct inode *inode, - struct ceph_mds_session *session, u64 cap_id, - int fmode, unsigned issued, unsigned wanted, - unsigned seq, unsigned mseq, u64 realmino, int flags, - struct ceph_cap_reservation *caps_reservation) +void ceph_add_cap(struct inode *inode, + struct ceph_mds_session *session, u64 cap_id, + int fmode, unsigned issued, unsigned wanted, + unsigned seq, unsigned mseq, u64 realmino, int flags, + struct ceph_cap **new_cap) { struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_cap *new_cap = NULL; struct ceph_cap *cap; int mds = session->s_mds; int actual_wanted; @@ -531,20 +530,10 @@ int ceph_add_cap(struct inode *inode, if (fmode >= 0) wanted |= ceph_caps_for_mode(fmode); -retry: - spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); if (!cap) { - if (new_cap) { - cap = new_cap; - new_cap = NULL; - } else { - spin_unlock(&ci->i_ceph_lock); - new_cap = get_cap(mdsc, caps_reservation); - if (new_cap == NULL) - return -ENOMEM; - goto retry; - } + cap = *new_cap; + *new_cap = NULL; cap->issued = 0; cap->implemented = 0; @@ -562,9 +551,6 @@ retry: session->s_nr_caps++; spin_unlock(&session->s_cap_lock); } else { - if (new_cap) - ceph_put_cap(mdsc, new_cap); - /* * auth mds of the inode changed. we received the cap export * message, but still haven't received the cap import message. @@ -626,7 +612,6 @@ retry: ci->i_auth_cap = cap; cap->mds_wanted = wanted; } - ci->i_cap_exporting_issued = 0; } else { WARN_ON(ci->i_auth_cap == cap); } @@ -648,9 +633,6 @@ retry: if (fmode >= 0) __ceph_get_fmode(ci, fmode); - spin_unlock(&ci->i_ceph_lock); - wake_up_all(&ci->i_cap_wq); - return 0; } /* @@ -685,7 +667,7 @@ static int __cap_is_valid(struct ceph_cap *cap) */ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) { - int have = ci->i_snap_caps | ci->i_cap_exporting_issued; + int have = ci->i_snap_caps; struct ceph_cap *cap; struct rb_node *p; @@ -900,7 +882,7 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) */ static int __ceph_is_any_caps(struct ceph_inode_info *ci) { - return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_issued; + return !RB_EMPTY_ROOT(&ci->i_caps); } int ceph_is_any_caps(struct inode *inode) @@ -2397,32 +2379,30 @@ static void invalidate_aliases(struct inode *inode) * actually be a revocation if it specifies a smaller cap set.) * * caller holds s_mutex and i_ceph_lock, we drop both. - * - * return value: - * 0 - ok - * 1 - check_caps on auth cap only (writeback) - * 2 - check_caps (ack revoke) */ -static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, +static void handle_cap_grant(struct ceph_mds_client *mdsc, + struct inode *inode, struct ceph_mds_caps *grant, + void *snaptrace, int snaptrace_len, + struct ceph_buffer *xattr_buf, struct ceph_mds_session *session, - struct ceph_cap *cap, - struct ceph_buffer *xattr_buf) - __releases(ci->i_ceph_lock) + struct ceph_cap *cap, int issued) + __releases(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; int seq = le32_to_cpu(grant->seq); int newcaps = le32_to_cpu(grant->caps); - int issued, implemented, used, wanted, dirty; + int used, wanted, dirty; u64 size = le64_to_cpu(grant->size); u64 max_size = le64_to_cpu(grant->max_size); struct timespec mtime, atime, ctime; int check_caps = 0; - int wake = 0; - int writeback = 0; - int queue_invalidate = 0; - int deleted_inode = 0; - int queue_revalidate = 0; + bool wake = 0; + bool writeback = 0; + bool queue_trunc = 0; + bool queue_invalidate = 0; + bool queue_revalidate = 0; + bool deleted_inode = 0; dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", inode, cap, mds, seq, ceph_cap_string(newcaps)); @@ -2466,16 +2446,13 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, } /* side effects now are allowed */ - - issued = __ceph_caps_issued(ci, &implemented); - issued |= implemented | __ceph_caps_dirty(ci); - cap->cap_gen = session->s_cap_gen; cap->seq = seq; __check_cap_issue(ci, cap, newcaps); - if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { + if ((newcaps & CEPH_CAP_AUTH_SHARED) && + (issued & CEPH_CAP_AUTH_EXCL) == 0) { inode->i_mode = le32_to_cpu(grant->mode); inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid)); @@ -2484,7 +2461,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, from_kgid(&init_user_ns, inode->i_gid)); } - if ((issued & CEPH_CAP_LINK_EXCL) == 0) { + if ((newcaps & CEPH_CAP_AUTH_SHARED) && + (issued & CEPH_CAP_LINK_EXCL) == 0) { set_nlink(inode, le32_to_cpu(grant->nlink)); if (inode->i_nlink == 0 && (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL))) @@ -2511,30 +2489,35 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1) queue_revalidate = 1; - /* size/ctime/mtime/atime? */ - ceph_fill_file_size(inode, issued, - le32_to_cpu(grant->truncate_seq), - le64_to_cpu(grant->truncate_size), size); - ceph_decode_timespec(&mtime, &grant->mtime); - ceph_decode_timespec(&atime, &grant->atime); - ceph_decode_timespec(&ctime, &grant->ctime); - ceph_fill_file_time(inode, issued, - le32_to_cpu(grant->time_warp_seq), &ctime, &mtime, - &atime); - - - /* file layout may have changed */ - ci->i_layout = grant->layout; - - /* max size increase? */ - if (ci->i_auth_cap == cap && max_size != ci->i_max_size) { - dout("max_size %lld -> %llu\n", ci->i_max_size, max_size); - ci->i_max_size = max_size; - if (max_size >= ci->i_wanted_max_size) { - ci->i_wanted_max_size = 0; /* reset */ - ci->i_requested_max_size = 0; + if (newcaps & CEPH_CAP_ANY_RD) { + /* ctime/mtime/atime? */ + ceph_decode_timespec(&mtime, &grant->mtime); + ceph_decode_timespec(&atime, &grant->atime); + ceph_decode_timespec(&ctime, &grant->ctime); + ceph_fill_file_time(inode, issued, + le32_to_cpu(grant->time_warp_seq), + &ctime, &mtime, &atime); + } + + if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) { + /* file layout may have changed */ + ci->i_layout = grant->layout; + /* size/truncate_seq? */ + queue_trunc = ceph_fill_file_size(inode, issued, + le32_to_cpu(grant->truncate_seq), + le64_to_cpu(grant->truncate_size), + size); + /* max size increase? */ + if (ci->i_auth_cap == cap && max_size != ci->i_max_size) { + dout("max_size %lld -> %llu\n", + ci->i_max_size, max_size); + ci->i_max_size = max_size; + if (max_size >= ci->i_wanted_max_size) { + ci->i_wanted_max_size = 0; /* reset */ + ci->i_requested_max_size = 0; + } + wake = 1; } - wake = 1; } /* check cap bits */ @@ -2595,6 +2578,23 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, spin_unlock(&ci->i_ceph_lock); + if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { + down_write(&mdsc->snap_rwsem); + ceph_update_snap_trace(mdsc, snaptrace, + snaptrace + snaptrace_len, false); + downgrade_write(&mdsc->snap_rwsem); + kick_flushing_inode_caps(mdsc, session, inode); + up_read(&mdsc->snap_rwsem); + if (newcaps & ~issued) + wake = 1; + } + + if (queue_trunc) { + ceph_queue_vmtruncate(inode); + ceph_queue_revalidate(inode); + } else if (queue_revalidate) + ceph_queue_revalidate(inode); + if (writeback) /* * queue inode for writeback: we can't actually call @@ -2606,8 +2606,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, ceph_queue_invalidate(inode); if (deleted_inode) invalidate_aliases(inode); - if (queue_revalidate) - ceph_queue_revalidate(inode); if (wake) wake_up_all(&ci->i_cap_wq); @@ -2784,7 +2782,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, { struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_mds_session *tsession = NULL; - struct ceph_cap *cap, *tcap; + struct ceph_cap *cap, *tcap, *new_cap = NULL; struct ceph_inode_info *ci = ceph_inode(inode); u64 t_cap_id; unsigned mseq = le32_to_cpu(ex->migrate_seq); @@ -2807,7 +2805,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, retry: spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); - if (!cap) + if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id)) goto out_unlock; if (target < 0) { @@ -2846,15 +2844,14 @@ retry: } __ceph_remove_cap(cap, false); goto out_unlock; - } - - if (tsession) { - int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; - spin_unlock(&ci->i_ceph_lock); + } else if (tsession) { /* add placeholder for the export tagert */ + int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0, - t_seq - 1, t_mseq, (u64)-1, flag, NULL); - goto retry; + t_seq - 1, t_mseq, (u64)-1, flag, &new_cap); + + __ceph_remove_cap(cap, false); + goto out_unlock; } spin_unlock(&ci->i_ceph_lock); @@ -2873,6 +2870,7 @@ retry: SINGLE_DEPTH_NESTING); } ceph_add_cap_releases(mdsc, tsession); + new_cap = ceph_get_cap(mdsc, NULL); } else { WARN_ON(1); tsession = NULL; @@ -2887,24 +2885,27 @@ out_unlock: mutex_unlock(&tsession->s_mutex); ceph_put_mds_session(tsession); } + if (new_cap) + ceph_put_cap(mdsc, new_cap); } /* - * Handle cap IMPORT. If there are temp bits from an older EXPORT, - * clean them up. + * Handle cap IMPORT. * - * caller holds s_mutex. + * caller holds s_mutex. acquires i_ceph_lock */ static void handle_cap_import(struct ceph_mds_client *mdsc, struct inode *inode, struct ceph_mds_caps *im, struct ceph_mds_cap_peer *ph, struct ceph_mds_session *session, - void *snaptrace, int snaptrace_len) + struct ceph_cap **target_cap, int *old_issued) + __acquires(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_cap *cap; + struct ceph_cap *cap, *ocap, *new_cap = NULL; int mds = session->s_mds; - unsigned issued = le32_to_cpu(im->caps); + int issued; + unsigned caps = le32_to_cpu(im->caps); unsigned wanted = le32_to_cpu(im->wanted); unsigned seq = le32_to_cpu(im->seq); unsigned mseq = le32_to_cpu(im->migrate_seq); @@ -2924,40 +2925,52 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n", inode, ci, mds, mseq, peer); +retry: spin_lock(&ci->i_ceph_lock); - cap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL; - if (cap && cap->cap_id == p_cap_id) { + cap = __get_cap_for_mds(ci, mds); + if (!cap) { + if (!new_cap) { + spin_unlock(&ci->i_ceph_lock); + new_cap = ceph_get_cap(mdsc, NULL); + goto retry; + } + cap = new_cap; + } else { + if (new_cap) { + ceph_put_cap(mdsc, new_cap); + new_cap = NULL; + } + } + + __ceph_caps_issued(ci, &issued); + issued |= __ceph_caps_dirty(ci); + + ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq, + realmino, CEPH_CAP_FLAG_AUTH, &new_cap); + + ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL; + if (ocap && ocap->cap_id == p_cap_id) { dout(" remove export cap %p mds%d flags %d\n", - cap, peer, ph->flags); + ocap, peer, ph->flags); if ((ph->flags & CEPH_CAP_FLAG_AUTH) && - (cap->seq != le32_to_cpu(ph->seq) || - cap->mseq != le32_to_cpu(ph->mseq))) { + (ocap->seq != le32_to_cpu(ph->seq) || + ocap->mseq != le32_to_cpu(ph->mseq))) { pr_err("handle_cap_import: mismatched seq/mseq: " "ino (%llx.%llx) mds%d seq %d mseq %d " "importer mds%d has peer seq %d mseq %d\n", - ceph_vinop(inode), peer, cap->seq, - cap->mseq, mds, le32_to_cpu(ph->seq), + ceph_vinop(inode), peer, ocap->seq, + ocap->mseq, mds, le32_to_cpu(ph->seq), le32_to_cpu(ph->mseq)); } - ci->i_cap_exporting_issued = cap->issued; - __ceph_remove_cap(cap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); + __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); } /* make sure we re-request max_size, if necessary */ ci->i_wanted_max_size = 0; ci->i_requested_max_size = 0; - spin_unlock(&ci->i_ceph_lock); - - down_write(&mdsc->snap_rwsem); - ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len, - false); - downgrade_write(&mdsc->snap_rwsem); - ceph_add_cap(inode, session, cap_id, -1, - issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH, - NULL /* no caps context */); - kick_flushing_inode_caps(mdsc, session, inode); - up_read(&mdsc->snap_rwsem); + *old_issued = issued; + *target_cap = cap; } /* @@ -2977,7 +2990,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_mds_caps *h; struct ceph_mds_cap_peer *peer = NULL; int mds = session->s_mds; - int op; + int op, issued; u32 seq, mseq; struct ceph_vino vino; u64 cap_id; @@ -3069,7 +3082,10 @@ void ceph_handle_caps(struct ceph_mds_session *session, case CEPH_CAP_OP_IMPORT: handle_cap_import(mdsc, inode, h, peer, session, - snaptrace, snaptrace_len); + &cap, &issued); + handle_cap_grant(mdsc, inode, h, snaptrace, snaptrace_len, + msg->middle, session, cap, issued); + goto done_unlocked; } /* the rest require a cap */ @@ -3086,8 +3102,10 @@ void ceph_handle_caps(struct ceph_mds_session *session, switch (op) { case CEPH_CAP_OP_REVOKE: case CEPH_CAP_OP_GRANT: - case CEPH_CAP_OP_IMPORT: - handle_cap_grant(inode, h, session, cap, msg->middle); + __ceph_caps_issued(ci, &issued); + issued |= __ceph_caps_dirty(ci); + handle_cap_grant(mdsc, inode, h, NULL, 0, msg->middle, + session, cap, issued); goto done_unlocked; case CEPH_CAP_OP_FLUSH_ACK: @@ -3261,7 +3279,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode, rel->seq = cpu_to_le32(cap->seq); rel->issue_seq = cpu_to_le32(cap->issue_seq), rel->mseq = cpu_to_le32(cap->mseq); - rel->caps = cpu_to_le32(cap->issued); + rel->caps = cpu_to_le32(cap->implemented); rel->wanted = cpu_to_le32(cap->mds_wanted); rel->dname_len = 0; rel->dname_seq = 0; diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 16b54aa31f08..5a743ac141ab 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -71,9 +71,9 @@ static int mdsc_show(struct seq_file *s, void *p) seq_printf(s, "%s", ceph_mds_op_name(req->r_op)); if (req->r_got_unsafe) - seq_printf(s, "\t(unsafe)"); + seq_puts(s, "\t(unsafe)"); else - seq_printf(s, "\t"); + seq_puts(s, "\t"); if (req->r_inode) { seq_printf(s, " #%llx", ceph_ino(req->r_inode)); @@ -119,7 +119,7 @@ static int mdsc_show(struct seq_file *s, void *p) seq_printf(s, " %s", req->r_path2); } - seq_printf(s, "\n"); + seq_puts(s, "\n"); } mutex_unlock(&mdsc->mutex); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 766410a12c2c..c29d6ae68874 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -141,7 +141,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, /* start at beginning? */ if (ctx->pos == 2 || last == NULL || - ctx->pos < ceph_dentry(last)->offset) { + fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) { if (list_empty(&parent->d_subdirs)) goto out_unlock; p = parent->d_subdirs.prev; @@ -182,9 +182,16 @@ more: spin_unlock(&dentry->d_lock); spin_unlock(&parent->d_lock); + /* make sure a dentry wasn't dropped while we didn't have parent lock */ + if (!ceph_dir_is_complete(dir)) { + dout(" lost dir complete on %p; falling back to mds\n", dir); + dput(dentry); + err = -EAGAIN; + goto out; + } + dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos, dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); - ctx->pos = di->offset; if (!dir_emit(ctx, dentry->d_name.name, dentry->d_name.len, ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino), @@ -198,19 +205,12 @@ more: return 0; } + ctx->pos = di->offset + 1; + if (last) dput(last); last = dentry; - ctx->pos++; - - /* make sure a dentry wasn't dropped while we didn't have parent lock */ - if (!ceph_dir_is_complete(dir)) { - dout(" lost dir complete on %p; falling back to mds\n", dir); - err = -EAGAIN; - goto out; - } - spin_lock(&parent->d_lock); p = p->prev; /* advance to next dentry */ goto more; @@ -296,6 +296,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) err = __dcache_readdir(file, ctx, shared_gen); if (err != -EAGAIN) return err; + frag = fpos_frag(ctx->pos); + off = fpos_off(ctx->pos); } else { spin_unlock(&ci->i_ceph_lock); } @@ -446,7 +448,6 @@ more: if (atomic_read(&ci->i_release_count) == fi->dir_release_count) { dout(" marking %p complete\n", inode); __ceph_dir_set_complete(ci, fi->dir_release_count); - ci->i_max_offset = ctx->pos; } spin_unlock(&ci->i_ceph_lock); @@ -935,14 +936,16 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, * to do it here. */ - /* d_move screws up d_subdirs order */ - ceph_dir_clear_complete(new_dir); - d_move(old_dentry, new_dentry); /* ensure target dentry is invalidated, despite rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(new_dentry); + + /* d_move screws up sibling dentries' offsets */ + ceph_dir_clear_complete(old_dir); + ceph_dir_clear_complete(new_dir); + } ceph_mdsc_put_request(req); return err; diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 00d6af6a32ec..8d7d782f4382 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -169,7 +169,7 @@ static struct dentry *__get_parent(struct super_block *sb, return dentry; } -struct dentry *ceph_get_parent(struct dentry *child) +static struct dentry *ceph_get_parent(struct dentry *child) { /* don't re-export snaps */ if (ceph_snap(child->d_inode) != CEPH_NOSNAP) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 39da1c2efa50..302085100c28 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -418,7 +418,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, struct page **pages; u64 off = iocb->ki_pos; int num_pages, ret; - size_t len = i->count; + size_t len = iov_iter_count(i); dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, @@ -436,25 +436,26 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, if (file->f_flags & O_DIRECT) { while (iov_iter_count(i)) { - void __user *data = i->iov[0].iov_base + i->iov_offset; - size_t len = i->iov[0].iov_len - i->iov_offset; + size_t start; + ssize_t n; - num_pages = calc_pages_for((unsigned long)data, len); - pages = ceph_get_direct_page_vector(data, - num_pages, true); - if (IS_ERR(pages)) - return PTR_ERR(pages); + n = iov_iter_get_pages_alloc(i, &pages, INT_MAX, &start); + if (n < 0) + return n; - ret = striped_read(inode, off, len, + num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE; + + ret = striped_read(inode, off, n, pages, num_pages, checkeof, - 1, (unsigned long)data & ~PAGE_MASK); + 1, start); + ceph_put_page_vector(pages, num_pages, true); if (ret <= 0) break; off += ret; iov_iter_advance(i, ret); - if (ret < len) + if (ret < n) break; } } else { @@ -466,25 +467,14 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, num_pages, checkeof, 0, 0); if (ret > 0) { int l, k = 0; - size_t left = len = ret; + size_t left = ret; while (left) { - void __user *data = i->iov[0].iov_base - + i->iov_offset; - l = min(i->iov[0].iov_len - i->iov_offset, - left); - - ret = ceph_copy_page_vector_to_user(&pages[k], - data, off, - l); - if (ret > 0) { - iov_iter_advance(i, ret); - left -= ret; - off += ret; - k = calc_pages_for(iocb->ki_pos, - len - left + 1) - 1; - BUG_ON(k >= num_pages && left); - } else + int copy = min_t(size_t, PAGE_SIZE, left); + l = copy_page_to_iter(pages[k++], 0, copy, i); + off += l; + left -= l; + if (l < copy) break; } } @@ -541,8 +531,7 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) * objects, rollback on failure, etc.) */ static ssize_t -ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, size_t count) +ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); @@ -556,11 +545,10 @@ ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov, int written = 0; int flags; int check_caps = 0; - int page_align; int ret; struct timespec mtime = CURRENT_TIME; loff_t pos = iocb->ki_pos; - struct iov_iter i; + size_t count = iov_iter_count(from); if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; @@ -582,13 +570,10 @@ ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov, CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE; - iov_iter_init(&i, iov, nr_segs, count, 0); - - while (iov_iter_count(&i) > 0) { - void __user *data = i.iov->iov_base + i.iov_offset; - u64 len = i.iov->iov_len - i.iov_offset; - - page_align = (unsigned long)data & ~PAGE_MASK; + while (iov_iter_count(from) > 0) { + u64 len = iov_iter_single_seg_count(from); + size_t start; + ssize_t n; snapc = ci->i_snap_realm->cached_context; vino = ceph_vino(inode); @@ -604,20 +589,21 @@ ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov, break; } - num_pages = calc_pages_for(page_align, len); - pages = ceph_get_direct_page_vector(data, num_pages, false); - if (IS_ERR(pages)) { - ret = PTR_ERR(pages); - goto out; + n = iov_iter_get_pages_alloc(from, &pages, len, &start); + if (unlikely(n < 0)) { + ret = n; + ceph_osdc_put_request(req); + break; } + num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE; /* * throw out any page cache pages in this range. this * may block. */ truncate_inode_pages_range(inode->i_mapping, pos, - (pos+len) | (PAGE_CACHE_SIZE-1)); - osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align, + (pos+n) | (PAGE_CACHE_SIZE-1)); + osd_req_op_extent_osd_data_pages(req, 0, pages, n, start, false, false); /* BUG_ON(vino.snap != CEPH_NOSNAP); */ @@ -629,22 +615,20 @@ ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov, ceph_put_page_vector(pages, num_pages, false); -out: ceph_osdc_put_request(req); - if (ret == 0) { - pos += len; - written += len; - iov_iter_advance(&i, (size_t)len); - - if (pos > i_size_read(inode)) { - check_caps = ceph_inode_set_size(inode, pos); - if (check_caps) - ceph_check_caps(ceph_inode(inode), - CHECK_CAPS_AUTHONLY, - NULL); - } - } else + if (ret) break; + pos += n; + written += n; + iov_iter_advance(from, n); + + if (pos > i_size_read(inode)) { + check_caps = ceph_inode_set_size(inode, pos); + if (check_caps) + ceph_check_caps(ceph_inode(inode), + CHECK_CAPS_AUTHONLY, + NULL); + } } if (ret != -EOLDSNAPC && written > 0) { @@ -662,8 +646,7 @@ out: * correct atomic write, we should e.g. take write locks on all * objects, rollback on failure, etc.) */ -static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, size_t count) +static ssize_t ceph_sync_write(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); @@ -681,7 +664,7 @@ static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov, int ret; struct timespec mtime = CURRENT_TIME; loff_t pos = iocb->ki_pos; - struct iov_iter i; + size_t count = iov_iter_count(from); if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; @@ -703,9 +686,7 @@ static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK; - iov_iter_init(&i, iov, nr_segs, count, 0); - - while ((len = iov_iter_count(&i)) > 0) { + while ((len = iov_iter_count(from)) > 0) { size_t left; int n; @@ -737,13 +718,12 @@ static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov, left = len; for (n = 0; n < num_pages; n++) { size_t plen = min_t(size_t, left, PAGE_SIZE); - ret = iov_iter_copy_from_user(pages[n], &i, 0, plen); + ret = copy_page_from_iter(pages[n], 0, plen, from); if (ret != plen) { ret = -EFAULT; break; } left -= ret; - iov_iter_advance(&i, ret); } if (ret < 0) { @@ -796,8 +776,7 @@ out: * * Hmm, the sync read case isn't actually async... should it be? */ -static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *filp = iocb->ki_filp; struct ceph_file_info *fi = filp->private_data; @@ -823,40 +802,20 @@ again: if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_filp->f_flags & O_DIRECT) || (fi->flags & CEPH_F_SYNC)) { - struct iov_iter i; dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n", inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, ceph_cap_string(got)); - if (!read) { - ret = generic_segment_checks(iov, &nr_segs, - &len, VERIFY_WRITE); - if (ret) - goto out; - } - - iov_iter_init(&i, iov, nr_segs, len, read); - /* hmm, this isn't really async... */ - ret = ceph_sync_read(iocb, &i, &checkeof); + ret = ceph_sync_read(iocb, to, &checkeof); } else { - /* - * We can't modify the content of iov, - * so we only read from beginning. - */ - if (read) { - iocb->ki_pos = pos; - len = iocb->ki_nbytes; - read = 0; - } dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", - inode, ceph_vinop(inode), pos, (unsigned)len, + inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, ceph_cap_string(got)); - ret = generic_file_aio_read(iocb, iov, nr_segs, pos); + ret = generic_file_read_iter(iocb, to); } -out: dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); ceph_put_cap_refs(ci, got); @@ -872,6 +831,7 @@ out: ", reading more\n", iocb->ki_pos, inode->i_size); + iov_iter_advance(to, ret); read += ret; len -= ret; checkeof = 0; @@ -895,8 +855,7 @@ out: * * If we are near ENOSPC, write synchronously. */ -static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct ceph_file_info *fi = file->private_data; @@ -904,18 +863,15 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->client->osdc; - ssize_t count, written = 0; + ssize_t count = iov_iter_count(from), written = 0; int err, want, got; + loff_t pos = iocb->ki_pos; if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; mutex_lock(&inode->i_mutex); - err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); - if (err) - goto out; - /* We can write back this queue in page reclaim */ current->backing_dev_info = file->f_mapping->backing_dev_info; @@ -925,6 +881,7 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, if (count == 0) goto out; + iov_iter_truncate(from, count); err = file_remove_suid(file); if (err) @@ -956,23 +913,26 @@ retry_snap: if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || (file->f_flags & O_DIRECT) || (fi->flags & CEPH_F_SYNC)) { + struct iov_iter data; mutex_unlock(&inode->i_mutex); + /* we might need to revert back to that point */ + data = *from; if (file->f_flags & O_DIRECT) - written = ceph_sync_direct_write(iocb, iov, - nr_segs, count); + written = ceph_sync_direct_write(iocb, &data); else - written = ceph_sync_write(iocb, iov, nr_segs, count); + written = ceph_sync_write(iocb, &data); if (written == -EOLDSNAPC) { dout("aio_write %p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n", inode, ceph_vinop(inode), - pos, (unsigned)iov->iov_len); + pos, (unsigned)count); mutex_lock(&inode->i_mutex); goto retry_snap; } + if (written > 0) + iov_iter_advance(from, written); } else { loff_t old_size = inode->i_size; - struct iov_iter from; /* * No need to acquire the i_truncate_mutex. Because * the MDS revokes Fwb caps before sending truncate @@ -980,8 +940,7 @@ retry_snap: * are pending vmtruncate. So write and vmtruncate * can not run at the same time */ - iov_iter_init(&from, iov, nr_segs, count, 0); - written = generic_perform_write(file, &from, pos); + written = generic_perform_write(file, from, pos); if (likely(written >= 0)) iocb->ki_pos = pos + written; if (inode->i_size > old_size) @@ -999,7 +958,7 @@ retry_snap: } dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", - inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, + inode, ceph_vinop(inode), pos, (unsigned)count, ceph_cap_string(got)); ceph_put_cap_refs(ci, got); @@ -1221,9 +1180,6 @@ static long ceph_fallocate(struct file *file, int mode, if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - if (IS_SWAPFILE(inode)) - return -ETXTBSY; - mutex_lock(&inode->i_mutex); if (ceph_snap(inode) != CEPH_NOSNAP) { @@ -1279,16 +1235,16 @@ const struct file_operations ceph_file_fops = { .open = ceph_open, .release = ceph_release, .llseek = ceph_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = ceph_aio_read, - .aio_write = ceph_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = ceph_read_iter, + .write_iter = ceph_write_iter, .mmap = ceph_mmap, .fsync = ceph_fsync, .lock = ceph_lock, .flock = ceph_flock, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, .unlocked_ioctl = ceph_ioctl, .compat_ioctl = ceph_ioctl, .fallocate = ceph_fallocate, diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 0b0728e5be2d..04c89c266cec 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -10,6 +10,7 @@ #include <linux/writeback.h> #include <linux/vmalloc.h> #include <linux/posix_acl.h> +#include <linux/random.h> #include "super.h" #include "mds_client.h" @@ -179,9 +180,8 @@ struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f) * specified, copy the frag delegation info to the caller if * it is present. */ -u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, - struct ceph_inode_frag *pfrag, - int *found) +static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v, + struct ceph_inode_frag *pfrag, int *found) { u32 t = ceph_frag_make(0, 0); struct ceph_inode_frag *frag; @@ -191,7 +191,6 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, if (found) *found = 0; - mutex_lock(&ci->i_fragtree_mutex); while (1) { WARN_ON(!ceph_frag_contains_value(t, v)); frag = __ceph_find_frag(ci, t); @@ -220,10 +219,19 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, } dout("choose_frag(%x) = %x\n", v, t); - mutex_unlock(&ci->i_fragtree_mutex); return t; } +u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, + struct ceph_inode_frag *pfrag, int *found) +{ + u32 ret; + mutex_lock(&ci->i_fragtree_mutex); + ret = __ceph_choose_frag(ci, v, pfrag, found); + mutex_unlock(&ci->i_fragtree_mutex); + return ret; +} + /* * Process dirfrag (delegation) info from the mds. Include leaf * fragment in tree ONLY if ndist > 0. Otherwise, only @@ -237,11 +245,17 @@ static int ceph_fill_dirfrag(struct inode *inode, u32 id = le32_to_cpu(dirinfo->frag); int mds = le32_to_cpu(dirinfo->auth); int ndist = le32_to_cpu(dirinfo->ndist); + int diri_auth = -1; int i; int err = 0; + spin_lock(&ci->i_ceph_lock); + if (ci->i_auth_cap) + diri_auth = ci->i_auth_cap->mds; + spin_unlock(&ci->i_ceph_lock); + mutex_lock(&ci->i_fragtree_mutex); - if (ndist == 0) { + if (ndist == 0 && mds == diri_auth) { /* no delegation info needed. */ frag = __ceph_find_frag(ci, id); if (!frag) @@ -286,6 +300,75 @@ out: return err; } +static int ceph_fill_fragtree(struct inode *inode, + struct ceph_frag_tree_head *fragtree, + struct ceph_mds_reply_dirfrag *dirinfo) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_inode_frag *frag; + struct rb_node *rb_node; + int i; + u32 id, nsplits; + bool update = false; + + mutex_lock(&ci->i_fragtree_mutex); + nsplits = le32_to_cpu(fragtree->nsplits); + if (nsplits) { + i = prandom_u32() % nsplits; + id = le32_to_cpu(fragtree->splits[i].frag); + if (!__ceph_find_frag(ci, id)) + update = true; + } else if (!RB_EMPTY_ROOT(&ci->i_fragtree)) { + rb_node = rb_first(&ci->i_fragtree); + frag = rb_entry(rb_node, struct ceph_inode_frag, node); + if (frag->frag != ceph_frag_make(0, 0) || rb_next(rb_node)) + update = true; + } + if (!update && dirinfo) { + id = le32_to_cpu(dirinfo->frag); + if (id != __ceph_choose_frag(ci, id, NULL, NULL)) + update = true; + } + if (!update) + goto out_unlock; + + dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode)); + rb_node = rb_first(&ci->i_fragtree); + for (i = 0; i < nsplits; i++) { + id = le32_to_cpu(fragtree->splits[i].frag); + frag = NULL; + while (rb_node) { + frag = rb_entry(rb_node, struct ceph_inode_frag, node); + if (ceph_frag_compare(frag->frag, id) >= 0) { + if (frag->frag != id) + frag = NULL; + else + rb_node = rb_next(rb_node); + break; + } + rb_node = rb_next(rb_node); + rb_erase(&frag->node, &ci->i_fragtree); + kfree(frag); + frag = NULL; + } + if (!frag) { + frag = __get_or_create_frag(ci, id); + if (IS_ERR(frag)) + continue; + } + frag->split_by = le32_to_cpu(fragtree->splits[i].by); + dout(" frag %x split by %d\n", frag->frag, frag->split_by); + } + while (rb_node) { + frag = rb_entry(rb_node, struct ceph_inode_frag, node); + rb_node = rb_next(rb_node); + rb_erase(&frag->node, &ci->i_fragtree); + kfree(frag); + } +out_unlock: + mutex_unlock(&ci->i_fragtree_mutex); + return 0; +} /* * initialize a newly allocated inode. @@ -341,7 +424,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb) INIT_LIST_HEAD(&ci->i_cap_snaps); ci->i_head_snapc = NULL; ci->i_snap_caps = 0; - ci->i_cap_exporting_issued = 0; for (i = 0; i < CEPH_FILE_MODE_NUM; i++) ci->i_nr_by_mode[i] = 0; @@ -407,7 +489,7 @@ void ceph_destroy_inode(struct inode *inode) /* * we may still have a snap_realm reference if there are stray - * caps in i_cap_exporting_issued or i_snap_caps. + * caps in i_snap_caps. */ if (ci->i_snap_realm) { struct ceph_mds_client *mdsc = @@ -582,22 +664,26 @@ static int fill_inode(struct inode *inode, unsigned long ttl_from, int cap_fmode, struct ceph_cap_reservation *caps_reservation) { + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_mds_reply_inode *info = iinfo->in; struct ceph_inode_info *ci = ceph_inode(inode); - int i; - int issued = 0, implemented; + int issued = 0, implemented, new_issued; struct timespec mtime, atime, ctime; - u32 nsplits; - struct ceph_inode_frag *frag; - struct rb_node *rb_node; struct ceph_buffer *xattr_blob = NULL; + struct ceph_cap *new_cap = NULL; int err = 0; - int queue_trunc = 0; + bool wake = false; + bool queue_trunc = false; + bool new_version = false; dout("fill_inode %p ino %llx.%llx v %llu had %llu\n", inode, ceph_vinop(inode), le64_to_cpu(info->version), ci->i_version); + /* prealloc new cap struct */ + if (info->cap.caps && ceph_snap(inode) == CEPH_NOSNAP) + new_cap = ceph_get_cap(mdsc, caps_reservation); + /* * prealloc xattr data, if it looks like we'll need it. only * if len > 4 (meaning there are actually xattrs; the first 4 @@ -623,19 +709,23 @@ static int fill_inode(struct inode *inode, * 3 2 skip * 3 3 update */ - if (le64_to_cpu(info->version) > 0 && - (ci->i_version & ~1) >= le64_to_cpu(info->version)) - goto no_change; - + if (ci->i_version == 0 || + ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && + le64_to_cpu(info->version) > (ci->i_version & ~1))) + new_version = true; + issued = __ceph_caps_issued(ci, &implemented); issued |= implemented | __ceph_caps_dirty(ci); + new_issued = ~issued & le32_to_cpu(info->cap.caps); /* update inode */ ci->i_version = le64_to_cpu(info->version); inode->i_version++; inode->i_rdev = le32_to_cpu(info->rdev); + inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; - if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { + if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) && + (issued & CEPH_CAP_AUTH_EXCL) == 0) { inode->i_mode = le32_to_cpu(info->mode); inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid)); inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid)); @@ -644,23 +734,35 @@ static int fill_inode(struct inode *inode, from_kgid(&init_user_ns, inode->i_gid)); } - if ((issued & CEPH_CAP_LINK_EXCL) == 0) + if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) && + (issued & CEPH_CAP_LINK_EXCL) == 0) set_nlink(inode, le32_to_cpu(info->nlink)); - /* be careful with mtime, atime, size */ - ceph_decode_timespec(&atime, &info->atime); - ceph_decode_timespec(&mtime, &info->mtime); - ceph_decode_timespec(&ctime, &info->ctime); - queue_trunc = ceph_fill_file_size(inode, issued, - le32_to_cpu(info->truncate_seq), - le64_to_cpu(info->truncate_size), - le64_to_cpu(info->size)); - ceph_fill_file_time(inode, issued, - le32_to_cpu(info->time_warp_seq), - &ctime, &mtime, &atime); - - ci->i_layout = info->layout; - inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; + if (new_version || (new_issued & CEPH_CAP_ANY_RD)) { + /* be careful with mtime, atime, size */ + ceph_decode_timespec(&atime, &info->atime); + ceph_decode_timespec(&mtime, &info->mtime); + ceph_decode_timespec(&ctime, &info->ctime); + ceph_fill_file_time(inode, issued, + le32_to_cpu(info->time_warp_seq), + &ctime, &mtime, &atime); + } + + if (new_version || + (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { + ci->i_layout = info->layout; + queue_trunc = ceph_fill_file_size(inode, issued, + le32_to_cpu(info->truncate_seq), + le64_to_cpu(info->truncate_size), + le64_to_cpu(info->size)); + /* only update max_size on auth cap */ + if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && + ci->i_max_size != le64_to_cpu(info->max_size)) { + dout("max_size %lld -> %llu\n", ci->i_max_size, + le64_to_cpu(info->max_size)); + ci->i_max_size = le64_to_cpu(info->max_size); + } + } /* xattrs */ /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */ @@ -744,60 +846,7 @@ static int fill_inode(struct inode *inode, !__ceph_dir_is_complete(ci)) { dout(" marking %p complete (empty)\n", inode); __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count)); - ci->i_max_offset = 2; - } -no_change: - /* only update max_size on auth cap */ - if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && - ci->i_max_size != le64_to_cpu(info->max_size)) { - dout("max_size %lld -> %llu\n", ci->i_max_size, - le64_to_cpu(info->max_size)); - ci->i_max_size = le64_to_cpu(info->max_size); - } - - spin_unlock(&ci->i_ceph_lock); - - /* queue truncate if we saw i_size decrease */ - if (queue_trunc) - ceph_queue_vmtruncate(inode); - - /* populate frag tree */ - /* FIXME: move me up, if/when version reflects fragtree changes */ - nsplits = le32_to_cpu(info->fragtree.nsplits); - mutex_lock(&ci->i_fragtree_mutex); - rb_node = rb_first(&ci->i_fragtree); - for (i = 0; i < nsplits; i++) { - u32 id = le32_to_cpu(info->fragtree.splits[i].frag); - frag = NULL; - while (rb_node) { - frag = rb_entry(rb_node, struct ceph_inode_frag, node); - if (ceph_frag_compare(frag->frag, id) >= 0) { - if (frag->frag != id) - frag = NULL; - else - rb_node = rb_next(rb_node); - break; - } - rb_node = rb_next(rb_node); - rb_erase(&frag->node, &ci->i_fragtree); - kfree(frag); - frag = NULL; - } - if (!frag) { - frag = __get_or_create_frag(ci, id); - if (IS_ERR(frag)) - continue; - } - frag->split_by = le32_to_cpu(info->fragtree.splits[i].by); - dout(" frag %x split by %d\n", frag->frag, frag->split_by); } - while (rb_node) { - frag = rb_entry(rb_node, struct ceph_inode_frag, node); - rb_node = rb_next(rb_node); - rb_erase(&frag->node, &ci->i_fragtree); - kfree(frag); - } - mutex_unlock(&ci->i_fragtree_mutex); /* were we issued a capability? */ if (info->cap.caps) { @@ -810,30 +859,41 @@ no_change: le32_to_cpu(info->cap.seq), le32_to_cpu(info->cap.mseq), le64_to_cpu(info->cap.realm), - info->cap.flags, - caps_reservation); + info->cap.flags, &new_cap); + wake = true; } else { - spin_lock(&ci->i_ceph_lock); dout(" %p got snap_caps %s\n", inode, ceph_cap_string(le32_to_cpu(info->cap.caps))); ci->i_snap_caps |= le32_to_cpu(info->cap.caps); if (cap_fmode >= 0) __ceph_get_fmode(ci, cap_fmode); - spin_unlock(&ci->i_ceph_lock); } } else if (cap_fmode >= 0) { - pr_warning("mds issued no caps on %llx.%llx\n", + pr_warn("mds issued no caps on %llx.%llx\n", ceph_vinop(inode)); __ceph_get_fmode(ci, cap_fmode); } + spin_unlock(&ci->i_ceph_lock); + + if (wake) + wake_up_all(&ci->i_cap_wq); + + /* queue truncate if we saw i_size decrease */ + if (queue_trunc) + ceph_queue_vmtruncate(inode); + + /* populate frag tree */ + if (S_ISDIR(inode->i_mode)) + ceph_fill_fragtree(inode, &info->fragtree, dirinfo); /* update delegation info? */ if (dirinfo) ceph_fill_dirfrag(inode, dirinfo); err = 0; - out: + if (new_cap) + ceph_put_cap(mdsc, new_cap); if (xattr_blob) ceph_buffer_put(xattr_blob); return err; @@ -890,41 +950,6 @@ out_unlock: } /* - * Set dentry's directory position based on the current dir's max, and - * order it in d_subdirs, so that dcache_readdir behaves. - * - * Always called under directory's i_mutex. - */ -static void ceph_set_dentry_offset(struct dentry *dn) -{ - struct dentry *dir = dn->d_parent; - struct inode *inode = dir->d_inode; - struct ceph_inode_info *ci; - struct ceph_dentry_info *di; - - BUG_ON(!inode); - - ci = ceph_inode(inode); - di = ceph_dentry(dn); - - spin_lock(&ci->i_ceph_lock); - if (!__ceph_dir_is_complete(ci)) { - spin_unlock(&ci->i_ceph_lock); - return; - } - di->offset = ceph_inode(inode)->i_max_offset++; - spin_unlock(&ci->i_ceph_lock); - - spin_lock(&dir->d_lock); - spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED); - list_move(&dn->d_u.d_child, &dir->d_subdirs); - dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset, - dn->d_u.d_child.prev, dn->d_u.d_child.next); - spin_unlock(&dn->d_lock); - spin_unlock(&dir->d_lock); -} - -/* * splice a dentry to an inode. * caller must hold directory i_mutex for this to be safe. * @@ -933,7 +958,7 @@ static void ceph_set_dentry_offset(struct dentry *dn) * the caller) if we fail. */ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, - bool *prehash, bool set_offset) + bool *prehash) { struct dentry *realdn; @@ -965,8 +990,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, } if ((!prehash || *prehash) && d_unhashed(dn)) d_rehash(dn); - if (set_offset) - ceph_set_dentry_offset(dn); out: return dn; } @@ -987,7 +1010,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, { struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct inode *in = NULL; - struct ceph_mds_reply_inode *ininfo; struct ceph_vino vino; struct ceph_fs_client *fsc = ceph_sb_to_client(sb); int err = 0; @@ -1161,6 +1183,9 @@ retry_lookup: /* rename? */ if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) { + struct inode *olddir = req->r_old_dentry_dir; + BUG_ON(!olddir); + dout(" src %p '%.*s' dst %p '%.*s'\n", req->r_old_dentry, req->r_old_dentry->d_name.len, @@ -1180,13 +1205,10 @@ retry_lookup: rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(dn); - /* - * d_move() puts the renamed dentry at the end of - * d_subdirs. We need to assign it an appropriate - * directory offset so we can behave when dir is - * complete. - */ - ceph_set_dentry_offset(req->r_old_dentry); + /* d_move screws up sibling dentries' offsets */ + ceph_dir_clear_complete(dir); + ceph_dir_clear_complete(olddir); + dout("dn %p gets new offset %lld\n", req->r_old_dentry, ceph_dentry(req->r_old_dentry)->offset); @@ -1213,8 +1235,9 @@ retry_lookup: /* attach proper inode */ if (!dn->d_inode) { + ceph_dir_clear_complete(dir); ihold(in); - dn = splice_dentry(dn, in, &have_lease, true); + dn = splice_dentry(dn, in, &have_lease); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; @@ -1235,17 +1258,16 @@ retry_lookup: (req->r_op == CEPH_MDS_OP_LOOKUPSNAP || req->r_op == CEPH_MDS_OP_MKSNAP)) { struct dentry *dn = req->r_dentry; + struct inode *dir = req->r_locked_dir; /* fill out a snapdir LOOKUPSNAP dentry */ BUG_ON(!dn); - BUG_ON(!req->r_locked_dir); - BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR); - ininfo = rinfo->targeti.in; - vino.ino = le64_to_cpu(ininfo->ino); - vino.snap = le64_to_cpu(ininfo->snapid); + BUG_ON(!dir); + BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR); dout(" linking snapped dir %p to dn %p\n", in, dn); + ceph_dir_clear_complete(dir); ihold(in); - dn = splice_dentry(dn, in, NULL, true); + dn = splice_dentry(dn, in, NULL); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; @@ -1407,7 +1429,7 @@ retry_lookup: } if (!dn->d_inode) { - dn = splice_dentry(dn, in, NULL, false); + dn = splice_dentry(dn, in, NULL); if (IS_ERR(dn)) { err = PTR_ERR(dn); dn = NULL; @@ -1524,7 +1546,7 @@ static void ceph_invalidate_work(struct work_struct *work) orig_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); - truncate_inode_pages(inode->i_mapping, 0); + truncate_pagecache(inode, 0); spin_lock(&ci->i_ceph_lock); if (orig_gen == ci->i_rdcache_gen && @@ -1627,7 +1649,7 @@ retry: ci->i_truncate_pending, to); spin_unlock(&ci->i_ceph_lock); - truncate_inode_pages(inode->i_mapping, to); + truncate_pagecache(inode, to); spin_lock(&ci->i_ceph_lock); if (to == ci->i_truncate_size) { diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index fdf941b44ff1..a822a6e58290 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -109,6 +109,8 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) return PTR_ERR(req); req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; + req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL; req->r_args.setlayout.layout.fl_stripe_unit = @@ -153,6 +155,7 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg) return PTR_ERR(req); req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; req->r_args.setlayout.layout.fl_stripe_unit = cpu_to_le32(l.stripe_unit); diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index d94ba0df9f4d..fbc39c47bacd 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -45,6 +45,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, return PTR_ERR(req); req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; /* mds requires start and length rather than start and end */ if (LLONG_MAX == fl->fl_end) @@ -52,10 +53,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, else length = fl->fl_end - fl->fl_start + 1; - if (lock_type == CEPH_LOCK_FCNTL) - owner = secure_addr(fl->fl_owner); - else - owner = secure_addr(fl->fl_file); + owner = secure_addr(fl->fl_owner); dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, " "start: %llu, length: %llu, wait: %d, type: %d", (int)lock_type, @@ -313,10 +311,7 @@ int lock_to_ceph_filelock(struct file_lock *lock, cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); cephlock->client = cpu_to_le64(0); cephlock->pid = cpu_to_le64((u64)lock->fl_pid); - if (lock->fl_flags & FL_POSIX) - cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner)); - else - cephlock->owner = cpu_to_le64(secure_addr(lock->fl_file)); + cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner)); switch (lock->fl_type) { case F_RDLCK: diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 2b4d093d0563..92a2548278fc 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1558,6 +1558,8 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) init_completion(&req->r_safe_completion); INIT_LIST_HEAD(&req->r_unsafe_item); + req->r_stamp = CURRENT_TIME; + req->r_op = op; req->r_direct_mode = mode; return req; @@ -1783,7 +1785,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, } len = sizeof(*head) + - pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)); + pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) + + sizeof(struct timespec); /* calculate (max) length for cap releases */ len += sizeof(struct ceph_mds_request_release) * @@ -1800,6 +1803,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, goto out_free2; } + msg->hdr.version = 2; msg->hdr.tid = cpu_to_le64(req->r_tid); head = msg->front.iov_base; @@ -1836,6 +1840,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, mds, req->r_old_inode_drop, req->r_old_inode_unless, 0); head->num_releases = cpu_to_le16(releases); + /* time stamp */ + ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp)); + BUG_ON(p > end); msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); @@ -2218,13 +2225,13 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) /* dup? */ if ((req->r_got_unsafe && !head->safe) || (req->r_got_safe && head->safe)) { - pr_warning("got a dup %s reply on %llu from mds%d\n", + pr_warn("got a dup %s reply on %llu from mds%d\n", head->safe ? "safe" : "unsafe", tid, mds); mutex_unlock(&mdsc->mutex); goto out; } if (req->r_got_safe && !head->safe) { - pr_warning("got unsafe after safe on %llu from mds%d\n", + pr_warn("got unsafe after safe on %llu from mds%d\n", tid, mds); mutex_unlock(&mdsc->mutex); goto out; @@ -3525,7 +3532,7 @@ static void peer_reset(struct ceph_connection *con) struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; - pr_warning("mds%d closed our session\n", s->s_mds); + pr_warn("mds%d closed our session\n", s->s_mds); send_mds_reconnect(mdsc, s); } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index e90cfccf93bd..e00737cf523c 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -194,6 +194,7 @@ struct ceph_mds_request { int r_fmode; /* file mode, if expecting cap */ kuid_t r_uid; kgid_t r_gid; + struct timespec r_stamp; /* for choosing which mds to send this request to */ int r_direct_mode; diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 132b64eeecd4..261531e55e9d 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -62,7 +62,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) ceph_decode_16_safe(p, end, version, bad); if (version > 3) { - pr_warning("got mdsmap version %d > 3, failing", version); + pr_warn("got mdsmap version %d > 3, failing", version); goto bad; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 7866cd05a6bb..12b20744e386 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -266,7 +266,6 @@ struct ceph_inode_info { struct timespec i_rctime; u64 i_rbytes, i_rfiles, i_rsubdirs; u64 i_files, i_subdirs; - u64 i_max_offset; /* largest readdir offset, set with complete dir */ struct rb_root i_fragtree; struct mutex i_fragtree_mutex; @@ -293,7 +292,6 @@ struct ceph_inode_info { struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or dirty|flushing caps */ unsigned i_snap_caps; /* cap bits for snapped files */ - unsigned i_cap_exporting_issued; int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ @@ -776,11 +774,13 @@ static inline void ceph_forget_all_cached_acls(struct inode *inode) extern const char *ceph_cap_string(int c); extern void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_msg *msg); -extern int ceph_add_cap(struct inode *inode, - struct ceph_mds_session *session, u64 cap_id, - int fmode, unsigned issued, unsigned wanted, - unsigned cap, unsigned seq, u64 realmino, int flags, - struct ceph_cap_reservation *caps_reservation); +extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx); +extern void ceph_add_cap(struct inode *inode, + struct ceph_mds_session *session, u64 cap_id, + int fmode, unsigned issued, unsigned wanted, + unsigned cap, unsigned seq, u64 realmino, int flags, + struct ceph_cap **new_cap); extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); extern void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap); diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 0227b45ef00a..15e9505aa35f 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -290,7 +290,8 @@ int cifsConvertToUTF16(__le16 *target, const char *source, int srclen, const struct nls_table *cp, int mapChars) { - int i, j, charlen; + int i, charlen; + int j = 0; char src_char; __le16 dst_char; wchar_t tmp; @@ -298,12 +299,11 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen, if (!mapChars) return cifs_strtoUTF16(target, source, PATH_MAX, cp); - for (i = 0, j = 0; i < srclen; j++) { + for (i = 0; i < srclen; j++) { src_char = source[i]; charlen = 1; switch (src_char) { case 0: - put_unaligned(0, &target[j]); goto ctoUTF16_out; case ':': dst_char = cpu_to_le16(UNI_COLON); @@ -350,6 +350,7 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen, } ctoUTF16_out: + put_unaligned(0, &target[j]); /* Null terminate target unicode string */ return j; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index df9c9141c099..888398067420 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -87,10 +87,6 @@ extern mempool_t *cifs_mid_poolp; struct workqueue_struct *cifsiod_wq; -#ifdef CONFIG_CIFS_SMB2 -__u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE]; -#endif - /* * Bumps refcount for cifs super block. * Note that it should be only called if a referece to VFS super block is @@ -251,8 +247,9 @@ cifs_alloc_inode(struct super_block *sb) * server, can not assume caching of file data or metadata. */ cifs_set_oplock_level(cifs_inode, 0); - cifs_inode->delete_pending = false; - cifs_inode->invalid_mapping = false; + cifs_inode->flags = 0; + spin_lock_init(&cifs_inode->writers_lock); + cifs_inode->writers = 0; cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ cifs_inode->server_eof = 0; cifs_inode->uniqueid = 0; @@ -297,7 +294,7 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server) struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr; struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr; - seq_printf(s, ",addr="); + seq_puts(s, ",addr="); switch (server->dstaddr.ss_family) { case AF_INET: @@ -309,7 +306,7 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server) seq_printf(s, "%%%u", sa6->sin6_scope_id); break; default: - seq_printf(s, "(unknown)"); + seq_puts(s, "(unknown)"); } } @@ -319,45 +316,45 @@ cifs_show_security(struct seq_file *s, struct cifs_ses *ses) if (ses->sectype == Unspecified) return; - seq_printf(s, ",sec="); + seq_puts(s, ",sec="); switch (ses->sectype) { case LANMAN: - seq_printf(s, "lanman"); + seq_puts(s, "lanman"); break; case NTLMv2: - seq_printf(s, "ntlmv2"); + seq_puts(s, "ntlmv2"); break; case NTLM: - seq_printf(s, "ntlm"); + seq_puts(s, "ntlm"); break; case Kerberos: - seq_printf(s, "krb5"); + seq_puts(s, "krb5"); break; case RawNTLMSSP: - seq_printf(s, "ntlmssp"); + seq_puts(s, "ntlmssp"); break; default: /* shouldn't ever happen */ - seq_printf(s, "unknown"); + seq_puts(s, "unknown"); break; } if (ses->sign) - seq_printf(s, "i"); + seq_puts(s, "i"); } static void cifs_show_cache_flavor(struct seq_file *s, struct cifs_sb_info *cifs_sb) { - seq_printf(s, ",cache="); + seq_puts(s, ",cache="); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) - seq_printf(s, "strict"); + seq_puts(s, "strict"); else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) - seq_printf(s, "none"); + seq_puts(s, "none"); else - seq_printf(s, "loose"); + seq_puts(s, "loose"); } static void @@ -390,7 +387,7 @@ cifs_show_options(struct seq_file *s, struct dentry *root) cifs_show_cache_flavor(s, cifs_sb); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) - seq_printf(s, ",multiuser"); + seq_puts(s, ",multiuser"); else if (tcon->ses->user_name) seq_printf(s, ",username=%s", tcon->ses->user_name); @@ -416,16 +413,16 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",uid=%u", from_kuid_munged(&init_user_ns, cifs_sb->mnt_uid)); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) - seq_printf(s, ",forceuid"); + seq_puts(s, ",forceuid"); else - seq_printf(s, ",noforceuid"); + seq_puts(s, ",noforceuid"); seq_printf(s, ",gid=%u", from_kgid_munged(&init_user_ns, cifs_sb->mnt_gid)); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) - seq_printf(s, ",forcegid"); + seq_puts(s, ",forcegid"); else - seq_printf(s, ",noforcegid"); + seq_puts(s, ",noforcegid"); cifs_show_address(s, tcon->ses->server); @@ -437,47 +434,47 @@ cifs_show_options(struct seq_file *s, struct dentry *root) cifs_show_nls(s, cifs_sb->local_nls); if (tcon->seal) - seq_printf(s, ",seal"); + seq_puts(s, ",seal"); if (tcon->nocase) - seq_printf(s, ",nocase"); + seq_puts(s, ",nocase"); if (tcon->retry) - seq_printf(s, ",hard"); + seq_puts(s, ",hard"); if (tcon->unix_ext) - seq_printf(s, ",unix"); + seq_puts(s, ",unix"); else - seq_printf(s, ",nounix"); + seq_puts(s, ",nounix"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) - seq_printf(s, ",posixpaths"); + seq_puts(s, ",posixpaths"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) - seq_printf(s, ",setuids"); + seq_puts(s, ",setuids"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) - seq_printf(s, ",serverino"); + seq_puts(s, ",serverino"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) - seq_printf(s, ",rwpidforward"); + seq_puts(s, ",rwpidforward"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) - seq_printf(s, ",forcemand"); + seq_puts(s, ",forcemand"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) - seq_printf(s, ",nouser_xattr"); + seq_puts(s, ",nouser_xattr"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR) - seq_printf(s, ",mapchars"); + seq_puts(s, ",mapchars"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) - seq_printf(s, ",sfu"); + seq_puts(s, ",sfu"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) - seq_printf(s, ",nobrl"); + seq_puts(s, ",nobrl"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) - seq_printf(s, ",cifsacl"); + seq_puts(s, ",cifsacl"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) - seq_printf(s, ",dynperm"); + seq_puts(s, ",dynperm"); if (root->d_sb->s_flags & MS_POSIXACL) - seq_printf(s, ",acl"); + seq_puts(s, ",acl"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) - seq_printf(s, ",mfsymlinks"); + seq_puts(s, ",mfsymlinks"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) - seq_printf(s, ",fsc"); + seq_puts(s, ",fsc"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC) - seq_printf(s, ",nostrictsync"); + seq_puts(s, ",nostrictsync"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) - seq_printf(s, ",noperm"); + seq_puts(s, ",noperm"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID) seq_printf(s, ",backupuid=%u", from_kuid_munged(&init_user_ns, @@ -728,23 +725,42 @@ out_nls: goto out; } -static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t +cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + ssize_t rc; + struct inode *inode = file_inode(iocb->ki_filp); + + rc = cifs_revalidate_mapping(inode); + if (rc) + return rc; + + return generic_file_read_iter(iocb, iter); +} + +static ssize_t cifs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); + struct cifsInodeInfo *cinode = CIFS_I(inode); ssize_t written; int rc; - written = generic_file_aio_write(iocb, iov, nr_segs, pos); + written = cifs_get_writer(cinode); + if (written) + return written; + + written = generic_file_write_iter(iocb, from); if (CIFS_CACHE_WRITE(CIFS_I(inode))) - return written; + goto out; rc = filemap_fdatawrite(inode->i_mapping); if (rc) - cifs_dbg(FYI, "cifs_file_aio_write: %d rc on %p inode\n", + cifs_dbg(FYI, "cifs_file_write_iter: %d rc on %p inode\n", rc, inode); +out: + cifs_put_writer(cinode); return written; } @@ -876,10 +892,10 @@ const struct inode_operations cifs_symlink_inode_ops = { }; const struct file_operations cifs_file_ops = { - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = cifs_file_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = cifs_loose_read_iter, + .write_iter = cifs_file_write_iter, .open = cifs_open, .release = cifs_close, .lock = cifs_lock, @@ -895,10 +911,10 @@ const struct file_operations cifs_file_ops = { }; const struct file_operations cifs_file_strict_ops = { - .read = do_sync_read, - .write = do_sync_write, - .aio_read = cifs_strict_readv, - .aio_write = cifs_strict_writev, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = cifs_strict_readv, + .write_iter = cifs_strict_writev, .open = cifs_open, .release = cifs_close, .lock = cifs_lock, @@ -915,10 +931,10 @@ const struct file_operations cifs_file_strict_ops = { const struct file_operations cifs_file_direct_ops = { /* BB reevaluate whether they can be done with directio, no cache */ - .read = do_sync_read, - .write = do_sync_write, - .aio_read = cifs_user_readv, - .aio_write = cifs_user_writev, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = cifs_user_readv, + .write_iter = cifs_user_writev, .open = cifs_open, .release = cifs_close, .lock = cifs_lock, @@ -934,10 +950,10 @@ const struct file_operations cifs_file_direct_ops = { }; const struct file_operations cifs_file_nobrl_ops = { - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = cifs_file_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = cifs_loose_read_iter, + .write_iter = cifs_file_write_iter, .open = cifs_open, .release = cifs_close, .fsync = cifs_fsync, @@ -952,10 +968,10 @@ const struct file_operations cifs_file_nobrl_ops = { }; const struct file_operations cifs_file_strict_nobrl_ops = { - .read = do_sync_read, - .write = do_sync_write, - .aio_read = cifs_strict_readv, - .aio_write = cifs_strict_writev, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = cifs_strict_readv, + .write_iter = cifs_strict_writev, .open = cifs_open, .release = cifs_close, .fsync = cifs_strict_fsync, @@ -971,10 +987,10 @@ const struct file_operations cifs_file_strict_nobrl_ops = { const struct file_operations cifs_file_direct_nobrl_ops = { /* BB reevaluate whether they can be done with directio, no cache */ - .read = do_sync_read, - .write = do_sync_write, - .aio_read = cifs_user_readv, - .aio_write = cifs_user_writev, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = cifs_user_readv, + .write_iter = cifs_user_writev, .open = cifs_open, .release = cifs_close, .fsync = cifs_fsync, @@ -1180,10 +1196,6 @@ init_cifs(void) spin_lock_init(&cifs_file_list_lock); spin_lock_init(&GlobalMid_Lock); -#ifdef CONFIG_CIFS_SMB2 - get_random_bytes(cifs_client_guid, SMB2_CLIENT_GUID_SIZE); -#endif - if (cifs_max_pending < 2) { cifs_max_pending = 2; cifs_dbg(FYI, "cifs_max_pending set to min of 2\n"); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 26a754f49ba1..70f178a7c759 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -22,20 +22,28 @@ #ifndef _CIFSFS_H #define _CIFSFS_H +#include <linux/hash.h> + #define ROOT_I 2 /* * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down - * so that it will fit. + * so that it will fit. We use hash_64 to convert the value to 31 bits, and + * then add 1, to ensure that we don't end up with a 0 as the value. */ +#if BITS_PER_LONG == 64 static inline ino_t cifs_uniqueid_to_ino_t(u64 fileid) { - ino_t ino = (ino_t) fileid; - if (sizeof(ino_t) < sizeof(u64)) - ino ^= fileid >> (sizeof(u64)-sizeof(ino_t)) * 8; - return ino; + return (ino_t)fileid; } +#else +static inline ino_t +cifs_uniqueid_to_ino_t(u64 fileid) +{ + return (ino_t)hash_64(fileid, (sizeof(ino_t) * 8) - 1) + 1; +} +#endif extern struct file_system_type cifs_fs_type; extern const struct address_space_operations cifs_addr_ops; @@ -67,6 +75,8 @@ extern int cifs_revalidate_dentry_attr(struct dentry *); extern int cifs_revalidate_file(struct file *filp); extern int cifs_revalidate_dentry(struct dentry *); extern int cifs_invalidate_mapping(struct inode *inode); +extern int cifs_revalidate_mapping(struct inode *inode); +extern int cifs_zap_mapping(struct inode *inode); extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *); extern int cifs_setattr(struct dentry *, struct iattr *); @@ -85,14 +95,10 @@ extern const struct file_operations cifs_file_strict_nobrl_ops; extern int cifs_open(struct inode *inode, struct file *file); extern int cifs_close(struct inode *inode, struct file *file); extern int cifs_closedir(struct inode *inode, struct file *file); -extern ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos); -extern ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos); -extern ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos); -extern ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos); +extern ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to); +extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to); +extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from); +extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from); extern int cifs_lock(struct file *, int, struct file_lock *); extern int cifs_fsync(struct file *, loff_t, loff_t, int); extern int cifs_strict_fsync(struct file *, loff_t, loff_t, int); @@ -130,5 +136,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.02" +#define CIFS_VERSION "2.03" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index c0f3718b77a8..de6aed8c78e5 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -228,6 +228,8 @@ struct smb_version_operations { /* verify the message */ int (*check_message)(char *, unsigned int); bool (*is_oplock_break)(char *, struct TCP_Server_Info *); + void (*downgrade_oplock)(struct TCP_Server_Info *, + struct cifsInodeInfo *, bool); /* process transaction2 response */ bool (*check_trans2)(struct mid_q_entry *, struct TCP_Server_Info *, char *, int); @@ -557,6 +559,7 @@ struct TCP_Server_Info { int echo_credits; /* echo reserved slots */ int oplock_credits; /* oplock break reserved slots */ bool echoes:1; /* enable echoes */ + __u8 client_guid[SMB2_CLIENT_GUID_SIZE]; /* Client GUID */ #endif u16 dialect; /* dialect index that server chose */ bool oplocks:1; /* enable oplocks */ @@ -1111,8 +1114,15 @@ struct cifsInodeInfo { __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */ unsigned int oplock; /* oplock/lease level we have */ unsigned int epoch; /* used to track lease state changes */ - bool delete_pending; /* DELETE_ON_CLOSE is set */ - bool invalid_mapping; /* pagecache is invalid */ +#define CIFS_INODE_PENDING_OPLOCK_BREAK (0) /* oplock break in progress */ +#define CIFS_INODE_PENDING_WRITERS (1) /* Writes in progress */ +#define CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2 (2) /* Downgrade oplock to L2 */ +#define CIFS_INO_DELETE_PENDING (3) /* delete pending on server */ +#define CIFS_INO_INVALID_MAPPING (4) /* pagecache is invalid */ +#define CIFS_INO_LOCK (5) /* lock bit for synchronization */ + unsigned long flags; + spinlock_t writers_lock; + unsigned int writers; /* Number of writers on this inode */ unsigned long time; /* jiffies of last update of inode */ u64 server_eof; /* current file size on server -- protected by i_lock */ u64 uniqueid; /* server inode number */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index acc4ee8ed075..ca7980a1e303 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -127,6 +127,9 @@ extern u64 cifs_UnixTimeToNT(struct timespec); extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset); extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock); +extern int cifs_get_writer(struct cifsInodeInfo *cinode); +extern void cifs_put_writer(struct cifsInodeInfo *cinode); +extern void cifs_done_oplock_break(struct cifsInodeInfo *cinode); extern int cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, const unsigned int xid); extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index f3264bd7a83d..6ce4e0954b98 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -6197,6 +6197,9 @@ QAllEAsRetry: cifs_dbg(FYI, "ea length %d\n", list_len); if (list_len <= 8) { cifs_dbg(FYI, "empty EA list returned from server\n"); + /* didn't find the named attribute */ + if (ea_name) + rc = -ENODATA; goto QAllEAsOut; } diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 8813ff776ba3..20d75b8ddb26 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2144,6 +2144,9 @@ cifs_get_tcp_session(struct smb_vol *volume_info) sizeof(tcp_ses->srcaddr)); memcpy(&tcp_ses->dstaddr, &volume_info->dstaddr, sizeof(tcp_ses->dstaddr)); +#ifdef CONFIG_CIFS_SMB2 + get_random_bytes(tcp_ses->client_guid, SMB2_CLIENT_GUID_SIZE); +#endif /* * at this point we are the only ones with the pointer * to the struct since the kernel thread not created yet @@ -2225,7 +2228,7 @@ static int match_session(struct cifs_ses *ses, struct smb_vol *vol) vol->username ? vol->username : "", CIFS_MAX_USERNAME_LEN)) return 0; - if (strlen(vol->username) != 0 && + if ((vol->username && strlen(vol->username) != 0) && ses->password != NULL && strncmp(ses->password, vol->password ? vol->password : "", diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 8add25538a3b..e90a1e9aa627 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -335,7 +335,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, spin_unlock(&cifs_file_list_lock); if (fid->purge_cache) - cifs_invalidate_mapping(inode); + cifs_zap_mapping(inode); file->private_data = cfile; return cfile; @@ -392,7 +392,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) * again and get at least level II oplock. */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) - CIFS_I(inode)->invalid_mapping = true; + set_bit(CIFS_INO_INVALID_MAPPING, &cifsi->flags); cifs_set_oplock_level(cifsi, 0); } spin_unlock(&cifs_file_list_lock); @@ -1529,7 +1529,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, */ if (!CIFS_CACHE_WRITE(CIFS_I(inode)) && CIFS_CACHE_READ(CIFS_I(inode))) { - cifs_invalidate_mapping(inode); + cifs_zap_mapping(inode); cifs_dbg(FYI, "Set no oplock for inode=%p due to mand locks\n", inode); CIFS_I(inode)->oplock = 0; @@ -2218,7 +2218,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end, file->f_path.dentry->d_name.name, datasync); if (!CIFS_CACHE_READ(CIFS_I(inode))) { - rc = cifs_invalidate_mapping(inode); + rc = cifs_zap_mapping(inode); if (rc) { cifs_dbg(FYI, "rc: %d during invalidate phase\n", rc); rc = 0; /* don't care about it in fsync */ @@ -2385,14 +2385,12 @@ cifs_uncached_retry_writev(struct cifs_writedata *wdata) } static ssize_t -cifs_iovec_write(struct file *file, const struct iovec *iov, - unsigned long nr_segs, loff_t *poffset) +cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset) { unsigned long nr_pages, i; size_t bytes, copied, len, cur_len; ssize_t total_written = 0; loff_t offset; - struct iov_iter it; struct cifsFileInfo *open_file; struct cifs_tcon *tcon; struct cifs_sb_info *cifs_sb; @@ -2401,14 +2399,16 @@ cifs_iovec_write(struct file *file, const struct iovec *iov, int rc; pid_t pid; - len = iov_length(iov, nr_segs); - if (!len) - return 0; - + len = iov_iter_count(from); rc = generic_write_checks(file, poffset, &len, 0); if (rc) return rc; + if (!len) + return 0; + + iov_iter_truncate(from, len); + INIT_LIST_HEAD(&wdata_list); cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); open_file = file->private_data; @@ -2424,7 +2424,6 @@ cifs_iovec_write(struct file *file, const struct iovec *iov, else pid = current->tgid; - iov_iter_init(&it, iov, nr_segs, len, 0); do { size_t save_len; @@ -2444,11 +2443,10 @@ cifs_iovec_write(struct file *file, const struct iovec *iov, save_len = cur_len; for (i = 0; i < nr_pages; i++) { - bytes = min_t(const size_t, cur_len, PAGE_SIZE); - copied = iov_iter_copy_from_user(wdata->pages[i], &it, - 0, bytes); + bytes = min_t(size_t, cur_len, PAGE_SIZE); + copied = copy_page_from_iter(wdata->pages[i], 0, bytes, + from); cur_len -= copied; - iov_iter_advance(&it, copied); /* * If we didn't copy as much as we expected, then that * may mean we trod into an unmapped area. Stop copying @@ -2546,11 +2544,11 @@ restart_loop: return total_written ? total_written : (ssize_t)rc; } -ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from) { ssize_t written; struct inode *inode; + loff_t pos = iocb->ki_pos; inode = file_inode(iocb->ki_filp); @@ -2560,9 +2558,9 @@ ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov, * write request. */ - written = cifs_iovec_write(iocb->ki_filp, iov, nr_segs, &pos); + written = cifs_iovec_write(iocb->ki_filp, from, &pos); if (written > 0) { - CIFS_I(inode)->invalid_mapping = true; + set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(inode)->flags); iocb->ki_pos = pos; } @@ -2570,8 +2568,7 @@ ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov, } static ssize_t -cifs_writev(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +cifs_writev(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; @@ -2589,17 +2586,17 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov, mutex_lock(&inode->i_mutex); if (file->f_flags & O_APPEND) lock_pos = i_size_read(inode); - if (!cifs_find_lock_conflict(cfile, lock_pos, iov_length(iov, nr_segs), + if (!cifs_find_lock_conflict(cfile, lock_pos, iov_iter_count(from), server->vals->exclusive_lock_type, NULL, CIFS_WRITE_OP)) { - rc = __generic_file_aio_write(iocb, iov, nr_segs); + rc = __generic_file_write_iter(iocb, from); mutex_unlock(&inode->i_mutex); if (rc > 0) { ssize_t err; err = generic_write_sync(file, iocb->ki_pos - rc, rc); - if (rc < 0) + if (err < 0) rc = err; } } else { @@ -2610,8 +2607,7 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov, } ssize_t -cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); struct cifsInodeInfo *cinode = CIFS_I(inode); @@ -2621,12 +2617,19 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); ssize_t written; + written = cifs_get_writer(cinode); + if (written) + return written; + if (CIFS_CACHE_WRITE(cinode)) { if (cap_unix(tcon->ses) && (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) - && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) - return generic_file_aio_write(iocb, iov, nr_segs, pos); - return cifs_writev(iocb, iov, nr_segs, pos); + && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) { + written = generic_file_write_iter(iocb, from); + goto out; + } + written = cifs_writev(iocb, from); + goto out; } /* * For non-oplocked files in strict cache mode we need to write the data @@ -2634,18 +2637,20 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, * affected pages because it may cause a error with mandatory locks on * these pages but not on the region from pos to ppos+len-1. */ - written = cifs_user_writev(iocb, iov, nr_segs, pos); + written = cifs_user_writev(iocb, from); if (written > 0 && CIFS_CACHE_READ(cinode)) { /* * Windows 7 server can delay breaking level2 oplock if a write * request comes - break it on the client to prevent reading * an old data. */ - cifs_invalidate_mapping(inode); + cifs_zap_mapping(inode); cifs_dbg(FYI, "Set no oplock for inode=%p after a write operation\n", inode); cinode->oplock = 0; } +out: + cifs_put_writer(cinode); return written; } @@ -2821,32 +2826,25 @@ cifs_uncached_read_into_pages(struct TCP_Server_Info *server, return total_read > 0 ? total_read : result; } -ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; ssize_t rc; size_t len, cur_len; ssize_t total_read = 0; - loff_t offset = pos; + loff_t offset = iocb->ki_pos; unsigned int npages; struct cifs_sb_info *cifs_sb; struct cifs_tcon *tcon; struct cifsFileInfo *open_file; struct cifs_readdata *rdata, *tmp; struct list_head rdata_list; - struct iov_iter to; pid_t pid; - if (!nr_segs) - return 0; - - len = iov_length(iov, nr_segs); + len = iov_iter_count(to); if (!len) return 0; - iov_iter_init(&to, iov, nr_segs, len, 0); - INIT_LIST_HEAD(&rdata_list); cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); open_file = file->private_data; @@ -2872,7 +2870,7 @@ ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov, cifs_uncached_readv_complete); if (!rdata) { rc = -ENOMEM; - goto error; + break; } rc = cifs_read_allocate_pages(rdata, npages); @@ -2904,7 +2902,7 @@ error: if (!list_empty(&rdata_list)) rc = 0; - len = iov_iter_count(&to); + len = iov_iter_count(to); /* the loop below should proceed in the order of increasing offsets */ list_for_each_entry_safe(rdata, tmp, &rdata_list, list) { again: @@ -2921,7 +2919,7 @@ error: goto again; } } else { - rc = cifs_readdata_to_iov(rdata, &to); + rc = cifs_readdata_to_iov(rdata, to); } } @@ -2929,7 +2927,7 @@ error: kref_put(&rdata->refcount, cifs_uncached_readdata_release); } - total_read = len - iov_iter_count(&to); + total_read = len - iov_iter_count(to); cifs_stats_bytes_read(tcon, total_read); @@ -2938,15 +2936,14 @@ error: rc = 0; if (total_read) { - iocb->ki_pos = pos + total_read; + iocb->ki_pos += total_read; return total_read; } return rc; } ssize_t -cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); struct cifsInodeInfo *cinode = CIFS_I(inode); @@ -2965,22 +2962,22 @@ cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov, * pos+len-1. */ if (!CIFS_CACHE_READ(cinode)) - return cifs_user_readv(iocb, iov, nr_segs, pos); + return cifs_user_readv(iocb, to); if (cap_unix(tcon->ses) && (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) - return generic_file_aio_read(iocb, iov, nr_segs, pos); + return generic_file_read_iter(iocb, to); /* * We need to hold the sem to be sure nobody modifies lock list * with a brlock that prevents reading. */ down_read(&cinode->lock_sem); - if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs), + if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(to), tcon->ses->server->vals->shared_lock_type, NULL, CIFS_READ_OP)) - rc = generic_file_aio_read(iocb, iov, nr_segs, pos); + rc = generic_file_read_iter(iocb, to); up_read(&cinode->lock_sem); return rc; } @@ -3102,7 +3099,7 @@ int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma) xid = get_xid(); if (!CIFS_CACHE_READ(CIFS_I(inode))) { - rc = cifs_invalidate_mapping(inode); + rc = cifs_zap_mapping(inode); if (rc) return rc; } @@ -3621,6 +3618,13 @@ static int cifs_launder_page(struct page *page) return rc; } +static int +cifs_pending_writers_wait(void *unused) +{ + schedule(); + return 0; +} + void cifs_oplock_break(struct work_struct *work) { struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, @@ -3628,8 +3632,15 @@ void cifs_oplock_break(struct work_struct *work) struct inode *inode = cfile->dentry->d_inode; struct cifsInodeInfo *cinode = CIFS_I(inode); struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; int rc = 0; + wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS, + cifs_pending_writers_wait, TASK_UNINTERRUPTIBLE); + + server->ops->downgrade_oplock(server, cinode, + test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags)); + if (!CIFS_CACHE_WRITE(cinode) && CIFS_CACHE_READ(cinode) && cifs_has_mand_locks(cinode)) { cifs_dbg(FYI, "Reset oplock to None for inode=%p due to mand locks\n", @@ -3646,7 +3657,7 @@ void cifs_oplock_break(struct work_struct *work) if (!CIFS_CACHE_READ(cinode)) { rc = filemap_fdatawait(inode->i_mapping); mapping_set_error(inode->i_mapping, rc); - cifs_invalidate_mapping(inode); + cifs_zap_mapping(inode); } cifs_dbg(FYI, "Oplock flush inode %p rc %d\n", inode, rc); } @@ -3666,6 +3677,7 @@ void cifs_oplock_break(struct work_struct *work) cinode); cifs_dbg(FYI, "Oplock release rc = %d\n", rc); } + cifs_done_oplock_break(cinode); } /* @@ -3678,8 +3690,8 @@ void cifs_oplock_break(struct work_struct *work) * Direct IO is not yet supported in the cached mode. */ static ssize_t -cifs_direct_io(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t pos, unsigned long nr_segs) +cifs_direct_io(int rw, struct kiocb *iocb, struct iov_iter *iter, + loff_t pos) { /* * FIXME diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index aadc2b68678b..a174605f6afa 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -22,6 +22,7 @@ #include <linux/stat.h> #include <linux/slab.h> #include <linux/pagemap.h> +#include <linux/freezer.h> #include <asm/div64.h> #include "cifsfs.h" #include "cifspdu.h" @@ -117,7 +118,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) cifs_dbg(FYI, "%s: invalidating inode %llu mapping\n", __func__, cifs_i->uniqueid); - cifs_i->invalid_mapping = true; + set_bit(CIFS_INO_INVALID_MAPPING, &cifs_i->flags); } /* @@ -177,7 +178,10 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) else cifs_i->time = jiffies; - cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING; + if (fattr->cf_flags & CIFS_FATTR_DELETE_PENDING) + set_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags); + else + clear_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags); cifs_i->server_eof = fattr->cf_eof; /* @@ -1121,7 +1125,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, } /* try to set DELETE_ON_CLOSE */ - if (!cifsInode->delete_pending) { + if (!test_bit(CIFS_INO_DELETE_PENDING, &cifsInode->flags)) { rc = CIFSSMBSetFileDisposition(xid, tcon, true, fid.netfid, current->tgid); /* @@ -1138,7 +1142,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, rc = -EBUSY; goto undo_rename; } - cifsInode->delete_pending = true; + set_bit(CIFS_INO_DELETE_PENDING, &cifsInode->flags); } out_close: @@ -1737,6 +1741,9 @@ cifs_inode_needs_reval(struct inode *inode) if (cifs_i->time == 0) return true; + if (!cifs_sb->actimeo) + return true; + if (!time_in_range(jiffies, cifs_i->time, cifs_i->time + cifs_sb->actimeo)) return true; @@ -1756,23 +1763,62 @@ int cifs_invalidate_mapping(struct inode *inode) { int rc = 0; - struct cifsInodeInfo *cifs_i = CIFS_I(inode); - - cifs_i->invalid_mapping = false; if (inode->i_mapping && inode->i_mapping->nrpages != 0) { rc = invalidate_inode_pages2(inode->i_mapping); - if (rc) { + if (rc) cifs_dbg(VFS, "%s: could not invalidate inode %p\n", __func__, inode); - cifs_i->invalid_mapping = true; - } } cifs_fscache_reset_inode_cookie(inode); return rc; } +/** + * cifs_wait_bit_killable - helper for functions that are sleeping on bit locks + * @word: long word containing the bit lock + */ +static int +cifs_wait_bit_killable(void *word) +{ + if (fatal_signal_pending(current)) + return -ERESTARTSYS; + freezable_schedule_unsafe(); + return 0; +} + +int +cifs_revalidate_mapping(struct inode *inode) +{ + int rc; + unsigned long *flags = &CIFS_I(inode)->flags; + + rc = wait_on_bit_lock(flags, CIFS_INO_LOCK, cifs_wait_bit_killable, + TASK_KILLABLE); + if (rc) + return rc; + + if (test_and_clear_bit(CIFS_INO_INVALID_MAPPING, flags)) { + rc = cifs_invalidate_mapping(inode); + if (rc) + set_bit(CIFS_INO_INVALID_MAPPING, flags); + } + + clear_bit_unlock(CIFS_INO_LOCK, flags); + smp_mb__after_atomic(); + wake_up_bit(flags, CIFS_INO_LOCK); + + return rc; +} + +int +cifs_zap_mapping(struct inode *inode) +{ + set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(inode)->flags); + return cifs_revalidate_mapping(inode); +} + int cifs_revalidate_file_attr(struct file *filp) { int rc = 0; @@ -1839,9 +1885,7 @@ int cifs_revalidate_file(struct file *filp) if (rc) return rc; - if (CIFS_I(inode)->invalid_mapping) - rc = cifs_invalidate_mapping(inode); - return rc; + return cifs_revalidate_mapping(inode); } /* revalidate a dentry's inode attributes */ @@ -1854,9 +1898,7 @@ int cifs_revalidate_dentry(struct dentry *dentry) if (rc) return rc; - if (CIFS_I(inode)->invalid_mapping) - rc = cifs_invalidate_mapping(inode); - return rc; + return cifs_revalidate_mapping(inode); } int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry, diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 77492301cc2b..45cb59bcc791 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -85,7 +85,7 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file, goto out_fput; } - src_inode = src_file.file->f_dentry->d_inode; + src_inode = file_inode(src_file.file); /* * Note: cifs case is easier than btrfs since server responsible for diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 264ece71bdb2..68559fd557fb 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -374,7 +374,7 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, oparms.cifs_sb = cifs_sb; oparms.desired_access = GENERIC_WRITE; oparms.create_options = create_options; - oparms.disposition = FILE_OPEN; + oparms.disposition = FILE_CREATE; oparms.path = path; oparms.fid = &fid; oparms.reconnect = false; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 2f9f3790679d..3b0c62e622da 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -466,8 +466,22 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) cifs_dbg(FYI, "file id match, oplock break\n"); pCifsInode = CIFS_I(netfile->dentry->d_inode); - cifs_set_oplock_level(pCifsInode, - pSMB->OplockLevel ? OPLOCK_READ : 0); + set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, + &pCifsInode->flags); + + /* + * Set flag if the server downgrades the oplock + * to L2 else clear. + */ + if (pSMB->OplockLevel) + set_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &pCifsInode->flags); + else + clear_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &pCifsInode->flags); + queue_work(cifsiod_wq, &netfile->oplock_break); netfile->oplock_break_cancelled = false; @@ -551,6 +565,62 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) cinode->oplock = 0; } +static int +cifs_oplock_break_wait(void *unused) +{ + schedule(); + return signal_pending(current) ? -ERESTARTSYS : 0; +} + +/* + * We wait for oplock breaks to be processed before we attempt to perform + * writes. + */ +int cifs_get_writer(struct cifsInodeInfo *cinode) +{ + int rc; + +start: + rc = wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK, + cifs_oplock_break_wait, TASK_KILLABLE); + if (rc) + return rc; + + spin_lock(&cinode->writers_lock); + if (!cinode->writers) + set_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags); + cinode->writers++; + /* Check to see if we have started servicing an oplock break */ + if (test_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags)) { + cinode->writers--; + if (cinode->writers == 0) { + clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags); + wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS); + } + spin_unlock(&cinode->writers_lock); + goto start; + } + spin_unlock(&cinode->writers_lock); + return 0; +} + +void cifs_put_writer(struct cifsInodeInfo *cinode) +{ + spin_lock(&cinode->writers_lock); + cinode->writers--; + if (cinode->writers == 0) { + clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags); + wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS); + } + spin_unlock(&cinode->writers_lock); +} + +void cifs_done_oplock_break(struct cifsInodeInfo *cinode) +{ + clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags); + wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK); +} + bool backup_cred(struct cifs_sb_info *cifs_sb) { diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index 049884552e76..6834b9c3bec1 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -795,8 +795,8 @@ cifs_print_status(__u32 status_code) while (nt_errs[idx].nt_errstr != NULL) { if (((nt_errs[idx].nt_errcode) & 0xFFFFFF) == (status_code & 0xFFFFFF)) { - printk(KERN_NOTICE "Status code returned 0x%08x %s\n", - status_code, nt_errs[idx].nt_errstr); + pr_notice("Status code returned 0x%08x %s\n", + status_code, nt_errs[idx].nt_errstr); } idx++; } @@ -941,8 +941,9 @@ cifs_UnixTimeToNT(struct timespec t) return (u64) t.tv_sec * 10000000 + t.tv_nsec/100 + NTFS_TIME_OFFSET; } -static int total_days_of_prev_months[] = -{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334}; +static const int total_days_of_prev_months[] = { + 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334 +}; struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset) { diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 526fb89f9230..d1fdfa848703 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -372,6 +372,16 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr) return 0; } +static void +cifs_downgrade_oplock(struct TCP_Server_Info *server, + struct cifsInodeInfo *cinode, bool set_level2) +{ + if (set_level2) + cifs_set_oplock_level(cinode, OPLOCK_READ); + else + cifs_set_oplock_level(cinode, 0); +} + static bool cifs_check_trans2(struct mid_q_entry *mid, struct TCP_Server_Info *server, char *buf, int malformed) @@ -1019,6 +1029,7 @@ struct smb_version_operations smb1_operations = { .clear_stats = cifs_clear_stats, .print_stats = cifs_print_stats, .is_oplock_break = is_valid_oplock_break, + .downgrade_oplock = cifs_downgrade_oplock, .check_trans2 = cifs_check_trans2, .need_neg = cifs_need_neg, .negotiate = cifs_negotiate, diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index fb3966265b6e..b8021fde987d 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -575,9 +575,21 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) else cfile->oplock_break_cancelled = false; - server->ops->set_oplock_level(cinode, - rsp->OplockLevel ? SMB2_OPLOCK_LEVEL_II : 0, - 0, NULL); + set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, + &cinode->flags); + + /* + * Set flag if the server downgrades the oplock + * to L2 else clear. + */ + if (rsp->OplockLevel) + set_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &cinode->flags); + else + clear_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &cinode->flags); queue_work(cifsiod_wq, &cfile->oplock_break); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 192f51a12cf1..787844bde384 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -905,6 +905,17 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, } static void +smb2_downgrade_oplock(struct TCP_Server_Info *server, + struct cifsInodeInfo *cinode, bool set_level2) +{ + if (set_level2) + server->ops->set_oplock_level(cinode, SMB2_OPLOCK_LEVEL_II, + 0, NULL); + else + server->ops->set_oplock_level(cinode, 0, 0, NULL); +} + +static void smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, unsigned int epoch, bool *purge_cache) { @@ -1036,6 +1047,7 @@ smb2_create_lease_buf(u8 *lease_key, u8 oplock) buf->ccontext.NameOffset = cpu_to_le16(offsetof (struct create_lease, Name)); buf->ccontext.NameLength = cpu_to_le16(4); + /* SMB2_CREATE_REQUEST_LEASE is "RqLs" */ buf->Name[0] = 'R'; buf->Name[1] = 'q'; buf->Name[2] = 'L'; @@ -1062,6 +1074,7 @@ smb3_create_lease_buf(u8 *lease_key, u8 oplock) buf->ccontext.NameOffset = cpu_to_le16(offsetof (struct create_lease_v2, Name)); buf->ccontext.NameLength = cpu_to_le16(4); + /* SMB2_CREATE_REQUEST_LEASE is "RqLs" */ buf->Name[0] = 'R'; buf->Name[1] = 'q'; buf->Name[2] = 'L'; @@ -1110,6 +1123,7 @@ struct smb_version_operations smb20_operations = { .clear_stats = smb2_clear_stats, .print_stats = smb2_print_stats, .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, @@ -1184,6 +1198,7 @@ struct smb_version_operations smb21_operations = { .clear_stats = smb2_clear_stats, .print_stats = smb2_print_stats, .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, @@ -1259,6 +1274,7 @@ struct smb_version_operations smb30_operations = { .print_stats = smb2_print_stats, .dump_share_caps = smb2_dump_share_caps, .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 860344701067..b0b260dbb19d 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -375,7 +375,12 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) req->Capabilities = cpu_to_le32(ses->server->vals->req_capabilities); - memcpy(req->ClientGUID, cifs_client_guid, SMB2_CLIENT_GUID_SIZE); + /* ClientGUID must be zero for SMB2.02 dialect */ + if (ses->server->vals->protocol_id == SMB20_PROT_ID) + memset(req->ClientGUID, 0, SMB2_CLIENT_GUID_SIZE); + else + memcpy(req->ClientGUID, server->client_guid, + SMB2_CLIENT_GUID_SIZE); iov[0].iov_base = (char *)req; /* 4 for rfc1002 length field */ @@ -478,7 +483,8 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) vneg_inbuf.Capabilities = cpu_to_le32(tcon->ses->server->vals->req_capabilities); - memcpy(vneg_inbuf.Guid, cifs_client_guid, SMB2_CLIENT_GUID_SIZE); + memcpy(vneg_inbuf.Guid, tcon->ses->server->client_guid, + SMB2_CLIENT_GUID_SIZE); if (tcon->ses->sign) vneg_inbuf.SecurityMode = @@ -966,6 +972,7 @@ create_durable_buf(void) buf->ccontext.NameOffset = cpu_to_le16(offsetof (struct create_durable, Name)); buf->ccontext.NameLength = cpu_to_le16(4); + /* SMB2_CREATE_DURABLE_HANDLE_REQUEST is "DHnQ" */ buf->Name[0] = 'D'; buf->Name[1] = 'H'; buf->Name[2] = 'n'; @@ -990,6 +997,7 @@ create_reconnect_durable_buf(struct cifs_fid *fid) buf->ccontext.NameLength = cpu_to_le16(4); buf->Data.Fid.PersistentFileId = fid->persistent_fid; buf->Data.Fid.VolatileFileId = fid->volatile_fid; + /* SMB2_CREATE_DURABLE_HANDLE_RECONNECT is "DHnC" */ buf->Name[0] = 'D'; buf->Name[1] = 'H'; buf->Name[2] = 'n'; @@ -1089,6 +1097,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, int rc = 0; unsigned int num_iovecs = 2; __u32 file_attributes = 0; + char *dhc_buf = NULL, *lc_buf = NULL; cifs_dbg(FYI, "create/open\n"); @@ -1155,6 +1164,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, kfree(copy_path); return rc; } + lc_buf = iov[num_iovecs-1].iov_base; } if (*oplock == SMB2_OPLOCK_LEVEL_BATCH) { @@ -1169,9 +1179,10 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, if (rc) { cifs_small_buf_release(req); kfree(copy_path); - kfree(iov[num_iovecs-1].iov_base); + kfree(lc_buf); return rc; } + dhc_buf = iov[num_iovecs-1].iov_base; } rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0); @@ -1203,6 +1214,8 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, *oplock = rsp->OplockLevel; creat_exit: kfree(copy_path); + kfree(lc_buf); + kfree(dhc_buf); free_rsp_buf(resp_buftype, rsp); return rc; } @@ -1352,7 +1365,6 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid) { int rc; - char *res_key = NULL; struct compress_ioctl fsctl_input; char *ret_data = NULL; @@ -1365,7 +1377,6 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, 2 /* in data len */, &ret_data /* out data */, NULL); cifs_dbg(FYI, "set compression rc %d\n", rc); - kfree(res_key); return rc; } diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 2022c542ea3a..69f3595d3952 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -183,8 +183,6 @@ struct smb2_symlink_err_rsp { #define SMB2_CLIENT_GUID_SIZE 16 -extern __u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE]; - struct smb2_negotiate_req { struct smb2_hdr hdr; __le16 StructureSize; /* Must be 36 */ @@ -437,11 +435,15 @@ struct smb2_tree_disconnect_rsp { #define SMB2_CREATE_SD_BUFFER "SecD" /* security descriptor */ #define SMB2_CREATE_DURABLE_HANDLE_REQUEST "DHnQ" #define SMB2_CREATE_DURABLE_HANDLE_RECONNECT "DHnC" -#define SMB2_CREATE_ALLOCATION_SIZE "AlSi" +#define SMB2_CREATE_ALLOCATION_SIZE "AISi" #define SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST "MxAc" #define SMB2_CREATE_TIMEWARP_REQUEST "TWrp" #define SMB2_CREATE_QUERY_ON_DISK_ID "QFid" #define SMB2_CREATE_REQUEST_LEASE "RqLs" +#define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q" +#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C" +#define SMB2_CREATE_APP_INSTANCE_ID 0x45BCA66AEFA7F74A9008FA462E144D74 +#define SVHDX_OPEN_DEVICE_CONTEXT 0x83CE6F1AD851E0986E34401CC9BCFCE9 struct smb2_create_req { struct smb2_hdr hdr; diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c index 911cf30d057d..7740b1c871c1 100644 --- a/fs/coda/cnode.c +++ b/fs/coda/cnode.c @@ -101,7 +101,7 @@ struct inode *coda_cnode_make(struct CodaFid *fid, struct super_block *sb) inode = coda_iget(sb, fid, &attr); if (IS_ERR(inode)) - printk("coda_cnode_make: coda_iget failed\n"); + pr_warn("%s: coda_iget failed\n", __func__); return inode; } @@ -137,7 +137,7 @@ struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb) unsigned long hash = coda_f2i(fid); if ( !sb ) { - printk("coda_fid_to_inode: no sb!\n"); + pr_warn("%s: no sb!\n", __func__); return NULL; } diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h index e7550cb9fb74..d42b725b1d21 100644 --- a/fs/coda/coda_linux.h +++ b/fs/coda/coda_linux.h @@ -12,6 +12,12 @@ #ifndef _LINUX_CODA_FS #define _LINUX_CODA_FS +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/param.h> #include <linux/mm.h> @@ -63,7 +69,7 @@ void coda_sysctl_clean(void); else \ ptr = (cast)vzalloc((unsigned long) size); \ if (!ptr) \ - printk("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \ + pr_warn("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \ } while (0) diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 5efbb5ee0adc..cd8a63238b11 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -102,7 +102,7 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, unsig int type = 0; if (length > CODA_MAXNAMLEN) { - printk(KERN_ERR "name too long: lookup, %s (%*s)\n", + pr_err("name too long: lookup, %s (%*s)\n", coda_i2s(dir), (int)length, name); return ERR_PTR(-ENAMETOOLONG); } @@ -453,23 +453,23 @@ static int coda_venus_readdir(struct file *coda_file, struct dir_context *ctx) ret = kernel_read(host_file, ctx->pos - 2, (char *)vdir, sizeof(*vdir)); if (ret < 0) { - printk(KERN_ERR "coda readdir: read dir %s failed %d\n", - coda_f2s(&cii->c_fid), ret); + pr_err("%s: read dir %s failed %d\n", + __func__, coda_f2s(&cii->c_fid), ret); break; } if (ret == 0) break; /* end of directory file reached */ /* catch truncated reads */ if (ret < vdir_size || ret < vdir_size + vdir->d_namlen) { - printk(KERN_ERR "coda readdir: short read on %s\n", - coda_f2s(&cii->c_fid)); + pr_err("%s: short read on %s\n", + __func__, coda_f2s(&cii->c_fid)); ret = -EBADF; break; } /* validate whether the directory file actually makes sense */ if (vdir->d_reclen < vdir_size + vdir->d_namlen) { - printk(KERN_ERR "coda readdir: invalid dir %s\n", - coda_f2s(&cii->c_fid)); + pr_err("%s: invalid dir %s\n", + __func__, coda_f2s(&cii->c_fid)); ret = -EBADF; break; } @@ -589,8 +589,8 @@ int coda_revalidate_inode(struct inode *inode) coda_vattr_to_iattr(inode, &attr); if ((old_mode & S_IFMT) != (inode->i_mode & S_IFMT)) { - printk("Coda: inode %ld, fid %s changed type!\n", - inode->i_ino, coda_f2s(&(cii->c_fid))); + pr_warn("inode %ld, fid %s changed type!\n", + inode->i_ino, coda_f2s(&(cii->c_fid))); } /* the following can happen when a local fid is replaced diff --git a/fs/coda/inode.c b/fs/coda/inode.c index d9c7751f10ac..fe3afb2de880 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -119,12 +119,12 @@ static int get_device_index(struct coda_mount_data *data) int idx; if (data == NULL) { - printk("coda_read_super: Bad mount data\n"); + pr_warn("%s: Bad mount data\n", __func__); return -1; } if (data->version != CODA_MOUNT_VERSION) { - printk("coda_read_super: Bad mount version\n"); + pr_warn("%s: Bad mount version\n", __func__); return -1; } @@ -141,13 +141,13 @@ static int get_device_index(struct coda_mount_data *data) fdput(f); if (idx < 0 || idx >= MAX_CODADEVS) { - printk("coda_read_super: Bad minor number\n"); + pr_warn("%s: Bad minor number\n", __func__); return -1; } return idx; Ebadf: - printk("coda_read_super: Bad file\n"); + pr_warn("%s: Bad file\n", __func__); return -1; } @@ -168,19 +168,19 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) if(idx == -1) idx = 0; - printk(KERN_INFO "coda_read_super: device index: %i\n", idx); + pr_info("%s: device index: %i\n", __func__, idx); vc = &coda_comms[idx]; mutex_lock(&vc->vc_mutex); if (!vc->vc_inuse) { - printk("coda_read_super: No pseudo device\n"); + pr_warn("%s: No pseudo device\n", __func__); error = -EINVAL; goto unlock_out; } if (vc->vc_sb) { - printk("coda_read_super: Device already mounted\n"); + pr_warn("%s: Device already mounted\n", __func__); error = -EBUSY; goto unlock_out; } @@ -204,22 +204,23 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) /* get root fid from Venus: this needs the root inode */ error = venus_rootfid(sb, &fid); if ( error ) { - printk("coda_read_super: coda_get_rootfid failed with %d\n", - error); + pr_warn("%s: coda_get_rootfid failed with %d\n", + __func__, error); goto error; } - printk("coda_read_super: rootfid is %s\n", coda_f2s(&fid)); + pr_info("%s: rootfid is %s\n", __func__, coda_f2s(&fid)); /* make root inode */ root = coda_cnode_make(&fid, sb); if (IS_ERR(root)) { error = PTR_ERR(root); - printk("Failure of coda_cnode_make for root: error %d\n", error); + pr_warn("Failure of coda_cnode_make for root: error %d\n", + error); goto error; } - printk("coda_read_super: rootinode is %ld dev %s\n", - root->i_ino, root->i_sb->s_id); + pr_info("%s: rootinode is %ld dev %s\n", + __func__, root->i_ino, root->i_sb->s_id); sb->s_root = d_make_root(root); if (!sb->s_root) { error = -EINVAL; @@ -246,7 +247,7 @@ static void coda_put_super(struct super_block *sb) sb->s_fs_info = NULL; mutex_unlock(&vcp->vc_mutex); - printk("Coda: Bye bye.\n"); + pr_info("Bye bye.\n"); } static void coda_evict_inode(struct inode *inode) diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index ebc2bae6c289..5c1e4242368b 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -114,14 +114,14 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf, int size = sizeof(*dcbuf); if ( nbytes < sizeof(struct coda_out_hdr) ) { - printk("coda_downcall opc %d uniq %d, not enough!\n", - hdr.opcode, hdr.unique); + pr_warn("coda_downcall opc %d uniq %d, not enough!\n", + hdr.opcode, hdr.unique); count = nbytes; goto out; } if ( nbytes > size ) { - printk("Coda: downcall opc %d, uniq %d, too much!", - hdr.opcode, hdr.unique); + pr_warn("downcall opc %d, uniq %d, too much!", + hdr.opcode, hdr.unique); nbytes = size; } CODA_ALLOC(dcbuf, union outputArgs *, nbytes); @@ -136,7 +136,8 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf, CODA_FREE(dcbuf, nbytes); if (error) { - printk("psdev_write: coda_downcall error: %d\n", error); + pr_warn("%s: coda_downcall error: %d\n", + __func__, error); retval = error; goto out; } @@ -157,16 +158,17 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf, mutex_unlock(&vcp->vc_mutex); if (!req) { - printk("psdev_write: msg (%d, %d) not found\n", - hdr.opcode, hdr.unique); + pr_warn("%s: msg (%d, %d) not found\n", + __func__, hdr.opcode, hdr.unique); retval = -ESRCH; goto out; } /* move data into response buffer. */ if (req->uc_outSize < nbytes) { - printk("psdev_write: too much cnt: %d, cnt: %ld, opc: %d, uniq: %d.\n", - req->uc_outSize, (long)nbytes, hdr.opcode, hdr.unique); + pr_warn("%s: too much cnt: %d, cnt: %ld, opc: %d, uniq: %d.\n", + __func__, req->uc_outSize, (long)nbytes, + hdr.opcode, hdr.unique); nbytes = req->uc_outSize; /* don't have more space! */ } if (copy_from_user(req->uc_data, buf, nbytes)) { @@ -240,8 +242,8 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf, /* Move the input args into userspace */ count = req->uc_inSize; if (nbytes < req->uc_inSize) { - printk ("psdev_read: Venus read %ld bytes of %d in message\n", - (long)nbytes, req->uc_inSize); + pr_warn("%s: Venus read %ld bytes of %d in message\n", + __func__, (long)nbytes, req->uc_inSize); count = nbytes; } @@ -305,7 +307,7 @@ static int coda_psdev_release(struct inode * inode, struct file * file) struct upc_req *req, *tmp; if (!vcp || !vcp->vc_inuse ) { - printk("psdev_release: Not open.\n"); + pr_warn("%s: Not open.\n", __func__); return -1; } @@ -354,8 +356,8 @@ static int init_coda_psdev(void) { int i, err = 0; if (register_chrdev(CODA_PSDEV_MAJOR, "coda", &coda_psdev_fops)) { - printk(KERN_ERR "coda_psdev: unable to get major %d\n", - CODA_PSDEV_MAJOR); + pr_err("%s: unable to get major %d\n", + __func__, CODA_PSDEV_MAJOR); return -EIO; } coda_psdev_class = class_create(THIS_MODULE, "coda"); @@ -393,13 +395,13 @@ static int __init init_coda(void) goto out2; status = init_coda_psdev(); if ( status ) { - printk("Problem (%d) in init_coda_psdev\n", status); + pr_warn("Problem (%d) in init_coda_psdev\n", status); goto out1; } status = register_filesystem(&coda_fs_type); if (status) { - printk("coda: failed to register filesystem!\n"); + pr_warn("failed to register filesystem!\n"); goto out; } return 0; @@ -420,9 +422,8 @@ static void __exit exit_coda(void) int err, i; err = unregister_filesystem(&coda_fs_type); - if ( err != 0 ) { - printk("coda: failed to unregister filesystem\n"); - } + if (err != 0) + pr_warn("failed to unregister filesystem\n"); for (i = 0; i < MAX_CODADEVS; i++) device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i)); class_destroy(coda_psdev_class); diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c index af56ad56a89a..34218a8a28cd 100644 --- a/fs/coda/sysctl.c +++ b/fs/coda/sysctl.c @@ -14,7 +14,7 @@ #ifdef CONFIG_SYSCTL static struct ctl_table_header *fs_table_header; -static ctl_table coda_table[] = { +static struct ctl_table coda_table[] = { { .procname = "timeout", .data = &coda_timeout, @@ -39,7 +39,7 @@ static ctl_table coda_table[] = { {} }; -static ctl_table fs_table[] = { +static struct ctl_table fs_table[] = { { .procname = "coda", .mode = 0555, diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index 3a731976dc5e..21fcf8dcb9cd 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -508,8 +508,8 @@ int venus_pioctl(struct super_block *sb, struct CodaFid *fid, inp->coda_ioctl.data = (char *)(INSIZE(ioctl)); /* get the data out of user space */ - if ( copy_from_user((char*)inp + (long)inp->coda_ioctl.data, - data->vi.in, data->vi.in_size) ) { + if (copy_from_user((char *)inp + (long)inp->coda_ioctl.data, + data->vi.in, data->vi.in_size)) { error = -EINVAL; goto exit; } @@ -518,8 +518,8 @@ int venus_pioctl(struct super_block *sb, struct CodaFid *fid, &outsize, inp); if (error) { - printk("coda_pioctl: Venus returns: %d for %s\n", - error, coda_f2s(fid)); + pr_warn("%s: Venus returns: %d for %s\n", + __func__, error, coda_f2s(fid)); goto exit; } @@ -675,7 +675,7 @@ static int coda_upcall(struct venus_comm *vcp, mutex_lock(&vcp->vc_mutex); if (!vcp->vc_inuse) { - printk(KERN_NOTICE "coda: Venus dead, not sending upcall\n"); + pr_notice("Venus dead, not sending upcall\n"); error = -ENXIO; goto exit; } @@ -725,7 +725,7 @@ static int coda_upcall(struct venus_comm *vcp, error = -EINTR; if ((req->uc_flags & CODA_REQ_ABORT) || !signal_pending(current)) { - printk(KERN_WARNING "coda: Unexpected interruption.\n"); + pr_warn("Unexpected interruption.\n"); goto exit; } @@ -735,7 +735,7 @@ static int coda_upcall(struct venus_comm *vcp, /* Venus saw the upcall, make sure we can send interrupt signal */ if (!vcp->vc_inuse) { - printk(KERN_INFO "coda: Venus dead, not sending signal.\n"); + pr_info("Venus dead, not sending signal.\n"); goto exit; } diff --git a/fs/compat.c b/fs/compat.c index ca926ad0430c..66d3d3c6b4b2 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -457,9 +457,9 @@ COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, case F_GETLK64: case F_SETLK64: case F_SETLKW64: - case F_GETLKP: - case F_SETLKP: - case F_SETLKPW: + case F_OFD_GETLK: + case F_OFD_SETLK: + case F_OFD_SETLKW: ret = get_compat_flock64(&f, compat_ptr(arg)); if (ret != 0) break; @@ -468,7 +468,7 @@ COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, conv_cmd = convert_fcntl_cmd(cmd); ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f); set_fs(old_fs); - if ((conv_cmd == F_GETLK || conv_cmd == F_GETLKP) && ret == 0) { + if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) { /* need to return lock information - see above for commentary */ if (f.l_start > COMPAT_LOFF_T_MAX) ret = -EOVERFLOW; @@ -493,9 +493,9 @@ COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, case F_GETLK64: case F_SETLK64: case F_SETLKW64: - case F_GETLKP: - case F_SETLKP: - case F_SETLKPW: + case F_OFD_GETLK: + case F_OFD_SETLK: + case F_OFD_SETLKW: return -EINVAL; } return compat_sys_fcntl64(fd, cmd, arg); diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index b5f0a3b91f18..bd4a3c167091 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -24,6 +24,12 @@ * configfs Copyright (C) 2005 Oracle. All rights reserved. */ +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/slab.h> #include <linux/list.h> #include <linux/spinlock.h> diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index e081acbac2e7..668dcabc5695 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -940,9 +940,9 @@ static void client_drop_item(struct config_item *parent_item, #ifdef DEBUG static void configfs_dump_one(struct configfs_dirent *sd, int level) { - printk(KERN_INFO "%*s\"%s\":\n", level, " ", configfs_get_name(sd)); + pr_info("%*s\"%s\":\n", level, " ", configfs_get_name(sd)); -#define type_print(_type) if (sd->s_type & _type) printk(KERN_INFO "%*s %s\n", level, " ", #_type); +#define type_print(_type) if (sd->s_type & _type) pr_info("%*s %s\n", level, " ", #_type); type_print(CONFIGFS_ROOT); type_print(CONFIGFS_DIR); type_print(CONFIGFS_ITEM_ATTR); @@ -1699,7 +1699,7 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) struct dentry *root = dentry->d_sb->s_root; if (dentry->d_parent != root) { - printk(KERN_ERR "configfs: Tried to unregister non-subsystem!\n"); + pr_err("Tried to unregister non-subsystem!\n"); return; } @@ -1709,7 +1709,7 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) mutex_lock(&configfs_symlink_mutex); spin_lock(&configfs_dirent_lock); if (configfs_detach_prep(dentry, NULL)) { - printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n"); + pr_err("Tried to unregister non-empty subsystem!\n"); } spin_unlock(&configfs_dirent_lock); mutex_unlock(&configfs_symlink_mutex); diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index a9d35b0e06cf..5946ad98053f 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -168,9 +168,8 @@ static void configfs_set_inode_lock_class(struct configfs_dirent *sd, * In practice the maximum level of locking depth is * already reached. Just inform about possible reasons. */ - printk(KERN_INFO "configfs: Too many levels of inodes" - " for the locking correctness validator.\n"); - printk(KERN_INFO "Spurious warnings may appear.\n"); + pr_info("Too many levels of inodes for the locking correctness validator.\n"); + pr_info("Spurious warnings may appear.\n"); } } } diff --git a/fs/configfs/item.c b/fs/configfs/item.c index 50cee7f9110b..e65f9ffbb999 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c @@ -19,7 +19,7 @@ * Boston, MA 021110-1307, USA. * * Based on kobject: - * kobject is Copyright (c) 2002-2003 Patrick Mochel + * kobject is Copyright (c) 2002-2003 Patrick Mochel * * configfs Copyright (C) 2005 Oracle. All rights reserved. * @@ -35,9 +35,9 @@ #include <linux/configfs.h> -static inline struct config_item * to_item(struct list_head * entry) +static inline struct config_item *to_item(struct list_head *entry) { - return container_of(entry,struct config_item,ci_entry); + return container_of(entry, struct config_item, ci_entry); } /* Evil kernel */ @@ -47,34 +47,35 @@ static void config_item_release(struct kref *kref); * config_item_init - initialize item. * @item: item in question. */ -void config_item_init(struct config_item * item) +void config_item_init(struct config_item *item) { kref_init(&item->ci_kref); INIT_LIST_HEAD(&item->ci_entry); } +EXPORT_SYMBOL(config_item_init); /** * config_item_set_name - Set the name of an item * @item: item. - * @name: name. + * @fmt: The vsnprintf()'s format string. * * If strlen(name) >= CONFIGFS_ITEM_NAME_LEN, then use a * dynamically allocated string that @item->ci_name points to. * Otherwise, use the static @item->ci_namebuf array. */ -int config_item_set_name(struct config_item * item, const char * fmt, ...) +int config_item_set_name(struct config_item *item, const char *fmt, ...) { int error = 0; int limit = CONFIGFS_ITEM_NAME_LEN; int need; va_list args; - char * name; + char *name; /* * First, try the static array */ - va_start(args,fmt); - need = vsnprintf(item->ci_namebuf,limit,fmt,args); + va_start(args, fmt); + need = vsnprintf(item->ci_namebuf, limit, fmt, args); va_end(args); if (need < limit) name = item->ci_namebuf; @@ -83,13 +84,13 @@ int config_item_set_name(struct config_item * item, const char * fmt, ...) * Need more space? Allocate it and try again */ limit = need + 1; - name = kmalloc(limit,GFP_KERNEL); + name = kmalloc(limit, GFP_KERNEL); if (!name) { error = -ENOMEM; goto Done; } - va_start(args,fmt); - need = vsnprintf(name,limit,fmt,args); + va_start(args, fmt); + need = vsnprintf(name, limit, fmt, args); va_end(args); /* Still? Give up. */ @@ -109,7 +110,6 @@ int config_item_set_name(struct config_item * item, const char * fmt, ...) Done: return error; } - EXPORT_SYMBOL(config_item_set_name); void config_item_init_type_name(struct config_item *item, @@ -131,20 +131,21 @@ void config_group_init_type_name(struct config_group *group, const char *name, } EXPORT_SYMBOL(config_group_init_type_name); -struct config_item * config_item_get(struct config_item * item) +struct config_item *config_item_get(struct config_item *item) { if (item) kref_get(&item->ci_kref); return item; } +EXPORT_SYMBOL(config_item_get); -static void config_item_cleanup(struct config_item * item) +static void config_item_cleanup(struct config_item *item) { - struct config_item_type * t = item->ci_type; - struct config_group * s = item->ci_group; - struct config_item * parent = item->ci_parent; + struct config_item_type *t = item->ci_type; + struct config_group *s = item->ci_group; + struct config_item *parent = item->ci_parent; - pr_debug("config_item %s: cleaning up\n",config_item_name(item)); + pr_debug("config_item %s: cleaning up\n", config_item_name(item)); if (item->ci_name != item->ci_namebuf) kfree(item->ci_name); item->ci_name = NULL; @@ -167,21 +168,23 @@ static void config_item_release(struct kref *kref) * * Decrement the refcount, and if 0, call config_item_cleanup(). */ -void config_item_put(struct config_item * item) +void config_item_put(struct config_item *item) { if (item) kref_put(&item->ci_kref, config_item_release); } +EXPORT_SYMBOL(config_item_put); /** * config_group_init - initialize a group for use - * @k: group + * @group: config_group */ void config_group_init(struct config_group *group) { config_item_init(&group->cg_item); INIT_LIST_HEAD(&group->cg_children); } +EXPORT_SYMBOL(config_group_init); /** * config_group_find_item - search for item in group. @@ -195,11 +198,11 @@ void config_group_init(struct config_group *group) struct config_item *config_group_find_item(struct config_group *group, const char *name) { - struct list_head * entry; - struct config_item * ret = NULL; + struct list_head *entry; + struct config_item *ret = NULL; - list_for_each(entry,&group->cg_children) { - struct config_item * item = to_item(entry); + list_for_each(entry, &group->cg_children) { + struct config_item *item = to_item(entry); if (config_item_name(item) && !strcmp(config_item_name(item), name)) { ret = config_item_get(item); @@ -208,9 +211,4 @@ struct config_item *config_group_find_item(struct config_group *group, } return ret; } - -EXPORT_SYMBOL(config_item_init); -EXPORT_SYMBOL(config_group_init); -EXPORT_SYMBOL(config_item_get); -EXPORT_SYMBOL(config_item_put); EXPORT_SYMBOL(config_group_find_item); diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index 7f26c3cf75ae..f6c285833390 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -85,7 +85,7 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent) /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); } else { - pr_debug("configfs: could not get root inode\n"); + pr_debug("could not get root inode\n"); return -ENOMEM; } @@ -155,7 +155,7 @@ static int __init configfs_init(void) return 0; out4: - printk(KERN_ERR "configfs: Unable to register filesystem!\n"); + pr_err("Unable to register filesystem!\n"); configfs_inode_exit(); out3: kobject_put(config_kobj); diff --git a/fs/coredump.c b/fs/coredump.c index e3ad709a4232..a93f7e6ea4cf 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -73,10 +73,15 @@ static int expand_corename(struct core_name *cn, int size) static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg) { int free, need; + va_list arg_copy; again: free = cn->size - cn->used; - need = vsnprintf(cn->corename + cn->used, free, fmt, arg); + + va_copy(arg_copy, arg); + need = vsnprintf(cn->corename + cn->used, free, fmt, arg_copy); + va_end(arg_copy); + if (need < free) { cn->used += need; return 0; @@ -301,7 +306,7 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm, if (unlikely(nr < 0)) return nr; - tsk->flags = PF_DUMPCORE; + tsk->flags |= PF_DUMPCORE; if (atomic_read(&mm->mm_users) == nr + 1) goto done; /* diff --git a/fs/dcache.c b/fs/dcache.c index 40707d88a945..06f65857a855 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -150,7 +150,7 @@ static long get_nr_dentry_unused(void) return sum < 0 ? 0 : sum; } -int proc_nr_dentry(ctl_table *table, int write, void __user *buffer, +int proc_nr_dentry(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); @@ -246,16 +246,8 @@ static void __d_free(struct rcu_head *head) kmem_cache_free(dentry_cache, dentry); } -/* - * no locks, please. - */ -static void d_free(struct dentry *dentry) +static void dentry_free(struct dentry *dentry) { - BUG_ON((int)dentry->d_lockref.count > 0); - this_cpu_dec(nr_dentry); - if (dentry->d_op && dentry->d_op->d_release) - dentry->d_op->d_release(dentry); - /* if dentry was never visible to RCU, immediate free is OK */ if (!(dentry->d_flags & DCACHE_RCUACCESS)) __d_free(&dentry->d_u.d_rcu); @@ -403,56 +395,6 @@ static void dentry_lru_add(struct dentry *dentry) d_lru_add(dentry); } -/* - * Remove a dentry with references from the LRU. - * - * If we are on the shrink list, then we can get to try_prune_one_dentry() and - * lose our last reference through the parent walk. In this case, we need to - * remove ourselves from the shrink list, not the LRU. - */ -static void dentry_lru_del(struct dentry *dentry) -{ - if (dentry->d_flags & DCACHE_LRU_LIST) { - if (dentry->d_flags & DCACHE_SHRINK_LIST) - return d_shrink_del(dentry); - d_lru_del(dentry); - } -} - -/** - * d_kill - kill dentry and return parent - * @dentry: dentry to kill - * @parent: parent dentry - * - * The dentry must already be unhashed and removed from the LRU. - * - * If this is the root of the dentry tree, return NULL. - * - * dentry->d_lock and parent->d_lock must be held by caller, and are dropped by - * d_kill. - */ -static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) - __releases(dentry->d_lock) - __releases(parent->d_lock) - __releases(dentry->d_inode->i_lock) -{ - list_del(&dentry->d_u.d_child); - /* - * Inform d_walk() that we are no longer attached to the - * dentry tree - */ - dentry->d_flags |= DCACHE_DENTRY_KILLED; - if (parent) - spin_unlock(&parent->d_lock); - dentry_iput(dentry); - /* - * dentry_iput drops the locks, at which point nobody (except - * transient RCU lookups) can reach this dentry. - */ - d_free(dentry); - return parent; -} - /** * d_drop - drop a dentry * @dentry: dentry to drop @@ -499,37 +441,12 @@ void d_drop(struct dentry *dentry) } EXPORT_SYMBOL(d_drop); -/* - * Finish off a dentry we've decided to kill. - * dentry->d_lock must be held, returns with it unlocked. - * If ref is non-zero, then decrement the refcount too. - * Returns dentry requiring refcount drop, or NULL if we're done. - */ -static struct dentry * -dentry_kill(struct dentry *dentry, int unlock_on_failure) - __releases(dentry->d_lock) +static void __dentry_kill(struct dentry *dentry) { - struct inode *inode; - struct dentry *parent; - - inode = dentry->d_inode; - if (inode && !spin_trylock(&inode->i_lock)) { -relock: - if (unlock_on_failure) { - spin_unlock(&dentry->d_lock); - cpu_relax(); - } - return dentry; /* try again with same dentry */ - } - if (IS_ROOT(dentry)) - parent = NULL; - else + struct dentry *parent = NULL; + bool can_free = true; + if (!IS_ROOT(dentry)) parent = dentry->d_parent; - if (parent && !spin_trylock(&parent->d_lock)) { - if (inode) - spin_unlock(&inode->i_lock); - goto relock; - } /* * The dentry is now unrecoverably dead to the world. @@ -543,10 +460,105 @@ relock: if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry)) dentry->d_op->d_prune(dentry); - dentry_lru_del(dentry); + if (dentry->d_flags & DCACHE_LRU_LIST) { + if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) + d_lru_del(dentry); + } /* if it was on the hash then remove it */ __d_drop(dentry); - return d_kill(dentry, parent); + list_del(&dentry->d_u.d_child); + /* + * Inform d_walk() that we are no longer attached to the + * dentry tree + */ + dentry->d_flags |= DCACHE_DENTRY_KILLED; + if (parent) + spin_unlock(&parent->d_lock); + dentry_iput(dentry); + /* + * dentry_iput drops the locks, at which point nobody (except + * transient RCU lookups) can reach this dentry. + */ + BUG_ON((int)dentry->d_lockref.count > 0); + this_cpu_dec(nr_dentry); + if (dentry->d_op && dentry->d_op->d_release) + dentry->d_op->d_release(dentry); + + spin_lock(&dentry->d_lock); + if (dentry->d_flags & DCACHE_SHRINK_LIST) { + dentry->d_flags |= DCACHE_MAY_FREE; + can_free = false; + } + spin_unlock(&dentry->d_lock); + if (likely(can_free)) + dentry_free(dentry); +} + +/* + * Finish off a dentry we've decided to kill. + * dentry->d_lock must be held, returns with it unlocked. + * If ref is non-zero, then decrement the refcount too. + * Returns dentry requiring refcount drop, or NULL if we're done. + */ +static struct dentry *dentry_kill(struct dentry *dentry) + __releases(dentry->d_lock) +{ + struct inode *inode = dentry->d_inode; + struct dentry *parent = NULL; + + if (inode && unlikely(!spin_trylock(&inode->i_lock))) + goto failed; + + if (!IS_ROOT(dentry)) { + parent = dentry->d_parent; + if (unlikely(!spin_trylock(&parent->d_lock))) { + if (inode) + spin_unlock(&inode->i_lock); + goto failed; + } + } + + __dentry_kill(dentry); + return parent; + +failed: + spin_unlock(&dentry->d_lock); + cpu_relax(); + return dentry; /* try again with same dentry */ +} + +static inline struct dentry *lock_parent(struct dentry *dentry) +{ + struct dentry *parent = dentry->d_parent; + if (IS_ROOT(dentry)) + return NULL; + if (unlikely((int)dentry->d_lockref.count < 0)) + return NULL; + if (likely(spin_trylock(&parent->d_lock))) + return parent; + rcu_read_lock(); + spin_unlock(&dentry->d_lock); +again: + parent = ACCESS_ONCE(dentry->d_parent); + spin_lock(&parent->d_lock); + /* + * We can't blindly lock dentry until we are sure + * that we won't violate the locking order. + * Any changes of dentry->d_parent must have + * been done with parent->d_lock held, so + * spin_lock() above is enough of a barrier + * for checking if it's still our child. + */ + if (unlikely(parent != dentry->d_parent)) { + spin_unlock(&parent->d_lock); + goto again; + } + rcu_read_unlock(); + if (parent != dentry) + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + else + parent = NULL; + return parent; } /* @@ -602,7 +614,7 @@ repeat: return; kill_it: - dentry = dentry_kill(dentry, 1); + dentry = dentry_kill(dentry); if (dentry) goto repeat; } @@ -815,64 +827,15 @@ restart: } EXPORT_SYMBOL(d_prune_aliases); -/* - * Try to throw away a dentry - free the inode, dput the parent. - * Requires dentry->d_lock is held, and dentry->d_count == 0. - * Releases dentry->d_lock. - * - * This may fail if locks cannot be acquired no problem, just try again. - */ -static struct dentry * try_prune_one_dentry(struct dentry *dentry) - __releases(dentry->d_lock) -{ - struct dentry *parent; - - parent = dentry_kill(dentry, 0); - /* - * If dentry_kill returns NULL, we have nothing more to do. - * if it returns the same dentry, trylocks failed. In either - * case, just loop again. - * - * Otherwise, we need to prune ancestors too. This is necessary - * to prevent quadratic behavior of shrink_dcache_parent(), but - * is also expected to be beneficial in reducing dentry cache - * fragmentation. - */ - if (!parent) - return NULL; - if (parent == dentry) - return dentry; - - /* Prune ancestors. */ - dentry = parent; - while (dentry) { - if (lockref_put_or_lock(&dentry->d_lockref)) - return NULL; - dentry = dentry_kill(dentry, 1); - } - return NULL; -} - static void shrink_dentry_list(struct list_head *list) { - struct dentry *dentry; - - rcu_read_lock(); - for (;;) { - dentry = list_entry_rcu(list->prev, struct dentry, d_lru); - if (&dentry->d_lru == list) - break; /* empty */ + struct dentry *dentry, *parent; - /* - * Get the dentry lock, and re-verify that the dentry is - * this on the shrinking list. If it is, we know that - * DCACHE_SHRINK_LIST and DCACHE_LRU_LIST are set. - */ + while (!list_empty(list)) { + struct inode *inode; + dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); - if (dentry != list_entry(list->prev, struct dentry, d_lru)) { - spin_unlock(&dentry->d_lock); - continue; - } + parent = lock_parent(dentry); /* * The dispose list is isolated and dentries are not accounted @@ -885,30 +848,63 @@ static void shrink_dentry_list(struct list_head *list) * We found an inuse dentry which was not removed from * the LRU because of laziness during lookup. Do not free it. */ - if (dentry->d_lockref.count) { + if ((int)dentry->d_lockref.count > 0) { spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); continue; } - rcu_read_unlock(); - /* - * If 'try_to_prune()' returns a dentry, it will - * be the same one we passed in, and d_lock will - * have been held the whole time, so it will not - * have been added to any other lists. We failed - * to get the inode lock. - * - * We just add it back to the shrink list. - */ - dentry = try_prune_one_dentry(dentry); - rcu_read_lock(); - if (dentry) { + if (unlikely(dentry->d_flags & DCACHE_DENTRY_KILLED)) { + bool can_free = dentry->d_flags & DCACHE_MAY_FREE; + spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); + if (can_free) + dentry_free(dentry); + continue; + } + + inode = dentry->d_inode; + if (inode && unlikely(!spin_trylock(&inode->i_lock))) { d_shrink_add(dentry, list); spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); + continue; + } + + __dentry_kill(dentry); + + /* + * We need to prune ancestors too. This is necessary to prevent + * quadratic behavior of shrink_dcache_parent(), but is also + * expected to be beneficial in reducing dentry cache + * fragmentation. + */ + dentry = parent; + while (dentry && !lockref_put_or_lock(&dentry->d_lockref)) { + parent = lock_parent(dentry); + if (dentry->d_lockref.count != 1) { + dentry->d_lockref.count--; + spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); + break; + } + inode = dentry->d_inode; /* can't be NULL */ + if (unlikely(!spin_trylock(&inode->i_lock))) { + spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); + cpu_relax(); + continue; + } + __dentry_kill(dentry); + dentry = parent; } } - rcu_read_unlock(); } static enum lru_status @@ -1261,34 +1257,23 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) if (data->start == dentry) goto out; - /* - * move only zero ref count dentries to the dispose list. - * - * Those which are presently on the shrink list, being processed - * by shrink_dentry_list(), shouldn't be moved. Otherwise the - * loop in shrink_dcache_parent() might not make any progress - * and loop forever. - */ - if (dentry->d_lockref.count) { - dentry_lru_del(dentry); - } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { - /* - * We can't use d_lru_shrink_move() because we - * need to get the global LRU lock and do the - * LRU accounting. - */ - d_lru_del(dentry); - d_shrink_add(dentry, &data->dispose); + if (dentry->d_flags & DCACHE_SHRINK_LIST) { data->found++; - ret = D_WALK_NORETRY; + } else { + if (dentry->d_flags & DCACHE_LRU_LIST) + d_lru_del(dentry); + if (!dentry->d_lockref.count) { + d_shrink_add(dentry, &data->dispose); + data->found++; + } } /* * We can return to the caller if we have found some (this * ensures forward progress). We'll be coming back to find * the rest. */ - if (data->found && need_resched()) - ret = D_WALK_QUIT; + if (!list_empty(&data->dispose)) + ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; out: return ret; } @@ -1318,45 +1303,35 @@ void shrink_dcache_parent(struct dentry *parent) } EXPORT_SYMBOL(shrink_dcache_parent); -static enum d_walk_ret umount_collect(void *_data, struct dentry *dentry) +static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) { - struct select_data *data = _data; - enum d_walk_ret ret = D_WALK_CONTINUE; + /* it has busy descendents; complain about those instead */ + if (!list_empty(&dentry->d_subdirs)) + return D_WALK_CONTINUE; - if (dentry->d_lockref.count) { - dentry_lru_del(dentry); - if (likely(!list_empty(&dentry->d_subdirs))) - goto out; - if (dentry == data->start && dentry->d_lockref.count == 1) - goto out; - printk(KERN_ERR - "BUG: Dentry %p{i=%lx,n=%s}" - " still in use (%d)" - " [unmount of %s %s]\n", + /* root with refcount 1 is fine */ + if (dentry == _data && dentry->d_lockref.count == 1) + return D_WALK_CONTINUE; + + printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%pd} " + " still in use (%d) [unmount of %s %s]\n", dentry, dentry->d_inode ? dentry->d_inode->i_ino : 0UL, - dentry->d_name.name, + dentry, dentry->d_lockref.count, dentry->d_sb->s_type->name, dentry->d_sb->s_id); - BUG(); - } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { - /* - * We can't use d_lru_shrink_move() because we - * need to get the global LRU lock and do the - * LRU accounting. - */ - if (dentry->d_flags & DCACHE_LRU_LIST) - d_lru_del(dentry); - d_shrink_add(dentry, &data->dispose); - data->found++; - ret = D_WALK_NORETRY; - } -out: - if (data->found && need_resched()) - ret = D_WALK_QUIT; - return ret; + WARN_ON(1); + return D_WALK_CONTINUE; +} + +static void do_one_tree(struct dentry *dentry) +{ + shrink_dcache_parent(dentry); + d_walk(dentry, dentry, umount_check, NULL); + d_drop(dentry); + dput(dentry); } /* @@ -1366,40 +1341,15 @@ void shrink_dcache_for_umount(struct super_block *sb) { struct dentry *dentry; - if (down_read_trylock(&sb->s_umount)) - BUG(); + WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked"); dentry = sb->s_root; sb->s_root = NULL; - for (;;) { - struct select_data data; - - INIT_LIST_HEAD(&data.dispose); - data.start = dentry; - data.found = 0; - - d_walk(dentry, &data, umount_collect, NULL); - if (!data.found) - break; - - shrink_dentry_list(&data.dispose); - cond_resched(); - } - d_drop(dentry); - dput(dentry); + do_one_tree(dentry); while (!hlist_bl_empty(&sb->s_anon)) { - struct select_data data; - dentry = hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash); - - INIT_LIST_HEAD(&data.dispose); - data.start = NULL; - data.found = 0; - - d_walk(dentry, &data, umount_collect, NULL); - if (data.found) - shrink_dentry_list(&data.dispose); - cond_resched(); + dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash)); + do_one_tree(dentry); } } @@ -1647,8 +1597,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) unsigned add_flags = d_flags_for_inode(inode); spin_lock(&dentry->d_lock); - dentry->d_flags &= ~DCACHE_ENTRY_TYPE; - dentry->d_flags |= add_flags; + __d_set_type(dentry, add_flags); if (inode) hlist_add_head(&dentry->d_alias, &inode->i_dentry); dentry->d_inode = inode; diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index c71038079b47..cfe8466f7fef 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -10,6 +10,8 @@ * * ------------------------------------------------------------------------- */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/init.h> #include <linux/fs.h> @@ -148,10 +150,10 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode) /* * parse_mount_options(): - * Set @opts to mount options specified in @data. If an option is not - * specified in @data, set it to its default value. The exception is - * 'newinstance' option which can only be set/cleared on a mount (i.e. - * cannot be changed during remount). + * Set @opts to mount options specified in @data. If an option is not + * specified in @data, set it to its default value. The exception is + * 'newinstance' option which can only be set/cleared on a mount (i.e. + * cannot be changed during remount). * * Note: @data may be NULL (in which case all options are set to default). */ @@ -225,7 +227,7 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) break; #endif default: - printk(KERN_ERR "devpts: called with bogus options\n"); + pr_err("called with bogus options\n"); return -EINVAL; } } @@ -261,7 +263,7 @@ static int mknod_ptmx(struct super_block *sb) dentry = d_alloc_name(root, "ptmx"); if (!dentry) { - printk(KERN_NOTICE "Unable to alloc dentry for ptmx node\n"); + pr_err("Unable to alloc dentry for ptmx node\n"); goto out; } @@ -270,7 +272,7 @@ static int mknod_ptmx(struct super_block *sb) */ inode = new_inode(sb); if (!inode) { - printk(KERN_ERR "Unable to alloc inode for ptmx node\n"); + pr_err("Unable to alloc inode for ptmx node\n"); dput(dentry); goto out; } @@ -303,7 +305,7 @@ static void update_ptmx_mode(struct pts_fs_info *fsi) #else static inline void update_ptmx_mode(struct pts_fs_info *fsi) { - return; + return; } #endif @@ -333,9 +335,11 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root) struct pts_mount_opts *opts = &fsi->mount_opts; if (opts->setuid) - seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, opts->uid)); + seq_printf(seq, ",uid=%u", + from_kuid_munged(&init_user_ns, opts->uid)); if (opts->setgid) - seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, opts->gid)); + seq_printf(seq, ",gid=%u", + from_kgid_munged(&init_user_ns, opts->gid)); seq_printf(seq, ",mode=%03o", opts->mode); #ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode); @@ -396,7 +400,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent) if (s->s_root) return 0; - printk(KERN_ERR "devpts: get root dentry failed\n"); + pr_err("get root dentry failed\n"); fail: return -ENOMEM; diff --git a/fs/direct-io.c b/fs/direct-io.c index 31ba0935e32e..17e39b047de5 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -71,13 +71,11 @@ struct dio_submit { been performed at the start of a write */ int pages_in_io; /* approximate total IO pages */ - size_t size; /* total request size (doesn't change)*/ sector_t block_in_file; /* Current offset into the underlying file in dio_block units. */ unsigned blocks_available; /* At block_in_file. changes */ int reap_counter; /* rate limit reaping */ sector_t final_block_in_request;/* doesn't change */ - unsigned first_block_in_page; /* doesn't change, Used only once */ int boundary; /* prev block is at a boundary */ get_block_t *get_block; /* block mapping function */ dio_submit_t *submit_io; /* IO submition function */ @@ -98,19 +96,14 @@ struct dio_submit { sector_t cur_page_block; /* Where it starts */ loff_t cur_page_fs_offset; /* Offset in file */ - /* - * Page fetching state. These variables belong to dio_refill_pages(). - */ - int curr_page; /* changes */ - int total_pages; /* doesn't change */ - unsigned long curr_user_address;/* changes */ - + struct iov_iter *iter; /* * Page queue. These variables belong to dio_refill_pages() and * dio_get_page(). */ unsigned head; /* next page to process */ unsigned tail; /* last valid page + 1 */ + size_t from, to; }; /* dio_state communicated between submission path and end_io */ @@ -163,15 +156,10 @@ static inline unsigned dio_pages_present(struct dio_submit *sdio) */ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) { - int ret; - int nr_pages; + ssize_t ret; - nr_pages = min(sdio->total_pages - sdio->curr_page, DIO_PAGES); - ret = get_user_pages_fast( - sdio->curr_user_address, /* Where from? */ - nr_pages, /* How many pages? */ - dio->rw == READ, /* Write to memory? */ - &dio->pages[0]); /* Put results here */ + ret = iov_iter_get_pages(sdio->iter, dio->pages, DIO_PAGES * PAGE_SIZE, + &sdio->from); if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) { struct page *page = ZERO_PAGE(0); @@ -186,18 +174,19 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) dio->pages[0] = page; sdio->head = 0; sdio->tail = 1; - ret = 0; - goto out; + sdio->from = 0; + sdio->to = PAGE_SIZE; + return 0; } if (ret >= 0) { - sdio->curr_user_address += ret * PAGE_SIZE; - sdio->curr_page += ret; + iov_iter_advance(sdio->iter, ret); + ret += sdio->from; sdio->head = 0; - sdio->tail = ret; - ret = 0; + sdio->tail = (ret + PAGE_SIZE - 1) / PAGE_SIZE; + sdio->to = ((ret - 1) & (PAGE_SIZE - 1)) + 1; + return 0; } -out: return ret; } @@ -208,7 +197,7 @@ out: * L1 cache. */ static inline struct page *dio_get_page(struct dio *dio, - struct dio_submit *sdio) + struct dio_submit *sdio) { if (dio_pages_present(sdio) == 0) { int ret; @@ -218,7 +207,7 @@ static inline struct page *dio_get_page(struct dio *dio, return ERR_PTR(ret); BUG_ON(dio_pages_present(sdio) == 0); } - return dio->pages[sdio->head++]; + return dio->pages[sdio->head]; } /** @@ -422,8 +411,8 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) */ static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio) { - while (dio_pages_present(sdio)) - page_cache_release(dio_get_page(dio, sdio)); + while (sdio->head < sdio->tail) + page_cache_release(dio->pages[sdio->head++]); } /* @@ -912,23 +901,22 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, struct buffer_head *map_bh) { const unsigned blkbits = sdio->blkbits; - const unsigned blocks_per_page = PAGE_SIZE >> blkbits; - struct page *page; - unsigned block_in_page; int ret = 0; - /* The I/O can start at any block offset within the first page */ - block_in_page = sdio->first_block_in_page; - while (sdio->block_in_file < sdio->final_block_in_request) { + struct page *page; + size_t from, to; + page = dio_get_page(dio, sdio); if (IS_ERR(page)) { ret = PTR_ERR(page); goto out; } + from = sdio->head ? 0 : sdio->from; + to = (sdio->head == sdio->tail - 1) ? sdio->to : PAGE_SIZE; + sdio->head++; - while (block_in_page < blocks_per_page) { - unsigned offset_in_page = block_in_page << blkbits; + while (from < to) { unsigned this_chunk_bytes; /* # of bytes mapped */ unsigned this_chunk_blocks; /* # of blocks */ unsigned u; @@ -999,10 +987,10 @@ do_holes: page_cache_release(page); goto out; } - zero_user(page, block_in_page << blkbits, - 1 << blkbits); + zero_user(page, from, 1 << blkbits); sdio->block_in_file++; - block_in_page++; + from += 1 << blkbits; + dio->result += 1 << blkbits; goto next_block; } @@ -1019,7 +1007,7 @@ do_holes: * can add to this page */ this_chunk_blocks = sdio->blocks_available; - u = (PAGE_SIZE - offset_in_page) >> blkbits; + u = (to - from) >> blkbits; if (this_chunk_blocks > u) this_chunk_blocks = u; u = sdio->final_block_in_request - sdio->block_in_file; @@ -1031,7 +1019,7 @@ do_holes: if (this_chunk_blocks == sdio->blocks_available) sdio->boundary = buffer_boundary(map_bh); ret = submit_page_section(dio, sdio, page, - offset_in_page, + from, this_chunk_bytes, sdio->next_block_for_io, map_bh); @@ -1042,7 +1030,8 @@ do_holes: sdio->next_block_for_io += this_chunk_blocks; sdio->block_in_file += this_chunk_blocks; - block_in_page += this_chunk_blocks; + from += this_chunk_bytes; + dio->result += this_chunk_bytes; sdio->blocks_available -= this_chunk_blocks; next_block: BUG_ON(sdio->block_in_file > sdio->final_block_in_request); @@ -1052,7 +1041,6 @@ next_block: /* Drop the ref which was taken in get_user_pages() */ page_cache_release(page); - block_in_page = 0; } out: return ret; @@ -1107,24 +1095,21 @@ static inline int drop_refcount(struct dio *dio) */ static inline ssize_t do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, - struct block_device *bdev, const struct iovec *iov, loff_t offset, - unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, + struct block_device *bdev, struct iov_iter *iter, loff_t offset, + get_block_t get_block, dio_iodone_t end_io, dio_submit_t submit_io, int flags) { - int seg; - size_t size; - unsigned long addr; unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits); unsigned blkbits = i_blkbits; unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; - loff_t end = offset; + size_t count = iov_iter_count(iter); + loff_t end = offset + count; struct dio *dio; struct dio_submit sdio = { 0, }; - unsigned long user_addr; - size_t bytes; struct buffer_head map_bh = { 0, }; struct blk_plug plug; + unsigned long align = offset | iov_iter_alignment(iter); if (rw & WRITE) rw = WRITE_ODIRECT; @@ -1134,32 +1119,16 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, * the early prefetch in the caller enough time. */ - if (offset & blocksize_mask) { + if (align & blocksize_mask) { if (bdev) blkbits = blksize_bits(bdev_logical_block_size(bdev)); blocksize_mask = (1 << blkbits) - 1; - if (offset & blocksize_mask) + if (align & blocksize_mask) goto out; } - /* Check the memory alignment. Blocks cannot straddle pages */ - for (seg = 0; seg < nr_segs; seg++) { - addr = (unsigned long)iov[seg].iov_base; - size = iov[seg].iov_len; - end += size; - if (unlikely((addr & blocksize_mask) || - (size & blocksize_mask))) { - if (bdev) - blkbits = blksize_bits( - bdev_logical_block_size(bdev)); - blocksize_mask = (1 << blkbits) - 1; - if ((addr & blocksize_mask) || (size & blocksize_mask)) - goto out; - } - } - /* watch out for a 0 len io from a tricksy fs */ - if (rw == READ && end == offset) + if (rw == READ && !iov_iter_count(iter)) return 0; dio = kmem_cache_alloc(dio_cache, GFP_KERNEL); @@ -1249,6 +1218,10 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, spin_lock_init(&dio->bio_lock); dio->refcount = 1; + sdio.iter = iter; + sdio.final_block_in_request = + (offset + iov_iter_count(iter)) >> blkbits; + /* * In case of non-aligned buffers, we may need 2 more * pages since we need to zero out first and last block. @@ -1256,47 +1229,13 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, if (unlikely(sdio.blkfactor)) sdio.pages_in_io = 2; - for (seg = 0; seg < nr_segs; seg++) { - user_addr = (unsigned long)iov[seg].iov_base; - sdio.pages_in_io += - ((user_addr + iov[seg].iov_len + PAGE_SIZE-1) / - PAGE_SIZE - user_addr / PAGE_SIZE); - } + sdio.pages_in_io += iov_iter_npages(iter, INT_MAX); blk_start_plug(&plug); - for (seg = 0; seg < nr_segs; seg++) { - user_addr = (unsigned long)iov[seg].iov_base; - sdio.size += bytes = iov[seg].iov_len; - - /* Index into the first page of the first block */ - sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits; - sdio.final_block_in_request = sdio.block_in_file + - (bytes >> blkbits); - /* Page fetching state */ - sdio.head = 0; - sdio.tail = 0; - sdio.curr_page = 0; - - sdio.total_pages = 0; - if (user_addr & (PAGE_SIZE-1)) { - sdio.total_pages++; - bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1)); - } - sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE; - sdio.curr_user_address = user_addr; - - retval = do_direct_IO(dio, &sdio, &map_bh); - - dio->result += iov[seg].iov_len - - ((sdio.final_block_in_request - sdio.block_in_file) << - blkbits); - - if (retval) { - dio_cleanup(dio, &sdio); - break; - } - } /* end iovec loop */ + retval = do_direct_IO(dio, &sdio, &map_bh); + if (retval) + dio_cleanup(dio, &sdio); if (retval == -ENOTBLK) { /* @@ -1348,10 +1287,9 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, */ BUG_ON(retval == -EIOCBQUEUED); if (dio->is_async && retval == 0 && dio->result && - ((rw == READ) || (dio->result == sdio.size))) + (rw == READ || dio->result == count)) retval = -EIOCBQUEUED; - - if (retval != -EIOCBQUEUED) + else dio_await_completion(dio); if (drop_refcount(dio) == 0) { @@ -1365,8 +1303,8 @@ out: ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, - struct block_device *bdev, const struct iovec *iov, loff_t offset, - unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, + struct block_device *bdev, struct iov_iter *iter, loff_t offset, + get_block_t get_block, dio_iodone_t end_io, dio_submit_t submit_io, int flags) { /* @@ -1381,9 +1319,8 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, prefetch(bdev->bd_queue); prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES); - return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, - nr_segs, get_block, end_io, - submit_io, flags); + return do_blockdev_direct_IO(rw, iocb, inode, bdev, iter, offset, + get_block, end_io, submit_io, flags); } EXPORT_SYMBOL(__blockdev_direct_IO); diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 76feb4b60fa6..d521bddf876d 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -157,11 +157,13 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, const char *buf, size_t len) { unsigned int x; + int rc; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - - x = simple_strtoul(buf, NULL, 0); + rc = kstrtouint(buf, 0, &x); + if (rc) + return rc; if (check_zero && !x) return -EINVAL; @@ -730,7 +732,10 @@ static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf) static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf, size_t len) { - cm->nodeid = simple_strtol(buf, NULL, 0); + int rc = kstrtoint(buf, 0, &cm->nodeid); + + if (rc) + return rc; return len; } @@ -742,7 +747,10 @@ static ssize_t comm_local_read(struct dlm_comm *cm, char *buf) static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf, size_t len) { - cm->local= simple_strtol(buf, NULL, 0); + int rc = kstrtoint(buf, 0, &cm->local); + + if (rc) + return rc; if (cm->local && !local_comm) local_comm = cm; return len; @@ -846,7 +854,10 @@ static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf, size_t len) { uint32_t seq = 0; - nd->nodeid = simple_strtol(buf, NULL, 0); + int rc = kstrtoint(buf, 0, &nd->nodeid); + + if (rc) + return rc; dlm_comm_seq(nd->nodeid, &seq); nd->comm_seq = seq; return len; @@ -860,7 +871,10 @@ static ssize_t node_weight_read(struct dlm_node *nd, char *buf) static ssize_t node_weight_write(struct dlm_node *nd, const char *buf, size_t len) { - nd->weight = simple_strtol(buf, NULL, 0); + int rc = kstrtoint(buf, 0, &nd->weight); + + if (rc) + return rc; return len; } diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index b969deef9ebb..8d77ba7b1756 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -68,7 +68,7 @@ static int print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb, if (lkb->lkb_wait_type) seq_printf(s, " wait_type: %d", lkb->lkb_wait_type); - return seq_printf(s, "\n"); + return seq_puts(s, "\n"); } static int print_format1(struct dlm_rsb *res, struct seq_file *s) @@ -92,31 +92,31 @@ static int print_format1(struct dlm_rsb *res, struct seq_file *s) } if (res->res_nodeid > 0) - rv = seq_printf(s, "\" \nLocal Copy, Master is node %d\n", + rv = seq_printf(s, "\"\nLocal Copy, Master is node %d\n", res->res_nodeid); else if (res->res_nodeid == 0) - rv = seq_printf(s, "\" \nMaster Copy\n"); + rv = seq_puts(s, "\"\nMaster Copy\n"); else if (res->res_nodeid == -1) - rv = seq_printf(s, "\" \nLooking up master (lkid %x)\n", + rv = seq_printf(s, "\"\nLooking up master (lkid %x)\n", res->res_first_lkid); else - rv = seq_printf(s, "\" \nInvalid master %d\n", + rv = seq_printf(s, "\"\nInvalid master %d\n", res->res_nodeid); if (rv) goto out; /* Print the LVB: */ if (res->res_lvbptr) { - seq_printf(s, "LVB: "); + seq_puts(s, "LVB: "); for (i = 0; i < lvblen; i++) { if (i == lvblen / 2) - seq_printf(s, "\n "); + seq_puts(s, "\n "); seq_printf(s, "%02x ", (unsigned char) res->res_lvbptr[i]); } if (rsb_flag(res, RSB_VALNOTVALID)) - seq_printf(s, " (INVALID)"); - rv = seq_printf(s, "\n"); + seq_puts(s, " (INVALID)"); + rv = seq_puts(s, "\n"); if (rv) goto out; } @@ -133,21 +133,21 @@ static int print_format1(struct dlm_rsb *res, struct seq_file *s) } /* Print the locks attached to this resource */ - seq_printf(s, "Granted Queue\n"); + seq_puts(s, "Granted Queue\n"); list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue) { rv = print_format1_lock(s, lkb, res); if (rv) goto out; } - seq_printf(s, "Conversion Queue\n"); + seq_puts(s, "Conversion Queue\n"); list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue) { rv = print_format1_lock(s, lkb, res); if (rv) goto out; } - seq_printf(s, "Waiting Queue\n"); + seq_puts(s, "Waiting Queue\n"); list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue) { rv = print_format1_lock(s, lkb, res); if (rv) @@ -157,13 +157,13 @@ static int print_format1(struct dlm_rsb *res, struct seq_file *s) if (list_empty(&res->res_lookup)) goto out; - seq_printf(s, "Lookup Queue\n"); + seq_puts(s, "Lookup Queue\n"); list_for_each_entry(lkb, &res->res_lookup, lkb_rsb_lookup) { rv = seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_rqmode)); if (lkb->lkb_wait_type) seq_printf(s, " wait_type: %d", lkb->lkb_wait_type); - rv = seq_printf(s, "\n"); + rv = seq_puts(s, "\n"); } out: unlock_rsb(res); @@ -300,7 +300,7 @@ static int print_format3(struct dlm_rsb *r, struct seq_file *s) else seq_printf(s, " %02x", (unsigned char)r->res_name[i]); } - rv = seq_printf(s, "\n"); + rv = seq_puts(s, "\n"); if (rv) goto out; @@ -311,7 +311,7 @@ static int print_format3(struct dlm_rsb *r, struct seq_file *s) for (i = 0; i < lvblen; i++) seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]); - rv = seq_printf(s, "\n"); + rv = seq_puts(s, "\n"); if (rv) goto out; @@ -377,7 +377,7 @@ static int print_format4(struct dlm_rsb *r, struct seq_file *s) else seq_printf(s, " %02x", (unsigned char)r->res_name[i]); } - rv = seq_printf(s, "\n"); + rv = seq_puts(s, "\n"); out: unlock_rsb(r); return rv; diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 04d6398c1f1c..f3e72787e7f9 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -35,8 +35,11 @@ static struct task_struct * scand_task; static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len) { ssize_t ret = len; - int n = simple_strtol(buf, NULL, 0); + int n; + int rc = kstrtoint(buf, 0, &n); + if (rc) + return rc; ls = dlm_find_lockspace_local(ls->ls_local_handle); if (!ls) return -EINVAL; @@ -57,7 +60,10 @@ static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len) static ssize_t dlm_event_store(struct dlm_ls *ls, const char *buf, size_t len) { - ls->ls_uevent_result = simple_strtol(buf, NULL, 0); + int rc = kstrtoint(buf, 0, &ls->ls_uevent_result); + + if (rc) + return rc; set_bit(LSFL_UEVENT_WAIT, &ls->ls_flags); wake_up(&ls->ls_uevent_wait); return len; @@ -70,7 +76,10 @@ static ssize_t dlm_id_show(struct dlm_ls *ls, char *buf) static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len) { - ls->ls_global_id = simple_strtoul(buf, NULL, 0); + int rc = kstrtouint(buf, 0, &ls->ls_global_id); + + if (rc) + return rc; return len; } @@ -81,7 +90,11 @@ static ssize_t dlm_nodir_show(struct dlm_ls *ls, char *buf) static ssize_t dlm_nodir_store(struct dlm_ls *ls, const char *buf, size_t len) { - int val = simple_strtoul(buf, NULL, 0); + int val; + int rc = kstrtoint(buf, 0, &val); + + if (rc) + return rc; if (val == 1) set_bit(LSFL_NODIR, &ls->ls_flags); return len; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 1e5b45359509..d08e079ea5d3 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -617,6 +617,11 @@ static void retry_failed_sctp_send(struct connection *recv_con, int nodeid = sn_send_failed->ssf_info.sinfo_ppid; log_print("Retry sending %d bytes to node id %d", len, nodeid); + + if (!nodeid) { + log_print("Shouldn't resend data via listening connection."); + return; + } con = nodeid2con(nodeid, 0); if (!con) { diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 9280202e488c..1de7294aad20 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -50,7 +50,7 @@ static void drop_slab(void) } while (nr_objects > 10); } -int drop_caches_sysctl_handler(ctl_table *table, int write, +int drop_caches_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { int ret; diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index b1eaa7a1f82c..db0fad3269c0 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -45,14 +45,13 @@ * The function to be used for directory reads is ecryptfs_read. */ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, loff_t pos) + struct iov_iter *to) { ssize_t rc; struct path *path; struct file *file = iocb->ki_filp; - rc = generic_file_aio_read(iocb, iov, nr_segs, pos); + rc = generic_file_read_iter(iocb, to); /* * Even though this is a async interface, we need to wait * for IO to finish to update atime @@ -352,10 +351,10 @@ const struct file_operations ecryptfs_dir_fops = { const struct file_operations ecryptfs_main_fops = { .llseek = generic_file_llseek, - .read = do_sync_read, - .aio_read = ecryptfs_read_update_atime, - .write = do_sync_write, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .read_iter = ecryptfs_read_update_atime, + .write = new_sync_write, + .write_iter = generic_file_write_iter, .iterate = ecryptfs_readdir, .unlocked_ioctl = ecryptfs_unlocked_ioctl, #ifdef CONFIG_COMPAT diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index becc725a1953..0a48886e069c 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -83,7 +83,7 @@ static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr) return 0; } -static struct dentry_operations efivarfs_d_ops = { +static const struct dentry_operations efivarfs_d_ops = { .d_compare = efivarfs_d_compare, .d_hash = efivarfs_d_hash, .d_delete = always_delete_dentry, diff --git a/fs/efs/dir.c b/fs/efs/dir.c index b72307ccdf7a..ce63b24f7c3e 100644 --- a/fs/efs/dir.c +++ b/fs/efs/dir.c @@ -26,7 +26,8 @@ static int efs_readdir(struct file *file, struct dir_context *ctx) int slot; if (inode->i_size & (EFS_DIRBSIZE-1)) - printk(KERN_WARNING "EFS: WARNING: readdir(): directory size not a multiple of EFS_DIRBSIZE\n"); + pr_warn("%s(): directory size not a multiple of EFS_DIRBSIZE\n", + __func__); /* work out where this entry can be found */ block = ctx->pos >> EFS_DIRBSIZE_BITS; @@ -43,14 +44,15 @@ static int efs_readdir(struct file *file, struct dir_context *ctx) bh = sb_bread(inode->i_sb, efs_bmap(inode, block)); if (!bh) { - printk(KERN_ERR "EFS: readdir(): failed to read dir block %d\n", block); + pr_err("%s(): failed to read dir block %d\n", + __func__, block); break; } dirblock = (struct efs_dir *) bh->b_data; if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) { - printk(KERN_ERR "EFS: readdir(): invalid directory block\n"); + pr_err("%s(): invalid directory block\n", __func__); brelse(bh); break; } @@ -69,10 +71,9 @@ static int efs_readdir(struct file *file, struct dir_context *ctx) inodenum = be32_to_cpu(dirslot->inode); namelen = dirslot->namelen; nameptr = dirslot->name; - -#ifdef DEBUG - printk(KERN_DEBUG "EFS: readdir(): block %d slot %d/%d: inode %u, name \"%s\", namelen %u\n", block, slot, dirblock->slots-1, inodenum, nameptr, namelen); -#endif + pr_debug("%s(): block %d slot %d/%d: inode %u, name \"%s\", namelen %u\n", + __func__, block, slot, dirblock->slots-1, + inodenum, nameptr, namelen); if (!namelen) continue; /* found the next entry */ @@ -80,7 +81,8 @@ static int efs_readdir(struct file *file, struct dir_context *ctx) /* sanity check */ if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) { - printk(KERN_WARNING "EFS: directory entry %d exceeds directory block\n", slot); + pr_warn("directory entry %d exceeds directory block\n", + slot); continue; } diff --git a/fs/efs/efs.h b/fs/efs/efs.h index 5528926ac7f6..5bbf9612140c 100644 --- a/fs/efs/efs.h +++ b/fs/efs/efs.h @@ -7,6 +7,12 @@ #ifndef _EFS_EFS_H_ #define _EFS_EFS_H_ +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/fs.h> #include <asm/uaccess.h> diff --git a/fs/efs/file.c b/fs/efs/file.c index 1ccb364ffa63..a37dcee46866 100644 --- a/fs/efs/file.c +++ b/fs/efs/file.c @@ -22,10 +22,8 @@ int efs_get_block(struct inode *inode, sector_t iblock, /* * i have no idea why this happens as often as it does */ - printk(KERN_WARNING "EFS: bmap(): block %d >= %ld (filesize %ld)\n", - block, - inode->i_blocks, - inode->i_size); + pr_warn("%s(): block %d >= %ld (filesize %ld)\n", + __func__, block, inode->i_blocks, inode->i_size); #endif return 0; } @@ -38,7 +36,7 @@ int efs_get_block(struct inode *inode, sector_t iblock, int efs_bmap(struct inode *inode, efs_block_t block) { if (block < 0) { - printk(KERN_WARNING "EFS: bmap(): block < 0\n"); + pr_warn("%s(): block < 0\n", __func__); return 0; } @@ -48,10 +46,8 @@ int efs_bmap(struct inode *inode, efs_block_t block) { /* * i have no idea why this happens as often as it does */ - printk(KERN_WARNING "EFS: bmap(): block %d >= %ld (filesize %ld)\n", - block, - inode->i_blocks, - inode->i_size); + pr_warn("%s(): block %d >= %ld (filesize %ld)\n", + __func__, block, inode->i_blocks, inode->i_size); #endif return 0; } diff --git a/fs/efs/inode.c b/fs/efs/inode.c index d15ccf20f1b3..079d20306ee1 100644 --- a/fs/efs/inode.c +++ b/fs/efs/inode.c @@ -89,7 +89,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) bh = sb_bread(inode->i_sb, block); if (!bh) { - printk(KERN_WARNING "EFS: bread() failed at block %d\n", block); + pr_warn("%s() failed at block %d\n", __func__, block); goto read_inode_error; } @@ -130,19 +130,16 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) for(i = 0; i < EFS_DIRECTEXTENTS; i++) { extent_copy(&(efs_inode->di_u.di_extents[i]), &(in->extents[i])); if (i < in->numextents && in->extents[i].cooked.ex_magic != 0) { - printk(KERN_WARNING "EFS: extent %d has bad magic number in inode %lu\n", i, inode->i_ino); + pr_warn("extent %d has bad magic number in inode %lu\n", + i, inode->i_ino); brelse(bh); goto read_inode_error; } } brelse(bh); - -#ifdef DEBUG - printk(KERN_DEBUG "EFS: efs_iget(): inode %lu, extents %d, mode %o\n", - inode->i_ino, in->numextents, inode->i_mode); -#endif - + pr_debug("efs_iget(): inode %lu, extents %d, mode %o\n", + inode->i_ino, in->numextents, inode->i_mode); switch (inode->i_mode & S_IFMT) { case S_IFDIR: inode->i_op = &efs_dir_inode_operations; @@ -162,7 +159,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) init_special_inode(inode, inode->i_mode, device); break; default: - printk(KERN_WARNING "EFS: unsupported inode mode %o\n", inode->i_mode); + pr_warn("unsupported inode mode %o\n", inode->i_mode); goto read_inode_error; break; } @@ -171,7 +168,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) return inode; read_inode_error: - printk(KERN_WARNING "EFS: failed to read inode %lu\n", inode->i_ino); + pr_warn("failed to read inode %lu\n", inode->i_ino); iget_failed(inode); return ERR_PTR(-EIO); } @@ -216,7 +213,7 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { /* if we only have one extent then nothing can be found */ if (in->numextents == 1) { - printk(KERN_ERR "EFS: map_block() failed to map (1 extent)\n"); + pr_err("%s() failed to map (1 extent)\n", __func__); return 0; } @@ -234,13 +231,12 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { } } - printk(KERN_ERR "EFS: map_block() failed to map block %u (dir)\n", block); + pr_err("%s() failed to map block %u (dir)\n", __func__, block); return 0; } -#ifdef DEBUG - printk(KERN_DEBUG "EFS: map_block(): indirect search for logical block %u\n", block); -#endif + pr_debug("%s(): indirect search for logical block %u\n", + __func__, block); direxts = in->extents[0].cooked.ex_offset; indexts = in->numextents; @@ -262,7 +258,8 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { if (dirext == direxts) { /* should never happen */ - printk(KERN_ERR "EFS: couldn't find direct extent for indirect extent %d (block %u)\n", cur, block); + pr_err("couldn't find direct extent for indirect extent %d (block %u)\n", + cur, block); if (bh) brelse(bh); return 0; } @@ -279,12 +276,12 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { bh = sb_bread(inode->i_sb, iblock); if (!bh) { - printk(KERN_ERR "EFS: bread() failed at block %d\n", iblock); + pr_err("%s() failed at block %d\n", + __func__, iblock); return 0; } -#ifdef DEBUG - printk(KERN_DEBUG "EFS: map_block(): read indirect extent block %d\n", iblock); -#endif + pr_debug("%s(): read indirect extent block %d\n", + __func__, iblock); first = 0; lastblock = iblock; } @@ -294,7 +291,8 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { extent_copy(&(exts[ioffset]), &ext); if (ext.cooked.ex_magic != 0) { - printk(KERN_ERR "EFS: extent %d has bad magic number in block %d\n", cur, iblock); + pr_err("extent %d has bad magic number in block %d\n", + cur, iblock); if (bh) brelse(bh); return 0; } @@ -306,7 +304,7 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { } } if (bh) brelse(bh); - printk(KERN_ERR "EFS: map_block() failed to map block %u (indir)\n", block); + pr_err("%s() failed to map block %u (indir)\n", __func__, block); return 0; } diff --git a/fs/efs/namei.c b/fs/efs/namei.c index 96f66d213a19..356c044e2cd3 100644 --- a/fs/efs/namei.c +++ b/fs/efs/namei.c @@ -23,20 +23,22 @@ static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len) efs_block_t block; if (inode->i_size & (EFS_DIRBSIZE-1)) - printk(KERN_WARNING "EFS: WARNING: find_entry(): directory size not a multiple of EFS_DIRBSIZE\n"); + pr_warn("%s(): directory size not a multiple of EFS_DIRBSIZE\n", + __func__); for(block = 0; block < inode->i_blocks; block++) { bh = sb_bread(inode->i_sb, efs_bmap(inode, block)); if (!bh) { - printk(KERN_ERR "EFS: find_entry(): failed to read dir block %d\n", block); + pr_err("%s(): failed to read dir block %d\n", + __func__, block); return 0; } dirblock = (struct efs_dir *) bh->b_data; if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) { - printk(KERN_ERR "EFS: find_entry(): invalid directory block\n"); + pr_err("%s(): invalid directory block\n", __func__); brelse(bh); return(0); } diff --git a/fs/efs/super.c b/fs/efs/super.c index 3befcc9f5d63..7fca462ea4e3 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -134,7 +134,7 @@ static const struct export_operations efs_export_ops = { static int __init init_efs_fs(void) { int err; - printk("EFS: "EFS_VERSION" - http://aeschi.ch.eu.org/efs/\n"); + pr_info(EFS_VERSION" - http://aeschi.ch.eu.org/efs/\n"); err = init_inodecache(); if (err) goto out1; @@ -179,12 +179,12 @@ static efs_block_t efs_validate_vh(struct volume_header *vh) { csum += be32_to_cpu(cs); } if (csum) { - printk(KERN_INFO "EFS: SGI disklabel: checksum bad, label corrupted\n"); + pr_warn("SGI disklabel: checksum bad, label corrupted\n"); return 0; } #ifdef DEBUG - printk(KERN_DEBUG "EFS: bf: \"%16s\"\n", vh->vh_bootfile); + pr_debug("bf: \"%16s\"\n", vh->vh_bootfile); for(i = 0; i < NVDIR; i++) { int j; @@ -196,9 +196,8 @@ static efs_block_t efs_validate_vh(struct volume_header *vh) { name[j] = (char) 0; if (name[0]) { - printk(KERN_DEBUG "EFS: vh: %8s block: 0x%08x size: 0x%08x\n", - name, - (int) be32_to_cpu(vh->vh_vd[i].vd_lbn), + pr_debug("vh: %8s block: 0x%08x size: 0x%08x\n", + name, (int) be32_to_cpu(vh->vh_vd[i].vd_lbn), (int) be32_to_cpu(vh->vh_vd[i].vd_nbytes)); } } @@ -211,12 +210,11 @@ static efs_block_t efs_validate_vh(struct volume_header *vh) { } #ifdef DEBUG if (be32_to_cpu(vh->vh_pt[i].pt_nblks)) { - printk(KERN_DEBUG "EFS: pt %2d: start: %08d size: %08d type: 0x%02x (%s)\n", - i, - (int) be32_to_cpu(vh->vh_pt[i].pt_firstlbn), - (int) be32_to_cpu(vh->vh_pt[i].pt_nblks), - pt_type, - (pt_entry->pt_name) ? pt_entry->pt_name : "unknown"); + pr_debug("pt %2d: start: %08d size: %08d type: 0x%02x (%s)\n", + i, (int)be32_to_cpu(vh->vh_pt[i].pt_firstlbn), + (int)be32_to_cpu(vh->vh_pt[i].pt_nblks), + pt_type, (pt_entry->pt_name) ? + pt_entry->pt_name : "unknown"); } #endif if (IS_EFS(pt_type)) { @@ -226,11 +224,10 @@ static efs_block_t efs_validate_vh(struct volume_header *vh) { } if (slice == -1) { - printk(KERN_NOTICE "EFS: partition table contained no EFS partitions\n"); + pr_notice("partition table contained no EFS partitions\n"); #ifdef DEBUG } else { - printk(KERN_INFO "EFS: using slice %d (type %s, offset 0x%x)\n", - slice, + pr_info("using slice %d (type %s, offset 0x%x)\n", slice, (pt_entry->pt_name) ? pt_entry->pt_name : "unknown", sblock); #endif @@ -268,7 +265,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) s->s_magic = EFS_SUPER_MAGIC; if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) { - printk(KERN_ERR "EFS: device does not support %d byte blocks\n", + pr_err("device does not support %d byte blocks\n", EFS_BLOCKSIZE); return -EINVAL; } @@ -277,7 +274,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) bh = sb_bread(s, 0); if (!bh) { - printk(KERN_ERR "EFS: cannot read volume header\n"); + pr_err("cannot read volume header\n"); return -EINVAL; } @@ -295,13 +292,14 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) bh = sb_bread(s, sb->fs_start + EFS_SUPER); if (!bh) { - printk(KERN_ERR "EFS: cannot read superblock\n"); + pr_err("cannot read superblock\n"); return -EINVAL; } if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) { #ifdef DEBUG - printk(KERN_WARNING "EFS: invalid superblock at block %u\n", sb->fs_start + EFS_SUPER); + pr_warn("invalid superblock at block %u\n", + sb->fs_start + EFS_SUPER); #endif brelse(bh); return -EINVAL; @@ -310,7 +308,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) if (!(s->s_flags & MS_RDONLY)) { #ifdef DEBUG - printk(KERN_INFO "EFS: forcing read-only mode\n"); + pr_info("forcing read-only mode\n"); #endif s->s_flags |= MS_RDONLY; } @@ -318,13 +316,13 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) s->s_export_op = &efs_export_ops; root = efs_iget(s, EFS_ROOTINODE); if (IS_ERR(root)) { - printk(KERN_ERR "EFS: get root inode failed\n"); + pr_err("get root inode failed\n"); return PTR_ERR(root); } s->s_root = d_make_root(root); if (!(s->s_root)) { - printk(KERN_ERR "EFS: get root dentry failed\n"); + pr_err("get root dentry failed\n"); return -ENOMEM; } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index af903128891c..b10b48c2a7af 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -293,7 +293,7 @@ static LIST_HEAD(tfile_check_list); static long zero; static long long_max = LONG_MAX; -ctl_table epoll_table[] = { +struct ctl_table epoll_table[] = { { .procname = "max_user_watches", .data = &max_user_watches, @@ -910,7 +910,7 @@ static const struct file_operations eventpoll_fops = { void eventpoll_release_file(struct file *file) { struct eventpoll *ep; - struct epitem *epi; + struct epitem *epi, *next; /* * We don't want to get "file->f_lock" because it is not @@ -926,7 +926,7 @@ void eventpoll_release_file(struct file *file) * Besides, ep_remove() acquires the lock, so we can't hold it here. */ mutex_lock(&epmutex); - list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) { + list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) { ep = epi->ep; mutex_lock_nested(&ep->mtx, 0); ep_remove(ep, epi); diff --git a/fs/exec.c b/fs/exec.c index 476f3ebf437e..a3d33fe592d6 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -657,10 +657,10 @@ int setup_arg_pages(struct linux_binprm *bprm, unsigned long rlim_stack; #ifdef CONFIG_STACK_GROWSUP - /* Limit stack size to 1GB */ + /* Limit stack size */ stack_base = rlimit_max(RLIMIT_STACK); - if (stack_base > (1 << 30)) - stack_base = 1 << 30; + if (stack_base > STACK_SIZE_MAX) + stack_base = STACK_SIZE_MAX; /* Make sure we didn't let the argument array grow too large. */ if (vma->vm_end - vma->vm_start > stack_base) @@ -1046,13 +1046,13 @@ EXPORT_SYMBOL_GPL(get_task_comm); * so that a new one can be started */ -void set_task_comm(struct task_struct *tsk, const char *buf) +void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) { task_lock(tsk); trace_task_rename(tsk, buf); strlcpy(tsk->comm, buf, sizeof(tsk->comm)); task_unlock(tsk); - perf_event_comm(tsk); + perf_event_comm(tsk, exec); } int flush_old_exec(struct linux_binprm * bprm) @@ -1110,7 +1110,8 @@ void setup_new_exec(struct linux_binprm * bprm) else set_dumpable(current->mm, suid_dumpable); - set_task_comm(current, kbasename(bprm->filename)); + perf_event_exec(); + __set_task_comm(current, kbasename(bprm->filename), true); /* Set the new mm task size. We have to do that late because it may * depend on TIF_32BIT which is only updated in flush_thread() on diff --git a/fs/exofs/Kconfig.ore b/fs/exofs/Kconfig.ore index 1ca7fb7b6ba8..2daf2329c28d 100644 --- a/fs/exofs/Kconfig.ore +++ b/fs/exofs/Kconfig.ore @@ -9,4 +9,6 @@ config ORE tristate depends on EXOFS_FS || PNFS_OBJLAYOUT select ASYNC_XOR + select RAID6_PQ + select ASYNC_PQ default SCSI_OSD_ULD diff --git a/fs/exofs/file.c b/fs/exofs/file.c index 491c6c078e7f..71bf8e4fb5d4 100644 --- a/fs/exofs/file.c +++ b/fs/exofs/file.c @@ -67,17 +67,17 @@ static int exofs_flush(struct file *file, fl_owner_t id) const struct file_operations exofs_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .open = generic_file_open, .release = exofs_release_file, .fsync = exofs_file_fsync, .flush = exofs_flush, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, }; const struct inode_operations exofs_file_inode_operations = { diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index d1c244d67667..3f9cafd73931 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -964,7 +964,7 @@ static void exofs_invalidatepage(struct page *page, unsigned int offset, /* TODO: Should be easy enough to do proprly */ static ssize_t exofs_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { return 0; } diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index dae884694bd9..cfc0205d62c4 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -58,9 +58,12 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) layout->parity = 1; break; case PNFS_OSD_RAID_PQ: + layout->parity = 2; + break; case PNFS_OSD_RAID_4: default: - ORE_ERR("Only RAID_0/5 for now\n"); + ORE_ERR("Only RAID_0/5/6 for now received-enum=%d\n", + layout->raid_algorithm); return -EINVAL; } if (0 != (layout->stripe_unit & ~PAGE_MASK)) { @@ -112,6 +115,8 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) layout->max_io_length /= stripe_length; layout->max_io_length *= stripe_length; } + ORE_DBGMSG("max_io_length=0x%lx\n", layout->max_io_length); + return 0; } EXPORT_SYMBOL(ore_verify_layout); @@ -545,21 +550,24 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, /* "H - (N * U)" is just "H % U" so it's bound to u32 */ u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width; + u32 first_dev = C - C % group_width; div_u64_rem(file_offset, stripe_unit, &si->unit_off); si->obj_offset = si->unit_off + (N * stripe_unit) + (M * group_depth * stripe_unit); + si->cur_comp = C - first_dev; + si->cur_pg = si->unit_off / PAGE_SIZE; if (parity) { u32 LCMdP = lcm(group_width, parity) / parity; /* R = N % LCMdP; */ u32 RxP = (N % LCMdP) * parity; - u32 first_dev = C - C % group_width; si->par_dev = (group_width + group_width - parity - RxP) % group_width + first_dev; - si->dev = (group_width + C - RxP) % group_width + first_dev; + si->dev = (group_width + group_width + C - RxP) % + group_width + first_dev; si->bytes_in_stripe = U; si->first_stripe_start = M * S + G * T + N * U; } else { @@ -649,6 +657,43 @@ out: /* we fail the complete unit on an error eg don't advance return ret; } +static int _add_parity_units(struct ore_io_state *ios, + struct ore_striping_info *si, + unsigned dev, unsigned first_dev, + unsigned mirrors_p1, unsigned devs_in_group, + unsigned cur_len) +{ + unsigned do_parity; + int ret = 0; + + for (do_parity = ios->layout->parity; do_parity; --do_parity) { + struct ore_per_dev_state *per_dev; + + per_dev = &ios->per_dev[dev - first_dev]; + if (!per_dev->length && !per_dev->offset) { + /* Only/always the parity unit of the first + * stripe will be empty. So this is a chance to + * initialize the per_dev info. + */ + per_dev->dev = dev; + per_dev->offset = si->obj_offset - si->unit_off; + } + + ret = _ore_add_parity_unit(ios, si, per_dev, cur_len, + do_parity == 1); + if (unlikely(ret)) + break; + + if (do_parity != 1) { + dev = ((dev + mirrors_p1) % devs_in_group) + first_dev; + si->cur_comp = (si->cur_comp + 1) % + ios->layout->group_width; + } + } + + return ret; +} + static int _prepare_for_striping(struct ore_io_state *ios) { struct ore_striping_info *si = &ios->si; @@ -658,7 +703,6 @@ static int _prepare_for_striping(struct ore_io_state *ios) unsigned devs_in_group = group_width * mirrors_p1; unsigned dev = si->dev; unsigned first_dev = dev - (dev % devs_in_group); - unsigned dev_order; unsigned cur_pg = ios->pages_consumed; u64 length = ios->length; int ret = 0; @@ -670,16 +714,13 @@ static int _prepare_for_striping(struct ore_io_state *ios) BUG_ON(length > si->length); - dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev); - si->cur_comp = dev_order; - si->cur_pg = si->unit_off / PAGE_SIZE; - while (length) { - unsigned comp = dev - first_dev; - struct ore_per_dev_state *per_dev = &ios->per_dev[comp]; + struct ore_per_dev_state *per_dev = + &ios->per_dev[dev - first_dev]; unsigned cur_len, page_off = 0; - if (!per_dev->length) { + if (!per_dev->length && !per_dev->offset) { + /* First time initialize the per_dev info. */ per_dev->dev = dev; if (dev == si->dev) { WARN_ON(dev == si->par_dev); @@ -688,13 +729,7 @@ static int _prepare_for_striping(struct ore_io_state *ios) page_off = si->unit_off & ~PAGE_MASK; BUG_ON(page_off && (page_off != ios->pgbase)); } else { - if (si->cur_comp > dev_order) - per_dev->offset = - si->obj_offset - si->unit_off; - else /* si->cur_comp < dev_order */ - per_dev->offset = - si->obj_offset + stripe_unit - - si->unit_off; + per_dev->offset = si->obj_offset - si->unit_off; cur_len = stripe_unit; } } else { @@ -708,11 +743,9 @@ static int _prepare_for_striping(struct ore_io_state *ios) if (unlikely(ret)) goto out; - dev += mirrors_p1; - dev = (dev % devs_in_group) + first_dev; - length -= cur_len; + dev = ((dev + mirrors_p1) % devs_in_group) + first_dev; si->cur_comp = (si->cur_comp + 1) % group_width; if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) { if (!length && ios->sp2d) { @@ -720,23 +753,16 @@ static int _prepare_for_striping(struct ore_io_state *ios) * stripe. then operate on parity dev. */ dev = si->par_dev; - } - if (ios->sp2d) - /* In writes cur_len just means if it's the - * last one. See _ore_add_parity_unit. - */ - cur_len = length; - per_dev = &ios->per_dev[dev - first_dev]; - if (!per_dev->length) { - /* Only/always the parity unit of the first - * stripe will be empty. So this is a chance to - * initialize the per_dev info. - */ - per_dev->dev = dev; - per_dev->offset = si->obj_offset - si->unit_off; + /* If last stripe operate on parity comp */ + si->cur_comp = group_width - ios->layout->parity; } - ret = _ore_add_parity_unit(ios, si, per_dev, cur_len); + /* In writes cur_len just means if it's the + * last one. See _ore_add_parity_unit. + */ + ret = _add_parity_units(ios, si, dev, first_dev, + mirrors_p1, devs_in_group, + ios->sp2d ? length : cur_len); if (unlikely(ret)) goto out; @@ -747,6 +773,8 @@ static int _prepare_for_striping(struct ore_io_state *ios) /* Next stripe, start fresh */ si->cur_comp = 0; si->cur_pg = 0; + si->obj_offset += cur_len; + si->unit_off = 0; } } out: diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index 4e2c032ab8a1..7f20f25c232c 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -218,22 +218,28 @@ static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d) static void _gen_xor_unit(struct __stripe_pages_2d *sp2d) { unsigned p; + unsigned tx_flags = ASYNC_TX_ACK; + + if (sp2d->parity == 1) + tx_flags |= ASYNC_TX_XOR_ZERO_DST; + for (p = 0; p < sp2d->pages_in_unit; p++) { struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; if (!_1ps->write_count) continue; - init_async_submit(&_1ps->submit, - ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK, - NULL, - NULL, NULL, - (addr_conv_t *)_1ps->scribble); - - /* TODO: raid6 */ - _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages, - 0, sp2d->data_devs, PAGE_SIZE, - &_1ps->submit); + init_async_submit(&_1ps->submit, tx_flags, + NULL, NULL, NULL, (addr_conv_t *)_1ps->scribble); + + if (sp2d->parity == 1) + _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], + _1ps->pages, 0, sp2d->data_devs, + PAGE_SIZE, &_1ps->submit); + else /* parity == 2 */ + _1ps->tx = async_gen_syndrome(_1ps->pages, 0, + sp2d->data_devs + sp2d->parity, + PAGE_SIZE, &_1ps->submit); } for (p = 0; p < sp2d->pages_in_unit; p++) { @@ -404,9 +410,8 @@ static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset) ore_calc_stripe_info(ios->layout, *offset, 0, &si); - p = si.unit_off / PAGE_SIZE; - c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, - ios->layout->mirrors_p1, si.par_dev, si.dev); + p = si.cur_pg; + c = si.cur_comp; page = ios->sp2d->_1p_stripes[p].pages[c]; pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE); @@ -534,9 +539,8 @@ static int _read_4_write_last_stripe(struct ore_io_state *ios) goto read_it; ore_calc_stripe_info(ios->layout, offset, 0, &read_si); - p = read_si.unit_off / PAGE_SIZE; - c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, - ios->layout->mirrors_p1, read_si.par_dev, read_si.dev); + p = read_si.cur_pg; + c = read_si.cur_comp; if (min_p == sp2d->pages_in_unit) { /* Didn't do it yet */ @@ -620,7 +624,7 @@ static int _read_4_write_execute(struct ore_io_state *ios) int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, struct ore_per_dev_state *per_dev, - unsigned cur_len) + unsigned cur_len, bool do_xor) { if (ios->reading) { if (per_dev->cur_sg >= ios->sgs_per_dev) { @@ -640,17 +644,16 @@ int _ore_add_parity_unit(struct ore_io_state *ios, si->cur_pg = _sp2d_min_pg(sp2d); num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg; - if (!cur_len) /* If last stripe operate on parity comp */ - si->cur_comp = sp2d->data_devs; - if (!per_dev->length) { per_dev->offset += si->cur_pg * PAGE_SIZE; /* If first stripe, Read in all read4write pages * (if needed) before we calculate the first parity. */ - _read_4_write_first_stripe(ios); + if (do_xor) + _read_4_write_first_stripe(ios); } - if (!cur_len) /* If last stripe r4w pages of last stripe */ + if (!cur_len && do_xor) + /* If last stripe r4w pages of last stripe */ _read_4_write_last_stripe(ios); _read_4_write_execute(ios); @@ -662,7 +665,7 @@ int _ore_add_parity_unit(struct ore_io_state *ios, ++(ios->cur_par_page); } - BUG_ON(si->cur_comp != sp2d->data_devs); + BUG_ON(si->cur_comp < sp2d->data_devs); BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit); ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, @@ -670,9 +673,10 @@ int _ore_add_parity_unit(struct ore_io_state *ios, if (unlikely(ret)) return ret; - /* TODO: raid6 if (last_parity_dev) */ - _gen_xor_unit(sp2d); - _sp2d_reset(sp2d, ios->r4w, ios->private); + if (do_xor) { + _gen_xor_unit(sp2d); + _sp2d_reset(sp2d, ios->r4w, ios->private); + } } return 0; } diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h index 2ffd2c3c6e46..cf6375d82129 100644 --- a/fs/exofs/ore_raid.h +++ b/fs/exofs/ore_raid.h @@ -31,24 +31,6 @@ #define ORE_DBGMSG2(M...) do {} while (0) /* #define ORE_DBGMSG2 ORE_DBGMSG */ -/* Calculate the component order in a stripe. eg the logical data unit - * address within the stripe of @dev given the @par_dev of this stripe. - */ -static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1, - unsigned par_dev, unsigned dev) -{ - unsigned first_dev = dev - dev % devs_in_group; - - dev -= first_dev; - par_dev -= first_dev; - - if (devs_in_group == par_dev) /* The raid 0 case */ - return dev / mirrors_p1; - /* raid4/5/6 case */ - return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) / - mirrors_p1; -} - /* ios_raid.c stuff needed by ios.c */ int _ore_post_alloc_raid_stuff(struct ore_io_state *ios); void _ore_free_raid_stuff(struct ore_io_state *ios); @@ -56,7 +38,8 @@ void _ore_free_raid_stuff(struct ore_io_state *ios); void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, bool not_last); int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, - struct ore_per_dev_state *per_dev, unsigned cur_len); + struct ore_per_dev_state *per_dev, unsigned cur_len, + bool do_xor); void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, struct ore_striping_info *si, struct page *page); static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d, diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 48a359dd286e..b01fbfb51f43 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -259,7 +259,7 @@ static int filldir_one(void * __buf, const char * name, int len, /** * get_name - default export_operations->get_name function - * @dentry: the directory in which to find a name + * @path: the directory in which to find a name * @name: a pointer to a %NAME_MAX+1 char buffer to store the name * @child: the dentry for the child directory. * @@ -337,7 +337,7 @@ out: /** * export_encode_fh - default export_operations->encode_fh function * @inode: the object to encode - * @fh: where to store the file handle fragment + * @fid: where to store the file handle fragment * @max_len: maximum length to store there * @parent: parent directory inode, if wanted * diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 44c36e590765..7c87b22a7228 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -62,10 +62,10 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync) */ const struct file_operations ext2_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .unlocked_ioctl = ext2_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext2_compat_ioctl, @@ -75,7 +75,7 @@ const struct file_operations ext2_file_operations = { .release = ext2_release_file, .fsync = ext2_fsync, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, }; #ifdef CONFIG_EXT2_FS_XIP diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index b1d2a4675d42..36d35c36311d 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -850,18 +850,18 @@ static sector_t ext2_bmap(struct address_space *mapping, sector_t block) } static ssize_t -ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t offset, unsigned long nr_segs) +ext2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; + size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, - ext2_get_block); + ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext2_get_block); if (ret < 0 && (rw & WRITE)) - ext2_write_failed(mapping, offset + iov_length(iov, nr_segs)); + ext2_write_failed(mapping, offset + count); return ret; } diff --git a/fs/ext3/file.c b/fs/ext3/file.c index aad05311392a..a062fa1e1b11 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -50,10 +50,10 @@ static int ext3_release_file (struct inode * inode, struct file * filp) const struct file_operations ext3_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .unlocked_ioctl = ext3_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext3_compat_ioctl, @@ -63,7 +63,7 @@ const struct file_operations ext3_file_operations = { .release = ext3_release_file, .fsync = ext3_sync_file, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, }; const struct inode_operations ext3_file_inode_operations = { diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index f5157d0d1b43..2c6ccc49ba27 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1716,17 +1716,17 @@ static int ext3_journalled_writepage(struct page *page, WARN_ON_ONCE(IS_RDONLY(inode) && !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); - if (ext3_journal_current_handle()) - goto no_write; - trace_ext3_journalled_writepage(page); - handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto no_write; - } - if (!page_has_buffers(page) || PageChecked(page)) { + if (ext3_journal_current_handle()) + goto no_write; + + handle = ext3_journal_start(inode, + ext3_writepage_trans_blocks(inode)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto no_write; + } /* * It's mmapped pagecache. Add buffers and journal it. There * doesn't seem much point in redirtying the page here. @@ -1749,17 +1749,18 @@ static int ext3_journalled_writepage(struct page *page, atomic_set(&EXT3_I(inode)->i_datasync_tid, handle->h_transaction->t_tid); unlock_page(page); + err = ext3_journal_stop(handle); + if (!ret) + ret = err; } else { /* - * It may be a page full of checkpoint-mode buffers. We don't - * really know unless we go poke around in the buffer_heads. - * But block_write_full_page will do the right thing. + * It is a page full of checkpoint-mode buffers. Go and write + * them. They should have been already mapped when they went + * to the journal so provide NULL get_block function to catch + * errors. */ - ret = block_write_full_page(page, ext3_get_block, wbc); + ret = block_write_full_page(page, NULL, wbc); } - err = ext3_journal_stop(handle); - if (!ret) - ret = err; out: return ret; @@ -1820,8 +1821,7 @@ static int ext3_releasepage(struct page *page, gfp_t wait) * VFS code falls back into buffered path in that case so we are safe. */ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; @@ -1829,10 +1829,10 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, handle_t *handle; ssize_t ret; int orphan = 0; - size_t count = iov_length(iov, nr_segs); + size_t count = iov_iter_count(iter); int retries = 0; - trace_ext3_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); + trace_ext3_direct_IO_enter(inode, offset, count, rw); if (rw == WRITE) { loff_t final_size = offset + count; @@ -1856,15 +1856,14 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, } retry: - ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, - ext3_get_block); + ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext3_get_block); /* * In case of error extending write may have instantiated a few * blocks outside i_size. Trim these off again. */ if (unlikely((rw & WRITE) && ret < 0)) { loff_t isize = i_size_read(inode); - loff_t end = offset + iov_length(iov, nr_segs); + loff_t end = offset + count; if (end > isize) ext3_truncate_failed_direct_write(inode); @@ -1909,8 +1908,7 @@ retry: ret = err; } out: - trace_ext3_direct_IO_exit(inode, offset, - iov_length(iov, nr_segs), rw, ret); + trace_ext3_direct_IO_exit(inode, offset, count, rw, ret); return ret; } diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 6ea7b1436bbc..fca382037ddd 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -83,9 +83,9 @@ static inline int ext4_block_in_group(struct super_block *sb, /* Return the number of clusters used for file system metadata; this * represents the overhead needed by the file system. */ -unsigned ext4_num_overhead_clusters(struct super_block *sb, - ext4_group_t block_group, - struct ext4_group_desc *gdp) +static unsigned ext4_num_overhead_clusters(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp) { unsigned num_clusters; int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c; @@ -176,9 +176,10 @@ static unsigned int num_clusters_in_group(struct super_block *sb, } /* Initializes an uninitialized block bitmap */ -void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, - ext4_group_t block_group, - struct ext4_group_desc *gdp) +static void ext4_init_block_bitmap(struct super_block *sb, + struct buffer_head *bh, + ext4_group_t block_group, + struct ext4_group_desc *gdp) { unsigned int bit, bit_max; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -193,7 +194,16 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return; } @@ -307,6 +317,7 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, ext4_group_t block_group, struct buffer_head *bh) { + struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t offset; ext4_grpblk_t next_zero_bit; ext4_fsblk_t blk; @@ -326,14 +337,14 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, /* check whether block bitmap block number is set */ blk = ext4_block_bitmap(sb, desc); offset = blk - group_first_block; - if (!ext4_test_bit(offset, bh->b_data)) + if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) /* bad block bitmap */ return blk; /* check whether the inode bitmap block number is set */ blk = ext4_inode_bitmap(sb, desc); offset = blk - group_first_block; - if (!ext4_test_bit(offset, bh->b_data)) + if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) /* bad block bitmap */ return blk; @@ -341,21 +352,23 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, blk = ext4_inode_table(sb, desc); offset = blk - group_first_block; next_zero_bit = ext4_find_next_zero_bit(bh->b_data, - offset + EXT4_SB(sb)->s_itb_per_group, - offset); - if (next_zero_bit < offset + EXT4_SB(sb)->s_itb_per_group) + EXT4_B2C(sbi, offset + EXT4_SB(sb)->s_itb_per_group), + EXT4_B2C(sbi, offset)); + if (next_zero_bit < + EXT4_B2C(sbi, offset + EXT4_SB(sb)->s_itb_per_group)) /* bad bitmap for inode tables */ return blk; return 0; } -void ext4_validate_block_bitmap(struct super_block *sb, - struct ext4_group_desc *desc, - ext4_group_t block_group, - struct buffer_head *bh) +static void ext4_validate_block_bitmap(struct super_block *sb, + struct ext4_group_desc *desc, + ext4_group_t block_group, + struct buffer_head *bh) { ext4_fsblk_t blk; struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); + struct ext4_sb_info *sbi = EXT4_SB(sb); if (buffer_verified(bh)) return; @@ -366,6 +379,9 @@ void ext4_validate_block_bitmap(struct super_block *sb, ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: block %llu: invalid block bitmap", block_group, blk); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); return; } @@ -373,6 +389,9 @@ void ext4_validate_block_bitmap(struct super_block *sb, desc, bh))) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); return; } @@ -667,7 +686,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) continue; x = ext4_count_free(bitmap_bh->b_data, - EXT4_BLOCKS_PER_GROUP(sb) / 8); + EXT4_CLUSTERS_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", i, ext4_free_group_clusters(sb, gdp), x); bitmap_count += x; @@ -708,16 +727,6 @@ static inline int test_root(ext4_group_t a, int b) } } -static int ext4_group_sparse(ext4_group_t group) -{ - if (group <= 1) - return 1; - if (!(group & 1)) - return 0; - return (test_root(group, 7) || test_root(group, 5) || - test_root(group, 3)); -} - /** * ext4_bg_has_super - number of blocks used by the superblock in group * @sb: superblock for filesystem @@ -728,11 +737,26 @@ static int ext4_group_sparse(ext4_group_t group) */ int ext4_bg_has_super(struct super_block *sb, ext4_group_t group) { - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) && - !ext4_group_sparse(group)) + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + if (group == 0) + return 1; + if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_SPARSE_SUPER2)) { + if (group == le32_to_cpu(es->s_backup_bgs[0]) || + group == le32_to_cpu(es->s_backup_bgs[1])) + return 1; + return 0; + } + if ((group <= 1) || !EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) + return 1; + if (!(group & 1)) return 0; - return 1; + if (test_root(group, 3) || (test_root(group, 5)) || + test_root(group, 7)) + return 1; + + return 0; } static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index d638c57e996e..ef1bed66c14f 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -105,7 +105,7 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, static int ext4_readdir(struct file *file, struct dir_context *ctx) { unsigned int offset; - int i, stored; + int i; struct ext4_dir_entry_2 *de; int err; struct inode *inode = file_inode(file); @@ -133,7 +133,6 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) return ret; } - stored = 0; offset = ctx->pos & (sb->s_blocksize - 1); while (ctx->pos < inode->i_size) { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f1c65dc7cc0a..7cc5a0e23688 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -158,7 +158,6 @@ struct ext4_allocation_request { #define EXT4_MAP_MAPPED (1 << BH_Mapped) #define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten) #define EXT4_MAP_BOUNDARY (1 << BH_Boundary) -#define EXT4_MAP_UNINIT (1 << BH_Uninit) /* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of * ext4_map_blocks wants to know whether or not the underlying cluster has * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that @@ -169,7 +168,7 @@ struct ext4_allocation_request { #define EXT4_MAP_FROM_CLUSTER (1 << BH_AllocFromCluster) #define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ - EXT4_MAP_UNINIT | EXT4_MAP_FROM_CLUSTER) + EXT4_MAP_FROM_CLUSTER) struct ext4_map_blocks { ext4_fsblk_t m_pblk; @@ -184,7 +183,7 @@ struct ext4_map_blocks { #define EXT4_IO_END_UNWRITTEN 0x0001 /* - * For converting uninitialized extents on a work queue. 'handle' is used for + * For converting unwritten extents on a work queue. 'handle' is used for * buffered writeback. */ typedef struct ext4_io_end { @@ -537,26 +536,26 @@ enum { /* * Flags used by ext4_map_blocks() */ - /* Allocate any needed blocks and/or convert an unitialized + /* Allocate any needed blocks and/or convert an unwritten extent to be an initialized ext4 */ #define EXT4_GET_BLOCKS_CREATE 0x0001 - /* Request the creation of an unitialized extent */ -#define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002 -#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ + /* Request the creation of an unwritten extent */ +#define EXT4_GET_BLOCKS_UNWRIT_EXT 0x0002 +#define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT (EXT4_GET_BLOCKS_UNWRIT_EXT|\ EXT4_GET_BLOCKS_CREATE) /* Caller is from the delayed allocation writeout path * finally doing the actual allocation of delayed blocks */ #define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 /* caller is from the direct IO path, request to creation of an - unitialized extents if not allocated, split the uninitialized + unwritten extents if not allocated, split the unwritten extent if blocks has been preallocated already*/ #define EXT4_GET_BLOCKS_PRE_IO 0x0008 #define EXT4_GET_BLOCKS_CONVERT 0x0010 #define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ - EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) /* Convert extent to initialized after IO complete */ #define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ - EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) /* Eventual metadata allocation (due to growing extent tree) * should not fail, so try to use reserved blocks for that.*/ #define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 @@ -876,6 +875,8 @@ struct ext4_inode_info { struct inode vfs_inode; struct jbd2_inode *jinode; + spinlock_t i_raw_lock; /* protects updates to the raw inode */ + /* * File creation time. Its function is same as that of * struct timespec i_{a,c,m}time in the generic inode. @@ -1159,7 +1160,8 @@ struct ext4_super_block { __le32 s_usr_quota_inum; /* inode for tracking user quota */ __le32 s_grp_quota_inum; /* inode for tracking group quota */ __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ - __le32 s_reserved[108]; /* Padding to the end of the block */ + __le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */ + __le32 s_reserved[106]; /* Padding to the end of the block */ __le32 s_checksum; /* crc32c(superblock) */ }; @@ -1505,6 +1507,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_COMPAT_EXT_ATTR 0x0008 #define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010 #define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020 +#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200 #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 #define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 @@ -1953,10 +1956,6 @@ extern void ext4_get_group_no_and_offset(struct super_block *sb, extern ext4_group_t ext4_get_group_number(struct super_block *sb, ext4_fsblk_t block); -extern void ext4_validate_block_bitmap(struct super_block *sb, - struct ext4_group_desc *desc, - ext4_group_t block_group, - struct buffer_head *bh); extern unsigned int ext4_block_group(struct super_block *sb, ext4_fsblk_t blocknr); extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, @@ -1985,16 +1984,9 @@ extern int ext4_wait_block_bitmap(struct super_block *sb, struct buffer_head *bh); extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group); -extern void ext4_init_block_bitmap(struct super_block *sb, - struct buffer_head *bh, - ext4_group_t group, - struct ext4_group_desc *desc); extern unsigned ext4_free_clusters_after_init(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp); -extern unsigned ext4_num_overhead_clusters(struct super_block *sb, - ext4_group_t block_group, - struct ext4_group_desc *gdp); ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); /* dir.c */ @@ -2137,8 +2129,6 @@ extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); -extern int ext4_block_truncate_page(handle_t *handle, - struct address_space *mapping, loff_t from); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); @@ -2150,8 +2140,7 @@ extern void ext4_da_update_reserve_space(struct inode *inode, extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs); + struct iov_iter *iter, loff_t offset); extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); extern void ext4_ind_truncate(handle_t *, struct inode *inode); @@ -2198,8 +2187,6 @@ extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); /* super.c */ extern int ext4_calculate_overhead(struct super_block *sb); -extern int ext4_superblock_csum_verify(struct super_block *sb, - struct ext4_super_block *es); extern void ext4_superblock_csum_set(struct super_block *sb); extern void *ext4_kvmalloc(size_t size, gfp_t flags); extern void *ext4_kvzalloc(size_t size, gfp_t flags); @@ -2466,23 +2453,6 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) up_write(&EXT4_I(inode)->i_data_sem); } -/* - * Update i_disksize after writeback has been started. Races with truncate - * are avoided by checking i_size under i_data_sem. - */ -static inline void ext4_wb_update_i_disksize(struct inode *inode, loff_t newsize) -{ - loff_t i_size; - - down_write(&EXT4_I(inode)->i_data_sem); - i_size = i_size_read(inode); - if (newsize > i_size) - newsize = i_size; - if (newsize > EXT4_I(inode)->i_disksize) - EXT4_I(inode)->i_disksize = newsize; - up_write(&EXT4_I(inode)->i_data_sem); -} - struct ext4_group_info { unsigned long bb_state; struct rb_root bb_free_root; @@ -2588,19 +2558,11 @@ extern const struct file_operations ext4_dir_operations; extern const struct inode_operations ext4_file_inode_operations; extern const struct file_operations ext4_file_operations; extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); -extern void ext4_unwritten_wait(struct inode *inode); /* inline.c */ extern int ext4_has_inline_data(struct inode *inode); -extern int ext4_get_inline_size(struct inode *inode); extern int ext4_get_max_inline_size(struct inode *inode); extern int ext4_find_inline_data_nolock(struct inode *inode); -extern void ext4_write_inline_data(struct inode *inode, - struct ext4_iloc *iloc, - void *buffer, loff_t pos, - unsigned int len); -extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, - unsigned int len); extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, unsigned int len); extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); @@ -2788,23 +2750,20 @@ extern void ext4_io_submit(struct ext4_io_submit *io); extern int ext4_bio_write_page(struct ext4_io_submit *io, struct page *page, int len, - struct writeback_control *wbc); + struct writeback_control *wbc, + bool keep_towrite); /* mmp.c */ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); -extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp); -extern int ext4_mmp_csum_verify(struct super_block *sb, - struct mmp_struct *mmp); /* * Note that these flags will never ever appear in a buffer_head's state flag. * See EXT4_MAP_... to see where this is used. */ enum ext4_state_bits { - BH_Uninit /* blocks are allocated but uninitialized on disk */ - = BH_JBDPrivateStart, - BH_AllocFromCluster, /* allocated blocks were part of already + BH_AllocFromCluster /* allocated blocks were part of already * allocated cluster. */ + = BH_JBDPrivateStart }; /* diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 5074fe23f19e..a867f5ca9991 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -137,21 +137,21 @@ struct ext4_ext_path { * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an * initialized extent. This is 2^15 and not (2^16 - 1), since we use the * MSB of ee_len field in the extent datastructure to signify if this - * particular extent is an initialized extent or an uninitialized (i.e. + * particular extent is an initialized extent or an unwritten (i.e. * preallocated). - * EXT_UNINIT_MAX_LEN is the maximum number of blocks we can have in an - * uninitialized extent. + * EXT_UNWRITTEN_MAX_LEN is the maximum number of blocks we can have in an + * unwritten extent. * If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an - * uninitialized one. In other words, if MSB of ee_len is set, it is an - * uninitialized extent with only one special scenario when ee_len = 0x8000. - * In this case we can not have an uninitialized extent of zero length and + * unwritten one. In other words, if MSB of ee_len is set, it is an + * unwritten extent with only one special scenario when ee_len = 0x8000. + * In this case we can not have an unwritten extent of zero length and * thus we make it as a special case of initialized extent with 0x8000 length. * This way we get better extent-to-group alignment for initialized extents. * Hence, the maximum number of blocks we can have in an *initialized* - * extent is 2^15 (32768) and in an *uninitialized* extent is 2^15-1 (32767). + * extent is 2^15 (32768) and in an *unwritten* extent is 2^15-1 (32767). */ #define EXT_INIT_MAX_LEN (1UL << 15) -#define EXT_UNINIT_MAX_LEN (EXT_INIT_MAX_LEN - 1) +#define EXT_UNWRITTEN_MAX_LEN (EXT_INIT_MAX_LEN - 1) #define EXT_FIRST_EXTENT(__hdr__) \ @@ -187,14 +187,14 @@ static inline unsigned short ext_depth(struct inode *inode) return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); } -static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext) +static inline void ext4_ext_mark_unwritten(struct ext4_extent *ext) { - /* We can not have an uninitialized extent of zero length! */ + /* We can not have an unwritten extent of zero length! */ BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0); ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN); } -static inline int ext4_ext_is_uninitialized(struct ext4_extent *ext) +static inline int ext4_ext_is_unwritten(struct ext4_extent *ext) { /* Extent with ee_len of 0x8000 is treated as an initialized extent */ return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN); diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index c3fb607413ed..0074e0d23d6e 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -122,9 +122,10 @@ handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, return handle; } -void ext4_journal_abort_handle(const char *caller, unsigned int line, - const char *err_fn, struct buffer_head *bh, - handle_t *handle, int err) +static void ext4_journal_abort_handle(const char *caller, unsigned int line, + const char *err_fn, + struct buffer_head *bh, + handle_t *handle, int err) { char nbuf[16]; const char *errstr = ext4_decode_error(NULL, err, nbuf); diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 81cfefa9dc0c..17c00ff202f2 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -231,10 +231,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); /* * Wrapper functions with which ext4 calls into JBD. */ -void ext4_journal_abort_handle(const char *caller, unsigned int line, - const char *err_fn, - struct buffer_head *bh, handle_t *handle, int err); - int __ext4_journal_get_write_access(const char *where, unsigned int line, handle_t *handle, struct buffer_head *bh); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 82df3ce9874a..4da228a0e6d0 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -50,8 +50,8 @@ */ #define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ due to ENOSPC */ -#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */ -#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */ +#define EXT4_EXT_MARK_UNWRIT1 0x2 /* mark first half unwritten */ +#define EXT4_EXT_MARK_UNWRIT2 0x4 /* mark second half unwritten */ #define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */ #define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */ @@ -143,6 +143,7 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode, { if (path->p_bh) { /* path points to block */ + BUFFER_TRACE(path->p_bh, "get_write_access"); return ext4_journal_get_write_access(handle, path->p_bh); } /* path points to leaf/index in inode body */ @@ -524,7 +525,7 @@ __read_extent_tree_block(const char *function, unsigned int line, lblk - prev, ~0, EXTENT_STATUS_HOLE); - if (ext4_ext_is_uninitialized(ex)) + if (ext4_ext_is_unwritten(ex)) status = EXTENT_STATUS_UNWRITTEN; ext4_es_cache_extent(inode, lblk, len, ext4_ext_pblock(ex), status); @@ -620,7 +621,7 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) } else if (path->p_ext) { ext_debug(" %d:[%d]%d:%llu ", le32_to_cpu(path->p_ext->ee_block), - ext4_ext_is_uninitialized(path->p_ext), + ext4_ext_is_unwritten(path->p_ext), ext4_ext_get_actual_len(path->p_ext), ext4_ext_pblock(path->p_ext)); } else @@ -646,7 +647,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), - ext4_ext_is_uninitialized(ex), + ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); } ext_debug("\n"); @@ -677,7 +678,7 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), - ext4_ext_is_uninitialized(ex), + ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), newblock); ex++; @@ -802,7 +803,7 @@ ext4_ext_binsearch(struct inode *inode, ext_debug(" -> %d:%llu:[%d]%d ", le32_to_cpu(path->p_ext->ee_block), ext4_ext_pblock(path->p_ext), - ext4_ext_is_uninitialized(path->p_ext), + ext4_ext_is_unwritten(path->p_ext), ext4_ext_get_actual_len(path->p_ext)); #ifdef CHECK_BINSEARCH @@ -1686,11 +1687,11 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, /* * Make sure that both extents are initialized. We don't merge - * uninitialized extents so that we can be sure that end_io code has + * unwritten extents so that we can be sure that end_io code has * the extent that was written properly split out and conversion to * initialized is trivial. */ - if (ext4_ext_is_uninitialized(ex1) != ext4_ext_is_uninitialized(ex2)) + if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2)) return 0; ext1_ee_len = ext4_ext_get_actual_len(ex1); @@ -1707,10 +1708,10 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, */ if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) return 0; - if (ext4_ext_is_uninitialized(ex1) && + if (ext4_ext_is_unwritten(ex1) && (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || atomic_read(&EXT4_I(inode)->i_unwritten) || - (ext1_ee_len + ext2_ee_len > EXT_UNINIT_MAX_LEN))) + (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN))) return 0; #ifdef AGGRESSIVE_TEST if (ext1_ee_len >= 4) @@ -1735,7 +1736,7 @@ static int ext4_ext_try_to_merge_right(struct inode *inode, { struct ext4_extent_header *eh; unsigned int depth, len; - int merge_done = 0, uninit; + int merge_done = 0, unwritten; depth = ext_depth(inode); BUG_ON(path[depth].p_hdr == NULL); @@ -1745,11 +1746,11 @@ static int ext4_ext_try_to_merge_right(struct inode *inode, if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) break; /* merge with next extent! */ - uninit = ext4_ext_is_uninitialized(ex); + unwritten = ext4_ext_is_unwritten(ex); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(ex + 1)); - if (uninit) - ext4_ext_mark_uninitialized(ex); + if (unwritten) + ext4_ext_mark_unwritten(ex); if (ex + 1 < EXT_LAST_EXTENT(eh)) { len = (EXT_LAST_EXTENT(eh) - ex - 1) @@ -1903,7 +1904,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path *npath = NULL; int depth, len, err; ext4_lblk_t next; - int mb_flags = 0, uninit; + int mb_flags = 0, unwritten; if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); @@ -1943,21 +1944,21 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, if (ext4_can_extents_be_merged(inode, ex, newext)) { ext_debug("append [%d]%d block to %u:[%d]%d" "(from %llu)\n", - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), - ext4_ext_is_uninitialized(ex), + ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) return err; - uninit = ext4_ext_is_uninitialized(ex); + unwritten = ext4_ext_is_unwritten(ex); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); - if (uninit) - ext4_ext_mark_uninitialized(ex); + if (unwritten) + ext4_ext_mark_unwritten(ex); eh = path[depth].p_hdr; nearex = ex; goto merge; @@ -1969,10 +1970,10 @@ prepend: ext_debug("prepend %u[%d]%d block to %u:[%d]%d" "(from %llu)\n", le32_to_cpu(newext->ee_block), - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), - ext4_ext_is_uninitialized(ex), + ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, @@ -1980,13 +1981,13 @@ prepend: if (err) return err; - uninit = ext4_ext_is_uninitialized(ex); + unwritten = ext4_ext_is_unwritten(ex); ex->ee_block = newext->ee_block; ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); - if (uninit) - ext4_ext_mark_uninitialized(ex); + if (unwritten) + ext4_ext_mark_unwritten(ex); eh = path[depth].p_hdr; nearex = ex; goto merge; @@ -2046,7 +2047,7 @@ has_space: ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext)); nearex = EXT_FIRST_EXTENT(eh); } else { @@ -2057,7 +2058,7 @@ has_space: "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), nearex); nearex++; @@ -2068,7 +2069,7 @@ has_space: "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), nearex); } @@ -2078,7 +2079,7 @@ has_space: "move %d extents from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), + ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), len, nearex, nearex + 1); memmove(nearex + 1, nearex, @@ -2200,7 +2201,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode, es.es_lblk = le32_to_cpu(ex->ee_block); es.es_len = ext4_ext_get_actual_len(ex); es.es_pblk = ext4_ext_pblock(ex); - if (ext4_ext_is_uninitialized(ex)) + if (ext4_ext_is_unwritten(ex)) flags |= FIEMAP_EXTENT_UNWRITTEN; } @@ -2576,7 +2577,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, unsigned num; ext4_lblk_t ex_ee_block; unsigned short ex_ee_len; - unsigned uninitialized = 0; + unsigned unwritten = 0; struct ext4_extent *ex; ext4_fsblk_t pblk; @@ -2623,13 +2624,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, while (ex >= EXT_FIRST_EXTENT(eh) && ex_ee_block + ex_ee_len > start) { - if (ext4_ext_is_uninitialized(ex)) - uninitialized = 1; + if (ext4_ext_is_unwritten(ex)) + unwritten = 1; else - uninitialized = 0; + unwritten = 0; ext_debug("remove ext %u:[%d]%d\n", ex_ee_block, - uninitialized, ex_ee_len); + unwritten, ex_ee_len); path[depth].p_ext = ex; a = ex_ee_block > start ? ex_ee_block : start; @@ -2701,11 +2702,11 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ex->ee_len = cpu_to_le16(num); /* - * Do not mark uninitialized if all the blocks in the + * Do not mark unwritten if all the blocks in the * extent have been removed. */ - if (uninitialized && num) - ext4_ext_mark_uninitialized(ex); + if (unwritten && num) + ext4_ext_mark_unwritten(ex); /* * If the extent was completely released, * we need to remove it from the leaf @@ -2854,9 +2855,9 @@ again: end < ee_block + ext4_ext_get_actual_len(ex) - 1) { int split_flag = 0; - if (ext4_ext_is_uninitialized(ex)) - split_flag = EXT4_EXT_MARK_UNINIT1 | - EXT4_EXT_MARK_UNINIT2; + if (ext4_ext_is_unwritten(ex)) + split_flag = EXT4_EXT_MARK_UNWRIT1 | + EXT4_EXT_MARK_UNWRIT2; /* * Split the extent in two so that 'end' is the last @@ -3113,7 +3114,7 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) * @path: the path to the extent * @split: the logical block where the extent is splitted. * @split_flags: indicates if the extent could be zeroout if split fails, and - * the states(init or uninit) of new extents. + * the states(init or unwritten) of new extents. * @flags: flags used to insert new extent to extent tree. * * @@ -3155,10 +3156,10 @@ static int ext4_split_extent_at(handle_t *handle, newblock = split - ee_block + ext4_ext_pblock(ex); BUG_ON(split < ee_block || split >= (ee_block + ee_len)); - BUG_ON(!ext4_ext_is_uninitialized(ex) && + BUG_ON(!ext4_ext_is_unwritten(ex) && split_flag & (EXT4_EXT_MAY_ZEROOUT | - EXT4_EXT_MARK_UNINIT1 | - EXT4_EXT_MARK_UNINIT2)); + EXT4_EXT_MARK_UNWRIT1 | + EXT4_EXT_MARK_UNWRIT2)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -3170,8 +3171,8 @@ static int ext4_split_extent_at(handle_t *handle, * then we just change the state of the extent, and splitting * is not needed. */ - if (split_flag & EXT4_EXT_MARK_UNINIT2) - ext4_ext_mark_uninitialized(ex); + if (split_flag & EXT4_EXT_MARK_UNWRIT2) + ext4_ext_mark_unwritten(ex); else ext4_ext_mark_initialized(ex); @@ -3185,8 +3186,8 @@ static int ext4_split_extent_at(handle_t *handle, /* case a */ memcpy(&orig_ex, ex, sizeof(orig_ex)); ex->ee_len = cpu_to_le16(split - ee_block); - if (split_flag & EXT4_EXT_MARK_UNINIT1) - ext4_ext_mark_uninitialized(ex); + if (split_flag & EXT4_EXT_MARK_UNWRIT1) + ext4_ext_mark_unwritten(ex); /* * path may lead to new leaf, not to original leaf any more @@ -3200,8 +3201,8 @@ static int ext4_split_extent_at(handle_t *handle, ex2->ee_block = cpu_to_le32(split); ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block)); ext4_ext_store_pblock(ex2, newblock); - if (split_flag & EXT4_EXT_MARK_UNINIT2) - ext4_ext_mark_uninitialized(ex2); + if (split_flag & EXT4_EXT_MARK_UNWRIT2) + ext4_ext_mark_unwritten(ex2); err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { @@ -3278,7 +3279,7 @@ static int ext4_split_extent(handle_t *handle, struct ext4_extent *ex; unsigned int ee_len, depth; int err = 0; - int uninitialized; + int unwritten; int split_flag1, flags1; int allocated = map->m_len; @@ -3286,14 +3287,14 @@ static int ext4_split_extent(handle_t *handle, ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - uninitialized = ext4_ext_is_uninitialized(ex); + unwritten = ext4_ext_is_unwritten(ex); if (map->m_lblk + map->m_len < ee_block + ee_len) { split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT; flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; - if (uninitialized) - split_flag1 |= EXT4_EXT_MARK_UNINIT1 | - EXT4_EXT_MARK_UNINIT2; + if (unwritten) + split_flag1 |= EXT4_EXT_MARK_UNWRIT1 | + EXT4_EXT_MARK_UNWRIT2; if (split_flag & EXT4_EXT_DATA_VALID2) split_flag1 |= EXT4_EXT_DATA_VALID1; err = ext4_split_extent_at(handle, inode, path, @@ -3313,15 +3314,20 @@ static int ext4_split_extent(handle_t *handle, return PTR_ERR(path); depth = ext_depth(inode); ex = path[depth].p_ext; - uninitialized = ext4_ext_is_uninitialized(ex); + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + return -EIO; + } + unwritten = ext4_ext_is_unwritten(ex); split_flag1 = 0; if (map->m_lblk >= ee_block) { split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; - if (uninitialized) { - split_flag1 |= EXT4_EXT_MARK_UNINIT1; + if (unwritten) { + split_flag1 |= EXT4_EXT_MARK_UNWRIT1; split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | - EXT4_EXT_MARK_UNINIT2); + EXT4_EXT_MARK_UNWRIT2); } err = ext4_split_extent_at(handle, inode, path, map->m_lblk, split_flag1, flags); @@ -3336,16 +3342,16 @@ out: /* * This function is called by ext4_ext_map_blocks() if someone tries to write - * to an uninitialized extent. It may result in splitting the uninitialized + * to an unwritten extent. It may result in splitting the unwritten * extent into multiple extents (up to three - one initialized and two - * uninitialized). + * unwritten). * There are three possibilities: * a> There is no split required: Entire extent should be initialized * b> Splits in two extents: Write is happening at either end of the extent * c> Splits in three extents: Somone is writing in middle of the extent * * Pre-conditions: - * - The extent pointed to by 'path' is uninitialized. + * - The extent pointed to by 'path' is unwritten. * - The extent pointed to by 'path' contains a superset * of the logical span [map->m_lblk, map->m_lblk + map->m_len). * @@ -3391,12 +3397,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); /* Pre-conditions */ - BUG_ON(!ext4_ext_is_uninitialized(ex)); + BUG_ON(!ext4_ext_is_unwritten(ex)); BUG_ON(!in_range(map->m_lblk, ee_block, ee_len)); /* * Attempt to transfer newly initialized blocks from the currently - * uninitialized extent to its neighbor. This is much cheaper + * unwritten extent to its neighbor. This is much cheaper * than an insertion followed by a merge as those involve costly * memmove() calls. Transferring to the left is the common case in * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE) @@ -3432,7 +3438,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, * - C4: abut_ex can receive the additional blocks without * overflowing the (initialized) length limit. */ - if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/ + if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ ((prev_lblk + prev_len) == ee_block) && /*C2*/ ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ @@ -3447,7 +3453,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ex->ee_block = cpu_to_le32(ee_block + map_len); ext4_ext_store_pblock(ex, ee_pblk + map_len); ex->ee_len = cpu_to_le16(ee_len - map_len); - ext4_ext_mark_uninitialized(ex); /* Restore the flag */ + ext4_ext_mark_unwritten(ex); /* Restore the flag */ /* Extend abut_ex by 'map_len' blocks */ abut_ex->ee_len = cpu_to_le16(prev_len + map_len); @@ -3478,7 +3484,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, * - C4: abut_ex can receive the additional blocks without * overflowing the (initialized) length limit. */ - if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/ + if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ ((map->m_lblk + map_len) == next_lblk) && /*C2*/ ((ee_pblk + ee_len) == next_pblk) && /*C3*/ (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ @@ -3493,7 +3499,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, abut_ex->ee_block = cpu_to_le32(next_lblk - map_len); ext4_ext_store_pblock(abut_ex, next_pblk - map_len); ex->ee_len = cpu_to_le16(ee_len - map_len); - ext4_ext_mark_uninitialized(ex); /* Restore the flag */ + ext4_ext_mark_unwritten(ex); /* Restore the flag */ /* Extend abut_ex by 'map_len' blocks */ abut_ex->ee_len = cpu_to_le16(next_len + map_len); @@ -3598,26 +3604,26 @@ out: /* * This function is called by ext4_ext_map_blocks() from * ext4_get_blocks_dio_write() when DIO to write - * to an uninitialized extent. + * to an unwritten extent. * - * Writing to an uninitialized extent may result in splitting the uninitialized - * extent into multiple initialized/uninitialized extents (up to three) + * Writing to an unwritten extent may result in splitting the unwritten + * extent into multiple initialized/unwritten extents (up to three) * There are three possibilities: - * a> There is no split required: Entire extent should be uninitialized + * a> There is no split required: Entire extent should be unwritten * b> Splits in two extents: Write is happening at either end of the extent * c> Splits in three extents: Somone is writing in middle of the extent * * This works the same way in the case of initialized -> unwritten conversion. * * One of more index blocks maybe needed if the extent tree grow after - * the uninitialized extent split. To prevent ENOSPC occur at the IO - * complete, we need to split the uninitialized extent before DIO submit - * the IO. The uninitialized extent called at this time will be split - * into three uninitialized extent(at most). After IO complete, the part + * the unwritten extent split. To prevent ENOSPC occur at the IO + * complete, we need to split the unwritten extent before DIO submit + * the IO. The unwritten extent called at this time will be split + * into three unwritten extent(at most). After IO complete, the part * being filled will be convert to initialized by the end_io callback function * via ext4_convert_unwritten_extents(). * - * Returns the size of uninitialized extent to be written on success. + * Returns the size of unwritten extent to be written on success. */ static int ext4_split_convert_extents(handle_t *handle, struct inode *inode, @@ -3655,7 +3661,7 @@ static int ext4_split_convert_extents(handle_t *handle, } else if (flags & EXT4_GET_BLOCKS_CONVERT) { split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; - split_flag |= (EXT4_EXT_MARK_UNINIT2 | EXT4_EXT_DATA_VALID2); + split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); } flags |= EXT4_GET_BLOCKS_PRE_IO; return ext4_split_extent(handle, inode, path, map, split_flag, flags); @@ -3694,13 +3700,19 @@ static int ext4_convert_initialized_extents(handle_t *handle, } depth = ext_depth(inode); ex = path[depth].p_ext; + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + err = -EIO; + goto out; + } } err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; - /* first mark the extent as uninitialized */ - ext4_ext_mark_uninitialized(ex); + /* first mark the extent as unwritten */ + ext4_ext_mark_unwritten(ex); /* note: ext4_ext_correct_indexes() isn't needed here because * borders are not changed @@ -3960,10 +3972,10 @@ ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode, /* * Make sure that the extent is no bigger than we support with - * uninitialized extent + * unwritten extent */ - if (map->m_len > EXT_UNINIT_MAX_LEN) - map->m_len = EXT_UNINIT_MAX_LEN / 2; + if (map->m_len > EXT_UNWRITTEN_MAX_LEN) + map->m_len = EXT_UNWRITTEN_MAX_LEN / 2; ret = ext4_convert_initialized_extents(handle, inode, map, path); @@ -3982,7 +3994,7 @@ ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode, } static int -ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, +ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path *path, int flags, unsigned int allocated, ext4_fsblk_t newblock) @@ -3991,23 +4003,23 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, int err = 0; ext4_io_end_t *io = ext4_inode_aio(inode); - ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical " + ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical " "block %llu, max_blocks %u, flags %x, allocated %u\n", inode->i_ino, (unsigned long long)map->m_lblk, map->m_len, flags, allocated); ext4_ext_show_leaf(inode, path); /* - * When writing into uninitialized space, we should not fail to + * When writing into unwritten space, we should not fail to * allocate metadata blocks for the new extent block if needed. */ flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL; - trace_ext4_ext_handle_uninitialized_extents(inode, map, flags, + trace_ext4_ext_handle_unwritten_extents(inode, map, flags, allocated, newblock); /* get_block() before submit the IO, split the extent */ - if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { + if (flags & EXT4_GET_BLOCKS_PRE_IO) { ret = ext4_split_convert_extents(handle, inode, map, path, flags | EXT4_GET_BLOCKS_CONVERT); if (ret <= 0) @@ -4022,12 +4034,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, else ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); map->m_flags |= EXT4_MAP_UNWRITTEN; - if (ext4_should_dioread_nolock(inode)) - map->m_flags |= EXT4_MAP_UNINIT; goto out; } /* IO end_io complete, convert the filled extent to written */ - if ((flags & EXT4_GET_BLOCKS_CONVERT)) { + if (flags & EXT4_GET_BLOCKS_CONVERT) { ret = ext4_convert_unwritten_extents_endio(handle, inode, map, path); if (ret >= 0) { @@ -4048,7 +4058,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, * repeat fallocate creation request * we already have an unwritten extent */ - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) { + if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) { map->m_flags |= EXT4_MAP_UNWRITTEN; goto map_out; } @@ -4299,7 +4309,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* - * Uninitialized extents are treated as holes, except that + * unwritten extents are treated as holes, except that * we split out initialized portions during a write. */ ee_len = ext4_ext_get_actual_len(ex); @@ -4318,16 +4328,16 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * If the extent is initialized check whether the * caller wants to convert it to unwritten. */ - if ((!ext4_ext_is_uninitialized(ex)) && + if ((!ext4_ext_is_unwritten(ex)) && (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { allocated = ext4_ext_convert_initialized_extent( handle, inode, map, path, flags, allocated, newblock); goto out2; - } else if (!ext4_ext_is_uninitialized(ex)) + } else if (!ext4_ext_is_unwritten(ex)) goto out; - ret = ext4_ext_handle_uninitialized_extents( + ret = ext4_ext_handle_unwritten_extents( handle, inode, map, path, flags, allocated, newblock); if (ret < 0) @@ -4399,15 +4409,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* * See if request is beyond maximum number of blocks we can have in * a single extent. For an initialized extent this limit is - * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is - * EXT_UNINIT_MAX_LEN. + * EXT_INIT_MAX_LEN and for an unwritten extent this limit is + * EXT_UNWRITTEN_MAX_LEN. */ if (map->m_len > EXT_INIT_MAX_LEN && - !(flags & EXT4_GET_BLOCKS_UNINIT_EXT)) + !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) map->m_len = EXT_INIT_MAX_LEN; - else if (map->m_len > EXT_UNINIT_MAX_LEN && - (flags & EXT4_GET_BLOCKS_UNINIT_EXT)) - map->m_len = EXT_UNINIT_MAX_LEN; + else if (map->m_len > EXT_UNWRITTEN_MAX_LEN && + (flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) + map->m_len = EXT_UNWRITTEN_MAX_LEN; /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ newex.ee_len = cpu_to_le16(map->m_len); @@ -4455,21 +4465,19 @@ got_allocated_blocks: /* try to insert new extent into found leaf and return */ ext4_ext_store_pblock(&newex, newblock + offset); newex.ee_len = cpu_to_le16(ar.len); - /* Mark uninitialized */ - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ - ext4_ext_mark_uninitialized(&newex); + /* Mark unwritten */ + if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){ + ext4_ext_mark_unwritten(&newex); map->m_flags |= EXT4_MAP_UNWRITTEN; /* * io_end structure was created for every IO write to an - * uninitialized extent. To avoid unnecessary conversion, + * unwritten extent. To avoid unnecessary conversion, * here we flag the IO that really needs the conversion. * For non asycn direct IO case, flag the inode state * that we need to perform conversion when IO is done. */ - if ((flags & EXT4_GET_BLOCKS_PRE_IO)) + if (flags & EXT4_GET_BLOCKS_PRE_IO) set_unwritten = 1; - if (ext4_should_dioread_nolock(inode)) - map->m_flags |= EXT4_MAP_UNINIT; } err = 0; @@ -4596,9 +4604,9 @@ got_allocated_blocks: /* * Cache the extent and update transaction to commit on fdatasync only - * when it is _not_ an uninitialized extent. + * when it is _not_ an unwritten extent. */ - if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) + if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0) ext4_update_inode_fsync_trans(handle, inode, 1); else ext4_update_inode_fsync_trans(handle, inode, 0); @@ -4672,7 +4680,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, * that it doesn't get unnecessarily split into multiple * extents. */ - if (len <= EXT_UNINIT_MAX_LEN) + if (len <= EXT_UNWRITTEN_MAX_LEN) flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; /* @@ -4730,6 +4738,16 @@ static long ext4_zero_range(struct file *file, loff_t offset, trace_ext4_zero_range(inode, offset, len, mode); + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + /* Call ext4_force_commit to flush all data in case of data=journal. */ + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + if (ret) + return ret; + } + /* * Write out all dirty pages to avoid race conditions * Then release them. @@ -4761,7 +4779,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, else max_blocks -= lblk; - flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT | + flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT | EXT4_GET_BLOCKS_CONVERT_UNWRITTEN; if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; @@ -4878,9 +4896,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (mode & FALLOC_FL_PUNCH_HOLE) return ext4_punch_hole(inode, offset, len); - if (mode & FALLOC_FL_COLLAPSE_RANGE) - return ext4_collapse_range(inode, offset, len); - ret = ext4_convert_inline_data(inode); if (ret) return ret; @@ -4892,6 +4907,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EOPNOTSUPP; + if (mode & FALLOC_FL_COLLAPSE_RANGE) + return ext4_collapse_range(inode, offset, len); + if (mode & FALLOC_FL_ZERO_RANGE) return ext4_zero_range(file, offset, len, mode); @@ -4904,7 +4922,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - lblk; - flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT; + flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; @@ -5229,18 +5247,19 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) update = 1; - *start = ex_last->ee_block + + *start = le32_to_cpu(ex_last->ee_block) + ext4_ext_get_actual_len(ex_last); while (ex_start <= ex_last) { - ex_start->ee_block -= shift; - if (ex_start > - EXT_FIRST_EXTENT(path[depth].p_hdr)) { - if (ext4_ext_try_to_merge_right(inode, - path, ex_start - 1)) - ex_last--; - } - ex_start++; + le32_add_cpu(&ex_start->ee_block, -shift); + /* Try to merge to the left. */ + if ((ex_start > + EXT_FIRST_EXTENT(path[depth].p_hdr)) && + ext4_ext_try_to_merge_right(inode, + path, ex_start - 1)) + ex_last--; + else + ex_start++; } err = ext4_ext_dirty(handle, inode, path + depth); if (err) @@ -5255,7 +5274,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, if (err) goto out; - path[depth].p_idx->ei_block -= shift; + le32_add_cpu(&path[depth].p_idx->ei_block, -shift); err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto out; @@ -5300,7 +5319,8 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, return ret; } - stop_block = extent->ee_block + ext4_ext_get_actual_len(extent); + stop_block = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); ext4_ext_drop_refs(path); kfree(path); @@ -5313,10 +5333,18 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, * enough to accomodate the shift. */ path = ext4_ext_find_extent(inode, start - 1, NULL, 0); + if (IS_ERR(path)) + return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; - ex_start = extent->ee_block; - ex_end = extent->ee_block + ext4_ext_get_actual_len(extent); + if (extent) { + ex_start = le32_to_cpu(extent->ee_block); + ex_end = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); + } else { + ex_start = 0; + ex_end = 0; + } ext4_ext_drop_refs(path); kfree(path); @@ -5331,7 +5359,13 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; - current_block = extent->ee_block; + if (!extent) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) start); + return -EIO; + } + + current_block = le32_to_cpu(extent->ee_block); if (start > current_block) { /* Hole, move to the next extent */ ret = mext_next_extent(inode, path, &extent); @@ -5365,17 +5399,18 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ext4_lblk_t punch_start, punch_stop; handle_t *handle; unsigned int credits; - loff_t new_size; + loff_t new_size, ioffset; int ret; - BUG_ON(offset + len > i_size_read(inode)); - /* Collapse range works only on fs block size aligned offsets. */ if (offset & (EXT4_BLOCK_SIZE(sb) - 1) || len & (EXT4_BLOCK_SIZE(sb) - 1)) return -EINVAL; if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) return -EOPNOTSUPP; trace_ext4_collapse_range(inode, offset, len); @@ -5383,22 +5418,34 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); + /* Call ext4_force_commit to flush all data in case of data=journal. */ + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + if (ret) + return ret; + } + + /* + * Need to round down offset to be aligned with page size boundary + * for page size > block size. + */ + ioffset = round_down(offset, PAGE_SIZE); + /* Write out all dirty pages */ - ret = filemap_write_and_wait_range(inode->i_mapping, offset, -1); + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, + LLONG_MAX); if (ret) return ret; /* Take mutex lock */ mutex_lock(&inode->i_mutex); - /* It's not possible punch hole on append only file */ - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { - ret = -EPERM; - goto out_mutex; - } - - if (IS_SWAPFILE(inode)) { - ret = -ETXTBSY; + /* + * There is no need to overlap collapse range with EOF, in which case + * it is effectively a truncate operation + */ + if (offset + len >= i_size_read(inode)) { + ret = -EINVAL; goto out_mutex; } @@ -5408,7 +5455,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) goto out_mutex; } - truncate_pagecache_range(inode, offset, -1); + truncate_pagecache(inode, ioffset); /* Wait for existing dio to complete */ ext4_inode_block_unlocked_dio(inode); @@ -5425,7 +5472,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ext4_discard_preallocations(inode); ret = ext4_es_remove_extent(inode, punch_start, - EXT_MAX_BLOCKS - punch_start - 1); + EXT_MAX_BLOCKS - punch_start); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; @@ -5436,6 +5483,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } + ext4_discard_preallocations(inode); ret = ext4_ext_shift_extents(inode, handle, punch_stop, punch_stop - punch_start); @@ -5445,10 +5493,9 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) } new_size = i_size_read(inode) - len; - truncate_setsize(inode, new_size); + i_size_write(inode, new_size); EXT4_I(inode)->i_disksize = new_size; - ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 0a014a7194b2..0b7e28e7eaa4 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -344,8 +344,14 @@ static int ext4_es_can_be_merged(struct extent_status *es1, if (ext4_es_status(es1) != ext4_es_status(es2)) return 0; - if (((__u64) es1->es_len) + es2->es_len > 0xFFFFFFFFULL) + if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) { + pr_warn("ES assertion failed when merging extents. " + "The sum of lengths of es1 (%d) and es2 (%d) " + "is bigger than allowed file size (%d)\n", + es1->es_len, es2->es_len, EXT_MAX_BLOCKS); + WARN_ON(1); return 0; + } if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk) return 0; @@ -433,7 +439,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, ee_start = ext4_ext_pblock(ex); ee_len = ext4_ext_get_actual_len(ex); - ee_status = ext4_ext_is_uninitialized(ex) ? 1 : 0; + ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0; es_status = ext4_es_is_unwritten(es) ? 1 : 0; /* @@ -810,7 +816,7 @@ retry: newes.es_lblk = end + 1; newes.es_len = len2; - block = 0x7FDEADBEEF; + block = 0x7FDEADBEEFULL; if (ext4_es_is_written(&orig_es) || ext4_es_is_unwritten(&orig_es)) block = ext4_es_pblock(&orig_es) + @@ -960,10 +966,10 @@ retry: continue; } - if (ei->i_es_lru_nr == 0 || ei == locked_ei) + if (ei->i_es_lru_nr == 0 || ei == locked_ei || + !write_trylock(&ei->i_es_lock)) continue; - write_lock(&ei->i_es_lock); shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); if (ei->i_es_lru_nr == 0) list_del_init(&ei->i_es_lru); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index ca7502d89fde..8695f70af1ef 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -57,7 +57,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp) return 0; } -void ext4_unwritten_wait(struct inode *inode) +static void ext4_unwritten_wait(struct inode *inode) { wait_queue_head_t *wq = ext4_ioend_wq(inode); @@ -74,79 +74,105 @@ void ext4_unwritten_wait(struct inode *inode) * or one thread will zero the other's data, causing corruption. */ static int -ext4_unaligned_aio(struct inode *inode, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) { struct super_block *sb = inode->i_sb; int blockmask = sb->s_blocksize - 1; - size_t count = iov_length(iov, nr_segs); - loff_t final_size = pos + count; - if (pos >= inode->i_size) + if (pos >= i_size_read(inode)) return 0; - if ((pos & blockmask) || (final_size & blockmask)) + if ((pos | iov_iter_alignment(from)) & blockmask) return 1; return 0; } static ssize_t -ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; + struct inode *inode = file_inode(iocb->ki_filp); + struct mutex *aio_mutex = NULL; struct blk_plug plug; - int unaligned_aio = 0; - ssize_t ret; + int o_direct = file->f_flags & O_DIRECT; int overwrite = 0; - size_t length = iov_length(iov, nr_segs); - - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && - !is_sync_kiocb(iocb)) - unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos); + size_t length = iov_iter_count(from); + ssize_t ret; + loff_t pos = iocb->ki_pos; - /* Unaligned direct AIO must be serialized; see comment above */ - if (unaligned_aio) { - mutex_lock(ext4_aio_mutex(inode)); + /* + * Unaligned direct AIO must be serialized; see comment above + * In the case of O_APPEND, assume that we must always serialize + */ + if (o_direct && + ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && + !is_sync_kiocb(iocb) && + (file->f_flags & O_APPEND || + ext4_unaligned_aio(inode, from, pos))) { + aio_mutex = ext4_aio_mutex(inode); + mutex_lock(aio_mutex); ext4_unwritten_wait(inode); } - BUG_ON(iocb->ki_pos != pos); - mutex_lock(&inode->i_mutex); - blk_start_plug(&plug); + if (file->f_flags & O_APPEND) + iocb->ki_pos = pos = i_size_read(inode); + + /* + * If we have encountered a bitmap-format file, the size limit + * is smaller than s_maxbytes, which is for extent-mapped files. + */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - iocb->private = &overwrite; + if ((pos > sbi->s_bitmap_maxbytes) || + (pos == sbi->s_bitmap_maxbytes && length > 0)) { + mutex_unlock(&inode->i_mutex); + ret = -EFBIG; + goto errout; + } - /* check whether we do a DIO overwrite or not */ - if (ext4_should_dioread_nolock(inode) && !unaligned_aio && - !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) { - struct ext4_map_blocks map; - unsigned int blkbits = inode->i_blkbits; - int err, len; + if (pos + length > sbi->s_bitmap_maxbytes) + iov_iter_truncate(from, sbi->s_bitmap_maxbytes - pos); + } - map.m_lblk = pos >> blkbits; - map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits) - - map.m_lblk; - len = map.m_len; + if (o_direct) { + blk_start_plug(&plug); - err = ext4_map_blocks(NULL, inode, &map, 0); - /* - * 'err==len' means that all of blocks has been preallocated no - * matter they are initialized or not. For excluding - * uninitialized extents, we need to check m_flags. There are - * two conditions that indicate for initialized extents. - * 1) If we hit extent cache, EXT4_MAP_MAPPED flag is returned; - * 2) If we do a real lookup, non-flags are returned. - * So we should check these two conditions. - */ - if (err == len && (map.m_flags & EXT4_MAP_MAPPED)) - overwrite = 1; + iocb->private = &overwrite; + + /* check whether we do a DIO overwrite or not */ + if (ext4_should_dioread_nolock(inode) && !aio_mutex && + !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) { + struct ext4_map_blocks map; + unsigned int blkbits = inode->i_blkbits; + int err, len; + + map.m_lblk = pos >> blkbits; + map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits) + - map.m_lblk; + len = map.m_len; + + err = ext4_map_blocks(NULL, inode, &map, 0); + /* + * 'err==len' means that all of blocks has + * been preallocated no matter they are + * initialized or not. For excluding + * unwritten extents, we need to check + * m_flags. There are two conditions that + * indicate for initialized extents. 1) If we + * hit extent cache, EXT4_MAP_MAPPED flag is + * returned; 2) If we do a real lookup, + * non-flags are returned. So we should check + * these two conditions. + */ + if (err == len && (map.m_flags & EXT4_MAP_MAPPED)) + overwrite = 1; + } } - ret = __generic_file_aio_write(iocb, iov, nr_segs); + ret = __generic_file_write_iter(iocb, from); mutex_unlock(&inode->i_mutex); if (ret > 0) { @@ -156,45 +182,12 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov, if (err < 0) ret = err; } - blk_finish_plug(&plug); - - if (unaligned_aio) - mutex_unlock(ext4_aio_mutex(inode)); - - return ret; -} - -static ssize_t -ext4_file_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - struct inode *inode = file_inode(iocb->ki_filp); - ssize_t ret; - - /* - * If we have encountered a bitmap-format file, the size limit - * is smaller than s_maxbytes, which is for extent-mapped files. - */ - - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - size_t length = iov_length(iov, nr_segs); - - if ((pos > sbi->s_bitmap_maxbytes || - (pos == sbi->s_bitmap_maxbytes && length > 0))) - return -EFBIG; - - if (pos + length > sbi->s_bitmap_maxbytes) { - nr_segs = iov_shorten((struct iovec *)iov, nr_segs, - sbi->s_bitmap_maxbytes - pos); - } - } - - if (unlikely(iocb->ki_filp->f_flags & O_DIRECT)) - ret = ext4_file_dio_write(iocb, iov, nr_segs, pos); - else - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); + if (o_direct) + blk_finish_plug(&plug); +errout: + if (aio_mutex) + mutex_unlock(aio_mutex); return ret; } @@ -244,6 +237,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); if (IS_ERR(handle)) return PTR_ERR(handle); + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sbi->s_sbh); if (err) { ext4_journal_stop(handle); @@ -593,10 +587,10 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence) const struct file_operations ext4_file_operations = { .llseek = ext4_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = ext4_file_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = ext4_file_write_iter, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, @@ -606,7 +600,7 @@ const struct file_operations ext4_file_operations = { .release = ext4_release_file, .fsync = ext4_sync_file, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, .fallocate = ext4_fallocate, }; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 0ee59a6644e2..5b87fc36aab8 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -71,6 +71,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, struct ext4_group_desc *gdp) { struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); J_ASSERT_BH(bh, buffer_locked(bh)); /* If checksum is bad mark all blocks and inodes use to prevent @@ -78,7 +79,16 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return 0; } @@ -116,6 +126,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) struct buffer_head *bh = NULL; ext4_fsblk_t bitmap_blk; struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) @@ -185,6 +196,12 @@ verify: ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " "inode_bitmap = %llu", block_group, bitmap_blk); grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, desc); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return NULL; } @@ -321,6 +338,12 @@ out: fatal = err; } else { ext4_error(sb, "bit already cleared for inode %lu", ino); + if (gdp && !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); } @@ -851,6 +874,13 @@ got: goto out; } + BUFFER_TRACE(group_desc_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, group_desc_bh); + if (err) { + ext4_std_error(sb, err); + goto out; + } + /* We may have to initialize the block bitmap if it isn't already */ if (ext4_has_group_desc_csum(sb) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { @@ -887,13 +917,6 @@ got: } } - BUFFER_TRACE(group_desc_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, group_desc_bh); - if (err) { - ext4_std_error(sb, err); - goto out; - } - /* Update the relevant bg descriptor fields */ if (ext4_has_group_desc_csum(sb)) { int free; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 594009f5f523..fd69da194826 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -389,7 +389,13 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, return 0; failed: for (; i >= 0; i--) { - if (i != indirect_blks && branch[i].bh) + /* + * We want to ext4_forget() only freshly allocated indirect + * blocks. Buffer for new_blocks[i-1] is at branch[i].bh and + * buffer at branch[0].bh is indirect block / inode already + * existing before ext4_alloc_branch() was called. + */ + if (i > 0 && i != indirect_blks && branch[i].bh) ext4_forget(handle, 1, inode, branch[i].bh, branch[i].bh->b_blocknr); ext4_free_blocks(handle, inode, NULL, new_blocks[i], @@ -639,8 +645,7 @@ out: * VFS code falls back into buffered path in that case so we are safe. */ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; @@ -648,7 +653,7 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, handle_t *handle; ssize_t ret; int orphan = 0; - size_t count = iov_length(iov, nr_segs); + size_t count = iov_iter_count(iter); int retries = 0; if (rw == WRITE) { @@ -687,18 +692,17 @@ retry: goto locked; } ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, + inode->i_sb->s_bdev, iter, offset, ext4_get_block, NULL, NULL, 0); inode_dio_done(inode); } else { locked: - ret = blockdev_direct_IO(rw, iocb, inode, iov, - offset, nr_segs, ext4_get_block); + ret = blockdev_direct_IO(rw, iocb, inode, iter, + offset, ext4_get_block); if (unlikely((rw & WRITE) && ret < 0)) { loff_t isize = i_size_read(inode); - loff_t end = offset + iov_length(iov, nr_segs); + loff_t end = offset + count; if (end > isize) ext4_truncate_failed_write(inode); @@ -1312,16 +1316,24 @@ static int free_hole_blocks(handle_t *handle, struct inode *inode, blk = *i_data; if (level > 0) { ext4_lblk_t first2; + ext4_lblk_t count2; + bh = sb_bread(inode->i_sb, le32_to_cpu(blk)); if (!bh) { EXT4_ERROR_INODE_BLOCK(inode, le32_to_cpu(blk), "Read failure"); return -EIO; } - first2 = (first > offset) ? first - offset : 0; + if (first > offset) { + first2 = first - offset; + count2 = count; + } else { + first2 = 0; + count2 = count - (offset - first); + } ret = free_hole_blocks(handle, inode, bh, (__le32 *)bh->b_data, level - 1, - first2, count - offset, + first2, count2, inode->i_sb->s_blocksize >> 2); if (ret) { brelse(bh); @@ -1331,8 +1343,8 @@ static int free_hole_blocks(handle_t *handle, struct inode *inode, if (level == 0 || (bh && all_zeroes((__le32 *)bh->b_data, (__le32 *)bh->b_data + addr_per_block))) { - ext4_free_data(handle, inode, parent_bh, &blk, &blk+1); - *i_data = 0; + ext4_free_data(handle, inode, parent_bh, + i_data, i_data + 1); } brelse(bh); bh = NULL; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 82edf5b93352..645205d8ada6 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -22,7 +22,7 @@ #define EXT4_INLINE_DOTDOT_OFFSET 2 #define EXT4_INLINE_DOTDOT_SIZE 4 -int ext4_get_inline_size(struct inode *inode) +static int ext4_get_inline_size(struct inode *inode) { if (EXT4_I(inode)->i_inline_off) return EXT4_I(inode)->i_inline_size; @@ -211,8 +211,8 @@ out: * value since it is already handled by ext4_xattr_ibody_inline_set. * That saves us one memcpy. */ -void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, - void *buffer, loff_t pos, unsigned int len) +static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, + void *buffer, loff_t pos, unsigned int len) { struct ext4_xattr_entry *entry; struct ext4_xattr_ibody_header *header; @@ -264,6 +264,7 @@ static int ext4_create_inline_data(handle_t *handle, if (error) return error; + BUFFER_TRACE(is.iloc.bh, "get_write_access"); error = ext4_journal_get_write_access(handle, is.iloc.bh); if (error) goto out; @@ -347,6 +348,7 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode, if (error == -ENODATA) goto out; + BUFFER_TRACE(is.iloc.bh, "get_write_access"); error = ext4_journal_get_write_access(handle, is.iloc.bh); if (error) goto out; @@ -373,8 +375,8 @@ out: return error; } -int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, - unsigned int len) +static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, + unsigned int len) { int ret, size; struct ext4_inode_info *ei = EXT4_I(inode); @@ -424,6 +426,7 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle, if (error) goto out; + BUFFER_TRACE(is.iloc.bh, "get_write_access"); error = ext4_journal_get_write_access(handle, is.iloc.bh); if (error) goto out; @@ -1007,6 +1010,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, if (err) return err; + BUFFER_TRACE(iloc->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, iloc->bh); if (err) return err; @@ -1669,6 +1673,7 @@ int ext4_delete_inline_entry(handle_t *handle, EXT4_MIN_INLINE_DATA_SIZE; } + BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, bh); if (err) goto out; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5b0d2c7d5408..8a064734e6eb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -148,6 +148,9 @@ static int ext4_inode_is_fast_symlink(struct inode *inode) int ea_blocks = EXT4_I(inode)->i_file_acl ? EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; + if (ext4_has_inline_data(inode)) + return 0; + return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); } @@ -443,7 +446,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, * could be converted. */ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) - down_read((&EXT4_I(inode)->i_data_sem)); + down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, flags & EXT4_GET_BLOCKS_KEEP_SIZE); @@ -489,8 +492,8 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping * based files * - * On success, it returns the number of blocks being mapped or allocate. - * if create==0 and the blocks are pre-allocated and uninitialized block, + * On success, it returns the number of blocks being mapped or allocated. + * if create==0 and the blocks are pre-allocated and unwritten block, * the result buffer head is unmapped. If the create ==1, it will make sure * the buffer head is mapped. * @@ -522,6 +525,10 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, if (unlikely(map->m_len > INT_MAX)) map->m_len = INT_MAX; + /* We can handle the block number less than EXT_MAX_BLOCKS */ + if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) + return -EIO; + /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { ext4_es_lru_add(inode); @@ -551,7 +558,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * file system block. */ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) - down_read((&EXT4_I(inode)->i_data_sem)); + down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, flags & EXT4_GET_BLOCKS_KEEP_SIZE); @@ -618,12 +625,12 @@ found: map->m_flags &= ~EXT4_MAP_FLAGS; /* - * New blocks allocate and/or writing to uninitialized extent + * New blocks allocate and/or writing to unwritten extent * will possibly result in updating i_data, so we take * the write lock of i_data_sem, and call get_blocks() * with create == 1 flag. */ - down_write((&EXT4_I(inode)->i_data_sem)); + down_write(&EXT4_I(inode)->i_data_sem); /* * if the caller is from delayed allocation writeout path @@ -918,6 +925,7 @@ int do_journal_get_write_access(handle_t *handle, */ if (dirty) clear_buffer_dirty(bh); + BUFFER_TRACE(bh, "get write access"); ret = ext4_journal_get_write_access(handle, bh); if (!ret && dirty) ret = ext4_handle_dirty_metadata(handle, NULL, bh); @@ -1536,7 +1544,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, ext4_es_lru_add(inode); if (ext4_es_is_hole(&es)) { retval = 0; - down_read((&EXT4_I(inode)->i_data_sem)); + down_read(&EXT4_I(inode)->i_data_sem); goto add_delayed; } @@ -1573,7 +1581,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, * Try to see if we can get the block without requesting a new * file system block. */ - down_read((&EXT4_I(inode)->i_data_sem)); + down_read(&EXT4_I(inode)->i_data_sem); if (ext4_has_inline_data(inode)) { /* * We will soon create blocks for this page, and let @@ -1765,6 +1773,7 @@ static int __ext4_journalled_writepage(struct page *page, BUG_ON(!ext4_handle_valid(handle)); if (inline_data) { + BUFFER_TRACE(inode_bh, "get write access"); ret = ext4_journal_get_write_access(handle, inode_bh); err = ext4_handle_dirty_metadata(handle, inode, inode_bh); @@ -1842,6 +1851,7 @@ static int ext4_writepage(struct page *page, struct buffer_head *page_bufs = NULL; struct inode *inode = page->mapping->host; struct ext4_io_submit io_submit; + bool keep_towrite = false; trace_ext4_writepage(page); size = i_size_read(inode); @@ -1872,6 +1882,7 @@ static int ext4_writepage(struct page *page, unlock_page(page); return 0; } + keep_towrite = true; } if (PageChecked(page) && ext4_should_journal_data(inode)) @@ -1888,7 +1899,7 @@ static int ext4_writepage(struct page *page, unlock_page(page); return -ENOMEM; } - ret = ext4_bio_write_page(&io_submit, page, len, wbc); + ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite); ext4_io_submit(&io_submit); /* Drop io_end reference we got from init */ ext4_put_io_end_defer(io_submit.io_end); @@ -1907,7 +1918,7 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) else len = PAGE_CACHE_SIZE; clear_page_dirty_for_io(page); - err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc); + err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false); if (!err) mpd->wbc->nr_to_write--; mpd->first_page++; @@ -2028,7 +2039,7 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, * Scan buffers corresponding to changed extent (we expect corresponding pages * to be already locked) and update buffer state according to new extent state. * We map delalloc buffers to their physical location, clear unwritten bits, - * and mark buffers as uninit when we perform writes to uninitialized extents + * and mark buffers as uninit when we perform writes to unwritten extents * and do extent conversion after IO is finished. If the last page is not fully * mapped, we update @map to the next extent in the last page that needs * mapping. Otherwise we submit the page for IO. @@ -2122,12 +2133,12 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) struct inode *inode = mpd->inode; struct ext4_map_blocks *map = &mpd->map; int get_blocks_flags; - int err; + int err, dioread_nolock; trace_ext4_da_write_pages_extent(inode, map); /* * Call ext4_map_blocks() to allocate any delayed allocation blocks, or - * to convert an uninitialized extent to be initialized (in the case + * to convert an unwritten extent to be initialized (in the case * where we have written into one or more preallocated blocks). It is * possible that we're going to need more metadata blocks than * previously reserved. However we must not fail because we're in @@ -2144,7 +2155,8 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) */ get_blocks_flags = EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL; - if (ext4_should_dioread_nolock(inode)) + dioread_nolock = ext4_should_dioread_nolock(inode); + if (dioread_nolock) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; if (map->m_flags & (1 << BH_Delay)) get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; @@ -2152,7 +2164,7 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) err = ext4_map_blocks(handle, inode, map, get_blocks_flags); if (err < 0) return err; - if (map->m_flags & EXT4_MAP_UNINIT) { + if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) { if (!mpd->io_submit.io_end->handle && ext4_handle_valid(handle)) { mpd->io_submit.io_end->handle = handle->h_rsv_handle; @@ -2243,13 +2255,23 @@ static int mpage_map_and_submit_extent(handle_t *handle, return err; } while (map->m_len); - /* Update on-disk size after IO is submitted */ + /* + * Update on-disk size after IO is submitted. Races with + * truncate are avoided by checking i_size under i_data_sem. + */ disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT; if (disksize > EXT4_I(inode)->i_disksize) { int err2; - - ext4_wb_update_i_disksize(inode, disksize); + loff_t i_size; + + down_write(&EXT4_I(inode)->i_data_sem); + i_size = i_size_read(inode); + if (disksize > i_size) + disksize = i_size; + if (disksize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = disksize; err2 = ext4_mark_inode_dirty(handle, inode); + up_write(&EXT4_I(inode)->i_data_sem); if (err2) ext4_error(inode->i_sb, "Failed to mark inode %lu dirty", @@ -3056,9 +3078,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, * preallocated extents, and those write extend the file, no need to * fall back to buffered IO. * - * For holes, we fallocate those blocks, mark them as uninitialized + * For holes, we fallocate those blocks, mark them as unwritten * If those blocks were preallocated, we mark sure they are split, but - * still keep the range to write as uninitialized. + * still keep the range to write as unwritten. * * The unwritten extents will be converted to written when DIO is completed. * For async direct IO, since the IO may still pending when return, we @@ -3071,13 +3093,12 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, * */ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; ssize_t ret; - size_t count = iov_length(iov, nr_segs); + size_t count = iov_iter_count(iter); int overwrite = 0; get_block_t *get_block_func = NULL; int dio_flags = 0; @@ -3086,7 +3107,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, /* Use the old path for reads and writes beyond i_size. */ if (rw != WRITE || final_size > inode->i_size) - return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); + return ext4_ind_direct_IO(rw, iocb, iter, offset); BUG_ON(iocb->private == NULL); @@ -3110,12 +3131,12 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, * We could direct write to holes and fallocate. * * Allocated blocks to fill the hole are marked as - * uninitialized to prevent parallel buffered read to expose + * unwritten to prevent parallel buffered read to expose * the stale data before DIO complete the data IO. * * As to previously fallocated extents, ext4 get_block will * just simply mark the buffer mapped but still keep the - * extents uninitialized. + * extents unwritten. * * For non AIO case, we will convert those unwritten extents * to written after return back from blockdev_direct_IO. @@ -3153,8 +3174,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, dio_flags = DIO_LOCKING; } ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, + inode->i_sb->s_bdev, iter, + offset, get_block_func, ext4_end_io_dio, NULL, @@ -3208,11 +3229,11 @@ retake_lock: } static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + size_t count = iov_iter_count(iter); ssize_t ret; /* @@ -3225,13 +3246,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, if (ext4_has_inline_data(inode)) return 0; - trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); + trace_ext4_direct_IO_enter(inode, offset, count, rw); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); + ret = ext4_ext_direct_IO(rw, iocb, iter, offset); else - ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); - trace_ext4_direct_IO_exit(inode, offset, - iov_length(iov, nr_segs), rw, ret); + ret = ext4_ind_direct_IO(rw, iocb, iter, offset); + trace_ext4_direct_IO_exit(inode, offset, count, rw, ret); return ret; } @@ -3426,7 +3446,7 @@ unlock: * This required during truncate. We need to physically zero the tail end * of that block so it doesn't yield old data if the file is later grown. */ -int ext4_block_truncate_page(handle_t *handle, +static int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from) { unsigned offset = from & (PAGE_CACHE_SIZE-1); @@ -3527,15 +3547,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) } mutex_lock(&inode->i_mutex); - /* It's not possible punch hole on append only file */ - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { - ret = -EPERM; - goto out_mutex; - } - if (IS_SWAPFILE(inode)) { - ret = -ETXTBSY; - goto out_mutex; - } /* No need to punch hole beyond i_size */ if (offset >= inode->i_size) @@ -3616,7 +3627,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) ret = ext4_free_hole_blocks(handle, inode, first_block, stop_block); - ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); @@ -4300,12 +4310,15 @@ static int ext4_do_update_inode(handle_t *handle, struct ext4_inode *raw_inode = ext4_raw_inode(iloc); struct ext4_inode_info *ei = EXT4_I(inode); struct buffer_head *bh = iloc->bh; + struct super_block *sb = inode->i_sb; int err = 0, rc, block; - int need_datasync = 0; + int need_datasync = 0, set_large_file = 0; uid_t i_uid; gid_t i_gid; - /* For fields not not tracking in the in-memory inode, + spin_lock(&ei->i_raw_lock); + + /* For fields not tracked in the in-memory inode, * initialise them to zero for new inodes. */ if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); @@ -4343,8 +4356,10 @@ static int ext4_do_update_inode(handle_t *handle, EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); - if (ext4_inode_blocks_set(handle, raw_inode, ei)) + if (ext4_inode_blocks_set(handle, raw_inode, ei)) { + spin_unlock(&ei->i_raw_lock); goto out_brelse; + } raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) @@ -4356,24 +4371,11 @@ static int ext4_do_update_inode(handle_t *handle, need_datasync = 1; } if (ei->i_disksize > 0x7fffffffULL) { - struct super_block *sb = inode->i_sb; if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_LARGE_FILE) || EXT4_SB(sb)->s_es->s_rev_level == - cpu_to_le32(EXT4_GOOD_OLD_REV)) { - /* If this is the first large file - * created, add a flag to the superblock. - */ - err = ext4_journal_get_write_access(handle, - EXT4_SB(sb)->s_sbh); - if (err) - goto out_brelse; - ext4_update_dynamic_rev(sb); - EXT4_SET_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_LARGE_FILE); - ext4_handle_sync(handle); - err = ext4_handle_dirty_super(handle, sb); - } + cpu_to_le32(EXT4_GOOD_OLD_REV)) + set_large_file = 1; } raw_inode->i_generation = cpu_to_le32(inode->i_generation); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { @@ -4405,12 +4407,24 @@ static int ext4_do_update_inode(handle_t *handle, ext4_inode_csum_set(inode, raw_inode, ei); + spin_unlock(&ei->i_raw_lock); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); rc = ext4_handle_dirty_metadata(handle, NULL, bh); if (!err) err = rc; ext4_clear_inode_state(inode, EXT4_STATE_NEW); - + if (set_large_file) { + BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); + err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + if (err) + goto out_brelse; + ext4_update_dynamic_rev(sb); + EXT4_SET_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_LARGE_FILE); + ext4_handle_sync(handle); + err = ext4_handle_dirty_super(handle, sb); + } ext4_update_inode_fsync_trans(handle, inode, need_datasync); out_brelse: brelse(bh); @@ -4423,21 +4437,20 @@ out_brelse: * * We are called from a few places: * - * - Within generic_file_write() for O_SYNC files. + * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. * Here, there will be no transaction running. We wait for any running * transaction to commit. * - * - Within sys_sync(), kupdate and such. - * We wait on commit, if tol to. + * - Within flush work (sys_sync(), kupdate and such). + * We wait on commit, if told to. * - * - Within prune_icache() (PF_MEMALLOC == true) - * Here we simply return. We can't afford to block kswapd on the - * journal commit. + * - Within iput_final() -> write_inode_now() + * We wait on commit, if told to. * * In all cases it is actually safe for us to return without doing anything, * because the inode has been copied into a raw inode buffer in - * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for - * knfsd. + * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL + * writeback. * * Note that we are absolutely dependent upon all inode dirtiers doing the * right thing: they *must* call mark_inode_dirty() after dirtying info in @@ -4449,15 +4462,15 @@ out_brelse: * stuff(); * inode->i_size = expr; * - * is in error because a kswapd-driven write_inode() could occur while - * `stuff()' is running, and the new i_size will be lost. Plus the inode - * will no longer be on the superblock's dirty inode list. + * is in error because write_inode() could occur while `stuff()' is running, + * and the new i_size will be lost. Plus the inode will no longer be on the + * superblock's dirty inode list. */ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; - if (current->flags & PF_MEMALLOC) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; if (EXT4_SB(inode->i_sb)->s_journal) { diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index a888cac76e9c..2dcb936be90e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -722,6 +722,7 @@ void ext4_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ext4_group_t group) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; ext4_grpblk_t first; @@ -751,14 +752,17 @@ void ext4_mb_generate_buddy(struct super_block *sb, if (free != grp->bb_free) { ext4_grp_locked_error(sb, group, 0, 0, - "%u clusters in bitmap, %u in gd; " - "block bitmap corrupt.", + "block bitmap and bg descriptor " + "inconsistent: %u vs %u free clusters", free, grp->bb_free); /* * If we intend to continue, we consider group descriptor * corrupt and update bb_free using bitmap value */ grp->bb_free = free; + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); } mb_set_largest_free_order(sb, grp); @@ -989,7 +993,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, poff = block % blocks_per_page; page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); if (!page) - return -EIO; + return -ENOMEM; BUG_ON(page->mapping != inode->i_mapping); e4b->bd_bitmap_page = page; e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); @@ -1003,7 +1007,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, pnum = block / blocks_per_page; page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); if (!page) - return -EIO; + return -ENOMEM; BUG_ON(page->mapping != inode->i_mapping); e4b->bd_buddy_page = page; return 0; @@ -1044,6 +1048,8 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) * allocating. If we are looking at the buddy cache we would * have taken a reference using ext4_mb_load_buddy and that * would have pinned buddy page to page cache. + * The call to ext4_mb_get_buddy_page_lock will mark the + * page accessed. */ ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b); if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { @@ -1062,7 +1068,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) ret = -EIO; goto err; } - mark_page_accessed(page); if (e4b.bd_buddy_page == NULL) { /* @@ -1082,7 +1087,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) ret = -EIO; goto err; } - mark_page_accessed(page); err: ext4_mb_put_buddy_page_lock(&e4b); return ret; @@ -1141,7 +1145,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, /* we could use find_or_create_page(), but it locks page * what we'd like to avoid in fast path ... */ - page = find_get_page(inode->i_mapping, pnum); + page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); if (page == NULL || !PageUptodate(page)) { if (page) /* @@ -1168,19 +1172,24 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, unlock_page(page); } } - if (page == NULL || !PageUptodate(page)) { + if (page == NULL) { + ret = -ENOMEM; + goto err; + } + if (!PageUptodate(page)) { ret = -EIO; goto err; } + + /* Pages marked accessed already */ e4b->bd_bitmap_page = page; e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); - mark_page_accessed(page); block++; pnum = block / blocks_per_page; poff = block % blocks_per_page; - page = find_get_page(inode->i_mapping, pnum); + page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); if (page == NULL || !PageUptodate(page)) { if (page) page_cache_release(page); @@ -1197,13 +1206,18 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, unlock_page(page); } } - if (page == NULL || !PageUptodate(page)) { + if (page == NULL) { + ret = -ENOMEM; + goto err; + } + if (!PageUptodate(page)) { ret = -EIO; goto err; } + + /* Pages marked accessed already */ e4b->bd_buddy_page = page; e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); - mark_page_accessed(page); BUG_ON(e4b->bd_bitmap_page == NULL); BUG_ON(e4b->bd_buddy_page == NULL); @@ -1421,6 +1435,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); if (unlikely(block != -1)) { + struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t blocknr; blocknr = ext4_group_first_block_no(sb, e4b->bd_group); @@ -1431,6 +1446,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, "freeing already freed block " "(bit %u); block bitmap corrupt.", block); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + e4b->bd_info->bb_free); /* Mark the block group as corrupt. */ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &e4b->bd_info->bb_state); @@ -2609,7 +2627,7 @@ int ext4_mb_init(struct super_block *sb) sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); if (sbi->s_locality_groups == NULL) { ret = -ENOMEM; - goto out_free_groupinfo_slab; + goto out; } for_each_possible_cpu(i) { struct ext4_locality_group *lg; @@ -2634,8 +2652,6 @@ int ext4_mb_init(struct super_block *sb) out_free_locality_groups: free_percpu(sbi->s_locality_groups); sbi->s_locality_groups = NULL; -out_free_groupinfo_slab: - ext4_groupinfo_destroy_slabs(); out: kfree(sbi->s_mb_offsets); sbi->s_mb_offsets = NULL; @@ -2868,6 +2884,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, if (!bitmap_bh) goto out_err; + BUFFER_TRACE(bitmap_bh, "getting write access"); err = ext4_journal_get_write_access(handle, bitmap_bh); if (err) goto out_err; @@ -2880,6 +2897,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, ext4_free_group_clusters(sb, gdp)); + BUFFER_TRACE(gdp_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, gdp_bh); if (err) goto out_err; @@ -3137,7 +3155,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, } BUG_ON(start + size <= ac->ac_o_ex.fe_logical && start > ac->ac_o_ex.fe_logical); - BUG_ON(size <= 0 || size > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); + BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); /* now prepare goal request */ @@ -5008,6 +5026,8 @@ error_return: */ static int ext4_trim_extent(struct super_block *sb, int start, int count, ext4_group_t group, struct ext4_buddy *e4b) +__releases(bitlock) +__acquires(bitlock) { struct ext4_free_extent ex; int ret = 0; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 2ae73a80c19b..ec092437d3e0 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -505,7 +505,7 @@ int ext4_ext_migrate(struct inode *inode) * with i_data_sem held to prevent racing with block * allocation. */ - down_read((&EXT4_I(inode)->i_data_sem)); + down_read(&EXT4_I(inode)->i_data_sem); ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE); up_read((&EXT4_I(inode)->i_data_sem)); diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 04434ad3e8e0..32bce844c2e1 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -18,7 +18,7 @@ static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) return cpu_to_le32(csum); } -int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) +static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) { if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) @@ -27,7 +27,7 @@ int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp); } -void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp) +static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp) { if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 58ee7dc87669..2484c7ec6a72 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -57,8 +57,8 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock, static void copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest) { - if (ext4_ext_is_uninitialized(src)) - ext4_ext_mark_uninitialized(dest); + if (ext4_ext_is_unwritten(src)) + ext4_ext_mark_unwritten(dest); else dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest)); } @@ -391,6 +391,7 @@ mext_insert_extents(handle_t *handle, struct inode *orig_inode, if (depth) { /* Register to journal */ + BUFFER_TRACE(orig_path->p_bh, "get_write_access"); ret = ext4_journal_get_write_access(handle, orig_path->p_bh); if (ret) return ret; @@ -593,14 +594,14 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, * @inode: inode in question * @from: block offset of inode * @count: block count to be checked - * @uninit: extents expected to be uninitialized + * @unwritten: extents expected to be unwritten * @err: pointer to save error value * * Return 1 if all extents in range has expected type, and zero otherwise. */ static int mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, - int uninit, int *err) + int unwritten, int *err) { struct ext4_ext_path *path = NULL; struct ext4_extent *ext; @@ -611,7 +612,7 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, if (*err) goto out; ext = path[ext_depth(inode)].p_ext; - if (uninit != ext4_ext_is_uninitialized(ext)) + if (unwritten != ext4_ext_is_unwritten(ext)) goto out; from += ext4_ext_get_actual_len(ext); ext4_ext_drop_refs(path); @@ -894,7 +895,7 @@ out: * @orig_page_offset: page index on original file * @data_offset_in_page: block index where data swapping starts * @block_len_in_page: the number of blocks to be swapped - * @uninit: orig extent is uninitialized or not + * @unwritten: orig extent is unwritten or not * @err: pointer to save return value * * Save the data in original inode blocks and replace original inode extents @@ -905,7 +906,7 @@ out: static int move_extent_per_page(struct file *o_filp, struct inode *donor_inode, pgoff_t orig_page_offset, int data_offset_in_page, - int block_len_in_page, int uninit, int *err) + int block_len_in_page, int unwritten, int *err) { struct inode *orig_inode = file_inode(o_filp); struct page *pagep[2] = {NULL, NULL}; @@ -962,27 +963,27 @@ again: if (unlikely(*err < 0)) goto stop_journal; /* - * If orig extent was uninitialized it can become initialized + * If orig extent was unwritten it can become initialized * at any time after i_data_sem was dropped, in order to * serialize with delalloc we have recheck extent while we * hold page's lock, if it is still the case data copy is not * necessary, just swap data blocks between orig and donor. */ - if (uninit) { + if (unwritten) { ext4_double_down_write_data_sem(orig_inode, donor_inode); /* If any of extents in range became initialized we have to * fallback to data copying */ - uninit = mext_check_coverage(orig_inode, orig_blk_offset, - block_len_in_page, 1, err); + unwritten = mext_check_coverage(orig_inode, orig_blk_offset, + block_len_in_page, 1, err); if (*err) goto drop_data_sem; - uninit &= mext_check_coverage(donor_inode, orig_blk_offset, - block_len_in_page, 1, err); + unwritten &= mext_check_coverage(donor_inode, orig_blk_offset, + block_len_in_page, 1, err); if (*err) goto drop_data_sem; - if (!uninit) { + if (!unwritten) { ext4_double_up_write_data_sem(orig_inode, donor_inode); goto data_copy; } @@ -1259,7 +1260,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; int data_offset_in_page; int block_len_in_page; - int uninit; + int unwritten; if (orig_inode->i_sb != donor_inode->i_sb) { ext4_debug("ext4 move extent: The argument files " @@ -1391,8 +1392,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, !last_extent) continue; - /* Is original extent is uninitialized */ - uninit = ext4_ext_is_uninitialized(ext_prev); + /* Is original extent is unwritten */ + unwritten = ext4_ext_is_unwritten(ext_prev); data_offset_in_page = seq_start % blocks_per_page; @@ -1432,8 +1433,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, o_filp, donor_inode, orig_page_offset, data_offset_in_page, - block_len_in_page, uninit, - &ret); + block_len_in_page, + unwritten, &ret); /* Count how many blocks we have exchanged */ *moved_len += block_len_in_page; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 1cb84f78909e..3520ab8a6639 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -67,6 +67,7 @@ static struct buffer_head *ext4_append(handle_t *handle, return ERR_PTR(err); inode->i_size += inode->i_sb->s_blocksize; EXT4_I(inode)->i_disksize = inode->i_size; + BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, bh); if (err) { brelse(bh); @@ -1778,6 +1779,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, blocksize = dir->i_sb->s_blocksize; dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); + BUFFER_TRACE(bh, "get_write_access"); retval = ext4_journal_get_write_access(handle, bh); if (retval) { ext4_std_error(dir->i_sb, retval); @@ -2510,8 +2512,7 @@ static int empty_dir(struct inode *inode) ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize); de = ext4_next_entry(de1, sb->s_blocksize); while (offset < inode->i_size) { - if (!bh || - (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { + if ((void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { unsigned int lblock; err = 0; brelse(bh); @@ -2539,26 +2540,37 @@ static int empty_dir(struct inode *inode) return 1; } -/* ext4_orphan_add() links an unlinked or truncated inode into a list of +/* + * ext4_orphan_add() links an unlinked or truncated inode into a list of * such inodes, starting at the superblock, in case we crash before the * file is closed/deleted, or in case the inode truncate spans multiple * transactions and the last transaction is not recovered after a crash. * * At filesystem recovery time, we walk this list deleting unlinked * inodes and truncating linked inodes in ext4_orphan_cleanup(). + * + * Orphan list manipulation functions must be called under i_mutex unless + * we are just creating the inode or deleting it. */ int ext4_orphan_add(handle_t *handle, struct inode *inode) { struct super_block *sb = inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_iloc iloc; int err = 0, rc; + bool dirty = false; - if (!EXT4_SB(sb)->s_journal) + if (!sbi->s_journal) return 0; - mutex_lock(&EXT4_SB(sb)->s_orphan_lock); + WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && + !mutex_is_locked(&inode->i_mutex)); + /* + * Exit early if inode already is on orphan list. This is a big speedup + * since we don't have to contend on the global s_orphan_lock. + */ if (!list_empty(&EXT4_I(inode)->i_orphan)) - goto out_unlock; + return 0; /* * Orphan handling is only valid for files with data blocks @@ -2569,48 +2581,51 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); - BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sbi->s_sbh); if (err) - goto out_unlock; + goto out; err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) - goto out_unlock; + goto out; + + mutex_lock(&sbi->s_orphan_lock); /* * Due to previous errors inode may be already a part of on-disk * orphan list. If so skip on-disk list modification. */ - if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <= - (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) - goto mem_insert; - - /* Insert this inode at the head of the on-disk orphan list... */ - NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); - EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); - err = ext4_handle_dirty_super(handle, sb); - rc = ext4_mark_iloc_dirty(handle, inode, &iloc); - if (!err) - err = rc; - - /* Only add to the head of the in-memory list if all the - * previous operations succeeded. If the orphan_add is going to - * fail (possibly taking the journal offline), we can't risk - * leaving the inode on the orphan list: stray orphan-list - * entries can cause panics at unmount time. - * - * This is safe: on error we're going to ignore the orphan list - * anyway on the next recovery. */ -mem_insert: - if (!err) - list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); - + if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) > + (le32_to_cpu(sbi->s_es->s_inodes_count))) { + /* Insert this inode at the head of the on-disk orphan list */ + NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan); + sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); + dirty = true; + } + list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan); + mutex_unlock(&sbi->s_orphan_lock); + + if (dirty) { + err = ext4_handle_dirty_super(handle, sb); + rc = ext4_mark_iloc_dirty(handle, inode, &iloc); + if (!err) + err = rc; + if (err) { + /* + * We have to remove inode from in-memory list if + * addition to on disk orphan list failed. Stray orphan + * list entries can cause panics at unmount time. + */ + mutex_lock(&sbi->s_orphan_lock); + list_del(&EXT4_I(inode)->i_orphan); + mutex_unlock(&sbi->s_orphan_lock); + } + } jbd_debug(4, "superblock will point to %lu\n", inode->i_ino); jbd_debug(4, "orphan inode %lu will point to %d\n", inode->i_ino, NEXT_ORPHAN(inode)); -out_unlock: - mutex_unlock(&EXT4_SB(sb)->s_orphan_lock); - ext4_std_error(inode->i_sb, err); +out: + ext4_std_error(sb, err); return err; } @@ -2622,45 +2637,51 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) { struct list_head *prev; struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 ino_next; struct ext4_iloc iloc; int err = 0; - if ((!EXT4_SB(inode->i_sb)->s_journal) && - !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) + if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS)) return 0; - mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); + WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && + !mutex_is_locked(&inode->i_mutex)); + /* Do this quick check before taking global s_orphan_lock. */ if (list_empty(&ei->i_orphan)) - goto out; + return 0; - ino_next = NEXT_ORPHAN(inode); - prev = ei->i_orphan.prev; - sbi = EXT4_SB(inode->i_sb); + if (handle) { + /* Grab inode buffer early before taking global s_orphan_lock */ + err = ext4_reserve_inode_write(handle, inode, &iloc); + } + mutex_lock(&sbi->s_orphan_lock); jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); + prev = ei->i_orphan.prev; list_del_init(&ei->i_orphan); /* If we're on an error path, we may not have a valid * transaction handle with which to update the orphan list on * disk, but we still need to remove the inode from the linked * list in memory. */ - if (!handle) - goto out; - - err = ext4_reserve_inode_write(handle, inode, &iloc); - if (err) + if (!handle || err) { + mutex_unlock(&sbi->s_orphan_lock); goto out_err; + } + ino_next = NEXT_ORPHAN(inode); if (prev == &sbi->s_orphan) { jbd_debug(4, "superblock will point to %u\n", ino_next); BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sbi->s_sbh); - if (err) + if (err) { + mutex_unlock(&sbi->s_orphan_lock); goto out_brelse; + } sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); + mutex_unlock(&sbi->s_orphan_lock); err = ext4_handle_dirty_super(handle, inode->i_sb); } else { struct ext4_iloc iloc2; @@ -2670,20 +2691,20 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) jbd_debug(4, "orphan inode %lu will point to %u\n", i_prev->i_ino, ino_next); err = ext4_reserve_inode_write(handle, i_prev, &iloc2); - if (err) + if (err) { + mutex_unlock(&sbi->s_orphan_lock); goto out_brelse; + } NEXT_ORPHAN(i_prev) = ino_next; err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2); + mutex_unlock(&sbi->s_orphan_lock); } if (err) goto out_brelse; NEXT_ORPHAN(inode) = 0; err = ext4_mark_iloc_dirty(handle, inode, &iloc); - out_err: ext4_std_error(inode->i_sb, err); -out: - mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock); return err; out_brelse: diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index ab95508e3d40..b24a2541a9ba 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -308,13 +308,14 @@ static void ext4_end_bio(struct bio *bio, int error) if (error) { struct inode *inode = io_end->inode; - ext4_warning(inode->i_sb, "I/O error writing to inode %lu " + ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " "(offset %llu size %ld starting block %llu)", - inode->i_ino, + error, inode->i_ino, (unsigned long long) io_end->offset, (long) io_end->size, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); + mapping_set_error(inode->i_mapping, error); } if (io_end->flag & EXT4_IO_END_UNWRITTEN) { @@ -400,7 +401,8 @@ submit_and_retry: int ext4_bio_write_page(struct ext4_io_submit *io, struct page *page, int len, - struct writeback_control *wbc) + struct writeback_control *wbc, + bool keep_towrite) { struct inode *inode = page->mapping->host; unsigned block_start, blocksize; @@ -413,10 +415,24 @@ int ext4_bio_write_page(struct ext4_io_submit *io, BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); - set_page_writeback(page); + if (keep_towrite) + set_page_writeback_keepwrite(page); + else + set_page_writeback(page); ClearPageError(page); /* + * Comments copied from block_write_full_page: + * + * The page straddles i_size. It must be zeroed out on each and every + * writepage invocation because it may be mmapped. "A file is mapped + * in multiples of the page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + if (len < PAGE_CACHE_SIZE) + zero_user_segment(page, len, PAGE_CACHE_SIZE); + /* * In the first loop we prepare and mark buffers to submit. We have to * mark all buffers in the page before submitting so that * end_page_writeback() cannot be called from ext4_bio_end_io() when IO @@ -427,19 +443,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io, do { block_start = bh_offset(bh); if (block_start >= len) { - /* - * Comments copied from block_write_full_page_endio: - * - * The page straddles i_size. It must be zeroed out on - * each and every writepage invocation because it may - * be mmapped. "A file is mapped in multiples of the - * page size. For a file that is not a multiple of - * the page size, the remaining memory is zeroed when - * mapped, and writes to that region are not written - * out to the file." - */ - zero_user_segment(page, block_start, - block_start + blocksize); clear_buffer_dirty(bh); set_buffer_uptodate(bh); continue; diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index f3b84cd9de56..bb0e80f03e2e 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -42,7 +42,7 @@ int ext4_resize_begin(struct super_block *sb) void ext4_resize_end(struct super_block *sb) { clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb, @@ -348,6 +348,7 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, bh = sb_getblk(sb, blk); if (unlikely(!bh)) return ERR_PTR(-ENOMEM); + BUFFER_TRACE(bh, "get_write_access"); if ((err = ext4_journal_get_write_access(handle, bh))) { brelse(bh); bh = ERR_PTR(err); @@ -426,6 +427,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, if (unlikely(!bh)) return -ENOMEM; + BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, bh); if (err) return err; @@ -518,6 +520,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb, goto out; } + BUFFER_TRACE(gdb, "get_write_access"); err = ext4_journal_get_write_access(handle, gdb); if (err) { brelse(gdb); @@ -790,14 +793,17 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, goto exit_dind; } + BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); if (unlikely(err)) goto exit_dind; + BUFFER_TRACE(gdb_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, gdb_bh); if (unlikely(err)) goto exit_dind; + BUFFER_TRACE(dind, "get_write_access"); err = ext4_journal_get_write_access(handle, dind); if (unlikely(err)) ext4_std_error(sb, err); @@ -902,6 +908,7 @@ static int add_new_gdb_meta_bg(struct super_block *sb, EXT4_SB(sb)->s_group_desc = n_group_desc; EXT4_SB(sb)->s_gdb_count++; ext4_kvfree(o_group_desc); + BUFFER_TRACE(gdb_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, gdb_bh); if (unlikely(err)) brelse(gdb_bh); @@ -977,6 +984,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, } for (i = 0; i < reserved_gdb; i++) { + BUFFER_TRACE(primary[i], "get_write_access"); if ((err = ext4_journal_get_write_access(handle, primary[i]))) goto exit_bh; } @@ -1084,6 +1092,7 @@ static void update_backups(struct super_block *sb, int blk_off, char *data, ext4_debug("update metadata backup %llu(+%llu)\n", backup_block, backup_block - ext4_group_first_block_no(sb, group)); + BUFFER_TRACE(bh, "get_write_access"); if ((err = ext4_journal_get_write_access(handle, bh))) break; lock_buffer(bh); @@ -1163,6 +1172,7 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb, */ if (gdb_off) { gdb_bh = sbi->s_group_desc[gdb_num]; + BUFFER_TRACE(gdb_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, gdb_bh); if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group)) @@ -1433,6 +1443,7 @@ static int ext4_flex_group_add(struct super_block *sb, goto exit; } + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sbi->s_sbh); if (err) goto exit_journal; @@ -1645,6 +1656,7 @@ static int ext4_group_extend_no_check(struct super_block *sb, return err; } + BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); if (err) { ext4_warning(sb, "error %d on journal write access", err); @@ -1804,6 +1816,7 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode) if (IS_ERR(handle)) return PTR_ERR(handle); + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sbi->s_sbh); if (err) goto errout; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f3c667091618..6df7bc611dbd 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -138,8 +138,8 @@ static __le32 ext4_superblock_csum(struct super_block *sb, return cpu_to_le32(csum); } -int ext4_superblock_csum_verify(struct super_block *sb, - struct ext4_super_block *es) +static int ext4_superblock_csum_verify(struct super_block *sb, + struct ext4_super_block *es) { if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) @@ -879,6 +879,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) return NULL; ei->vfs_inode.i_version = 1; + spin_lock_init(&ei->i_raw_lock); INIT_LIST_HEAD(&ei->i_prealloc_list); spin_lock_init(&ei->i_prealloc_lock); ext4_es_init_tree(&ei->i_es_tree); @@ -1524,8 +1525,6 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, arg = JBD2_DEFAULT_MAX_COMMIT_AGE; sbi->s_commit_interval = HZ * arg; } else if (token == Opt_max_batch_time) { - if (arg == 0) - arg = EXT4_DEF_MAX_BATCH_TIME; sbi->s_max_batch_time = arg; } else if (token == Opt_min_batch_time) { sbi->s_min_batch_time = arg; @@ -1903,7 +1902,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, if (!(sbi->s_mount_state & EXT4_VALID_FS)) ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, " "running e2fsck is recommended"); - else if ((sbi->s_mount_state & EXT4_ERROR_FS)) + else if (sbi->s_mount_state & EXT4_ERROR_FS) ext4_msg(sb, KERN_WARNING, "warning: mounting fs with errors, " "running e2fsck is recommended"); @@ -2404,6 +2403,16 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb, if (ext4_bg_has_super(sb, bg)) has_super = 1; + /* + * If we have a meta_bg fs with 1k blocks, group 0's GDT is at + * block 2, not 1. If s_first_data_block == 0 (bigalloc is enabled + * on modern mke2fs or blksize > 1k on older mke2fs) then we must + * compensate. + */ + if (sb->s_blocksize == 1024 && nr == 0 && + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) == 0) + has_super++; + return (has_super + ext4_group_first_block_no(sb, bg)); } @@ -2798,10 +2807,11 @@ static void print_daily_error_info(unsigned long arg) es = sbi->s_es; if (es->s_error_count) - ext4_msg(sb, KERN_NOTICE, "error count: %u", + /* fsck newer than v1.41.13 is needed to clean this condition. */ + ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u", le32_to_cpu(es->s_error_count)); if (es->s_first_error_time) { - printk(KERN_NOTICE "EXT4-fs (%s): initial error at %u: %.*s:%d", + printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %u: %.*s:%d", sb->s_id, le32_to_cpu(es->s_first_error_time), (int) sizeof(es->s_first_error_func), es->s_first_error_func, @@ -2815,7 +2825,7 @@ static void print_daily_error_info(unsigned long arg) printk("\n"); } if (es->s_last_error_time) { - printk(KERN_NOTICE "EXT4-fs (%s): last error at %u: %.*s:%d", + printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d", sb->s_id, le32_to_cpu(es->s_last_error_time), (int) sizeof(es->s_last_error_func), es->s_last_error_func, @@ -3337,7 +3347,7 @@ static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb) * By default we reserve 2% or 4096 clusters, whichever is smaller. * This should cover the situations where we can not afford to run * out of space like for example punch hole, or converting - * uninitialized extents in delalloc path. In most cases such + * unwritten extents in delalloc path. In most cases such * allocation would require 1, or 2 blocks, higher numbers are * very rare. */ @@ -5367,6 +5377,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, bh = ext4_bread(handle, inode, blk, 1, &err); if (!bh) goto out; + BUFFER_TRACE(bh, "get write access"); err = ext4_journal_get_write_access(handle, bh); if (err) { brelse(bh); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 1f5cf5880718..e7387337060c 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -369,6 +369,9 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name, { int error; + if (strlen(name) > 255) + return -ERANGE; + down_read(&EXT4_I(inode)->xattr_sem); error = ext4_xattr_ibody_get(inode, name_index, name, buffer, buffer_size); @@ -513,6 +516,7 @@ static void ext4_xattr_update_super_block(handle_t *handle, if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR)) return; + BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) { EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR); ext4_handle_dirty_super(handle, sb); @@ -520,8 +524,8 @@ static void ext4_xattr_update_super_block(handle_t *handle, } /* - * Release the xattr block BH: If the reference count is > 1, decrement - * it; otherwise free the block. + * Release the xattr block BH: If the reference count is > 1, decrement it; + * otherwise free the block. */ static void ext4_xattr_release_block(handle_t *handle, struct inode *inode, @@ -532,6 +536,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr); + BUFFER_TRACE(bh, "get_write_access"); error = ext4_journal_get_write_access(handle, bh); if (error) goto out; @@ -542,16 +547,31 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, if (ce) mb_cache_entry_free(ce); get_bh(bh); + unlock_buffer(bh); ext4_free_blocks(handle, inode, bh, 0, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); - unlock_buffer(bh); } else { le32_add_cpu(&BHDR(bh)->h_refcount, -1); if (ce) mb_cache_entry_release(ce); + /* + * Beware of this ugliness: Releasing of xattr block references + * from different inodes can race and so we have to protect + * from a race where someone else frees the block (and releases + * its journal_head) before we are done dirtying the buffer. In + * nojournal mode this race is harmless and we actually cannot + * call ext4_handle_dirty_xattr_block() with locked buffer as + * that function can call sync_dirty_buffer() so for that case + * we handle the dirtying after unlocking the buffer. + */ + if (ext4_handle_valid(handle)) + error = ext4_handle_dirty_xattr_block(handle, inode, + bh); unlock_buffer(bh); - error = ext4_handle_dirty_xattr_block(handle, inode, bh); + if (!ext4_handle_valid(handle)) + error = ext4_handle_dirty_xattr_block(handle, inode, + bh); if (IS_SYNC(inode)) ext4_handle_sync(handle); dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1)); @@ -759,6 +779,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, if (s->base) { ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev, bs->bh->b_blocknr); + BUFFER_TRACE(bs->bh, "get_write_access"); error = ext4_journal_get_write_access(handle, bs->bh); if (error) goto cleanup; @@ -844,6 +865,7 @@ inserted: EXT4_C2B(EXT4_SB(sb), 1)); if (error) goto cleanup; + BUFFER_TRACE(new_bh, "get_write_access"); error = ext4_journal_get_write_access(handle, new_bh); if (error) @@ -881,7 +903,7 @@ inserted: * take i_data_sem because we will test * i_delalloc_reserved_flag in ext4_mb_new_blocks */ - down_read((&EXT4_I(inode)->i_data_sem)); + down_read(&EXT4_I(inode)->i_data_sem); block = ext4_new_meta_blocks(handle, inode, goal, 0, NULL, &error); up_read((&EXT4_I(inode)->i_data_sem)); diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index e93e4ec7d165..dbe2141d10ad 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -240,7 +240,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, } } - error = f2fs_setxattr(inode, name_index, "", value, size, ipage); + error = f2fs_setxattr(inode, name_index, "", value, size, ipage, 0); kfree(value); if (!error) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 4aa521aa9bc3..0b4710c1d370 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -33,12 +33,12 @@ struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) struct address_space *mapping = META_MAPPING(sbi); struct page *page = NULL; repeat: - page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); + page = grab_cache_page(mapping, index); if (!page) { cond_resched(); goto repeat; } - + f2fs_wait_on_page_writeback(page, META); SetPageUptodate(page); return page; } @@ -69,11 +69,10 @@ repeat: goto repeat; } out: - mark_page_accessed(page); return page; } -inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type) +static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type) { switch (type) { case META_NAT: @@ -137,13 +136,11 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type) if (!page) continue; if (PageUptodate(page)) { - mark_page_accessed(page); f2fs_put_page(page, 1); continue; } f2fs_submit_page_mbio(sbi, page, blk_addr, &fio); - mark_page_accessed(page); f2fs_put_page(page, 0); } out: @@ -157,6 +154,8 @@ static int f2fs_write_meta_page(struct page *page, struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + trace_f2fs_writepage(page, META); + if (unlikely(sbi->por_doing)) goto redirty_out; if (wbc->for_reclaim) @@ -174,10 +173,7 @@ no_write: return 0; redirty_out: - dec_page_count(sbi, F2FS_DIRTY_META); - wbc->pages_skipped++; - account_page_redirty(page); - set_page_dirty(page); + redirty_page_for_writepage(wbc, page); return AOP_WRITEPAGE_ACTIVATE; } @@ -187,6 +183,8 @@ static int f2fs_write_meta_pages(struct address_space *mapping, struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); long diff, written; + trace_f2fs_writepages(mapping->host, wbc, META); + /* collect a number of dirty meta pages and write together */ if (wbc->for_kupdate || get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) @@ -370,7 +368,9 @@ void recover_orphan_inodes(struct f2fs_sb_info *sbi) return; sbi->por_doing = true; - start_blk = __start_cp_addr(sbi) + 1; + + start_blk = __start_cp_addr(sbi) + 1 + + le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); orphan_blkaddr = __start_sum_addr(sbi) - 1; ra_meta_pages(sbi, start_blk, orphan_blkaddr, META_CP); @@ -511,8 +511,11 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) unsigned long blk_size = sbi->blocksize; unsigned long long cp1_version = 0, cp2_version = 0; unsigned long long cp_start_blk_no; + unsigned int cp_blks = 1 + le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); + block_t cp_blk_no; + int i; - sbi->ckpt = kzalloc(blk_size, GFP_KERNEL); + sbi->ckpt = kzalloc(cp_blks * blk_size, GFP_KERNEL); if (!sbi->ckpt) return -ENOMEM; /* @@ -543,6 +546,23 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) cp_block = (struct f2fs_checkpoint *)page_address(cur_page); memcpy(sbi->ckpt, cp_block, blk_size); + if (cp_blks <= 1) + goto done; + + cp_blk_no = le32_to_cpu(fsb->cp_blkaddr); + if (cur_page == cp2) + cp_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg); + + for (i = 1; i < cp_blks; i++) { + void *sit_bitmap_ptr; + unsigned char *ckpt = (unsigned char *)sbi->ckpt; + + cur_page = get_meta_page(sbi, cp_blk_no + i); + sit_bitmap_ptr = page_address(cur_page); + memcpy(ckpt + i * blk_size, sit_bitmap_ptr, blk_size); + f2fs_put_page(cur_page, 1); + } +done: f2fs_put_page(cp1, 1); f2fs_put_page(cp2, 1); return 0; @@ -555,14 +575,13 @@ fail_no_cp: static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct list_head *head = &sbi->dir_inode_list; - struct dir_inode_entry *entry; - list_for_each_entry(entry, head, list) - if (unlikely(entry->inode == inode)) - return -EEXIST; + if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) + return -EEXIST; - list_add_tail(&new->list, head); + set_inode_flag(F2FS_I(inode), FI_DIRTY_DIR); + F2FS_I(inode)->dirty_dir = new; + list_add_tail(&new->list, &sbi->dir_inode_list); stat_inc_dirty_dir(sbi); return 0; } @@ -611,31 +630,26 @@ void add_dirty_dir_inode(struct inode *inode) void remove_dirty_dir_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct list_head *head; struct dir_inode_entry *entry; if (!S_ISDIR(inode->i_mode)) return; spin_lock(&sbi->dir_inode_lock); - if (get_dirty_dents(inode)) { + if (get_dirty_dents(inode) || + !is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) { spin_unlock(&sbi->dir_inode_lock); return; } - head = &sbi->dir_inode_list; - list_for_each_entry(entry, head, list) { - if (entry->inode == inode) { - list_del(&entry->list); - stat_dec_dirty_dir(sbi); - spin_unlock(&sbi->dir_inode_lock); - kmem_cache_free(inode_entry_slab, entry); - goto done; - } - } + entry = F2FS_I(inode)->dirty_dir; + list_del(&entry->list); + F2FS_I(inode)->dirty_dir = NULL; + clear_inode_flag(F2FS_I(inode), FI_DIRTY_DIR); + stat_dec_dirty_dir(sbi); spin_unlock(&sbi->dir_inode_lock); + kmem_cache_free(inode_entry_slab, entry); -done: /* Only from the recovery routine */ if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) { clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT); @@ -643,26 +657,6 @@ done: } } -struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino) -{ - - struct list_head *head; - struct inode *inode = NULL; - struct dir_inode_entry *entry; - - spin_lock(&sbi->dir_inode_lock); - - head = &sbi->dir_inode_list; - list_for_each_entry(entry, head, list) { - if (entry->inode->i_ino == ino) { - inode = entry->inode; - break; - } - } - spin_unlock(&sbi->dir_inode_lock); - return inode; -} - void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi) { struct list_head *head; @@ -761,6 +755,13 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) __u32 crc32 = 0; void *kaddr; int i; + int cp_payload_blks = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); + + /* + * This avoids to conduct wrong roll-forward operations and uses + * metapages, so should be called prior to sync_meta_pages below. + */ + discard_next_dnode(sbi); /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) @@ -805,16 +806,19 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) orphan_blocks = (sbi->n_orphans + F2FS_ORPHANS_PER_BLOCK - 1) / F2FS_ORPHANS_PER_BLOCK; - ckpt->cp_pack_start_sum = cpu_to_le32(1 + orphan_blocks); + ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + + orphan_blocks); if (is_umount) { set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); ckpt->cp_pack_total_block_count = cpu_to_le32(2 + - data_sum_blocks + orphan_blocks + NR_CURSEG_NODE_TYPE); + cp_payload_blks + data_sum_blocks + + orphan_blocks + NR_CURSEG_NODE_TYPE); } else { clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); ckpt->cp_pack_total_block_count = cpu_to_le32(2 + - data_sum_blocks + orphan_blocks); + cp_payload_blks + data_sum_blocks + + orphan_blocks); } if (sbi->n_orphans) @@ -840,6 +844,15 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) set_page_dirty(cp_page); f2fs_put_page(cp_page, 1); + for (i = 1; i < 1 + cp_payload_blks; i++) { + cp_page = grab_meta_page(sbi, start_blk++); + kaddr = page_address(cp_page); + memcpy(kaddr, (char *)ckpt + i * F2FS_BLKSIZE, + (1 << sbi->log_blocksize)); + set_page_dirty(cp_page); + f2fs_put_page(cp_page, 1); + } + if (sbi->n_orphans) { write_orphan_inodes(sbi, start_blk); start_blk += orphan_blocks; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 45abd60e2bff..f8cf619edb5f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -417,7 +417,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) if (unlikely(dn.data_blkaddr == NEW_ADDR)) return ERR_PTR(-EINVAL); - page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); + page = grab_cache_page(mapping, index); if (!page) return ERR_PTR(-ENOMEM); @@ -455,7 +455,7 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index) int err; repeat: - page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); + page = grab_cache_page(mapping, index); if (!page) return ERR_PTR(-ENOMEM); @@ -608,8 +608,8 @@ static int __allocate_data_block(struct dnode_of_data *dn) * b. do not use extent cache for better performance * c. give the block addresses to blockdev */ -static int get_data_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +static int __get_data_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create, bool fiemap) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); unsigned int blkbits = inode->i_sb->s_blocksize_bits; @@ -637,7 +637,7 @@ static int get_data_block(struct inode *inode, sector_t iblock, err = 0; goto unlock_out; } - if (dn.data_blkaddr == NEW_ADDR) + if (dn.data_blkaddr == NEW_ADDR && !fiemap) goto put_out; if (dn.data_blkaddr != NULL_ADDR) { @@ -652,8 +652,7 @@ static int get_data_block(struct inode *inode, sector_t iblock, goto put_out; } - end_offset = IS_INODE(dn.node_page) ? - ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK; + end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); bh_result->b_size = (((size_t)1) << blkbits); dn.ofs_in_node++; pgofs++; @@ -672,11 +671,10 @@ get_next: err = 0; goto unlock_out; } - if (dn.data_blkaddr == NEW_ADDR) + if (dn.data_blkaddr == NEW_ADDR && !fiemap) goto put_out; - end_offset = IS_INODE(dn.node_page) ? - ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK; + end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); } if (maxblocks > (bh_result->b_size >> blkbits)) { @@ -710,11 +708,32 @@ out: return err; } +static int get_data_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + return __get_data_block(inode, iblock, bh_result, create, false); +} + +static int get_data_block_fiemap(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + return __get_data_block(inode, iblock, bh_result, create, true); +} + +int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + return generic_block_fiemap(inode, fieinfo, + start, len, get_data_block_fiemap); +} + static int f2fs_read_data_page(struct file *file, struct page *page) { struct inode *inode = page->mapping->host; int ret; + trace_f2fs_readpage(page, DATA); + /* If the file has inline data, try to read it directlly */ if (f2fs_has_inline_data(inode)) ret = f2fs_read_inline_data(inode, page); @@ -790,6 +809,8 @@ static int f2fs_write_data_page(struct page *page, .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, }; + trace_f2fs_writepage(page, DATA); + if (page->index < end_index) goto write; @@ -798,10 +819,8 @@ static int f2fs_write_data_page(struct page *page, * this page does not have to be written to disk. */ offset = i_size & (PAGE_CACHE_SIZE - 1); - if ((page->index >= end_index + 1) || !offset) { - inode_dec_dirty_dents(inode); + if ((page->index >= end_index + 1) || !offset) goto out; - } zero_user_segment(page, offset, PAGE_CACHE_SIZE); write: @@ -810,7 +829,6 @@ write: /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { - inode_dec_dirty_dents(inode); err = do_write_data_page(page, &fio); goto done; } @@ -832,15 +850,16 @@ done: clear_cold_data(page); out: + inode_dec_dirty_dents(inode); unlock_page(page); if (need_balance_fs) f2fs_balance_fs(sbi); + if (wbc->for_reclaim) + f2fs_submit_merged_bio(sbi, DATA, WRITE); return 0; redirty_out: - wbc->pages_skipped++; - account_page_redirty(page); - set_page_dirty(page); + redirty_page_for_writepage(wbc, page); return AOP_WRITEPAGE_ACTIVATE; } @@ -862,12 +881,15 @@ static int f2fs_write_data_pages(struct address_space *mapping, int ret; long diff; + trace_f2fs_writepages(mapping->host, wbc, DATA); + /* deal with chardevs and other special file */ if (!mapping->a_ops->writepage) return 0; if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && - get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA)) + get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA) && + available_free_memory(sbi, DIRTY_DENTS)) goto skip_write; diff = nr_pages_to_write(sbi, DATA, wbc); @@ -903,6 +925,8 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, struct dnode_of_data dn; int err = 0; + trace_f2fs_write_begin(inode, pos, len, flags); + f2fs_balance_fs(sbi); repeat: err = f2fs_convert_inline_data(inode, pos + len); @@ -912,6 +936,10 @@ repeat: page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; + + /* to avoid latency during memory pressure */ + unlock_page(page); + *pagep = page; if (f2fs_has_inline_data(inode) && (pos + len) <= MAX_INLINE_DATA) @@ -923,10 +951,18 @@ repeat: f2fs_unlock_op(sbi); if (err) { - f2fs_put_page(page, 1); + f2fs_put_page(page, 0); return err; } inline_data: + lock_page(page); + if (unlikely(page->mapping != mapping)) { + f2fs_put_page(page, 1); + goto repeat; + } + + f2fs_wait_on_page_writeback(page, DATA); + if ((len == PAGE_CACHE_SIZE) || PageUptodate(page)) return 0; @@ -978,6 +1014,8 @@ static int f2fs_write_end(struct file *file, { struct inode *inode = page->mapping->host; + trace_f2fs_write_end(inode, pos, len, copied); + SetPageUptodate(page); set_page_dirty(page); @@ -992,10 +1030,9 @@ static int f2fs_write_end(struct file *file, } static int check_direct_IO(struct inode *inode, int rw, - const struct iovec *iov, loff_t offset, unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { unsigned blocksize_mask = inode->i_sb->s_blocksize - 1; - int i; if (rw == READ) return 0; @@ -1003,14 +1040,14 @@ static int check_direct_IO(struct inode *inode, int rw, if (offset & blocksize_mask) return -EINVAL; - for (i = 0; i < nr_segs; i++) - if (iov[i].iov_len & blocksize_mask) - return -EINVAL; + if (iov_iter_alignment(iter) & blocksize_mask) + return -EINVAL; + return 0; } static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; @@ -1019,11 +1056,14 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, if (f2fs_has_inline_data(inode)) return 0; - if (check_direct_IO(inode, rw, iov, offset, nr_segs)) + if (check_direct_IO(inode, rw, iter, offset)) return 0; - return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, - get_data_block); + /* clear fsync mark to recover these blocks */ + fsync_mark_clear(F2FS_SB(inode->i_sb), inode->i_ino); + + return blockdev_direct_IO(rw, iocb, inode, iter, offset, + get_data_block); } static void f2fs_invalidate_data_page(struct page *page, unsigned int offset, @@ -1061,6 +1101,11 @@ static int f2fs_set_data_page_dirty(struct page *page) static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) { + struct inode *inode = mapping->host; + + if (f2fs_has_inline_data(inode)) + return 0; + return generic_block_bmap(mapping, block, get_data_block); } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 972fd0ef230f..a4addd72ebbd 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -23,10 +23,10 @@ static unsigned long dir_blocks(struct inode *inode) static unsigned int dir_buckets(unsigned int level, int dir_level) { - if (level < MAX_DIR_HASH_DEPTH / 2) + if (level + dir_level < MAX_DIR_HASH_DEPTH / 2) return 1 << (level + dir_level); else - return 1 << ((MAX_DIR_HASH_DEPTH / 2 + dir_level) - 1); + return MAX_DIR_BUCKETS; } static unsigned int bucket_blocks(unsigned int level) @@ -268,6 +268,8 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage) { struct f2fs_inode *ri; + f2fs_wait_on_page_writeback(ipage, NODE); + /* copy name info. to this inode page */ ri = F2FS_INODE(ipage); ri->i_namelen = cpu_to_le32(name->len); @@ -374,11 +376,11 @@ static struct page *init_inode_metadata(struct inode *inode, put_error: f2fs_put_page(page, 1); +error: /* once the failed inode becomes a bad inode, i_mode is S_IFREG */ truncate_inode_pages(&inode->i_data, 0); truncate_blocks(inode, 0); remove_dirty_dir_inode(inode); -error: remove_inode_page(inode); return ERR_PTR(err); } @@ -637,11 +639,17 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) struct f2fs_dentry_block *dentry_blk = NULL; struct f2fs_dir_entry *de = NULL; struct page *dentry_page = NULL; + struct file_ra_state *ra = &file->f_ra; unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK); unsigned char d_type = DT_UNKNOWN; bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK); + /* readahead for multi pages of dir */ + if (npages - n > 1 && !ra_has_index(ra, n)) + page_cache_sync_readahead(inode->i_mapping, ra, file, n, + min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); + for (; n < npages; n++) { dentry_page = get_lock_data_page(inode, n); if (IS_ERR(dentry_page)) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2ecac8312359..58df97e174d0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -182,6 +182,8 @@ enum { #define F2FS_LINK_MAX 32000 /* maximum link count per file */ +#define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */ + /* for in-memory extent cache entry */ #define F2FS_MIN_EXTENT_LEN 16 /* minimum extent length */ @@ -218,6 +220,7 @@ struct f2fs_inode_info { nid_t i_xattr_nid; /* node id that contains xattrs */ unsigned long long xattr_ver; /* cp version of xattr modification */ struct extent_info ext; /* in-memory extent cache entry */ + struct dir_inode_entry *dirty_dir; /* the pointer of dirty dir */ }; static inline void get_extent_info(struct extent_info *ext, @@ -243,6 +246,7 @@ static inline void set_raw_extent(struct extent_info *ext, struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ + nid_t available_nids; /* maximum available node ids */ nid_t next_scan_nid; /* the next nid to be scanned */ unsigned int ram_thresh; /* control the memory footprint */ @@ -323,15 +327,21 @@ struct flush_cmd { int ret; }; +struct flush_cmd_control { + struct task_struct *f2fs_issue_flush; /* flush thread */ + wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ + struct flush_cmd *issue_list; /* list for command issue */ + struct flush_cmd *dispatch_list; /* list for command dispatch */ + spinlock_t issue_lock; /* for issue list lock */ + struct flush_cmd *issue_tail; /* list tail of issue list */ +}; + struct f2fs_sm_info { struct sit_info *sit_info; /* whole segment information */ struct free_segmap_info *free_info; /* free segment information */ struct dirty_seglist_info *dirty_info; /* dirty segment information */ struct curseg_info *curseg_array; /* active segment information */ - struct list_head wblist_head; /* list of under-writeback pages */ - spinlock_t wblist_lock; /* lock for checkpoint */ - block_t seg0_blkaddr; /* block address of 0'th segment */ block_t main_blkaddr; /* start block address of main area */ block_t ssa_blkaddr; /* start block address of SSA area */ @@ -353,12 +363,8 @@ struct f2fs_sm_info { unsigned int min_ipu_util; /* in-place-update threshold */ /* for flush command control */ - struct task_struct *f2fs_issue_flush; /* flush thread */ - wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ - struct flush_cmd *issue_list; /* list for command issue */ - struct flush_cmd *dispatch_list; /* list for command dispatch */ - spinlock_t issue_lock; /* for issue list lock */ - struct flush_cmd *issue_tail; /* list tail of issue list */ + struct flush_cmd_control *cmd_control_info; + }; /* @@ -635,7 +641,8 @@ static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) */ static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) { - WARN_ON((nid >= NM_I(sbi)->max_nid)); + if (unlikely(nid < F2FS_ROOT_INO(sbi))) + return -EINVAL; if (unlikely(nid >= NM_I(sbi)->max_nid)) return -EINVAL; return 0; @@ -755,9 +762,18 @@ static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag) static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - int offset = (flag == NAT_BITMAP) ? + int offset; + + if (le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload) > 0) { + if (flag == NAT_BITMAP) + return &ckpt->sit_nat_version_bitmap; + else + return ((unsigned char *)ckpt + F2FS_BLKSIZE); + } else { + offset = (flag == NAT_BITMAP) ? le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0; - return &ckpt->sit_nat_version_bitmap + offset; + return &ckpt->sit_nat_version_bitmap + offset; + } } static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi) @@ -958,6 +974,7 @@ static inline int f2fs_clear_bit(unsigned int nr, char *addr) enum { FI_NEW_INODE, /* indicate newly allocated inode */ FI_DIRTY_INODE, /* indicate inode is dirty or not */ + FI_DIRTY_DIR, /* indicate directory has dirty pages */ FI_INC_LINK, /* need to increment i_nlink */ FI_ACL_MODE, /* indicate acl mode */ FI_NO_ALLOC, /* should not allocate any blocks */ @@ -1071,6 +1088,12 @@ static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi) ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) +/* get offset of first page in next direct node */ +#define PGOFS_OF_NEXT_DNODE(pgofs, fi) \ + ((pgofs < ADDRS_PER_INODE(fi)) ? ADDRS_PER_INODE(fi) : \ + (pgofs - ADDRS_PER_INODE(fi) + ADDRS_PER_BLOCK) / \ + ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi)) + /* * file.c */ @@ -1140,8 +1163,10 @@ f2fs_hash_t f2fs_dentry_hash(const char *, size_t); struct dnode_of_data; struct node_info; +bool available_free_memory(struct f2fs_sb_info *, int); int is_checkpointed_node(struct f2fs_sb_info *, nid_t); bool fsync_mark_done(struct f2fs_sb_info *, nid_t); +void fsync_mark_clear(struct f2fs_sb_info *, nid_t); void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); int truncate_inode_blocks(struct inode *, pgoff_t); @@ -1176,9 +1201,12 @@ void destroy_node_manager_caches(void); void f2fs_balance_fs(struct f2fs_sb_info *); void f2fs_balance_fs_bg(struct f2fs_sb_info *); int f2fs_issue_flush(struct f2fs_sb_info *); +int create_flush_cmd_control(struct f2fs_sb_info *); +void destroy_flush_cmd_control(struct f2fs_sb_info *); void invalidate_blocks(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); void clear_prefree_segments(struct f2fs_sb_info *); +void discard_next_dnode(struct f2fs_sb_info *); int npages_for_summary_flush(struct f2fs_sb_info *); void allocate_new_segments(struct f2fs_sb_info *); struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); @@ -1221,7 +1249,6 @@ int get_valid_checkpoint(struct f2fs_sb_info *); void set_dirty_dir_page(struct inode *, struct page *); void add_dirty_dir_inode(struct inode *); void remove_dirty_dir_inode(struct inode *); -struct inode *check_dirty_dir_inode(struct f2fs_sb_info *, nid_t); void sync_dirty_dir_inodes(struct f2fs_sb_info *); void write_checkpoint(struct f2fs_sb_info *, bool); void init_orphan_info(struct f2fs_sb_info *); @@ -1242,6 +1269,7 @@ struct page *find_data_page(struct inode *, pgoff_t, bool); struct page *get_lock_data_page(struct inode *, pgoff_t); struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); int do_write_data_page(struct page *, struct f2fs_io_info *); +int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); /* * gc.c @@ -1391,5 +1419,6 @@ bool f2fs_may_inline(struct inode *); int f2fs_read_inline_data(struct inode *, struct page *); int f2fs_convert_inline_data(struct inode *, pgoff_t); int f2fs_write_inline_data(struct inode *, struct page *, unsigned int); +void truncate_inline_data(struct inode *, u64); int recover_inline_data(struct inode *, struct page *); #endif diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 60e7d5448a1d..7d8b96275092 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -19,6 +19,7 @@ #include <linux/compat.h> #include <linux/uaccess.h> #include <linux/mount.h> +#include <linux/pagevec.h> #include "f2fs.h" #include "node.h" @@ -194,6 +195,132 @@ out: return ret; } +static pgoff_t __get_first_dirty_index(struct address_space *mapping, + pgoff_t pgofs, int whence) +{ + struct pagevec pvec; + int nr_pages; + + if (whence != SEEK_DATA) + return 0; + + /* find first dirty page index */ + pagevec_init(&pvec, 0); + nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs, PAGECACHE_TAG_DIRTY, 1); + pgofs = nr_pages ? pvec.pages[0]->index: LONG_MAX; + pagevec_release(&pvec); + return pgofs; +} + +static bool __found_offset(block_t blkaddr, pgoff_t dirty, pgoff_t pgofs, + int whence) +{ + switch (whence) { + case SEEK_DATA: + if ((blkaddr == NEW_ADDR && dirty == pgofs) || + (blkaddr != NEW_ADDR && blkaddr != NULL_ADDR)) + return true; + break; + case SEEK_HOLE: + if (blkaddr == NULL_ADDR) + return true; + break; + } + return false; +} + +static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + loff_t maxbytes = inode->i_sb->s_maxbytes; + struct dnode_of_data dn; + pgoff_t pgofs, end_offset, dirty; + loff_t data_ofs = offset; + loff_t isize; + int err = 0; + + mutex_lock(&inode->i_mutex); + + isize = i_size_read(inode); + if (offset >= isize) + goto fail; + + /* handle inline data case */ + if (f2fs_has_inline_data(inode)) { + if (whence == SEEK_HOLE) + data_ofs = isize; + goto found; + } + + pgofs = (pgoff_t)(offset >> PAGE_CACHE_SHIFT); + + dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence); + + for (; data_ofs < isize; data_ofs = pgofs << PAGE_CACHE_SHIFT) { + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA); + if (err && err != -ENOENT) { + goto fail; + } else if (err == -ENOENT) { + /* direct node is not exist */ + if (whence == SEEK_DATA) { + pgofs = PGOFS_OF_NEXT_DNODE(pgofs, + F2FS_I(inode)); + continue; + } else { + goto found; + } + } + + end_offset = IS_INODE(dn.node_page) ? + ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK; + + /* find data/hole in dnode block */ + for (; dn.ofs_in_node < end_offset; + dn.ofs_in_node++, pgofs++, + data_ofs = pgofs << PAGE_CACHE_SHIFT) { + block_t blkaddr; + blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + + if (__found_offset(blkaddr, dirty, pgofs, whence)) { + f2fs_put_dnode(&dn); + goto found; + } + } + f2fs_put_dnode(&dn); + } + + if (whence == SEEK_DATA) + goto fail; +found: + if (whence == SEEK_HOLE && data_ofs > isize) + data_ofs = isize; + mutex_unlock(&inode->i_mutex); + return vfs_setpos(file, data_ofs, maxbytes); +fail: + mutex_unlock(&inode->i_mutex); + return -ENXIO; +} + +static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + loff_t maxbytes = inode->i_sb->s_maxbytes; + + switch (whence) { + case SEEK_SET: + case SEEK_CUR: + case SEEK_END: + return generic_file_llseek_size(file, offset, whence, + maxbytes, i_size_read(inode)); + case SEEK_DATA: + case SEEK_HOLE: + return f2fs_seek_block(file, offset, whence); + } + + return -EINVAL; +} + static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); @@ -242,6 +369,9 @@ static void truncate_partial_data_page(struct inode *inode, u64 from) unsigned offset = from & (PAGE_CACHE_SIZE - 1); struct page *page; + if (f2fs_has_inline_data(inode)) + return truncate_inline_data(inode, from); + if (!offset) return; @@ -288,10 +418,7 @@ int truncate_blocks(struct inode *inode, u64 from) return err; } - if (IS_INODE(dn.node_page)) - count = ADDRS_PER_INODE(F2FS_I(inode)); - else - count = ADDRS_PER_BLOCK; + count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); count -= dn.ofs_in_node; f2fs_bug_on(count < 0); @@ -413,6 +540,7 @@ const struct inode_operations f2fs_file_inode_operations = { .listxattr = f2fs_listxattr, .removexattr = generic_removexattr, #endif + .fiemap = f2fs_fiemap, }; static void fill_zero(struct inode *inode, pgoff_t index, @@ -531,16 +659,19 @@ static int expand_inode_data(struct inode *inode, loff_t offset, off_start = offset & (PAGE_CACHE_SIZE - 1); off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); + f2fs_lock_op(sbi); + for (index = pg_start; index <= pg_end; index++) { struct dnode_of_data dn; - f2fs_lock_op(sbi); + if (index == pg_end && !off_end) + goto noalloc; + set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_reserve_block(&dn, index); - f2fs_unlock_op(sbi); if (ret) break; - +noalloc: if (pg_start == pg_end) new_size = offset + len; else if (index == pg_start && off_start) @@ -555,7 +686,9 @@ static int expand_inode_data(struct inode *inode, loff_t offset, i_size_read(inode) < new_size) { i_size_write(inode, new_size); mark_inode_dirty(inode); + update_inode_page(inode); } + f2fs_unlock_op(sbi); return ret; } @@ -678,11 +811,11 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) #endif const struct file_operations f2fs_file_operations = { - .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, + .llseek = f2fs_llseek, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .open = generic_file_open, .mmap = f2fs_file_mmap, .fsync = f2fs_sync_file, @@ -692,5 +825,5 @@ const struct file_operations f2fs_file_operations = { .compat_ioctl = f2fs_compat_ioctl, #endif .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, }; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 383db1fabcf4..1bba5228c197 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -81,8 +81,10 @@ static int __f2fs_convert_inline_data(struct inode *inode, struct page *page) f2fs_lock_op(sbi); ipage = get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) - return PTR_ERR(ipage); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto out; + } /* * i_addr[0] is not used for inline data, @@ -90,11 +92,10 @@ static int __f2fs_convert_inline_data(struct inode *inode, struct page *page) */ set_new_dnode(&dn, inode, ipage, NULL, 0); err = f2fs_reserve_block(&dn, 0); - if (err) { - f2fs_unlock_op(sbi); - return err; - } + if (err) + goto out; + f2fs_wait_on_page_writeback(page, DATA); zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); /* Copy the whole inline data block */ @@ -118,6 +119,7 @@ static int __f2fs_convert_inline_data(struct inode *inode, struct page *page) sync_inode_page(&dn); f2fs_put_dnode(&dn); +out: f2fs_unlock_op(sbi); return err; } @@ -132,7 +134,7 @@ int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size) else if (to_size <= MAX_INLINE_DATA) return 0; - page = grab_cache_page_write_begin(inode->i_mapping, 0, AOP_FLAG_NOFS); + page = grab_cache_page(inode->i_mapping, 0); if (!page) return -ENOMEM; @@ -155,6 +157,7 @@ int f2fs_write_inline_data(struct inode *inode, return err; ipage = dn.inode_page; + f2fs_wait_on_page_writeback(ipage, NODE); zero_user_segment(ipage, INLINE_DATA_OFFSET, INLINE_DATA_OFFSET + MAX_INLINE_DATA); src_addr = kmap(page); @@ -175,6 +178,26 @@ int f2fs_write_inline_data(struct inode *inode, return 0; } +void truncate_inline_data(struct inode *inode, u64 from) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct page *ipage; + + if (from >= MAX_INLINE_DATA) + return; + + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) + return; + + f2fs_wait_on_page_writeback(ipage, NODE); + + zero_user_segment(ipage, INLINE_DATA_OFFSET + from, + INLINE_DATA_OFFSET + MAX_INLINE_DATA); + set_page_dirty(ipage); + f2fs_put_page(ipage, 1); +} + int recover_inline_data(struct inode *inode, struct page *npage) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); @@ -199,6 +222,8 @@ process_inline: ipage = get_node_page(sbi, inode->i_ino); f2fs_bug_on(IS_ERR(ipage)); + f2fs_wait_on_page_writeback(ipage, NODE); + src_addr = inline_data_addr(npage); dst_addr = inline_data_addr(ipage); memcpy(dst_addr, src_addr, MAX_INLINE_DATA); @@ -210,6 +235,7 @@ process_inline: if (f2fs_has_inline_data(inode)) { ipage = get_node_page(sbi, inode->i_ino); f2fs_bug_on(IS_ERR(ipage)); + f2fs_wait_on_page_writeback(ipage, NODE); zero_user_segment(ipage, INLINE_DATA_OFFSET, INLINE_DATA_OFFSET + MAX_INLINE_DATA); clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index ee829d360468..2cf6962f6cc8 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -12,6 +12,7 @@ #include <linux/f2fs_fs.h> #include <linux/buffer_head.h> #include <linux/writeback.h> +#include <linux/bitops.h> #include "f2fs.h" #include "node.h" @@ -21,20 +22,20 @@ void f2fs_set_inode_flags(struct inode *inode) { unsigned int flags = F2FS_I(inode)->i_flags; - - inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | - S_NOATIME | S_DIRSYNC); + unsigned int new_fl = 0; if (flags & FS_SYNC_FL) - inode->i_flags |= S_SYNC; + new_fl |= S_SYNC; if (flags & FS_APPEND_FL) - inode->i_flags |= S_APPEND; + new_fl |= S_APPEND; if (flags & FS_IMMUTABLE_FL) - inode->i_flags |= S_IMMUTABLE; + new_fl |= S_IMMUTABLE; if (flags & FS_NOATIME_FL) - inode->i_flags |= S_NOATIME; + new_fl |= S_NOATIME; if (flags & FS_DIRSYNC_FL) - inode->i_flags |= S_DIRSYNC; + new_fl |= S_DIRSYNC; + set_mask_bits(&inode->i_flags, + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC, new_fl); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) @@ -77,6 +78,7 @@ static int do_read_inode(struct inode *inode) if (check_nid_range(sbi, inode->i_ino)) { f2fs_msg(inode->i_sb, KERN_ERR, "bad inode number: %lu", (unsigned long) inode->i_ino); + WARN_ON(1); return -EINVAL; } @@ -294,4 +296,5 @@ void f2fs_evict_inode(struct inode *inode) sb_end_intwrite(inode->i_sb); no_delete: clear_inode(inode); + invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a9409d19dfd4..a6bdddc33ce2 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -41,18 +41,9 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) } f2fs_unlock_op(sbi); - inode->i_uid = current_fsuid(); - - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else { - inode->i_gid = current_fsgid(); - } + inode_init_owner(inode, dir, mode); inode->i_ino = ino; - inode->i_mode = mode; inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode->i_generation = sbi->s_next_generation++; @@ -426,9 +417,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } f2fs_set_link(new_dir, new_entry, new_page, old_inode); - down_write(&F2FS_I(old_inode)->i_sem); - F2FS_I(old_inode)->i_pino = new_dir->i_ino; - up_write(&F2FS_I(old_inode)->i_sem); new_inode->i_ctime = CURRENT_TIME; down_write(&F2FS_I(new_inode)->i_sem); @@ -457,6 +445,10 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } } + down_write(&F2FS_I(old_inode)->i_sem); + file_lost_pino(old_inode); + up_write(&F2FS_I(old_inode)->i_sem); + old_inode->i_ctime = CURRENT_TIME; mark_inode_dirty(old_inode); @@ -466,9 +458,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_dir != new_dir) { f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); - down_write(&F2FS_I(old_inode)->i_sem); - F2FS_I(old_inode)->i_pino = new_dir->i_ino; - up_write(&F2FS_I(old_inode)->i_sem); update_inode_page(old_inode); } else { kunmap(old_dir_page); @@ -483,7 +472,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, return 0; put_out_dir: - f2fs_put_page(new_page, 1); + kunmap(new_page); + f2fs_put_page(new_page, 0); out_dir: if (old_dir_entry) { kunmap(old_dir_page); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a161e955c4c8..4b697ccc9b0c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -26,20 +26,28 @@ static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; -static inline bool available_free_memory(struct f2fs_nm_info *nm_i, int type) +bool available_free_memory(struct f2fs_sb_info *sbi, int type) { + struct f2fs_nm_info *nm_i = NM_I(sbi); struct sysinfo val; unsigned long mem_size = 0; + bool res = false; si_meminfo(&val); - if (type == FREE_NIDS) - mem_size = nm_i->fcnt * sizeof(struct free_nid); - else if (type == NAT_ENTRIES) - mem_size += nm_i->nat_cnt * sizeof(struct nat_entry); - mem_size >>= 12; - - /* give 50:50 memory for free nids and nat caches respectively */ - return (mem_size < ((val.totalram * nm_i->ram_thresh) >> 11)); + /* give 25%, 25%, 50% memory for each components respectively */ + if (type == FREE_NIDS) { + mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >> 12; + res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 2); + } else if (type == NAT_ENTRIES) { + mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> 12; + res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 2); + } else if (type == DIRTY_DENTS) { + if (sbi->sb->s_bdi->dirty_exceeded) + return false; + mem_size = get_pages(sbi, F2FS_DIRTY_DENTS); + res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 1); + } + return res; } static void clear_node_page_dirty(struct page *page) @@ -147,6 +155,18 @@ bool fsync_mark_done(struct f2fs_sb_info *sbi, nid_t nid) return fsync_done; } +void fsync_mark_clear(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct nat_entry *e; + + write_lock(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, nid); + if (e) + e->fsync_done = false; + write_unlock(&nm_i->nat_tree_lock); +} + static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) { struct nat_entry *new; @@ -179,9 +199,7 @@ retry: write_unlock(&nm_i->nat_tree_lock); goto retry; } - nat_set_blkaddr(e, le32_to_cpu(ne->block_addr)); - nat_set_ino(e, le32_to_cpu(ne->ino)); - nat_set_version(e, ne->version); + node_info_from_raw_nat(&e->ni, ne); } write_unlock(&nm_i->nat_tree_lock); } @@ -243,7 +261,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) { struct f2fs_nm_info *nm_i = NM_I(sbi); - if (available_free_memory(nm_i, NAT_ENTRIES)) + if (available_free_memory(sbi, NAT_ENTRIES)) return 0; write_lock(&nm_i->nat_tree_lock); @@ -849,8 +867,7 @@ struct page *new_node_page(struct dnode_of_data *dn, if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) return ERR_PTR(-EPERM); - page = grab_cache_page_write_begin(NODE_MAPPING(sbi), - dn->nid, AOP_FLAG_NOFS); + page = grab_cache_page(NODE_MAPPING(sbi), dn->nid); if (!page) return ERR_PTR(-ENOMEM); @@ -867,6 +884,7 @@ struct page *new_node_page(struct dnode_of_data *dn, new_ni.ino = dn->inode->i_ino; set_node_addr(sbi, &new_ni, NEW_ADDR, false); + f2fs_wait_on_page_writeback(page, NODE); fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); set_cold_node(dn->inode, page); SetPageUptodate(page); @@ -946,8 +964,7 @@ struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) struct page *page; int err; repeat: - page = grab_cache_page_write_begin(NODE_MAPPING(sbi), - nid, AOP_FLAG_NOFS); + page = grab_cache_page(NODE_MAPPING(sbi), nid); if (!page) return ERR_PTR(-ENOMEM); @@ -967,7 +984,6 @@ repeat: goto repeat; } got_it: - mark_page_accessed(page); return page; } @@ -1022,7 +1038,6 @@ page_hit: f2fs_put_page(page, 1); return ERR_PTR(-EIO); } - mark_page_accessed(page); return page; } @@ -1196,6 +1211,8 @@ static int f2fs_write_node_page(struct page *page, .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, }; + trace_f2fs_writepage(page, NODE); + if (unlikely(sbi->por_doing)) goto redirty_out; @@ -1227,10 +1244,7 @@ static int f2fs_write_node_page(struct page *page, return 0; redirty_out: - dec_page_count(sbi, F2FS_DIRTY_NODES); - wbc->pages_skipped++; - account_page_redirty(page); - set_page_dirty(page); + redirty_page_for_writepage(wbc, page); return AOP_WRITEPAGE_ACTIVATE; } @@ -1240,6 +1254,8 @@ static int f2fs_write_node_pages(struct address_space *mapping, struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); long diff; + trace_f2fs_writepages(mapping->host, wbc, NODE); + /* balancing f2fs's metadata in background */ f2fs_balance_fs_bg(sbi); @@ -1315,13 +1331,14 @@ static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i, radix_tree_delete(&nm_i->free_nid_root, i->nid); } -static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) +static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) { + struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; struct nat_entry *ne; bool allocated = false; - if (!available_free_memory(nm_i, FREE_NIDS)) + if (!available_free_memory(sbi, FREE_NIDS)) return -1; /* 0 nid should not be used */ @@ -1374,9 +1391,10 @@ static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) kmem_cache_free(free_nid_slab, i); } -static void scan_nat_page(struct f2fs_nm_info *nm_i, +static void scan_nat_page(struct f2fs_sb_info *sbi, struct page *nat_page, nid_t start_nid) { + struct f2fs_nm_info *nm_i = NM_I(sbi); struct f2fs_nat_block *nat_blk = page_address(nat_page); block_t blk_addr; int i; @@ -1391,7 +1409,7 @@ static void scan_nat_page(struct f2fs_nm_info *nm_i, blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); f2fs_bug_on(blk_addr == NEW_ADDR); if (blk_addr == NULL_ADDR) { - if (add_free_nid(nm_i, start_nid, true) < 0) + if (add_free_nid(sbi, start_nid, true) < 0) break; } } @@ -1415,7 +1433,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi) while (1) { struct page *page = get_current_nat_page(sbi, nid); - scan_nat_page(nm_i, page, nid); + scan_nat_page(sbi, page, nid); f2fs_put_page(page, 1); nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK)); @@ -1435,7 +1453,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi) block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr); nid = le32_to_cpu(nid_in_journal(sum, i)); if (addr == NULL_ADDR) - add_free_nid(nm_i, nid, true); + add_free_nid(sbi, nid, true); else remove_free_nid(nm_i, nid); } @@ -1452,7 +1470,7 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i = NULL; retry: - if (unlikely(sbi->total_valid_node_count + 1 >= nm_i->max_nid)) + if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids)) return false; spin_lock(&nm_i->free_nid_list_lock); @@ -1512,7 +1530,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->free_nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); f2fs_bug_on(!i || i->state != NID_ALLOC); - if (!available_free_memory(nm_i, FREE_NIDS)) { + if (!available_free_memory(sbi, FREE_NIDS)) { __del_from_free_nid_list(nm_i, i); need_free = true; } else { @@ -1534,7 +1552,7 @@ void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, clear_node_page_dirty(page); } -void recover_inline_xattr(struct inode *inode, struct page *page) +static void recover_inline_xattr(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); void *src_addr, *dst_addr; @@ -1559,6 +1577,7 @@ void recover_inline_xattr(struct inode *inode, struct page *page) src_addr = inline_xattr_addr(page); inline_size = inline_xattr_size(inode); + f2fs_wait_on_page_writeback(ipage, NODE); memcpy(dst_addr, src_addr, inline_size); update_inode(inode, ipage); @@ -1614,6 +1633,11 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) struct node_info old_ni, new_ni; struct page *ipage; + get_node_info(sbi, ino, &old_ni); + + if (unlikely(old_ni.blk_addr != NULL_ADDR)) + return -EINVAL; + ipage = grab_cache_page(NODE_MAPPING(sbi), ino); if (!ipage) return -ENOMEM; @@ -1621,7 +1645,6 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) /* Should not use this inode from free nid list */ remove_free_nid(NM_I(sbi), ino); - get_node_info(sbi, ino, &old_ni); SetPageUptodate(ipage); fill_node_footer(ipage, ino, ino, 0, true); @@ -1647,35 +1670,29 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) /* * ra_sum_pages() merge contiguous pages into one bio and submit. - * these pre-readed pages are linked in pages list. + * these pre-readed pages are alloced in bd_inode's mapping tree. */ -static int ra_sum_pages(struct f2fs_sb_info *sbi, struct list_head *pages, +static int ra_sum_pages(struct f2fs_sb_info *sbi, struct page **pages, int start, int nrpages) { - struct page *page; - int page_idx = start; + struct inode *inode = sbi->sb->s_bdev->bd_inode; + struct address_space *mapping = inode->i_mapping; + int i, page_idx = start; struct f2fs_io_info fio = { .type = META, .rw = READ_SYNC | REQ_META | REQ_PRIO }; - for (; page_idx < start + nrpages; page_idx++) { - /* alloc temporal page for read node summary info*/ - page = alloc_page(GFP_F2FS_ZERO); - if (!page) + for (i = 0; page_idx < start + nrpages; page_idx++, i++) { + /* alloc page in bd_inode for reading node summary info */ + pages[i] = grab_cache_page(mapping, page_idx); + if (!pages[i]) break; - - lock_page(page); - page->index = page_idx; - list_add_tail(&page->lru, pages); + f2fs_submit_page_mbio(sbi, pages[i], page_idx, &fio); } - list_for_each_entry(page, pages, lru) - f2fs_submit_page_mbio(sbi, page, page->index, &fio); - f2fs_submit_merged_bio(sbi, META, READ); - - return page_idx - start; + return i; } int restore_node_summary(struct f2fs_sb_info *sbi, @@ -1683,11 +1700,11 @@ int restore_node_summary(struct f2fs_sb_info *sbi, { struct f2fs_node *rn; struct f2fs_summary *sum_entry; - struct page *page, *tmp; + struct inode *inode = sbi->sb->s_bdev->bd_inode; block_t addr; int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); - int i, last_offset, nrpages, err = 0; - LIST_HEAD(page_list); + struct page *pages[bio_blocks]; + int i, idx, last_offset, nrpages, err = 0; /* scan the node segment */ last_offset = sbi->blocks_per_seg; @@ -1698,29 +1715,31 @@ int restore_node_summary(struct f2fs_sb_info *sbi, nrpages = min(last_offset - i, bio_blocks); /* read ahead node pages */ - nrpages = ra_sum_pages(sbi, &page_list, addr, nrpages); + nrpages = ra_sum_pages(sbi, pages, addr, nrpages); if (!nrpages) return -ENOMEM; - list_for_each_entry_safe(page, tmp, &page_list, lru) { + for (idx = 0; idx < nrpages; idx++) { if (err) goto skip; - lock_page(page); - if (unlikely(!PageUptodate(page))) { + lock_page(pages[idx]); + if (unlikely(!PageUptodate(pages[idx]))) { err = -EIO; } else { - rn = F2FS_NODE(page); + rn = F2FS_NODE(pages[idx]); sum_entry->nid = rn->footer.nid; sum_entry->version = 0; sum_entry->ofs_in_node = 0; sum_entry++; } - unlock_page(page); + unlock_page(pages[idx]); skip: - list_del(&page->lru); - __free_pages(page, 0); + page_cache_release(pages[idx]); } + + invalidate_mapping_pages(inode->i_mapping, addr, + addr + nrpages); } return err; } @@ -1758,9 +1777,7 @@ retry: write_unlock(&nm_i->nat_tree_lock); goto retry; } - nat_set_blkaddr(ne, le32_to_cpu(raw_ne.block_addr)); - nat_set_ino(ne, le32_to_cpu(raw_ne.ino)); - nat_set_version(ne, raw_ne.version); + node_info_from_raw_nat(&ne->ni, &raw_ne); __set_nat_cache_dirty(nm_i, ne); write_unlock(&nm_i->nat_tree_lock); } @@ -1793,7 +1810,6 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) nid_t nid; struct f2fs_nat_entry raw_ne; int offset = -1; - block_t new_blkaddr; if (nat_get_blkaddr(ne) == NEW_ADDR) continue; @@ -1829,11 +1845,7 @@ to_nat_page: f2fs_bug_on(!nat_blk); raw_ne = nat_blk->entries[nid - start_nid]; flush_now: - new_blkaddr = nat_get_blkaddr(ne); - - raw_ne.ino = cpu_to_le32(nat_get_ino(ne)); - raw_ne.block_addr = cpu_to_le32(new_blkaddr); - raw_ne.version = nat_get_version(ne); + raw_nat_from_node_info(&raw_ne, &ne->ni); if (offset < 0) { nat_blk->entries[nid - start_nid] = raw_ne; @@ -1843,7 +1855,7 @@ flush_now: } if (nat_get_blkaddr(ne) == NULL_ADDR && - add_free_nid(NM_I(sbi), nid, false) <= 0) { + add_free_nid(sbi, nid, false) <= 0) { write_lock(&nm_i->nat_tree_lock); __del_from_nat_cache(nm_i, ne); write_unlock(&nm_i->nat_tree_lock); @@ -1871,8 +1883,10 @@ static int init_node_manager(struct f2fs_sb_info *sbi) nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1; nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); + nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; + /* not used nids: 0, node, meta, (and root counted as valid node) */ - nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks - 3; + nm_i->available_nids = nm_i->max_nid - 3; nm_i->fcnt = 0; nm_i->nat_cnt = 0; nm_i->ram_thresh = DEF_RAM_THRESHOLD; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 5decc1a375f0..7281112cd1c8 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -59,12 +59,12 @@ struct nat_entry { do { \ ne->checkpointed = false; \ list_move_tail(&ne->list, &nm_i->dirty_nat_entries); \ - } while (0); + } while (0) #define __clear_nat_cache_dirty(nm_i, ne) \ do { \ ne->checkpointed = true; \ list_move_tail(&ne->list, &nm_i->nat_entries); \ - } while (0); + } while (0) #define inc_node_version(version) (++version) static inline void node_info_from_raw_nat(struct node_info *ni, @@ -75,9 +75,18 @@ static inline void node_info_from_raw_nat(struct node_info *ni, ni->version = raw_ne->version; } -enum nid_type { +static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne, + struct node_info *ni) +{ + raw_ne->ino = cpu_to_le32(ni->ino); + raw_ne->block_addr = cpu_to_le32(ni->blk_addr); + raw_ne->version = ni->version; +} + +enum mem_type { FREE_NIDS, /* indicates the free nid list */ - NAT_ENTRIES /* indicates the cached nat entry */ + NAT_ENTRIES, /* indicates the cached nat entry */ + DIRTY_DENTS /* indicates dirty dentry pages */ }; /* @@ -263,7 +272,7 @@ static inline void set_nid(struct page *p, int off, nid_t nid, bool i) { struct f2fs_node *rn = F2FS_NODE(p); - wait_on_page_writeback(p); + f2fs_wait_on_page_writeback(p, NODE); if (i) rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index b1ae89f0f44e..a112368a4a86 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -46,15 +46,10 @@ static int recover_dentry(struct page *ipage, struct inode *inode) struct inode *dir, *einode; int err = 0; - dir = check_dirty_dir_inode(F2FS_SB(inode->i_sb), pino); - if (!dir) { - dir = f2fs_iget(inode->i_sb, pino); - if (IS_ERR(dir)) { - err = PTR_ERR(dir); - goto out; - } - set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT); - add_dirty_dir_inode(dir); + dir = f2fs_iget(inode->i_sb, pino); + if (IS_ERR(dir)) { + err = PTR_ERR(dir); + goto out; } name.len = le32_to_cpu(raw_inode->i_namelen); @@ -63,7 +58,7 @@ static int recover_dentry(struct page *ipage, struct inode *inode) if (unlikely(name.len > F2FS_NAME_LEN)) { WARN_ON(1); err = -ENAMETOOLONG; - goto out; + goto out_err; } retry: de = f2fs_find_entry(dir, &name, &page); @@ -73,7 +68,8 @@ retry: einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino)); if (IS_ERR(einode)) { WARN_ON(1); - if (PTR_ERR(einode) == -ENOENT) + err = PTR_ERR(einode); + if (err == -ENOENT) err = -EEXIST; goto out_unmap_put; } @@ -87,11 +83,23 @@ retry: goto retry; } err = __f2fs_add_link(dir, &name, inode); + if (err) + goto out_err; + + if (is_inode_flag_set(F2FS_I(dir), FI_DELAY_IPUT)) { + iput(dir); + } else { + add_dirty_dir_inode(dir); + set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT); + } + goto out; out_unmap_put: kunmap(page); f2fs_put_page(page, 0); +out_err: + iput(dir); out: f2fs_msg(inode->i_sb, KERN_NOTICE, "%s: ino = %x, name = %s, dir = %lx, err = %d", @@ -299,10 +307,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, goto out; start = start_bidx_of_node(ofs_of_node(page), fi); - if (IS_INODE(page)) - end = start + ADDRS_PER_INODE(fi); - else - end = start + ADDRS_PER_BLOCK; + end = start + ADDRS_PER_PAGE(page, fi); f2fs_lock_op(sbi); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 085f548be7a3..d04613df710a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -25,7 +25,6 @@ #define __reverse_ffz(x) __reverse_ffs(~(x)) static struct kmem_cache *discard_entry_slab; -static struct kmem_cache *flush_cmd_slab; /* * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since @@ -200,20 +199,20 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) static int issue_flush_thread(void *data) { struct f2fs_sb_info *sbi = data; - struct f2fs_sm_info *sm_i = SM_I(sbi); - wait_queue_head_t *q = &sm_i->flush_wait_queue; + struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; + wait_queue_head_t *q = &fcc->flush_wait_queue; repeat: if (kthread_should_stop()) return 0; - spin_lock(&sm_i->issue_lock); - if (sm_i->issue_list) { - sm_i->dispatch_list = sm_i->issue_list; - sm_i->issue_list = sm_i->issue_tail = NULL; + spin_lock(&fcc->issue_lock); + if (fcc->issue_list) { + fcc->dispatch_list = fcc->issue_list; + fcc->issue_list = fcc->issue_tail = NULL; } - spin_unlock(&sm_i->issue_lock); + spin_unlock(&fcc->issue_lock); - if (sm_i->dispatch_list) { + if (fcc->dispatch_list) { struct bio *bio = bio_alloc(GFP_NOIO, 0); struct flush_cmd *cmd, *next; int ret; @@ -221,47 +220,80 @@ repeat: bio->bi_bdev = sbi->sb->s_bdev; ret = submit_bio_wait(WRITE_FLUSH, bio); - for (cmd = sm_i->dispatch_list; cmd; cmd = next) { + for (cmd = fcc->dispatch_list; cmd; cmd = next) { cmd->ret = ret; next = cmd->next; complete(&cmd->wait); } - sm_i->dispatch_list = NULL; + bio_put(bio); + fcc->dispatch_list = NULL; } - wait_event_interruptible(*q, kthread_should_stop() || sm_i->issue_list); + wait_event_interruptible(*q, + kthread_should_stop() || fcc->issue_list); goto repeat; } int f2fs_issue_flush(struct f2fs_sb_info *sbi) { - struct f2fs_sm_info *sm_i = SM_I(sbi); - struct flush_cmd *cmd; - int ret; + struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; + struct flush_cmd cmd; if (!test_opt(sbi, FLUSH_MERGE)) return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL); - cmd = f2fs_kmem_cache_alloc(flush_cmd_slab, GFP_ATOMIC); - cmd->next = NULL; - cmd->ret = 0; - init_completion(&cmd->wait); + init_completion(&cmd.wait); + cmd.next = NULL; - spin_lock(&sm_i->issue_lock); - if (sm_i->issue_list) - sm_i->issue_tail->next = cmd; + spin_lock(&fcc->issue_lock); + if (fcc->issue_list) + fcc->issue_tail->next = &cmd; else - sm_i->issue_list = cmd; - sm_i->issue_tail = cmd; - spin_unlock(&sm_i->issue_lock); + fcc->issue_list = &cmd; + fcc->issue_tail = &cmd; + spin_unlock(&fcc->issue_lock); - if (!sm_i->dispatch_list) - wake_up(&sm_i->flush_wait_queue); + if (!fcc->dispatch_list) + wake_up(&fcc->flush_wait_queue); - wait_for_completion(&cmd->wait); - ret = cmd->ret; - kmem_cache_free(flush_cmd_slab, cmd); - return ret; + wait_for_completion(&cmd.wait); + + return cmd.ret; +} + +int create_flush_cmd_control(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + struct flush_cmd_control *fcc; + int err = 0; + + fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); + if (!fcc) + return -ENOMEM; + spin_lock_init(&fcc->issue_lock); + init_waitqueue_head(&fcc->flush_wait_queue); + sbi->sm_info->cmd_control_info = fcc; + fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, + "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(fcc->f2fs_issue_flush)) { + err = PTR_ERR(fcc->f2fs_issue_flush); + kfree(fcc); + sbi->sm_info->cmd_control_info = NULL; + return err; + } + + return err; +} + +void destroy_flush_cmd_control(struct f2fs_sb_info *sbi) +{ + struct flush_cmd_control *fcc = + sbi->sm_info->cmd_control_info; + + if (fcc && fcc->f2fs_issue_flush) + kthread_stop(fcc->f2fs_issue_flush); + kfree(fcc); + sbi->sm_info->cmd_control_info = NULL; } static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, @@ -336,13 +368,26 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } -static void f2fs_issue_discard(struct f2fs_sb_info *sbi, +static int f2fs_issue_discard(struct f2fs_sb_info *sbi, block_t blkstart, block_t blklen) { sector_t start = SECTOR_FROM_BLOCK(sbi, blkstart); sector_t len = SECTOR_FROM_BLOCK(sbi, blklen); - blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); + return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); +} + +void discard_next_dnode(struct f2fs_sb_info *sbi) +{ + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); + block_t blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + + if (f2fs_issue_discard(sbi, blkaddr, 1)) { + struct page *page = grab_meta_page(sbi, blkaddr); + /* zero-filled page */ + set_page_dirty(page); + f2fs_put_page(page, 1); + } } static void add_discard_addrs(struct f2fs_sb_info *sbi, @@ -1832,7 +1877,6 @@ int build_segment_manager(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - dev_t dev = sbi->sb->s_bdev->bd_dev; struct f2fs_sm_info *sm_info; int err; @@ -1842,8 +1886,6 @@ int build_segment_manager(struct f2fs_sb_info *sbi) /* init sm info */ sbi->sm_info = sm_info; - INIT_LIST_HEAD(&sm_info->wblist_head); - spin_lock_init(&sm_info->wblist_lock); sm_info->seg0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); sm_info->main_blkaddr = le32_to_cpu(raw_super->main_blkaddr); sm_info->segment_count = le32_to_cpu(raw_super->segment_count); @@ -1860,14 +1902,10 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->nr_discards = 0; sm_info->max_discards = 0; - if (test_opt(sbi, FLUSH_MERGE)) { - spin_lock_init(&sm_info->issue_lock); - init_waitqueue_head(&sm_info->flush_wait_queue); - - sm_info->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, - "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); - if (IS_ERR(sm_info->f2fs_issue_flush)) - return PTR_ERR(sm_info->f2fs_issue_flush); + if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) { + err = create_flush_cmd_control(sbi); + if (err) + return err; } err = build_sit_info(sbi); @@ -1976,10 +2014,10 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) void destroy_segment_manager(struct f2fs_sb_info *sbi) { struct f2fs_sm_info *sm_info = SM_I(sbi); + if (!sm_info) return; - if (sm_info->f2fs_issue_flush) - kthread_stop(sm_info->f2fs_issue_flush); + destroy_flush_cmd_control(sbi); destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); @@ -1994,17 +2032,10 @@ int __init create_segment_manager_caches(void) sizeof(struct discard_entry)); if (!discard_entry_slab) return -ENOMEM; - flush_cmd_slab = f2fs_kmem_cache_create("flush_command", - sizeof(struct flush_cmd)); - if (!flush_cmd_slab) { - kmem_cache_destroy(discard_entry_slab); - return -ENOMEM; - } return 0; } void destroy_segment_manager_caches(void) { kmem_cache_destroy(discard_entry_slab); - kmem_cache_destroy(flush_cmd_slab); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index c756923a7302..8f96d9372ade 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -514,7 +514,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) { struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); - if (!(root->d_sb->s_flags & MS_RDONLY) && test_opt(sbi, BG_GC)) + if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC)) seq_printf(seq, ",background_gc=%s", "on"); else seq_printf(seq, ",background_gc=%s", "off"); @@ -542,7 +542,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",disable_ext_identify"); if (test_opt(sbi, INLINE_DATA)) seq_puts(seq, ",inline_data"); - if (test_opt(sbi, FLUSH_MERGE)) + if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE)) seq_puts(seq, ",flush_merge"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); @@ -594,6 +594,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) struct f2fs_sb_info *sbi = F2FS_SB(sb); struct f2fs_mount_info org_mount_opt; int err, active_logs; + bool need_restart_gc = false; + bool need_stop_gc = false; sync_filesystem(sb); @@ -611,7 +613,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) /* * Previous and new state of filesystem is RO, - * so no point in checking GC conditions. + * so skip checking GC and FLUSH_MERGE conditions. */ if ((sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) goto skip; @@ -625,18 +627,40 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if (sbi->gc_thread) { stop_gc_thread(sbi); f2fs_sync_fs(sb, 1); + need_restart_gc = true; } } else if (test_opt(sbi, BG_GC) && !sbi->gc_thread) { err = start_gc_thread(sbi); if (err) goto restore_opts; + need_stop_gc = true; + } + + /* + * We stop issue flush thread if FS is mounted as RO + * or if flush_merge is not passed in mount option. + */ + if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { + destroy_flush_cmd_control(sbi); + } else if (test_opt(sbi, FLUSH_MERGE) && + !sbi->sm_info->cmd_control_info) { + err = create_flush_cmd_control(sbi); + if (err) + goto restore_gc; } skip: /* Update the POSIXACL Flag */ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); return 0; - +restore_gc: + if (need_restart_gc) { + if (start_gc_thread(sbi)) + f2fs_msg(sbi->sb, KERN_WARNING, + "background gc thread is stop"); + } else if (need_stop_gc) { + stop_gc_thread(sbi); + } restore_opts: sbi->mount_opt = org_mount_opt; sbi->active_logs = active_logs; @@ -665,9 +689,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb, struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; - if (unlikely(ino < F2FS_ROOT_INO(sbi))) - return ERR_PTR(-ESTALE); - if (unlikely(ino >= NM_I(sbi)->max_nid)) + if (check_nid_range(sbi, ino)) return ERR_PTR(-ESTALE); /* diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 503c2451131e..8bea941ee309 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -26,7 +26,7 @@ #include "xattr.h" static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list, - size_t list_size, const char *name, size_t name_len, int type) + size_t list_size, const char *name, size_t len, int type) { struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); int total_len, prefix_len = 0; @@ -53,11 +53,11 @@ static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list, return -EINVAL; } - total_len = prefix_len + name_len + 1; + total_len = prefix_len + len + 1; if (list && total_len <= list_size) { memcpy(list, prefix, prefix_len); - memcpy(list + prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; + memcpy(list + prefix_len, name, len); + list[prefix_len + len] = '\0'; } return total_len; } @@ -108,11 +108,12 @@ static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name, if (strcmp(name, "") == 0) return -EINVAL; - return f2fs_setxattr(dentry->d_inode, type, name, value, size, NULL); + return f2fs_setxattr(dentry->d_inode, type, name, + value, size, NULL, flags); } static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list, - size_t list_size, const char *name, size_t name_len, int type) + size_t list_size, const char *name, size_t len, int type) { const char *xname = F2FS_SYSTEM_ADVISE_PREFIX; size_t size; @@ -155,9 +156,6 @@ static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name, } #ifdef CONFIG_F2FS_FS_SECURITY -static int __f2fs_setxattr(struct inode *inode, int name_index, - const char *name, const void *value, size_t value_len, - struct page *ipage); static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array, void *page) { @@ -165,9 +163,9 @@ static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array, int err = 0; for (xattr = xattr_array; xattr->name != NULL; xattr++) { - err = __f2fs_setxattr(inode, F2FS_XATTR_INDEX_SECURITY, + err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_SECURITY, xattr->name, xattr->value, - xattr->value_len, (struct page *)page); + xattr->value_len, (struct page *)page, 0); if (err < 0) break; } @@ -241,26 +239,26 @@ const struct xattr_handler *f2fs_xattr_handlers[] = { NULL, }; -static inline const struct xattr_handler *f2fs_xattr_handler(int name_index) +static inline const struct xattr_handler *f2fs_xattr_handler(int index) { const struct xattr_handler *handler = NULL; - if (name_index > 0 && name_index < ARRAY_SIZE(f2fs_xattr_handler_map)) - handler = f2fs_xattr_handler_map[name_index]; + if (index > 0 && index < ARRAY_SIZE(f2fs_xattr_handler_map)) + handler = f2fs_xattr_handler_map[index]; return handler; } -static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int name_index, - size_t name_len, const char *name) +static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index, + size_t len, const char *name) { struct f2fs_xattr_entry *entry; list_for_each_xattr(entry, base_addr) { - if (entry->e_name_index != name_index) + if (entry->e_name_index != index) continue; - if (entry->e_name_len != name_len) + if (entry->e_name_len != len) continue; - if (!memcmp(entry->e_name, name, name_len)) + if (!memcmp(entry->e_name, name, len)) break; } return entry; @@ -347,6 +345,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, if (ipage) { inline_addr = inline_xattr_addr(ipage); + f2fs_wait_on_page_writeback(ipage, NODE); } else { page = get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { @@ -354,6 +353,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, return PTR_ERR(page); } inline_addr = inline_xattr_addr(page); + f2fs_wait_on_page_writeback(page, NODE); } memcpy(inline_addr, txattr_addr, inline_size); f2fs_put_page(page, 1); @@ -374,6 +374,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, return PTR_ERR(xpage); } f2fs_bug_on(new_nid); + f2fs_wait_on_page_writeback(xpage, NODE); } else { struct dnode_of_data dn; set_new_dnode(&dn, inode, NULL, NULL, new_nid); @@ -396,42 +397,43 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, return 0; } -int f2fs_getxattr(struct inode *inode, int name_index, const char *name, +int f2fs_getxattr(struct inode *inode, int index, const char *name, void *buffer, size_t buffer_size) { struct f2fs_xattr_entry *entry; void *base_addr; int error = 0; - size_t value_len, name_len; + size_t size, len; if (name == NULL) return -EINVAL; - name_len = strlen(name); - if (name_len > F2FS_NAME_LEN) + + len = strlen(name); + if (len > F2FS_NAME_LEN) return -ERANGE; base_addr = read_all_xattrs(inode, NULL); if (!base_addr) return -ENOMEM; - entry = __find_xattr(base_addr, name_index, name_len, name); + entry = __find_xattr(base_addr, index, len, name); if (IS_XATTR_LAST_ENTRY(entry)) { error = -ENODATA; goto cleanup; } - value_len = le16_to_cpu(entry->e_value_size); + size = le16_to_cpu(entry->e_value_size); - if (buffer && value_len > buffer_size) { + if (buffer && size > buffer_size) { error = -ERANGE; goto cleanup; } if (buffer) { char *pval = entry->e_name + entry->e_name_len; - memcpy(buffer, pval, value_len); + memcpy(buffer, pval, size); } - error = value_len; + error = size; cleanup: kzfree(base_addr); @@ -475,15 +477,15 @@ cleanup: return error; } -static int __f2fs_setxattr(struct inode *inode, int name_index, - const char *name, const void *value, size_t value_len, - struct page *ipage) +static int __f2fs_setxattr(struct inode *inode, int index, + const char *name, const void *value, size_t size, + struct page *ipage, int flags) { struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_xattr_entry *here, *last; void *base_addr; int found, newsize; - size_t name_len; + size_t len; __u32 new_hsize; int error = -ENOMEM; @@ -491,11 +493,11 @@ static int __f2fs_setxattr(struct inode *inode, int name_index, return -EINVAL; if (value == NULL) - value_len = 0; + size = 0; - name_len = strlen(name); + len = strlen(name); - if (name_len > F2FS_NAME_LEN || value_len > MAX_VALUE_LEN(inode)) + if (len > F2FS_NAME_LEN || size > MAX_VALUE_LEN(inode)) return -ERANGE; base_addr = read_all_xattrs(inode, ipage); @@ -503,16 +505,23 @@ static int __f2fs_setxattr(struct inode *inode, int name_index, goto exit; /* find entry with wanted name. */ - here = __find_xattr(base_addr, name_index, name_len, name); + here = __find_xattr(base_addr, index, len, name); found = IS_XATTR_LAST_ENTRY(here) ? 0 : 1; - last = here; + if ((flags & XATTR_REPLACE) && !found) { + error = -ENODATA; + goto exit; + } else if ((flags & XATTR_CREATE) && found) { + error = -EEXIST; + goto exit; + } + + last = here; while (!IS_XATTR_LAST_ENTRY(last)) last = XATTR_NEXT_ENTRY(last); - newsize = XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + - name_len + value_len); + newsize = XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + len + size); /* 1. Check space */ if (value) { @@ -555,12 +564,12 @@ static int __f2fs_setxattr(struct inode *inode, int name_index, * We just write new entry. */ memset(last, 0, newsize); - last->e_name_index = name_index; - last->e_name_len = name_len; - memcpy(last->e_name, name, name_len); - pval = last->e_name + name_len; - memcpy(pval, value, value_len); - last->e_value_size = cpu_to_le16(value_len); + last->e_name_index = index; + last->e_name_len = len; + memcpy(last->e_name, name, len); + pval = last->e_name + len; + memcpy(pval, value, size); + last->e_value_size = cpu_to_le16(size); new_hsize += newsize; } @@ -583,18 +592,23 @@ exit: return error; } -int f2fs_setxattr(struct inode *inode, int name_index, const char *name, - const void *value, size_t value_len, struct page *ipage) +int f2fs_setxattr(struct inode *inode, int index, const char *name, + const void *value, size_t size, + struct page *ipage, int flags) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); int err; + /* this case is only from init_inode_metadata */ + if (ipage) + return __f2fs_setxattr(inode, index, name, value, + size, ipage, flags); f2fs_balance_fs(sbi); f2fs_lock_op(sbi); /* protect xattr_ver */ down_write(&F2FS_I(inode)->i_sem); - err = __f2fs_setxattr(inode, name_index, name, value, value_len, ipage); + err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags); up_write(&F2FS_I(inode)->i_sem); f2fs_unlock_op(sbi); diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index b21d9ebdeff3..34ab7dbcf5e3 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -114,18 +114,18 @@ extern const struct xattr_handler f2fs_xattr_security_handler; extern const struct xattr_handler *f2fs_xattr_handlers[]; extern int f2fs_setxattr(struct inode *, int, const char *, - const void *, size_t, struct page *); + const void *, size_t, struct page *, int); extern int f2fs_getxattr(struct inode *, int, const char *, void *, size_t); extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t); #else #define f2fs_xattr_handlers NULL -static inline int f2fs_setxattr(struct inode *inode, int name_index, - const char *name, const void *value, size_t value_len) +static inline int f2fs_setxattr(struct inode *inode, int index, + const char *name, const void *value, size_t size, int flags) { return -EOPNOTSUPP; } -static inline int f2fs_getxattr(struct inode *inode, int name_index, +static inline int f2fs_getxattr(struct inode *inode, int index, const char *name, void *buffer, size_t buffer_size) { return -EOPNOTSUPP; diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 7c31f4bc74a9..e0c4ba39a377 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -52,7 +52,8 @@ struct fat_mount_options { usefree:1, /* Use free_clusters for FAT32 */ tz_set:1, /* Filesystem timestamps' offset set */ rodir:1, /* allow ATTR_RO for directory */ - discard:1; /* Issue discard requests on deletions */ + discard:1, /* Issue discard requests on deletions */ + dos1xfloppy:1; /* Assume default BPB for DOS 1.x floppies */ }; #define FAT_HASH_BITS 8 diff --git a/fs/fat/file.c b/fs/fat/file.c index 9b104f543056..85f79a89e747 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -170,10 +170,10 @@ int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) const struct file_operations fat_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .release = fat_file_release, .unlocked_ioctl = fat_generic_ioctl, diff --git a/fs/fat/inode.c b/fs/fat/inode.c index b3361fe2bcb5..756aead10d96 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -35,9 +35,71 @@ #define CONFIG_FAT_DEFAULT_IOCHARSET "" #endif +#define KB_IN_SECTORS 2 + +/* + * A deserialized copy of the on-disk structure laid out in struct + * fat_boot_sector. + */ +struct fat_bios_param_block { + u16 fat_sector_size; + u8 fat_sec_per_clus; + u16 fat_reserved; + u8 fat_fats; + u16 fat_dir_entries; + u16 fat_sectors; + u16 fat_fat_length; + u32 fat_total_sect; + + u8 fat16_state; + u32 fat16_vol_id; + + u32 fat32_length; + u32 fat32_root_cluster; + u16 fat32_info_sector; + u8 fat32_state; + u32 fat32_vol_id; +}; + static int fat_default_codepage = CONFIG_FAT_DEFAULT_CODEPAGE; static char fat_default_iocharset[] = CONFIG_FAT_DEFAULT_IOCHARSET; +static struct fat_floppy_defaults { + unsigned nr_sectors; + unsigned sec_per_clus; + unsigned dir_entries; + unsigned media; + unsigned fat_length; +} floppy_defaults[] = { +{ + .nr_sectors = 160 * KB_IN_SECTORS, + .sec_per_clus = 1, + .dir_entries = 64, + .media = 0xFE, + .fat_length = 1, +}, +{ + .nr_sectors = 180 * KB_IN_SECTORS, + .sec_per_clus = 1, + .dir_entries = 64, + .media = 0xFC, + .fat_length = 2, +}, +{ + .nr_sectors = 320 * KB_IN_SECTORS, + .sec_per_clus = 2, + .dir_entries = 112, + .media = 0xFF, + .fat_length = 1, +}, +{ + .nr_sectors = 360 * KB_IN_SECTORS, + .sec_per_clus = 2, + .dir_entries = 112, + .media = 0xFD, + .fat_length = 2, +}, +}; static int fat_add_cluster(struct inode *inode) { @@ -185,12 +247,13 @@ static int fat_write_end(struct file *file, struct address_space *mapping, } static ssize_t fat_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, - loff_t offset, unsigned long nr_segs) + struct iov_iter *iter, + loff_t offset) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; + size_t count = iov_iter_count(iter); ssize_t ret; if (rw == WRITE) { @@ -203,7 +266,7 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb, * * Return 0, and fallback to normal buffered write. */ - loff_t size = offset + iov_length(iov, nr_segs); + loff_t size = offset + count; if (MSDOS_I(inode)->mmu_private < size) return 0; } @@ -212,10 +275,9 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb, * FAT need to use the DIO_LOCKING for avoiding the race * condition of fat_get_block() and ->truncate(). */ - ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, - fat_get_block); + ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, fat_get_block); if (ret < 0 && (rw & WRITE)) - fat_write_failed(mapping, offset + iov_length(iov, nr_segs)); + fat_write_failed(mapping, offset + count); return ret; } @@ -359,7 +421,7 @@ struct inode *fat_iget(struct super_block *sb, loff_t i_pos) static int is_exec(unsigned char *extension) { - unsigned char *exe_extensions = "EXECOMBAT", *walk; + unsigned char exe_extensions[] = "EXECOMBAT", *walk; for (walk = exe_extensions; *walk; walk += 3) if (!strncmp(extension, walk, 3)) @@ -853,6 +915,8 @@ static int fat_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",nfs=stale_rw"); if (opts->discard) seq_puts(m, ",discard"); + if (opts->dos1xfloppy) + seq_puts(m, ",dos1xfloppy"); return 0; } @@ -867,7 +931,7 @@ enum { Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes, Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont, Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_time_offset, - Opt_nfs_stale_rw, Opt_nfs_nostale_ro, Opt_err, + Opt_nfs_stale_rw, Opt_nfs_nostale_ro, Opt_err, Opt_dos1xfloppy, }; static const match_table_t fat_tokens = { @@ -900,6 +964,7 @@ static const match_table_t fat_tokens = { {Opt_nfs_stale_rw, "nfs"}, {Opt_nfs_stale_rw, "nfs=stale_rw"}, {Opt_nfs_nostale_ro, "nfs=nostale_ro"}, + {Opt_dos1xfloppy, "dos1xfloppy"}, {Opt_obsolete, "conv=binary"}, {Opt_obsolete, "conv=text"}, {Opt_obsolete, "conv=auto"}, @@ -1102,6 +1167,9 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, case Opt_nfs_nostale_ro: opts->nfs = FAT_NFS_NOSTALE_RO; break; + case Opt_dos1xfloppy: + opts->dos1xfloppy = 1; + break; /* msdos specific */ case Opt_dots: @@ -1247,6 +1315,169 @@ static unsigned long calc_fat_clusters(struct super_block *sb) return sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits; } +static bool fat_bpb_is_zero(struct fat_boot_sector *b) +{ + if (get_unaligned_le16(&b->sector_size)) + return false; + if (b->sec_per_clus) + return false; + if (b->reserved) + return false; + if (b->fats) + return false; + if (get_unaligned_le16(&b->dir_entries)) + return false; + if (get_unaligned_le16(&b->sectors)) + return false; + if (b->media) + return false; + if (b->fat_length) + return false; + if (b->secs_track) + return false; + if (b->heads) + return false; + return true; +} + +static int fat_read_bpb(struct super_block *sb, struct fat_boot_sector *b, + int silent, struct fat_bios_param_block *bpb) +{ + int error = -EINVAL; + + /* Read in BPB ... */ + memset(bpb, 0, sizeof(*bpb)); + bpb->fat_sector_size = get_unaligned_le16(&b->sector_size); + bpb->fat_sec_per_clus = b->sec_per_clus; + bpb->fat_reserved = le16_to_cpu(b->reserved); + bpb->fat_fats = b->fats; + bpb->fat_dir_entries = get_unaligned_le16(&b->dir_entries); + bpb->fat_sectors = get_unaligned_le16(&b->sectors); + bpb->fat_fat_length = le16_to_cpu(b->fat_length); + bpb->fat_total_sect = le32_to_cpu(b->total_sect); + + bpb->fat16_state = b->fat16.state; + bpb->fat16_vol_id = get_unaligned_le32(b->fat16.vol_id); + + bpb->fat32_length = le32_to_cpu(b->fat32.length); + bpb->fat32_root_cluster = le32_to_cpu(b->fat32.root_cluster); + bpb->fat32_info_sector = le16_to_cpu(b->fat32.info_sector); + bpb->fat32_state = b->fat32.state; + bpb->fat32_vol_id = get_unaligned_le32(b->fat32.vol_id); + + /* Validate this looks like a FAT filesystem BPB */ + if (!bpb->fat_reserved) { + if (!silent) + fat_msg(sb, KERN_ERR, + "bogus number of reserved sectors"); + goto out; + } + if (!bpb->fat_fats) { + if (!silent) + fat_msg(sb, KERN_ERR, "bogus number of FAT structure"); + goto out; + } + + /* + * Earlier we checked here that b->secs_track and b->head are nonzero, + * but it turns out valid FAT filesystems can have zero there. + */ + + if (!fat_valid_media(b->media)) { + if (!silent) + fat_msg(sb, KERN_ERR, "invalid media value (0x%02x)", + (unsigned)b->media); + goto out; + } + + if (!is_power_of_2(bpb->fat_sector_size) + || (bpb->fat_sector_size < 512) + || (bpb->fat_sector_size > 4096)) { + if (!silent) + fat_msg(sb, KERN_ERR, "bogus logical sector size %u", + (unsigned)bpb->fat_sector_size); + goto out; + } + + if (!is_power_of_2(bpb->fat_sec_per_clus)) { + if (!silent) + fat_msg(sb, KERN_ERR, "bogus sectors per cluster %u", + (unsigned)bpb->fat_sec_per_clus); + goto out; + } + + error = 0; + +out: + return error; +} + +static int fat_read_static_bpb(struct super_block *sb, + struct fat_boot_sector *b, int silent, + struct fat_bios_param_block *bpb) +{ + static const char *notdos1x = "This doesn't look like a DOS 1.x volume"; + + struct fat_floppy_defaults *fdefaults = NULL; + int error = -EINVAL; + sector_t bd_sects; + unsigned i; + + bd_sects = i_size_read(sb->s_bdev->bd_inode) / SECTOR_SIZE; + + /* 16-bit DOS 1.x reliably wrote bootstrap short-jmp code */ + if (b->ignored[0] != 0xeb || b->ignored[2] != 0x90) { + if (!silent) + fat_msg(sb, KERN_ERR, + "%s; no bootstrapping code", notdos1x); + goto out; + } + + /* + * If any value in this region is non-zero, it isn't archaic + * DOS. + */ + if (!fat_bpb_is_zero(b)) { + if (!silent) + fat_msg(sb, KERN_ERR, + "%s; DOS 2.x BPB is non-zero", notdos1x); + goto out; + } + + for (i = 0; i < ARRAY_SIZE(floppy_defaults); i++) { + if (floppy_defaults[i].nr_sectors == bd_sects) { + fdefaults = &floppy_defaults[i]; + break; + } + } + + if (fdefaults == NULL) { + if (!silent) + fat_msg(sb, KERN_WARNING, + "This looks like a DOS 1.x volume, but isn't a recognized floppy size (%llu sectors)", + (u64)bd_sects); + goto out; + } + + if (!silent) + fat_msg(sb, KERN_INFO, + "This looks like a DOS 1.x volume; assuming default BPB values"); + + memset(bpb, 0, sizeof(*bpb)); + bpb->fat_sector_size = SECTOR_SIZE; + bpb->fat_sec_per_clus = fdefaults->sec_per_clus; + bpb->fat_reserved = 1; + bpb->fat_fats = 2; + bpb->fat_dir_entries = fdefaults->dir_entries; + bpb->fat_sectors = fdefaults->nr_sectors; + bpb->fat_fat_length = fdefaults->fat_length; + + error = 0; + +out: + return error; +} + /* * Read the super block of an MS-DOS FS. */ @@ -1256,12 +1487,11 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, struct inode *root_inode = NULL, *fat_inode = NULL; struct inode *fsinfo_inode = NULL; struct buffer_head *bh; - struct fat_boot_sector *b; + struct fat_bios_param_block bpb; struct msdos_sb_info *sbi; u16 logical_sector_size; u32 total_sectors, total_clusters, fat_clusters, rootdir_sectors; int debug; - unsigned int media; long error; char buf[50]; @@ -1298,100 +1528,72 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, goto out_fail; } - b = (struct fat_boot_sector *) bh->b_data; - if (!b->reserved) { - if (!silent) - fat_msg(sb, KERN_ERR, "bogus number of reserved sectors"); - brelse(bh); - goto out_invalid; - } - if (!b->fats) { - if (!silent) - fat_msg(sb, KERN_ERR, "bogus number of FAT structure"); - brelse(bh); - goto out_invalid; - } - - /* - * Earlier we checked here that b->secs_track and b->head are nonzero, - * but it turns out valid FAT filesystems can have zero there. - */ + error = fat_read_bpb(sb, (struct fat_boot_sector *)bh->b_data, silent, + &bpb); + if (error == -EINVAL && sbi->options.dos1xfloppy) + error = fat_read_static_bpb(sb, + (struct fat_boot_sector *)bh->b_data, silent, &bpb); + brelse(bh); - media = b->media; - if (!fat_valid_media(media)) { - if (!silent) - fat_msg(sb, KERN_ERR, "invalid media value (0x%02x)", - media); - brelse(bh); - goto out_invalid; - } - logical_sector_size = get_unaligned_le16(&b->sector_size); - if (!is_power_of_2(logical_sector_size) - || (logical_sector_size < 512) - || (logical_sector_size > 4096)) { - if (!silent) - fat_msg(sb, KERN_ERR, "bogus logical sector size %u", - logical_sector_size); - brelse(bh); - goto out_invalid; - } - sbi->sec_per_clus = b->sec_per_clus; - if (!is_power_of_2(sbi->sec_per_clus)) { - if (!silent) - fat_msg(sb, KERN_ERR, "bogus sectors per cluster %u", - sbi->sec_per_clus); - brelse(bh); + if (error == -EINVAL) goto out_invalid; - } + else if (error) + goto out_fail; + + logical_sector_size = bpb.fat_sector_size; + sbi->sec_per_clus = bpb.fat_sec_per_clus; + error = -EIO; if (logical_sector_size < sb->s_blocksize) { fat_msg(sb, KERN_ERR, "logical sector size too small for device" " (logical sector size = %u)", logical_sector_size); - brelse(bh); goto out_fail; } + if (logical_sector_size > sb->s_blocksize) { - brelse(bh); + struct buffer_head *bh_resize; if (!sb_set_blocksize(sb, logical_sector_size)) { fat_msg(sb, KERN_ERR, "unable to set blocksize %u", logical_sector_size); goto out_fail; } - bh = sb_bread(sb, 0); - if (bh == NULL) { + + /* Verify that the larger boot sector is fully readable */ + bh_resize = sb_bread(sb, 0); + if (bh_resize == NULL) { fat_msg(sb, KERN_ERR, "unable to read boot sector" " (logical sector size = %lu)", sb->s_blocksize); goto out_fail; } - b = (struct fat_boot_sector *) bh->b_data; + brelse(bh_resize); } mutex_init(&sbi->s_lock); sbi->cluster_size = sb->s_blocksize * sbi->sec_per_clus; sbi->cluster_bits = ffs(sbi->cluster_size) - 1; - sbi->fats = b->fats; + sbi->fats = bpb.fat_fats; sbi->fat_bits = 0; /* Don't know yet */ - sbi->fat_start = le16_to_cpu(b->reserved); - sbi->fat_length = le16_to_cpu(b->fat_length); + sbi->fat_start = bpb.fat_reserved; + sbi->fat_length = bpb.fat_fat_length; sbi->root_cluster = 0; sbi->free_clusters = -1; /* Don't know yet */ sbi->free_clus_valid = 0; sbi->prev_free = FAT_START_ENT; sb->s_maxbytes = 0xffffffff; - if (!sbi->fat_length && b->fat32.length) { + if (!sbi->fat_length && bpb.fat32_length) { struct fat_boot_fsinfo *fsinfo; struct buffer_head *fsinfo_bh; /* Must be FAT32 */ sbi->fat_bits = 32; - sbi->fat_length = le32_to_cpu(b->fat32.length); - sbi->root_cluster = le32_to_cpu(b->fat32.root_cluster); + sbi->fat_length = bpb.fat32_length; + sbi->root_cluster = bpb.fat32_root_cluster; /* MC - if info_sector is 0, don't multiply by 0 */ - sbi->fsinfo_sector = le16_to_cpu(b->fat32.info_sector); + sbi->fsinfo_sector = bpb.fat32_info_sector; if (sbi->fsinfo_sector == 0) sbi->fsinfo_sector = 1; @@ -1399,7 +1601,6 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, if (fsinfo_bh == NULL) { fat_msg(sb, KERN_ERR, "bread failed, FSINFO block" " (sector = %lu)", sbi->fsinfo_sector); - brelse(bh); goto out_fail; } @@ -1422,35 +1623,28 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, /* interpret volume ID as a little endian 32 bit integer */ if (sbi->fat_bits == 32) - sbi->vol_id = (((u32)b->fat32.vol_id[0]) | - ((u32)b->fat32.vol_id[1] << 8) | - ((u32)b->fat32.vol_id[2] << 16) | - ((u32)b->fat32.vol_id[3] << 24)); + sbi->vol_id = bpb.fat32_vol_id; else /* fat 16 or 12 */ - sbi->vol_id = (((u32)b->fat16.vol_id[0]) | - ((u32)b->fat16.vol_id[1] << 8) | - ((u32)b->fat16.vol_id[2] << 16) | - ((u32)b->fat16.vol_id[3] << 24)); + sbi->vol_id = bpb.fat16_vol_id; sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry); sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1; sbi->dir_start = sbi->fat_start + sbi->fats * sbi->fat_length; - sbi->dir_entries = get_unaligned_le16(&b->dir_entries); + sbi->dir_entries = bpb.fat_dir_entries; if (sbi->dir_entries & (sbi->dir_per_block - 1)) { if (!silent) fat_msg(sb, KERN_ERR, "bogus directory-entries per block" " (%u)", sbi->dir_entries); - brelse(bh); goto out_invalid; } rootdir_sectors = sbi->dir_entries * sizeof(struct msdos_dir_entry) / sb->s_blocksize; sbi->data_start = sbi->dir_start + rootdir_sectors; - total_sectors = get_unaligned_le16(&b->sectors); + total_sectors = bpb.fat_sectors; if (total_sectors == 0) - total_sectors = le32_to_cpu(b->total_sect); + total_sectors = bpb.fat_total_sect; total_clusters = (total_sectors - sbi->data_start) / sbi->sec_per_clus; @@ -1459,9 +1653,9 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, /* some OSes set FAT_STATE_DIRTY and clean it on unmount. */ if (sbi->fat_bits == 32) - sbi->dirty = b->fat32.state & FAT_STATE_DIRTY; + sbi->dirty = bpb.fat32_state & FAT_STATE_DIRTY; else /* fat 16 or 12 */ - sbi->dirty = b->fat16.state & FAT_STATE_DIRTY; + sbi->dirty = bpb.fat16_state & FAT_STATE_DIRTY; /* check that FAT table does not overflow */ fat_clusters = calc_fat_clusters(sb); @@ -1470,7 +1664,6 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, if (!silent) fat_msg(sb, KERN_ERR, "count of clusters too big (%u)", total_clusters); - brelse(bh); goto out_invalid; } @@ -1483,8 +1676,6 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, if (sbi->prev_free < FAT_START_ENT) sbi->prev_free = FAT_START_ENT; - brelse(bh); - /* set up enough so that it can read an inode */ fat_hash_init(sb); dir_hash_init(sb); diff --git a/fs/fcntl.c b/fs/fcntl.c index 9ead1596399a..72c82f69b01b 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -274,15 +274,15 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, break; #if BITS_PER_LONG != 32 /* 32-bit arches must use fcntl64() */ - case F_GETLKP: + case F_OFD_GETLK: #endif case F_GETLK: err = fcntl_getlk(filp, cmd, (struct flock __user *) arg); break; #if BITS_PER_LONG != 32 /* 32-bit arches must use fcntl64() */ - case F_SETLKP: - case F_SETLKPW: + case F_OFD_SETLK: + case F_OFD_SETLKW: #endif /* Fallthrough */ case F_SETLK: @@ -399,13 +399,13 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, switch (cmd) { case F_GETLK64: - case F_GETLKP: + case F_OFD_GETLK: err = fcntl_getlk64(f.file, cmd, (struct flock64 __user *) arg); break; case F_SETLK64: case F_SETLKW64: - case F_SETLKP: - case F_SETLKPW: + case F_OFD_SETLK: + case F_OFD_SETLKW: err = fcntl_setlk64(fd, f.file, cmd, (struct flock64 __user *) arg); break; diff --git a/fs/file.c b/fs/file.c index 8f294cfac697..66923fe3176e 100644 --- a/fs/file.c +++ b/fs/file.c @@ -44,15 +44,10 @@ static void *alloc_fdmem(size_t size) return vmalloc(size); } -static void free_fdmem(void *ptr) -{ - is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr); -} - static void __free_fdtable(struct fdtable *fdt) { - free_fdmem(fdt->fd); - free_fdmem(fdt->open_fds); + kvfree(fdt->fd); + kvfree(fdt->open_fds); kfree(fdt); } @@ -130,7 +125,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr) return fdt; out_arr: - free_fdmem(fdt->fd); + kvfree(fdt->fd); out_fdt: kfree(fdt); out: diff --git a/fs/file_table.c b/fs/file_table.c index a374f5033e97..385bfd31512a 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -76,14 +76,14 @@ EXPORT_SYMBOL_GPL(get_max_files); * Handle nr_files sysctl */ #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) -int proc_nr_files(ctl_table *table, int write, +int proc_nr_files(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { files_stat.nr_files = get_nr_files(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #else -int proc_nr_files(ctl_table *table, int write, +int proc_nr_files(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; @@ -175,6 +175,12 @@ struct file *alloc_file(struct path *path, fmode_t mode, file->f_path = *path; file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; + if ((mode & FMODE_READ) && + likely(fop->read || fop->aio_read || fop->read_iter)) + mode |= FMODE_CAN_READ; + if ((mode & FMODE_WRITE) && + likely(fop->write || fop->aio_write || fop->write_iter)) + mode |= FMODE_CAN_WRITE; file->f_mode = mode; file->f_op = fop; if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c index f7cff367db7f..56cce7fdd39e 100644 --- a/fs/fscache/cache.c +++ b/fs/fscache/cache.c @@ -280,15 +280,15 @@ int fscache_add_cache(struct fscache_cache *cache, spin_unlock(&fscache_fsdef_index.lock); up_write(&fscache_addremove_sem); - printk(KERN_NOTICE "FS-Cache: Cache \"%s\" added (type %s)\n", - cache->tag->name, cache->ops->name); + pr_notice("Cache \"%s\" added (type %s)\n", + cache->tag->name, cache->ops->name); kobject_uevent(cache->kobj, KOBJ_ADD); _leave(" = 0 [%s]", cache->identifier); return 0; tag_in_use: - printk(KERN_ERR "FS-Cache: Cache tag '%s' already in use\n", tagname); + pr_err("Cache tag '%s' already in use\n", tagname); __fscache_release_cache_tag(tag); _leave(" = -EXIST"); return -EEXIST; @@ -317,8 +317,7 @@ EXPORT_SYMBOL(fscache_add_cache); void fscache_io_error(struct fscache_cache *cache) { if (!test_and_set_bit(FSCACHE_IOERROR, &cache->flags)) - printk(KERN_ERR "FS-Cache:" - " Cache '%s' stopped due to I/O error\n", + pr_err("Cache '%s' stopped due to I/O error\n", cache->ops->name); } EXPORT_SYMBOL(fscache_io_error); @@ -369,8 +368,8 @@ void fscache_withdraw_cache(struct fscache_cache *cache) _enter(""); - printk(KERN_NOTICE "FS-Cache: Withdrawing cache \"%s\"\n", - cache->tag->name); + pr_notice("Withdrawing cache \"%s\"\n", + cache->tag->name); /* make the cache unavailable for cookie acquisition */ if (test_and_set_bit(FSCACHE_CACHE_WITHDRAWN, &cache->flags)) diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 29d7feb62cf7..aec01be91b0a 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -519,7 +519,7 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate) ASSERTCMP(atomic_read(&cookie->n_active), >, 0); if (atomic_read(&cookie->n_children) != 0) { - printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n", + pr_err("Cookie '%s' still has children\n", cookie->def->name); BUG(); } diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c index bad496748a59..7d637e2335fd 100644 --- a/fs/fscache/histogram.c +++ b/fs/fscache/histogram.c @@ -31,12 +31,10 @@ static int fscache_histogram_show(struct seq_file *m, void *v) switch ((unsigned long) v) { case 1: - seq_puts(m, "JIFS SECS OBJ INST OP RUNS OBJ RUNS " - " RETRV DLY RETRIEVLS\n"); + seq_puts(m, "JIFS SECS OBJ INST OP RUNS OBJ RUNS RETRV DLY RETRIEVLS\n"); return 0; case 2: - seq_puts(m, "===== ===== ========= ========= =========" - " ========= =========\n"); + seq_puts(m, "===== ===== ========= ========= ========= ========= =========\n"); return 0; default: index = (unsigned long) v - 3; diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 4226f6680b06..bc6c08fcfddd 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -22,6 +22,12 @@ * */ +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) "FS-Cache: " fmt + #include <linux/fscache-cache.h> #include <linux/sched.h> @@ -413,8 +419,8 @@ do { \ #define ASSERT(X) \ do { \ if (unlikely(!(X))) { \ - printk(KERN_ERR "\n"); \ - printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ BUG(); \ } \ } while (0) @@ -422,9 +428,9 @@ do { \ #define ASSERTCMP(X, OP, Y) \ do { \ if (unlikely(!((X) OP (Y)))) { \ - printk(KERN_ERR "\n"); \ - printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ - printk(KERN_ERR "%lx " #OP " %lx is false\n", \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + pr_err("%lx " #OP " %lx is false\n", \ (unsigned long)(X), (unsigned long)(Y)); \ BUG(); \ } \ @@ -433,8 +439,8 @@ do { \ #define ASSERTIF(C, X) \ do { \ if (unlikely((C) && !(X))) { \ - printk(KERN_ERR "\n"); \ - printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ BUG(); \ } \ } while (0) @@ -442,9 +448,9 @@ do { \ #define ASSERTIFCMP(C, X, OP, Y) \ do { \ if (unlikely((C) && !((X) OP (Y)))) { \ - printk(KERN_ERR "\n"); \ - printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ - printk(KERN_ERR "%lx " #OP " %lx is false\n", \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + pr_err("%lx " #OP " %lx is false\n", \ (unsigned long)(X), (unsigned long)(Y)); \ BUG(); \ } \ diff --git a/fs/fscache/main.c b/fs/fscache/main.c index 7c27907e650c..63f868e869b9 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -67,7 +67,7 @@ static int fscache_max_active_sysctl(struct ctl_table *table, int write, return ret; } -ctl_table fscache_sysctls[] = { +struct ctl_table fscache_sysctls[] = { { .procname = "object_max_active", .data = &fscache_object_max_active, @@ -87,7 +87,7 @@ ctl_table fscache_sysctls[] = { {} }; -ctl_table fscache_sysctls_root[] = { +struct ctl_table fscache_sysctls_root[] = { { .procname = "fscache", .mode = 0555, @@ -146,8 +146,7 @@ static int __init fscache_init(void) 0, fscache_cookie_init_once); if (!fscache_cookie_jar) { - printk(KERN_NOTICE - "FS-Cache: Failed to allocate a cookie jar\n"); + pr_notice("Failed to allocate a cookie jar\n"); ret = -ENOMEM; goto error_cookie_jar; } @@ -156,7 +155,7 @@ static int __init fscache_init(void) if (!fscache_root) goto error_kobj; - printk(KERN_NOTICE "FS-Cache: Loaded\n"); + pr_notice("Loaded\n"); return 0; error_kobj: @@ -192,7 +191,7 @@ static void __exit fscache_exit(void) fscache_proc_cleanup(); destroy_workqueue(fscache_op_wq); destroy_workqueue(fscache_object_wq); - printk(KERN_NOTICE "FS-Cache: Unloaded\n"); + pr_notice("Unloaded\n"); } module_exit(fscache_exit); diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c index 989f39401547..6d941f56faf4 100644 --- a/fs/fscache/netfs.c +++ b/fs/fscache/netfs.c @@ -65,8 +65,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs) list_add(&netfs->link, &fscache_netfs_list); ret = 0; - printk(KERN_NOTICE "FS-Cache: Netfs '%s' registered for caching\n", - netfs->name); + pr_notice("Netfs '%s' registered for caching\n", netfs->name); already_registered: up_write(&fscache_addremove_sem); @@ -97,8 +96,8 @@ void __fscache_unregister_netfs(struct fscache_netfs *netfs) up_write(&fscache_addremove_sem); - printk(KERN_NOTICE "FS-Cache: Netfs '%s' unregistered from caching\n", - netfs->name); + pr_notice("Netfs '%s' unregistered from caching\n", + netfs->name); _leave(""); } diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index b5ebc2d7d80d..b8179ca6bf9d 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -285,20 +285,20 @@ static int fscache_objlist_show(struct seq_file *m, void *v) fscache_unuse_cookie(obj); if (keylen > 0 || auxlen > 0) { - seq_printf(m, " "); + seq_puts(m, " "); for (p = buf; keylen > 0; keylen--) seq_printf(m, "%02x", *p++); if (auxlen > 0) { if (config & FSCACHE_OBJLIST_CONFIG_KEY) - seq_printf(m, ", "); + seq_puts(m, ", "); for (; auxlen > 0; auxlen--) seq_printf(m, "%02x", *p++); } } - seq_printf(m, "\n"); + seq_puts(m, "\n"); } else { - seq_printf(m, "<no_netfs>\n"); + seq_puts(m, "<no_netfs>\n"); } return 0; } diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 318071aca217..e7b87a0e5185 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -51,8 +51,7 @@ void fscache_enqueue_operation(struct fscache_operation *op) _debug("queue for caller's attention"); break; default: - printk(KERN_ERR "FS-Cache: Unexpected op type %lx", - op->flags); + pr_err("Unexpected op type %lx", op->flags); BUG(); break; } diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 7f5c658af755..ed70714503fa 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -1108,10 +1108,8 @@ void fscache_mark_page_cached(struct fscache_retrieval *op, struct page *page) static bool once_only; if (!once_only) { once_only = true; - printk(KERN_WARNING "FS-Cache:" - " Cookie type %s marked page %lx" - " multiple times\n", - cookie->def->name, page->index); + pr_warn("Cookie type %s marked page %lx multiple times\n", + cookie->def->name, page->index); } } diff --git a/fs/fuse/control.c b/fs/fuse/control.c index a0b0855d00a9..205e0d5d5307 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -348,7 +348,7 @@ int __init fuse_ctl_init(void) return register_filesystem(&fuse_ctl_fs_type); } -void fuse_ctl_cleanup(void) +void __exit fuse_ctl_cleanup(void) { unregister_filesystem(&fuse_ctl_fs_type); } diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 13b691a8a7d2..966ace8b243f 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -94,8 +94,10 @@ static ssize_t cuse_read(struct file *file, char __user *buf, size_t count, loff_t pos = 0; struct iovec iov = { .iov_base = buf, .iov_len = count }; struct fuse_io_priv io = { .async = 0, .file = file }; + struct iov_iter ii; + iov_iter_init(&ii, READ, &iov, 1, count); - return fuse_direct_io(&io, &iov, 1, count, &pos, FUSE_DIO_CUSE); + return fuse_direct_io(&io, &ii, &pos, FUSE_DIO_CUSE); } static ssize_t cuse_write(struct file *file, const char __user *buf, @@ -104,12 +106,14 @@ static ssize_t cuse_write(struct file *file, const char __user *buf, loff_t pos = 0; struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; struct fuse_io_priv io = { .async = 0, .file = file }; + struct iov_iter ii; + iov_iter_init(&ii, WRITE, &iov, 1, count); /* * No locking or generic_write_checks(), the server is * responsible for locking and sanity checks. */ - return fuse_direct_io(&io, &iov, 1, count, &pos, + return fuse_direct_io(&io, &ii, &pos, FUSE_DIO_WRITE | FUSE_DIO_CUSE); } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index aac71ce373e4..ca887314aba9 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -643,9 +643,8 @@ struct fuse_copy_state { unsigned long seglen; unsigned long addr; struct page *pg; - void *mapaddr; - void *buf; unsigned len; + unsigned offset; unsigned move_pages:1; }; @@ -666,23 +665,17 @@ static void fuse_copy_finish(struct fuse_copy_state *cs) if (cs->currbuf) { struct pipe_buffer *buf = cs->currbuf; - if (!cs->write) { - kunmap_atomic(cs->mapaddr); - } else { - kunmap_atomic(cs->mapaddr); + if (cs->write) buf->len = PAGE_SIZE - cs->len; - } cs->currbuf = NULL; - cs->mapaddr = NULL; - } else if (cs->mapaddr) { - kunmap_atomic(cs->mapaddr); + } else if (cs->pg) { if (cs->write) { flush_dcache_page(cs->pg); set_page_dirty_lock(cs->pg); } put_page(cs->pg); - cs->mapaddr = NULL; } + cs->pg = NULL; } /* @@ -691,7 +684,7 @@ static void fuse_copy_finish(struct fuse_copy_state *cs) */ static int fuse_copy_fill(struct fuse_copy_state *cs) { - unsigned long offset; + struct page *page; int err; unlock_request(cs->fc, cs->req); @@ -706,14 +699,12 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) BUG_ON(!cs->nr_segs); cs->currbuf = buf; - cs->mapaddr = kmap_atomic(buf->page); + cs->pg = buf->page; + cs->offset = buf->offset; cs->len = buf->len; - cs->buf = cs->mapaddr + buf->offset; cs->pipebufs++; cs->nr_segs--; } else { - struct page *page; - if (cs->nr_segs == cs->pipe->buffers) return -EIO; @@ -726,8 +717,8 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) buf->len = 0; cs->currbuf = buf; - cs->mapaddr = kmap_atomic(page); - cs->buf = cs->mapaddr; + cs->pg = page; + cs->offset = 0; cs->len = PAGE_SIZE; cs->pipebufs++; cs->nr_segs++; @@ -740,14 +731,13 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) cs->iov++; cs->nr_segs--; } - err = get_user_pages_fast(cs->addr, 1, cs->write, &cs->pg); + err = get_user_pages_fast(cs->addr, 1, cs->write, &page); if (err < 0) return err; BUG_ON(err != 1); - offset = cs->addr % PAGE_SIZE; - cs->mapaddr = kmap_atomic(cs->pg); - cs->buf = cs->mapaddr + offset; - cs->len = min(PAGE_SIZE - offset, cs->seglen); + cs->pg = page; + cs->offset = cs->addr % PAGE_SIZE; + cs->len = min(PAGE_SIZE - cs->offset, cs->seglen); cs->seglen -= cs->len; cs->addr += cs->len; } @@ -760,15 +750,20 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) { unsigned ncpy = min(*size, cs->len); if (val) { + void *pgaddr = kmap_atomic(cs->pg); + void *buf = pgaddr + cs->offset; + if (cs->write) - memcpy(cs->buf, *val, ncpy); + memcpy(buf, *val, ncpy); else - memcpy(*val, cs->buf, ncpy); + memcpy(*val, buf, ncpy); + + kunmap_atomic(pgaddr); *val += ncpy; } *size -= ncpy; cs->len -= ncpy; - cs->buf += ncpy; + cs->offset += ncpy; return ncpy; } @@ -874,8 +869,8 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) out_fallback_unlock: unlock_page(newpage); out_fallback: - cs->mapaddr = kmap_atomic(buf->page); - cs->buf = cs->mapaddr + buf->offset; + cs->pg = buf->page; + cs->offset = buf->offset; err = lock_request(cs->fc, cs->req); if (err) @@ -1614,7 +1609,7 @@ out_finish: static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) { - release_pages(req->pages, req->num_pages, 0); + release_pages(req->pages, req->num_pages, false); } static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 5b4e035b364c..0c6048247a34 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -198,7 +198,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) inode = ACCESS_ONCE(entry->d_inode); if (inode && is_bad_inode(inode)) goto invalid; - else if (fuse_dentry_time(entry) < get_jiffies_64()) { + else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) || + (flags & LOOKUP_REVAL)) { int err; struct fuse_entry_out outarg; struct fuse_req *req; @@ -679,6 +680,14 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, return create_new_entry(fc, req, dir, entry, S_IFLNK); } +static inline void fuse_update_ctime(struct inode *inode) +{ + if (!IS_NOCMTIME(inode)) { + inode->i_ctime = current_fs_time(inode->i_sb); + mark_inode_dirty_sync(inode); + } +} + static int fuse_unlink(struct inode *dir, struct dentry *entry) { int err; @@ -713,6 +722,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) fuse_invalidate_attr(inode); fuse_invalidate_attr(dir); fuse_invalidate_entry_cache(entry); + fuse_update_ctime(inode); } else if (err == -EINTR) fuse_invalidate_entry(entry); return err; @@ -743,23 +753,26 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) return err; } -static int fuse_rename(struct inode *olddir, struct dentry *oldent, - struct inode *newdir, struct dentry *newent) +static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent, + unsigned int flags, int opcode, size_t argsize) { int err; - struct fuse_rename_in inarg; + struct fuse_rename2_in inarg; struct fuse_conn *fc = get_fuse_conn(olddir); - struct fuse_req *req = fuse_get_req_nopages(fc); + struct fuse_req *req; + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); - memset(&inarg, 0, sizeof(inarg)); + memset(&inarg, 0, argsize); inarg.newdir = get_node_id(newdir); - req->in.h.opcode = FUSE_RENAME; + inarg.flags = flags; + req->in.h.opcode = opcode; req->in.h.nodeid = get_node_id(olddir); req->in.numargs = 3; - req->in.args[0].size = sizeof(inarg); + req->in.args[0].size = argsize; req->in.args[0].value = &inarg; req->in.args[1].size = oldent->d_name.len + 1; req->in.args[1].value = oldent->d_name.name; @@ -771,15 +784,22 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, if (!err) { /* ctime changes */ fuse_invalidate_attr(oldent->d_inode); + fuse_update_ctime(oldent->d_inode); + + if (flags & RENAME_EXCHANGE) { + fuse_invalidate_attr(newent->d_inode); + fuse_update_ctime(newent->d_inode); + } fuse_invalidate_attr(olddir); if (olddir != newdir) fuse_invalidate_attr(newdir); /* newent will end up negative */ - if (newent->d_inode) { + if (!(flags & RENAME_EXCHANGE) && newent->d_inode) { fuse_invalidate_attr(newent->d_inode); fuse_invalidate_entry_cache(newent); + fuse_update_ctime(newent->d_inode); } } else if (err == -EINTR) { /* If request was interrupted, DEITY only knows if the @@ -795,6 +815,42 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, return err; } +static int fuse_rename2(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent, + unsigned int flags) +{ + struct fuse_conn *fc = get_fuse_conn(olddir); + int err; + + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + if (flags) { + if (fc->no_rename2 || fc->minor < 23) + return -EINVAL; + + err = fuse_rename_common(olddir, oldent, newdir, newent, flags, + FUSE_RENAME2, + sizeof(struct fuse_rename2_in)); + if (err == -ENOSYS) { + fc->no_rename2 = 1; + err = -EINVAL; + } + } else { + err = fuse_rename_common(olddir, oldent, newdir, newent, 0, + FUSE_RENAME, + sizeof(struct fuse_rename_in)); + } + + return err; +} + +static int fuse_rename(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent) +{ + return fuse_rename2(olddir, oldent, newdir, newent, 0); +} + static int fuse_link(struct dentry *entry, struct inode *newdir, struct dentry *newent) { @@ -829,6 +885,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, inc_nlink(inode); spin_unlock(&fc->lock); fuse_invalidate_attr(inode); + fuse_update_ctime(inode); } else if (err == -EINTR) { fuse_invalidate_attr(inode); } @@ -846,6 +903,8 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, attr->size = i_size_read(inode); attr->mtime = inode->i_mtime.tv_sec; attr->mtimensec = inode->i_mtime.tv_nsec; + attr->ctime = inode->i_ctime.tv_sec; + attr->ctimensec = inode->i_ctime.tv_nsec; } stat->dev = inode->i_sb->s_dev; @@ -933,7 +992,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat, int err; bool r; - if (fi->i_time < get_jiffies_64()) { + if (time_before64(fi->i_time, get_jiffies_64())) { r = true; err = fuse_do_getattr(inode, stat, file); } else { @@ -1119,7 +1178,7 @@ static int fuse_permission(struct inode *inode, int mask) ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) { struct fuse_inode *fi = get_fuse_inode(inode); - if (fi->i_time < get_jiffies_64()) { + if (time_before64(fi->i_time, get_jiffies_64())) { refreshed = true; err = fuse_perm_getattr(inode, mask); @@ -1504,7 +1563,7 @@ static bool update_mtime(unsigned ivalid, bool trust_local_mtime) } static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg, - bool trust_local_mtime) + bool trust_local_cmtime) { unsigned ivalid = iattr->ia_valid; @@ -1523,13 +1582,18 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg, if (!(ivalid & ATTR_ATIME_SET)) arg->valid |= FATTR_ATIME_NOW; } - if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_mtime)) { + if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_cmtime)) { arg->valid |= FATTR_MTIME; arg->mtime = iattr->ia_mtime.tv_sec; arg->mtimensec = iattr->ia_mtime.tv_nsec; - if (!(ivalid & ATTR_MTIME_SET) && !trust_local_mtime) + if (!(ivalid & ATTR_MTIME_SET) && !trust_local_cmtime) arg->valid |= FATTR_MTIME_NOW; } + if ((ivalid & ATTR_CTIME) && trust_local_cmtime) { + arg->valid |= FATTR_CTIME; + arg->ctime = iattr->ia_ctime.tv_sec; + arg->ctimensec = iattr->ia_ctime.tv_nsec; + } } /* @@ -1597,39 +1661,38 @@ static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_req *req, /* * Flush inode->i_mtime to the server */ -int fuse_flush_mtime(struct file *file, bool nofail) +int fuse_flush_times(struct inode *inode, struct fuse_file *ff) { - struct inode *inode = file->f_mapping->host; - struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = NULL; + struct fuse_req *req; struct fuse_setattr_in inarg; struct fuse_attr_out outarg; int err; - if (nofail) { - req = fuse_get_req_nofail_nopages(fc, file); - } else { - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); - } + req = fuse_get_req_nopages(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); - inarg.valid |= FATTR_MTIME; + inarg.valid = FATTR_MTIME; inarg.mtime = inode->i_mtime.tv_sec; inarg.mtimensec = inode->i_mtime.tv_nsec; - + if (fc->minor >= 23) { + inarg.valid |= FATTR_CTIME; + inarg.ctime = inode->i_ctime.tv_sec; + inarg.ctimensec = inode->i_ctime.tv_nsec; + } + if (ff) { + inarg.valid |= FATTR_FH; + inarg.fh = ff->fh; + } fuse_setattr_fill(fc, req, inode, &inarg, &outarg); fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); - if (!err) - clear_bit(FUSE_I_MTIME_DIRTY, &fi->state); - return err; } @@ -1653,7 +1716,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, bool is_wb = fc->writeback_cache; loff_t oldsize; int err; - bool trust_local_mtime = is_wb && S_ISREG(inode->i_mode); + bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode); if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) attr->ia_valid |= ATTR_FORCE; @@ -1678,11 +1741,13 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, if (is_truncate) { fuse_set_nowrite(inode); set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + if (trust_local_cmtime && attr->ia_size != inode->i_size) + attr->ia_valid |= ATTR_MTIME | ATTR_CTIME; } memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); - iattr_to_fattr(attr, &inarg, trust_local_mtime); + iattr_to_fattr(attr, &inarg, trust_local_cmtime); if (file) { struct fuse_file *ff = file->private_data; inarg.valid |= FATTR_FH; @@ -1711,9 +1776,12 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, spin_lock(&fc->lock); /* the kernel maintains i_mtime locally */ - if (trust_local_mtime && (attr->ia_valid & ATTR_MTIME)) { - inode->i_mtime = attr->ia_mtime; - clear_bit(FUSE_I_MTIME_DIRTY, &fi->state); + if (trust_local_cmtime) { + if (attr->ia_valid & ATTR_MTIME) + inode->i_mtime = attr->ia_mtime; + if (attr->ia_valid & ATTR_CTIME) + inode->i_ctime = attr->ia_ctime; + /* FIXME: clear I_DIRTY_SYNC? */ } fuse_change_attributes_common(inode, &outarg.attr, @@ -1810,8 +1878,10 @@ static int fuse_setxattr(struct dentry *entry, const char *name, fc->no_setxattr = 1; err = -EOPNOTSUPP; } - if (!err) + if (!err) { fuse_invalidate_attr(inode); + fuse_update_ctime(inode); + } return err; } @@ -1941,20 +2011,11 @@ static int fuse_removexattr(struct dentry *entry, const char *name) fc->no_removexattr = 1; err = -EOPNOTSUPP; } - if (!err) + if (!err) { fuse_invalidate_attr(inode); - return err; -} - -static int fuse_update_time(struct inode *inode, struct timespec *now, - int flags) -{ - if (flags & S_MTIME) { - inode->i_mtime = *now; - set_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state); - BUG_ON(!S_ISREG(inode->i_mode)); + fuse_update_ctime(inode); } - return 0; + return err; } static const struct inode_operations fuse_dir_inode_operations = { @@ -1964,6 +2025,7 @@ static const struct inode_operations fuse_dir_inode_operations = { .unlink = fuse_unlink, .rmdir = fuse_rmdir, .rename = fuse_rename, + .rename2 = fuse_rename2, .link = fuse_link, .setattr = fuse_setattr, .create = fuse_create, @@ -1996,7 +2058,6 @@ static const struct inode_operations fuse_common_inode_operations = { .getxattr = fuse_getxattr, .listxattr = fuse_listxattr, .removexattr = fuse_removexattr, - .update_time = fuse_update_time, }; static const struct inode_operations fuse_symlink_inode_operations = { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 13f8bdec5110..40ac2628ddcf 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -223,6 +223,8 @@ void fuse_finish_open(struct inode *inode, struct file *file) i_size_write(inode, 0); spin_unlock(&fc->lock); fuse_invalidate_attr(inode); + if (fc->writeback_cache) + file_update_time(file); } if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) fuse_link_write_file(file); @@ -232,18 +234,26 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) { struct fuse_conn *fc = get_fuse_conn(inode); int err; + bool lock_inode = (file->f_flags & O_TRUNC) && + fc->atomic_o_trunc && + fc->writeback_cache; err = generic_file_open(inode, file); if (err) return err; + if (lock_inode) + mutex_lock(&inode->i_mutex); + err = fuse_do_open(fc, get_node_id(inode), file, isdir); - if (err) - return err; - fuse_finish_open(inode, file); + if (!err) + fuse_finish_open(inode, file); - return 0; + if (lock_inode) + mutex_unlock(&inode->i_mutex); + + return err; } static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode) @@ -314,10 +324,7 @@ static int fuse_release(struct inode *inode, struct file *file) /* see fuse_vma_close() for !writeback_cache case */ if (fc->writeback_cache) - filemap_write_and_wait(file->f_mapping); - - if (test_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state)) - fuse_flush_mtime(file, true); + write_inode_now(inode, 1); fuse_release_common(file, FUSE_RELEASE); @@ -439,7 +446,7 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (fc->no_flush) return 0; - err = filemap_write_and_wait(file->f_mapping); + err = write_inode_now(inode, 1); if (err) return err; @@ -480,13 +487,6 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, if (is_bad_inode(inode)) return -EIO; - err = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (err) - return err; - - if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) - return 0; - mutex_lock(&inode->i_mutex); /* @@ -494,17 +494,17 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, * wait for all outstanding writes, before sending the FSYNC * request. */ - err = write_inode_now(inode, 0); + err = filemap_write_and_wait_range(inode->i_mapping, start, end); if (err) goto out; fuse_sync_writes(inode); + err = sync_inode_metadata(inode, 1); + if (err) + goto out; - if (test_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state)) { - int err = fuse_flush_mtime(file, false); - if (err) - goto out; - } + if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) + goto out; req = fuse_get_req_nopages(fc); if (IS_ERR(req)) { @@ -933,8 +933,7 @@ out: return err; } -static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = iocb->ki_filp->f_mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); @@ -945,14 +944,14 @@ static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov, * i_size is up to date). */ if (fc->auto_inval_data || - (pos + iov_length(iov, nr_segs) > i_size_read(inode))) { + (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) { int err; err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL); if (err) return err; } - return generic_file_aio_read(iocb, iov, nr_segs, pos); + return generic_file_read_iter(iocb, to); } static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff, @@ -1089,8 +1088,6 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); flush_dcache_page(page); - mark_page_accessed(page); - if (!tmp) { unlock_page(page); page_cache_release(page); @@ -1183,19 +1180,17 @@ static ssize_t fuse_perform_write(struct file *file, return res > 0 ? res : err; } -static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; - size_t count = 0; - size_t ocount = 0; + size_t count = iov_iter_count(from); ssize_t written = 0; ssize_t written_buffered = 0; struct inode *inode = mapping->host; ssize_t err; - struct iov_iter i; loff_t endbyte = 0; + loff_t pos = iocb->ki_pos; if (get_fuse_conn(inode)->writeback_cache) { /* Update size (EOF optimization) and mode (SUID clearing) */ @@ -1203,17 +1198,9 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err) return err; - return generic_file_aio_write(iocb, iov, nr_segs, pos); + return generic_file_write_iter(iocb, from); } - WARN_ON(iocb->ki_pos != pos); - - ocount = 0; - err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); - if (err) - return err; - - count = ocount; mutex_lock(&inode->i_mutex); /* We can write back this queue in page reclaim */ @@ -1226,6 +1213,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (count == 0) goto out; + iov_iter_truncate(from, count); err = file_remove_suid(file); if (err) goto out; @@ -1235,16 +1223,13 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, goto out; if (file->f_flags & O_DIRECT) { - written = generic_file_direct_write(iocb, iov, &nr_segs, pos, - count, ocount); - if (written < 0 || written == count) + written = generic_file_direct_write(iocb, from, pos); + if (written < 0 || !iov_iter_count(from)) goto out; pos += written; - count -= written; - iov_iter_init(&i, iov, nr_segs, count, written); - written_buffered = fuse_perform_write(file, mapping, &i, pos); + written_buffered = fuse_perform_write(file, mapping, from, pos); if (written_buffered < 0) { err = written_buffered; goto out; @@ -1263,8 +1248,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, written += written_buffered; iocb->ki_pos = pos + written_buffered; } else { - iov_iter_init(&i, iov, nr_segs, count, 0); - written = fuse_perform_write(file, mapping, &i, pos); + written = fuse_perform_write(file, mapping, from, pos); if (written >= 0) iocb->ki_pos = pos + written; } @@ -1302,7 +1286,7 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, size_t nbytes = 0; /* # bytes already packed in req */ /* Special case for kernel I/O: can copy directly into the buffer */ - if (segment_eq(get_fs(), KERNEL_DS)) { + if (ii->type & ITER_KVEC) { unsigned long user_addr = fuse_get_user_addr(ii); size_t frag_size = fuse_get_frag_size(ii, *nbytesp); @@ -1318,35 +1302,26 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, while (nbytes < *nbytesp && req->num_pages < req->max_pages) { unsigned npages; - unsigned long user_addr = fuse_get_user_addr(ii); - unsigned offset = user_addr & ~PAGE_MASK; - size_t frag_size = fuse_get_frag_size(ii, *nbytesp - nbytes); - int ret; - + size_t start; unsigned n = req->max_pages - req->num_pages; - frag_size = min_t(size_t, frag_size, n << PAGE_SHIFT); - - npages = (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; - npages = clamp(npages, 1U, n); - - ret = get_user_pages_fast(user_addr, npages, !write, - &req->pages[req->num_pages]); + ssize_t ret = iov_iter_get_pages(ii, + &req->pages[req->num_pages], + n * PAGE_SIZE, &start); if (ret < 0) return ret; - npages = ret; - frag_size = min_t(size_t, frag_size, - (npages << PAGE_SHIFT) - offset); - iov_iter_advance(ii, frag_size); + iov_iter_advance(ii, ret); + nbytes += ret; + + ret += start; + npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE; - req->page_descs[req->num_pages].offset = offset; + req->page_descs[req->num_pages].offset = start; fuse_page_descs_length_init(req, req->num_pages, npages); req->num_pages += npages; req->page_descs[req->num_pages - 1].length -= - (npages << PAGE_SHIFT) - offset - frag_size; - - nbytes += frag_size; + (PAGE_SIZE - ret) & (PAGE_SIZE - 1); } if (write) @@ -1361,24 +1336,11 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, static inline int fuse_iter_npages(const struct iov_iter *ii_p) { - struct iov_iter ii = *ii_p; - int npages = 0; - - while (iov_iter_count(&ii) && npages < FUSE_MAX_PAGES_PER_REQ) { - unsigned long user_addr = fuse_get_user_addr(&ii); - unsigned offset = user_addr & ~PAGE_MASK; - size_t frag_size = iov_iter_single_seg_count(&ii); - - npages += (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; - iov_iter_advance(&ii, frag_size); - } - - return min(npages, FUSE_MAX_PAGES_PER_REQ); + return iov_iter_npages(ii_p, FUSE_MAX_PAGES_PER_REQ); } -ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, - unsigned long nr_segs, size_t count, loff_t *ppos, - int flags) +ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, + loff_t *ppos, int flags) { int write = flags & FUSE_DIO_WRITE; int cuse = flags & FUSE_DIO_CUSE; @@ -1388,18 +1350,16 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, struct fuse_conn *fc = ff->fc; size_t nmax = write ? fc->max_write : fc->max_read; loff_t pos = *ppos; + size_t count = iov_iter_count(iter); pgoff_t idx_from = pos >> PAGE_CACHE_SHIFT; pgoff_t idx_to = (pos + count - 1) >> PAGE_CACHE_SHIFT; ssize_t res = 0; struct fuse_req *req; - struct iov_iter ii; - - iov_iter_init(&ii, iov, nr_segs, count, 0); if (io->async) - req = fuse_get_req_for_background(fc, fuse_iter_npages(&ii)); + req = fuse_get_req_for_background(fc, fuse_iter_npages(iter)); else - req = fuse_get_req(fc, fuse_iter_npages(&ii)); + req = fuse_get_req(fc, fuse_iter_npages(iter)); if (IS_ERR(req)) return PTR_ERR(req); @@ -1415,7 +1375,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, size_t nres; fl_owner_t owner = current->files; size_t nbytes = min(count, nmax); - int err = fuse_get_user_pages(req, &ii, &nbytes, write); + int err = fuse_get_user_pages(req, iter, &nbytes, write); if (err) { res = err; break; @@ -1445,9 +1405,9 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, fuse_put_request(fc, req); if (io->async) req = fuse_get_req_for_background(fc, - fuse_iter_npages(&ii)); + fuse_iter_npages(iter)); else - req = fuse_get_req(fc, fuse_iter_npages(&ii)); + req = fuse_get_req(fc, fuse_iter_npages(iter)); if (IS_ERR(req)) break; } @@ -1462,9 +1422,8 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, EXPORT_SYMBOL_GPL(fuse_direct_io); static ssize_t __fuse_direct_read(struct fuse_io_priv *io, - const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos, - size_t count) + struct iov_iter *iter, + loff_t *ppos) { ssize_t res; struct file *file = io->file; @@ -1473,7 +1432,7 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io, if (is_bad_inode(inode)) return -EIO; - res = fuse_direct_io(io, iov, nr_segs, count, ppos, 0); + res = fuse_direct_io(io, iter, ppos, 0); fuse_invalidate_attr(inode); @@ -1485,22 +1444,26 @@ static ssize_t fuse_direct_read(struct file *file, char __user *buf, { struct fuse_io_priv io = { .async = 0, .file = file }; struct iovec iov = { .iov_base = buf, .iov_len = count }; - return __fuse_direct_read(&io, &iov, 1, ppos, count); + struct iov_iter ii; + iov_iter_init(&ii, READ, &iov, 1, count); + return __fuse_direct_read(&io, &ii, ppos); } static ssize_t __fuse_direct_write(struct fuse_io_priv *io, - const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) + struct iov_iter *iter, + loff_t *ppos) { struct file *file = io->file; struct inode *inode = file_inode(file); - size_t count = iov_length(iov, nr_segs); + size_t count = iov_iter_count(iter); ssize_t res; + res = generic_write_checks(file, ppos, &count, 0); - if (!res) - res = fuse_direct_io(io, iov, nr_segs, count, ppos, - FUSE_DIO_WRITE); + if (!res) { + iov_iter_truncate(iter, count); + res = fuse_direct_io(io, iter, ppos, FUSE_DIO_WRITE); + } fuse_invalidate_attr(inode); @@ -1514,13 +1477,15 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf, struct inode *inode = file_inode(file); ssize_t res; struct fuse_io_priv io = { .async = 0, .file = file }; + struct iov_iter ii; + iov_iter_init(&ii, WRITE, &iov, 1, count); if (is_bad_inode(inode)) return -EIO; /* Don't allow parallel writes to the same file */ mutex_lock(&inode->i_mutex); - res = __fuse_direct_write(&io, &iov, 1, ppos); + res = __fuse_direct_write(&io, &ii, ppos); if (res > 0) fuse_write_update_size(inode, *ppos); mutex_unlock(&inode->i_mutex); @@ -1659,13 +1624,13 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req) fuse_writepage_free(fc, req); } -static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc, - struct fuse_inode *fi) +static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc, + struct fuse_inode *fi) { struct fuse_file *ff = NULL; spin_lock(&fc->lock); - if (!WARN_ON(list_empty(&fi->write_files))) { + if (!list_empty(&fi->write_files)) { ff = list_entry(fi->write_files.next, struct fuse_file, write_entry); fuse_file_get(ff); @@ -1675,6 +1640,29 @@ static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc, return ff; } +static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc, + struct fuse_inode *fi) +{ + struct fuse_file *ff = __fuse_write_file_get(fc, fi); + WARN_ON(!ff); + return ff; +} + +int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_file *ff; + int err; + + ff = __fuse_write_file_get(fc, fi); + err = fuse_flush_times(inode, ff); + if (ff) + fuse_file_put(ff, 0); + + return err; +} + static int fuse_writepage_locked(struct page *page) { struct address_space *mapping = page->mapping; @@ -1699,7 +1687,7 @@ static int fuse_writepage_locked(struct page *page) error = -EIO; req->ff = fuse_write_file_get(fc, fi); if (!req->ff) - goto err_free; + goto err_nofile; fuse_write_fill(req, req->ff, page_offset(page), 0); @@ -1727,6 +1715,8 @@ static int fuse_writepage_locked(struct page *page) return 0; +err_nofile: + __free_page(tmp_page); err_free: fuse_request_free(req); err: @@ -1967,8 +1957,8 @@ static int fuse_writepages(struct address_space *mapping, data.ff = NULL; err = -ENOMEM; - data.orig_pages = kzalloc(sizeof(struct page *) * - FUSE_MAX_PAGES_PER_REQ, + data.orig_pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, + sizeof(struct page *), GFP_NOFS); if (!data.orig_pages) goto out; @@ -2281,7 +2271,6 @@ static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl) struct fuse_file *ff = file->private_data; /* emulate flock with POSIX locks */ - fl->fl_owner = (fl_owner_t) file; ff->flock = true; err = fuse_setlk(file, fl, 1); } @@ -2352,7 +2341,7 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov, if (!bytes) return 0; - iov_iter_init(&ii, iov, nr_segs, bytes, 0); + iov_iter_init(&ii, to_user ? READ : WRITE, iov, nr_segs, bytes); while (iov_iter_count(&ii)) { struct page *page = pages[page_idx++]; @@ -2874,8 +2863,8 @@ static inline loff_t fuse_round_up(loff_t off) } static ssize_t -fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t offset, unsigned long nr_segs) +fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) { ssize_t ret = 0; struct file *file = iocb->ki_filp; @@ -2884,7 +2873,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos = 0; struct inode *inode; loff_t i_size; - size_t count = iov_length(iov, nr_segs); + size_t count = iov_iter_count(iter); struct fuse_io_priv *io; pos = offset; @@ -2899,6 +2888,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, if (offset >= i_size) return 0; count = min_t(loff_t, count, fuse_round_up(i_size - offset)); + iov_iter_truncate(iter, count); } io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL); @@ -2928,9 +2918,9 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, io->async = false; if (rw == WRITE) - ret = __fuse_direct_write(io, iov, nr_segs, &pos); + ret = __fuse_direct_write(io, iter, &pos); else - ret = __fuse_direct_read(io, iov, nr_segs, &pos, count); + ret = __fuse_direct_read(io, iter, &pos); if (io->async) { fuse_aio_complete(io, ret < 0 ? ret : 0, -1); @@ -2972,6 +2962,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) || (mode & FALLOC_FL_PUNCH_HOLE); + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + if (fc->no_fallocate) return -EOPNOTSUPP; @@ -3017,12 +3010,8 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE)) { bool changed = fuse_write_update_size(inode, offset + length); - if (changed && fc->writeback_cache) { - struct fuse_inode *fi = get_fuse_inode(inode); - - inode->i_mtime = current_fs_time(inode->i_sb); - set_bit(FUSE_I_MTIME_DIRTY, &fi->state); - } + if (changed && fc->writeback_cache) + file_update_time(file); } if (mode & FALLOC_FL_PUNCH_HOLE) @@ -3042,10 +3031,10 @@ out: static const struct file_operations fuse_file_operations = { .llseek = fuse_file_llseek, - .read = do_sync_read, - .aio_read = fuse_file_aio_read, - .write = do_sync_write, - .aio_write = fuse_file_aio_write, + .read = new_sync_read, + .read_iter = fuse_file_read_iter, + .write = new_sync_write, + .write_iter = fuse_file_write_iter, .mmap = fuse_file_mmap, .open = fuse_open, .flush = fuse_flush, diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index a257ed8ebee6..e8e47a6ab518 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -119,8 +119,6 @@ enum { FUSE_I_INIT_RDPLUS, /** An operation changing file size is in progress */ FUSE_I_SIZE_UNSTABLE, - /** i_mtime has been updated locally; a flush to userspace needed */ - FUSE_I_MTIME_DIRTY, }; struct fuse_conn; @@ -544,6 +542,9 @@ struct fuse_conn { /** Is fallocate not implemented by fs? */ unsigned no_fallocate:1; + /** Is rename with flags implemented by fs? */ + unsigned no_rename2:1; + /** Use enhanced/automatic page cache invalidation. */ unsigned auto_inval_data:1; @@ -725,7 +726,7 @@ int fuse_dev_init(void); void fuse_dev_cleanup(void); int fuse_ctl_init(void); -void fuse_ctl_cleanup(void); +void __exit fuse_ctl_cleanup(void); /** * Allocate a request @@ -879,9 +880,8 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, /** CUSE pass fuse_direct_io() a file which f_mapping->host is not from FUSE */ #define FUSE_DIO_CUSE (1 << 1) -ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, - unsigned long nr_segs, size_t count, loff_t *ppos, - int flags); +ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, + loff_t *ppos, int flags); long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, unsigned int flags); long fuse_ioctl_common(struct file *file, unsigned int cmd, @@ -891,7 +891,8 @@ int fuse_dev_release(struct inode *inode, struct file *file); bool fuse_write_update_size(struct inode *inode, loff_t pos); -int fuse_flush_mtime(struct file *file, bool nofail); +int fuse_flush_times(struct inode *inode, struct fuse_file *ff); +int fuse_write_inode(struct inode *inode, struct writeback_control *wbc); int fuse_do_setattr(struct inode *inode, struct iattr *attr, struct file *file); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 8d611696fcad..03246cd9d47a 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -175,9 +175,9 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) { inode->i_mtime.tv_sec = attr->mtime; inode->i_mtime.tv_nsec = attr->mtimensec; + inode->i_ctime.tv_sec = attr->ctime; + inode->i_ctime.tv_nsec = attr->ctimensec; } - inode->i_ctime.tv_sec = attr->ctime; - inode->i_ctime.tv_nsec = attr->ctimensec; if (attr->blksize != 0) inode->i_blkbits = ilog2(attr->blksize); @@ -256,6 +256,8 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) inode->i_size = attr->size; inode->i_mtime.tv_sec = attr->mtime; inode->i_mtime.tv_nsec = attr->mtimensec; + inode->i_ctime.tv_sec = attr->ctime; + inode->i_ctime.tv_nsec = attr->ctimensec; if (S_ISREG(inode->i_mode)) { fuse_init_common(inode); fuse_init_file_inode(inode); @@ -303,7 +305,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, if ((inode->i_state & I_NEW)) { inode->i_flags |= S_NOATIME; - if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) + if (!fc->writeback_cache || !S_ISREG(attr->mode)) inode->i_flags |= S_NOCMTIME; inode->i_generation = generation; inode->i_data.backing_dev_info = &fc->bdi; @@ -476,6 +478,17 @@ static const match_table_t tokens = { {OPT_ERR, NULL} }; +static int fuse_match_uint(substring_t *s, unsigned int *res) +{ + int err = -ENOMEM; + char *buf = match_strdup(s); + if (buf) { + err = kstrtouint(buf, 10, res); + kfree(buf); + } + return err; +} + static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) { char *p; @@ -486,6 +499,7 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) while ((p = strsep(&opt, ",")) != NULL) { int token; int value; + unsigned uv; substring_t args[MAX_OPT_ARGS]; if (!*p) continue; @@ -509,18 +523,18 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) break; case OPT_USER_ID: - if (match_int(&args[0], &value)) + if (fuse_match_uint(&args[0], &uv)) return 0; - d->user_id = make_kuid(current_user_ns(), value); + d->user_id = make_kuid(current_user_ns(), uv); if (!uid_valid(d->user_id)) return 0; d->user_id_present = 1; break; case OPT_GROUP_ID: - if (match_int(&args[0], &value)) + if (fuse_match_uint(&args[0], &uv)) return 0; - d->group_id = make_kgid(current_user_ns(), value); + d->group_id = make_kgid(current_user_ns(), uv); if (!gid_valid(d->group_id)) return 0; d->group_id_present = 1; @@ -788,6 +802,7 @@ static const struct super_operations fuse_super_operations = { .alloc_inode = fuse_alloc_inode, .destroy_inode = fuse_destroy_inode, .evict_inode = fuse_evict_inode, + .write_inode = fuse_write_inode, .drop_inode = generic_delete_inode, .remount_fs = fuse_remount_fs, .put_super = fuse_put_super, @@ -890,6 +905,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->async_dio = 1; if (arg->flags & FUSE_WRITEBACK_CACHE) fc->writeback_cache = 1; + if (arg->time_gran && arg->time_gran <= 1000000000) + fc->sb->s_time_gran = arg->time_gran; } else { ra_pages = fc->max_read / PAGE_CACHE_SIZE; fc->no_lock = 1; @@ -918,7 +935,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | - FUSE_WRITEBACK_CACHE; + FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -996,9 +1013,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (sb->s_flags & MS_MANDLOCK) goto err; - sb->s_flags &= ~MS_NOSEC; + sb->s_flags &= ~(MS_NOSEC | MS_I_VERSION); - if (!parse_fuse_opt((char *) data, &d, is_bdev)) + if (!parse_fuse_opt(data, &d, is_bdev)) goto err; if (is_bdev) { diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index ce62dcac90b6..805b37fed638 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -431,7 +431,7 @@ static int gfs2_jdata_writepages(struct address_space *mapping, ret = gfs2_write_cache_jdata(mapping, wbc); if (ret == 0 && wbc->sync_mode == WB_SYNC_ALL) { - gfs2_log_flush(sdp, ip->i_gl); + gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH); ret = gfs2_write_cache_jdata(mapping, wbc); } return ret; @@ -577,7 +577,6 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, p = kmap_atomic(page); memcpy(buf + copied, p + offset, amt); kunmap_atomic(p); - mark_page_accessed(page); page_cache_release(page); copied += amt; index++; @@ -1041,8 +1040,7 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset) static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; @@ -1082,7 +1080,7 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, */ if (mapping->nrpages) { loff_t lstart = offset & (PAGE_CACHE_SIZE - 1); - loff_t len = iov_length(iov, nr_segs); + loff_t len = iov_iter_count(iter); loff_t end = PAGE_ALIGN(offset + len) - 1; rv = 0; @@ -1097,9 +1095,9 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, truncate_inode_pages_range(mapping, lstart, end); } - rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, - offset, nr_segs, gfs2_get_block_direct, - NULL, NULL, 0); + rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, + iter, offset, + gfs2_get_block_direct, NULL, NULL, 0); out: gfs2_glock_dq(&gh); gfs2_holder_uninit(&gh); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index c62d4b9f51dc..e6ee5b6e8d99 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -707,7 +707,7 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi * @top: The first pointer in the buffer * @bottom: One more than the last pointer * @height: the height this buffer is at - * @data: a pointer to a struct strip_mine + * @sm: a pointer to a struct strip_mine * * Returns: errno */ @@ -992,6 +992,8 @@ unlock: return err; } +#define GFS2_JTRUNC_REVOKES 8192 + /** * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files * @inode: The inode being truncated @@ -1003,8 +1005,6 @@ unlock: * if the number of pages being truncated gets too large. */ -#define GFS2_JTRUNC_REVOKES 8192 - static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize) { struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -1348,7 +1348,7 @@ void gfs2_free_journal_extents(struct gfs2_jdesc *jd) * gfs2_add_jextent - Add or merge a new extent to extent cache * @jd: The journal descriptor * @lblock: The logical block at start of new extent - * @pblock: The physical block at start of new extent + * @dblock: The physical block at start of new extent * @blocks: Size of extent in fs blocks * * Returns: 0 on success or -ENOMEM diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 80d67253623c..26b3f952e6b1 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -203,9 +203,9 @@ void gfs2_set_inode_flags(struct inode *inode) GFS2_DIF_INHERIT_JDATA) /** - * gfs2_set_flags - set flags on an inode - * @inode: The inode - * @flags: The flags to set + * do_gfs2_set_flags - set flags on an inode + * @filp: file pointer + * @reqflags: The flags to set * @mask: Indicates which flags are valid * */ @@ -256,7 +256,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) } if ((flags ^ new_flags) & GFS2_DIF_JDATA) { if (flags & GFS2_DIF_JDATA) - gfs2_log_flush(sdp, ip->i_gl); + gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH); error = filemap_fdatawrite(inode->i_mapping); if (error) goto out; @@ -318,7 +318,7 @@ static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) /** * gfs2_size_hint - Give a hint to the size of a write request - * @file: The struct file + * @filep: The struct file * @offset: The file offset of the write * @size: The length of the write * @@ -371,7 +371,7 @@ static int gfs2_allocate_page_backing(struct page *page) /** * gfs2_page_mkwrite - Make a shared, mmap()ed, page writable * @vma: The virtual memory area - * @page: The page which is about to become writable + * @vmf: The virtual memory fault containing the page to become writable * * When the page becomes writable, we need to ensure that we have * blocks allocated on disk to back that page. @@ -684,7 +684,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, } /** - * gfs2_file_aio_write - Perform a write to a file + * gfs2_file_write_iter - Perform a write to a file * @iocb: The io context * @iov: The data to write * @nr_segs: Number of @iov segments @@ -697,11 +697,9 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, * */ -static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; - size_t writesize = iov_length(iov, nr_segs); struct gfs2_inode *ip = GFS2_I(file_inode(file)); int ret; @@ -709,7 +707,7 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (ret) return ret; - gfs2_size_hint(file, pos, writesize); + gfs2_size_hint(file, iocb->ki_pos, iov_iter_count(from)); if (file->f_flags & O_APPEND) { struct gfs2_holder gh; @@ -720,7 +718,7 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, gfs2_glock_dq_uninit(&gh); } - return generic_file_aio_write(iocb, iov, nr_segs, pos); + return generic_file_write_iter(iocb, from); } static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, @@ -983,7 +981,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) int error = 0; state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; - flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE; + flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT; mutex_lock(&fp->f_fl_mutex); @@ -993,7 +991,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) goto out; flock_lock_file_wait(file, &(struct file_lock){.fl_type = F_UNLCK}); - gfs2_glock_dq_wait(fl_gh); + gfs2_glock_dq(fl_gh); gfs2_holder_reinit(state, flags, fl_gh); } else { error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr, @@ -1058,10 +1056,10 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) const struct file_operations gfs2_file_fops = { .llseek = gfs2_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = gfs2_file_aio_write, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write = new_sync_write, + .write_iter = gfs2_file_write_iter, .unlocked_ioctl = gfs2_ioctl, .mmap = gfs2_mmap, .open = gfs2_open, @@ -1070,7 +1068,7 @@ const struct file_operations gfs2_file_fops = { .lock = gfs2_lock, .flock = gfs2_flock, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, .setlease = gfs2_setlease, .fallocate = gfs2_fallocate, }; @@ -1090,17 +1088,17 @@ const struct file_operations gfs2_dir_fops = { const struct file_operations gfs2_file_fops_nolock = { .llseek = gfs2_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = gfs2_file_aio_write, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write = new_sync_write, + .write_iter = gfs2_file_write_iter, .unlocked_ioctl = gfs2_ioctl, .mmap = gfs2_mmap, .open = gfs2_open, .release = gfs2_release, .fsync = gfs2_fsync, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, .setlease = generic_setlease, .fallocate = gfs2_fallocate, }; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index aec7f73832f0..ee4e04fe60fc 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -277,7 +277,7 @@ static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holde static void gfs2_holder_wake(struct gfs2_holder *gh) { clear_bit(HIF_WAIT, &gh->gh_iflags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&gh->gh_iflags, HIF_WAIT); } @@ -411,7 +411,7 @@ static void gfs2_demote_wake(struct gfs2_glock *gl) { gl->gl_demote_state = LM_ST_EXCLUSIVE; clear_bit(GLF_DEMOTE, &gl->gl_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&gl->gl_flags, GLF_DEMOTE); } @@ -620,7 +620,7 @@ out: out_sched: clear_bit(GLF_LOCK, &gl->gl_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); gl->gl_lockref.count++; if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) gl->gl_lockref.count--; @@ -628,7 +628,7 @@ out_sched: out_unlock: clear_bit(GLF_LOCK, &gl->gl_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); return; } @@ -731,14 +731,14 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, cachep = gfs2_glock_aspace_cachep; else cachep = gfs2_glock_cachep; - gl = kmem_cache_alloc(cachep, GFP_KERNEL); + gl = kmem_cache_alloc(cachep, GFP_NOFS); if (!gl) return -ENOMEM; memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb)); if (glops->go_flags & GLOF_LVB) { - gl->gl_lksb.sb_lvbptr = kzalloc(GFS2_MIN_LVB_SIZE, GFP_KERNEL); + gl->gl_lksb.sb_lvbptr = kzalloc(GFS2_MIN_LVB_SIZE, GFP_NOFS); if (!gl->gl_lksb.sb_lvbptr) { kmem_cache_free(cachep, gl); return -ENOMEM; @@ -1404,12 +1404,16 @@ __acquires(&lru_lock) gl = list_entry(list->next, struct gfs2_glock, gl_lru); list_del_init(&gl->gl_lru); if (!spin_trylock(&gl->gl_spin)) { +add_back_to_lru: list_add(&gl->gl_lru, &lru_list); atomic_inc(&lru_count); continue; } + if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { + spin_unlock(&gl->gl_spin); + goto add_back_to_lru; + } clear_bit(GLF_LRU, &gl->gl_flags); - spin_unlock(&lru_lock); gl->gl_lockref.count++; if (demote_ok(gl)) handle_callback(gl, LM_ST_UNLOCKED, 0, false); @@ -1417,7 +1421,7 @@ __acquires(&lru_lock) if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) gl->gl_lockref.count--; spin_unlock(&gl->gl_spin); - spin_lock(&lru_lock); + cond_resched_lock(&lru_lock); } } @@ -1442,7 +1446,7 @@ static long gfs2_scan_glock_lru(int nr) gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru); /* Test for being demotable */ - if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { + if (!test_bit(GLF_LOCK, &gl->gl_flags)) { list_move(&gl->gl_lru, &dispose); atomic_dec(&lru_count); freed++; diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 54b66809e818..2ffc67dce87f 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -89,18 +89,23 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) if (!tr.tr_revokes) return; - /* A shortened, inline version of gfs2_trans_begin() */ + /* A shortened, inline version of gfs2_trans_begin() + * tr->alloced is not set since the transaction structure is + * on the stack */ tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64)); tr.tr_ip = (unsigned long)__builtin_return_address(0); sb_start_intwrite(sdp->sd_vfs); - gfs2_log_reserve(sdp, tr.tr_reserved); + if (gfs2_log_reserve(sdp, tr.tr_reserved) < 0) { + sb_end_intwrite(sdp->sd_vfs); + return; + } WARN_ON_ONCE(current->journal_info); current->journal_info = &tr; __gfs2_ail_flush(gl, 0, tr.tr_revokes); gfs2_trans_end(sdp); - gfs2_log_flush(sdp, NULL); + gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); } void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) @@ -121,7 +126,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) return; __gfs2_ail_flush(gl, fsync, max_revokes); gfs2_trans_end(sdp); - gfs2_log_flush(sdp, NULL); + gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); } /** @@ -144,7 +149,7 @@ static void rgrp_go_sync(struct gfs2_glock *gl) return; GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); - gfs2_log_flush(sdp, gl); + gfs2_log_flush(sdp, gl, NORMAL_FLUSH); filemap_fdatawrite_range(mapping, gl->gl_vm.start, gl->gl_vm.end); error = filemap_fdatawait_range(mapping, gl->gl_vm.start, gl->gl_vm.end); mapping_set_error(mapping, error); @@ -206,7 +211,7 @@ static void inode_go_sync(struct gfs2_glock *gl) GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); - gfs2_log_flush(gl->gl_sbd, gl); + gfs2_log_flush(gl->gl_sbd, gl, NORMAL_FLUSH); filemap_fdatawrite(metamapping); if (ip) { struct address_space *mapping = ip->i_inode.i_mapping; @@ -221,7 +226,7 @@ static void inode_go_sync(struct gfs2_glock *gl) * Writeback of the data mapping may cause the dirty flag to be set * so we have to clear it again here. */ - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(GLF_DIRTY, &gl->gl_flags); } @@ -229,8 +234,8 @@ static void inode_go_sync(struct gfs2_glock *gl) * inode_go_inval - prepare a inode glock to be released * @gl: the glock * @flags: - * - * Normally we invlidate everything, but if we are moving into + * + * Normally we invalidate everything, but if we are moving into * LM_ST_DEFERRED from LM_ST_SHARED or LM_ST_EXCLUSIVE then we * can keep hold of the metadata, since it won't have changed. * @@ -253,7 +258,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags) } if (ip == GFS2_I(gl->gl_sbd->sd_rindex)) { - gfs2_log_flush(gl->gl_sbd, NULL); + gfs2_log_flush(gl->gl_sbd, NULL, NORMAL_FLUSH); gl->gl_sbd->sd_rindex_uptodate = 0; } if (ip && S_ISREG(ip->i_inode.i_mode)) @@ -455,31 +460,39 @@ static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) } /** - * trans_go_sync - promote/demote the transaction glock + * freeze_go_sync - promote/demote the freeze glock * @gl: the glock * @state: the requested state * @flags: * */ -static void trans_go_sync(struct gfs2_glock *gl) +static void freeze_go_sync(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_sbd; + DEFINE_WAIT(wait); - if (gl->gl_state != LM_ST_UNLOCKED && + if (gl->gl_state == LM_ST_SHARED && test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { - gfs2_meta_syncfs(sdp); - gfs2_log_shutdown(sdp); + atomic_set(&sdp->sd_log_freeze, 1); + wake_up(&sdp->sd_logd_waitq); + do { + prepare_to_wait(&sdp->sd_log_frozen_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (atomic_read(&sdp->sd_log_freeze)) + io_schedule(); + } while(atomic_read(&sdp->sd_log_freeze)); + finish_wait(&sdp->sd_log_frozen_wait, &wait); } } /** - * trans_go_xmote_bh - After promoting/demoting the transaction glock + * freeze_go_xmote_bh - After promoting/demoting the freeze glock * @gl: the glock * */ -static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh) +static int freeze_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh) { struct gfs2_sbd *sdp = gl->gl_sbd; struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode); @@ -512,7 +525,7 @@ static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh) * Always returns 0 */ -static int trans_go_demote_ok(const struct gfs2_glock *gl) +static int freeze_go_demote_ok(const struct gfs2_glock *gl) { return 0; } @@ -563,10 +576,10 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = { .go_flags = GLOF_LVB, }; -const struct gfs2_glock_operations gfs2_trans_glops = { - .go_sync = trans_go_sync, - .go_xmote_bh = trans_go_xmote_bh, - .go_demote_ok = trans_go_demote_ok, +const struct gfs2_glock_operations gfs2_freeze_glops = { + .go_sync = freeze_go_sync, + .go_xmote_bh = freeze_go_xmote_bh, + .go_demote_ok = freeze_go_demote_ok, .go_type = LM_TYPE_NONDISK, }; diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h index bf95a2dc1662..7455d2629bcb 100644 --- a/fs/gfs2/glops.h +++ b/fs/gfs2/glops.h @@ -15,7 +15,7 @@ extern const struct gfs2_glock_operations gfs2_meta_glops; extern const struct gfs2_glock_operations gfs2_inode_glops; extern const struct gfs2_glock_operations gfs2_rgrp_glops; -extern const struct gfs2_glock_operations gfs2_trans_glops; +extern const struct gfs2_glock_operations gfs2_freeze_glops; extern const struct gfs2_glock_operations gfs2_iopen_glops; extern const struct gfs2_glock_operations gfs2_flock_glops; extern const struct gfs2_glock_operations gfs2_nondisk_glops; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index bdf70c18610c..67d310c9ada3 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -465,9 +465,7 @@ struct gfs2_trans { unsigned int tr_reserved; unsigned int tr_touched:1; unsigned int tr_attached:1; - - struct gfs2_holder tr_t_gh; - + unsigned int tr_alloced:1; unsigned int tr_num_buf_new; unsigned int tr_num_databuf_new; @@ -682,7 +680,7 @@ struct gfs2_sbd { struct lm_lockstruct sd_lockstruct; struct gfs2_holder sd_live_gh; struct gfs2_glock *sd_rename_gl; - struct gfs2_glock *sd_trans_gl; + struct gfs2_glock *sd_freeze_gl; wait_queue_head_t sd_glock_wait; atomic_t sd_glock_disposal; struct completion sd_locking_init; @@ -730,6 +728,8 @@ struct gfs2_sbd { struct gfs2_holder sd_sc_gh; struct gfs2_holder sd_qc_gh; + struct completion sd_journal_ready; + /* Daemon stuff */ struct task_struct *sd_logd_process; @@ -794,6 +794,12 @@ struct gfs2_sbd { /* For quiescing the filesystem */ struct gfs2_holder sd_freeze_gh; + struct gfs2_holder sd_freeze_root_gh; + struct gfs2_holder sd_thaw_gh; + atomic_t sd_log_freeze; + atomic_t sd_frozen_root; + wait_queue_head_t sd_frozen_root_wait; + wait_queue_head_t sd_log_frozen_wait; char sd_fsname[GFS2_FSNAME_LEN]; char sd_table_name[GFS2_FSNAME_LEN]; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 28cc7bf6575a..e62e59477884 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1613,18 +1613,26 @@ int gfs2_permission(struct inode *inode, int mask) { struct gfs2_inode *ip; struct gfs2_holder i_gh; + struct gfs2_sbd *sdp = GFS2_SB(inode); int error; int unlock = 0; + int frozen_root = 0; ip = GFS2_I(inode); if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { - if (mask & MAY_NOT_BLOCK) - return -ECHILD; - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); - if (error) - return error; - unlock = 1; + if (unlikely(gfs2_glock_is_held_excl(sdp->sd_freeze_gl) && + inode == sdp->sd_root_dir->d_inode && + atomic_inc_not_zero(&sdp->sd_frozen_root))) + frozen_root = 1; + else { + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + if (error) + return error; + unlock = 1; + } } if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode)) @@ -1633,6 +1641,8 @@ int gfs2_permission(struct inode *inode, int mask) error = generic_permission(inode, mask); if (unlock) gfs2_glock_dq_uninit(&i_gh); + else if (frozen_root && atomic_dec_and_test(&sdp->sd_frozen_root)) + wake_up(&sdp->sd_frozen_root_wait); return error; } @@ -1805,19 +1815,29 @@ static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, struct inode *inode = dentry->d_inode; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; + struct gfs2_sbd *sdp = GFS2_SB(inode); int error; int unlock = 0; + int frozen_root = 0; if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); - if (error) - return error; - unlock = 1; + if (unlikely(gfs2_glock_is_held_excl(sdp->sd_freeze_gl) && + inode == sdp->sd_root_dir->d_inode && + atomic_inc_not_zero(&sdp->sd_frozen_root))) + frozen_root = 1; + else { + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); + if (error) + return error; + unlock = 1; + } } generic_fillattr(inode, stat); if (unlock) gfs2_glock_dq_uninit(&gh); + else if (frozen_root && atomic_dec_and_test(&sdp->sd_frozen_root)) + wake_up(&sdp->sd_frozen_root_wait); return 0; } diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index c1eb555dc588..4fafea1c9ecf 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -1036,8 +1036,8 @@ static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots, new_size = old_size + RECOVER_SIZE_INC; - submit = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS); - result = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS); + submit = kcalloc(new_size, sizeof(uint32_t), GFP_NOFS); + result = kcalloc(new_size, sizeof(uint32_t), GFP_NOFS); if (!submit || !result) { kfree(submit); kfree(result); @@ -1134,7 +1134,7 @@ static void gdlm_recover_done(void *arg, struct dlm_slot *slots, int num_slots, queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, 0); clear_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY); spin_unlock(&ls->ls_recover_spin); } @@ -1271,7 +1271,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) ls->ls_first = !!test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags); clear_bit(SDF_NOJOURNALID, &sdp->sd_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID); return 0; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 4a14d504ef83..3966fadbcebd 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -301,6 +301,23 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) } /** + * gfs2_log_release - Release a given number of log blocks + * @sdp: The GFS2 superblock + * @blks: The number of blocks + * + */ + +void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks) +{ + + atomic_add(blks, &sdp->sd_log_blks_free); + trace_gfs2_log_blocks(sdp, blks); + gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= + sdp->sd_jdesc->jd_blocks); + up_read(&sdp->sd_log_flush_lock); +} + +/** * gfs2_log_reserve - Make a log reservation * @sdp: The GFS2 superblock * @blks: The number of blocks to reserve @@ -358,7 +375,10 @@ retry: wake_up(&sdp->sd_log_waitq); down_read(&sdp->sd_log_flush_lock); - + if (unlikely(!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))) { + gfs2_log_release(sdp, blks); + return -EROFS; + } return 0; } @@ -671,7 +691,8 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) * */ -void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) +void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, + enum gfs2_flush_type type) { struct gfs2_trans *tr; @@ -723,6 +744,42 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) } spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); + + if (atomic_read(&sdp->sd_log_freeze)) + type = FREEZE_FLUSH; + if (type != NORMAL_FLUSH) { + if (!sdp->sd_log_idle) { + for (;;) { + gfs2_ail1_start(sdp); + gfs2_ail1_wait(sdp); + if (gfs2_ail1_empty(sdp)) + break; + } + atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ + trace_gfs2_log_blocks(sdp, -1); + sdp->sd_log_flush_wrapped = 0; + log_write_header(sdp, 0); + sdp->sd_log_head = sdp->sd_log_flush_head; + } + if (type == SHUTDOWN_FLUSH || type == FREEZE_FLUSH) + gfs2_log_shutdown(sdp); + if (type == FREEZE_FLUSH) { + int error; + + atomic_set(&sdp->sd_log_freeze, 0); + wake_up(&sdp->sd_log_frozen_wait); + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, + LM_ST_SHARED, 0, + &sdp->sd_thaw_gh); + if (error) { + printk(KERN_INFO "GFS2: couln't get freeze lock : %d\n", error); + gfs2_assert_withdraw(sdp, 0); + } + else + gfs2_glock_dq_uninit(&sdp->sd_thaw_gh); + } + } + trace_gfs2_log_flush(sdp, 0); up_write(&sdp->sd_log_flush_lock); @@ -761,7 +818,7 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) if (sdp->sd_log_tr) { gfs2_merge_trans(sdp->sd_log_tr, tr); } else if (tr->tr_num_buf_new || tr->tr_num_databuf_new) { - gfs2_assert_withdraw(sdp, tr->tr_t_gh.gh_gl); + gfs2_assert_withdraw(sdp, tr->tr_alloced); sdp->sd_log_tr = tr; tr->tr_attached = 1; } @@ -813,8 +870,6 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) void gfs2_log_shutdown(struct gfs2_sbd *sdp) { - down_write(&sdp->sd_log_flush_lock); - gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved); gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list)); @@ -824,38 +879,16 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp) log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT); - gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks); gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail); gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list)); sdp->sd_log_head = sdp->sd_log_flush_head; sdp->sd_log_tail = sdp->sd_log_head; - - up_write(&sdp->sd_log_flush_lock); -} - - -/** - * gfs2_meta_syncfs - sync all the buffers in a filesystem - * @sdp: the filesystem - * - */ - -void gfs2_meta_syncfs(struct gfs2_sbd *sdp) -{ - gfs2_log_flush(sdp, NULL); - for (;;) { - gfs2_ail1_start(sdp); - gfs2_ail1_wait(sdp); - if (gfs2_ail1_empty(sdp)) - break; - } - gfs2_log_flush(sdp, NULL); } static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp) { - return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1)); + return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1) || atomic_read(&sdp->sd_log_freeze)); } static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp) @@ -882,14 +915,14 @@ int gfs2_logd(void *data) if (gfs2_jrnl_flush_reqd(sdp) || t == 0) { gfs2_ail1_empty(sdp); - gfs2_log_flush(sdp, NULL); + gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); } if (gfs2_ail_flush_reqd(sdp)) { gfs2_ail1_start(sdp); gfs2_ail1_wait(sdp); gfs2_ail1_empty(sdp); - gfs2_log_flush(sdp, NULL); + gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); } if (!gfs2_ail_flush_reqd(sdp)) diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 37216634f0aa..9499a6049212 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -63,14 +63,21 @@ extern void gfs2_ordered_del_inode(struct gfs2_inode *ip); extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, unsigned int ssize); +extern void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); -extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl); +enum gfs2_flush_type { + NORMAL_FLUSH = 0, + SYNC_FLUSH, + SHUTDOWN_FLUSH, + FREEZE_FLUSH +}; +extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, + enum gfs2_flush_type type); extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd); extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc); extern void gfs2_log_shutdown(struct gfs2_sbd *sdp); -extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp); extern int gfs2_logd(void *data); extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); extern void gfs2_write_revokes(struct gfs2_sbd *sdp); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index a294d8d8bcd4..2c1ae861dc94 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -75,7 +75,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd) unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number; struct gfs2_bitmap *bi = rgd->rd_bits + index; - if (bi->bi_clone == 0) + if (bi->bi_clone == NULL) return; if (sdp->sd_args.ar_discard) gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi, 1, NULL); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 2cf09b63a6b4..b984a6e190bc 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -136,7 +136,8 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) yield(); } } else { - page = find_lock_page(mapping, index); + page = find_get_page_flags(mapping, index, + FGP_LOCK|FGP_ACCESSED); if (!page) return NULL; } @@ -153,7 +154,6 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) map_bh(bh, sdp->sd_vfs, blkno); unlock_page(page); - mark_page_accessed(page); page_cache_release(page); return bh; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 22f954051bb8..bc564c0d6d16 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -94,6 +94,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) INIT_LIST_HEAD(&sdp->sd_jindex_list); spin_lock_init(&sdp->sd_jindex_spin); mutex_init(&sdp->sd_jindex_mutex); + init_completion(&sdp->sd_journal_ready); INIT_LIST_HEAD(&sdp->sd_quota_list); mutex_init(&sdp->sd_quota_mutex); @@ -129,6 +130,10 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) init_rwsem(&sdp->sd_log_flush_lock); atomic_set(&sdp->sd_log_in_flight, 0); init_waitqueue_head(&sdp->sd_log_flush_wait); + init_waitqueue_head(&sdp->sd_log_frozen_wait); + atomic_set(&sdp->sd_log_freeze, 0); + atomic_set(&sdp->sd_frozen_root, 0); + init_waitqueue_head(&sdp->sd_frozen_root_wait); return sdp; } @@ -419,8 +424,8 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh, goto fail_live; } - error = gfs2_glock_get(sdp, GFS2_TRANS_LOCK, &gfs2_trans_glops, - CREATE, &sdp->sd_trans_gl); + error = gfs2_glock_get(sdp, GFS2_FREEZE_LOCK, &gfs2_freeze_glops, + CREATE, &sdp->sd_freeze_gl); if (error) { fs_err(sdp, "can't create transaction glock: %d\n", error); goto fail_rename; @@ -429,7 +434,7 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh, return 0; fail_trans: - gfs2_glock_put(sdp->sd_trans_gl); + gfs2_glock_put(sdp->sd_freeze_gl); fail_rename: gfs2_glock_put(sdp->sd_rename_gl); fail_live: @@ -755,7 +760,15 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags); gfs2_glock_dq_uninit(&ji_gh); jindex = 0; - + if (!sdp->sd_args.ar_spectator) { + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, 0, + &sdp->sd_thaw_gh); + if (error) { + fs_err(sdp, "can't acquire freeze glock: %d\n", error); + goto fail_jinode_gh; + } + } + gfs2_glock_dq_uninit(&sdp->sd_thaw_gh); return 0; fail_jinode_gh: @@ -784,6 +797,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) goto fail_qinode; error = init_journal(sdp, undo); + complete_all(&sdp->sd_journal_ready); if (error) goto fail; @@ -1200,6 +1214,7 @@ fail_sb: fail_locking: init_locking(sdp, &mount_gh, UNDO); fail_lm: + complete_all(&sdp->sd_journal_ready); gfs2_gl_hash_clear(sdp); gfs2_lm_unmount(sdp); fail_debug: @@ -1380,7 +1395,7 @@ static void gfs2_kill_sb(struct super_block *sb) return; } - gfs2_meta_syncfs(sdp); + gfs2_log_flush(sdp, NULL, SYNC_FLUSH); dput(sdp->sd_root_dir); dput(sdp->sd_master_dir); sdp->sd_root_dir = NULL; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index c4effff7cf55..64b29f7f6b4c 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -778,6 +778,7 @@ get_a_page: i_size_write(inode, size); inode->i_mtime = inode->i_atime = CURRENT_TIME; mark_inode_dirty(inode); + set_bit(QDF_REFRESH, &qd->qd_flags); return 0; unlock_out: @@ -879,7 +880,7 @@ out: gfs2_glock_dq_uninit(&ghs[qx]); mutex_unlock(&ip->i_inode.i_mutex); kfree(ghs); - gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl); + gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl, NORMAL_FLUSH); return error; } diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 7ad4094d68c0..94555d4c5698 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -454,7 +454,7 @@ void gfs2_recover_func(struct work_struct *work) struct gfs2_inode *ip = GFS2_I(jd->jd_inode); struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct gfs2_log_header_host head; - struct gfs2_holder j_gh, ji_gh, t_gh; + struct gfs2_holder j_gh, ji_gh, thaw_gh; unsigned long t; int ro = 0; unsigned int pass; @@ -508,11 +508,11 @@ void gfs2_recover_func(struct work_struct *work) t = jiffies; - /* Acquire a shared hold on the transaction lock */ + /* Acquire a shared hold on the freeze lock */ - error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, - LM_FLAG_NOEXP | LM_FLAG_PRIORITY | - GL_NOCACHE, &t_gh); + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, + LM_FLAG_NOEXP | LM_FLAG_PRIORITY, + &thaw_gh); if (error) goto fail_gunlock_ji; @@ -538,7 +538,7 @@ void gfs2_recover_func(struct work_struct *work) fs_warn(sdp, "jid=%u: Can't replay: read-only block " "device\n", jd->jd_jid); error = -EROFS; - goto fail_gunlock_tr; + goto fail_gunlock_thaw; } fs_info(sdp, "jid=%u: Replaying journal...\n", jd->jd_jid); @@ -549,14 +549,14 @@ void gfs2_recover_func(struct work_struct *work) head.lh_blkno, pass); lops_after_scan(jd, error, pass); if (error) - goto fail_gunlock_tr; + goto fail_gunlock_thaw; } error = clean_journal(jd, &head); if (error) - goto fail_gunlock_tr; + goto fail_gunlock_thaw; - gfs2_glock_dq_uninit(&t_gh); + gfs2_glock_dq_uninit(&thaw_gh); t = DIV_ROUND_UP(jiffies - t, HZ); fs_info(sdp, "jid=%u: Journal replayed in %lus\n", jd->jd_jid, t); @@ -572,8 +572,8 @@ void gfs2_recover_func(struct work_struct *work) fs_info(sdp, "jid=%u: Done\n", jd->jd_jid); goto done; -fail_gunlock_tr: - gfs2_glock_dq_uninit(&t_gh); +fail_gunlock_thaw: + gfs2_glock_dq_uninit(&thaw_gh); fail_gunlock_ji: if (jlocked) { gfs2_glock_dq_uninit(&ji_gh); @@ -587,7 +587,7 @@ fail: gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); done: clear_bit(JDF_RECOVERY, &jd->jd_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&jd->jd_flags, JDF_RECOVERY); } diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 281a7716e3f3..f4cb9c0d6bbd 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -337,7 +337,7 @@ static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *le /** * gfs2_free_extlen - Return extent length of free blocks - * @rbm: Starting position + * @rrbm: Starting position * @len: Max length to check * * Starting at the block specified by the rbm, see how many free blocks @@ -2001,7 +2001,7 @@ next_rgrp: } /* Flushing the log may release space */ if (loops == 2) - gfs2_log_flush(sdp, NULL); + gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); } return -ENOSPC; @@ -2522,7 +2522,7 @@ void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state) /** * gfs2_rlist_free - free a resource group list - * @list: the list of resource groups + * @rlist: the list of resource groups * */ diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index de8afad89e51..1319b5c4ec68 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -399,7 +399,7 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) { struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode); struct gfs2_glock *j_gl = ip->i_gl; - struct gfs2_holder t_gh; + struct gfs2_holder thaw_gh; struct gfs2_log_header_host head; int error; @@ -407,7 +407,8 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) if (error) return error; - error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh); + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, 0, + &thaw_gh); if (error) goto fail_threads; @@ -433,13 +434,13 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); - gfs2_glock_dq_uninit(&t_gh); + gfs2_glock_dq_uninit(&thaw_gh); return 0; fail: - t_gh.gh_flags |= GL_NOCACHE; - gfs2_glock_dq_uninit(&t_gh); + thaw_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_dq_uninit(&thaw_gh); fail_threads: kthread_stop(sdp->sd_quotad_process); kthread_stop(sdp->sd_logd_process); @@ -635,15 +636,21 @@ struct lfcc { */ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, - struct gfs2_holder *t_gh) + struct gfs2_holder *freeze_gh) { struct gfs2_inode *ip; struct gfs2_jdesc *jd; struct lfcc *lfcc; LIST_HEAD(list); struct gfs2_log_header_host lh; + struct gfs2_inode *dip = GFS2_I(sdp->sd_root_dir->d_inode); int error; + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, + &sdp->sd_freeze_root_gh); + if (error) + return error; + atomic_set(&sdp->sd_frozen_root, 1); list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL); if (!lfcc) { @@ -659,8 +666,8 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, list_add(&lfcc->list, &list); } - error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED, - GL_NOCACHE, t_gh); + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_EXCLUSIVE, + GL_NOCACHE, freeze_gh); list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { error = gfs2_jdesc_check(jd); @@ -676,7 +683,7 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, } if (error) - gfs2_glock_dq_uninit(t_gh); + gfs2_glock_dq_uninit(freeze_gh); out: while (!list_empty(&list)) { @@ -685,6 +692,11 @@ out: gfs2_glock_dq_uninit(&lfcc->gh); kfree(lfcc); } + if (error) { + atomic_dec(&sdp->sd_frozen_root); + wait_event(sdp->sd_frozen_root_wait, atomic_read(&sdp->sd_frozen_root) == 0); + gfs2_glock_dq_uninit(&sdp->sd_freeze_root_gh); + } return error; } @@ -742,7 +754,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) int ret = 0; if (wbc->sync_mode == WB_SYNC_ALL) - gfs2_log_flush(GFS2_SB(inode), ip->i_gl); + gfs2_log_flush(GFS2_SB(inode), ip->i_gl, NORMAL_FLUSH); if (bdi->dirty_exceeded) gfs2_ail1_flush(sdp, wbc); else @@ -822,9 +834,18 @@ out: static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) { - struct gfs2_holder t_gh; + struct gfs2_holder thaw_gh; int error; + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, GL_NOCACHE, + &thaw_gh); + if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + return error; + + down_write(&sdp->sd_log_flush_lock); + clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); + up_write(&sdp->sd_log_flush_lock); + kthread_stop(sdp->sd_quotad_process); kthread_stop(sdp->sd_logd_process); @@ -832,18 +853,11 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) gfs2_quota_sync(sdp->sd_vfs, 0); gfs2_statfs_sync(sdp->sd_vfs, 0); - error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE, - &t_gh); - if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) - return error; - - gfs2_meta_syncfs(sdp); - gfs2_log_shutdown(sdp); - - clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); + gfs2_log_flush(sdp, NULL, SHUTDOWN_FLUSH); + gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks); - if (t_gh.gh_gl) - gfs2_glock_dq_uninit(&t_gh); + if (thaw_gh.gh_gl) + gfs2_glock_dq_uninit(&thaw_gh); gfs2_quota_cleanup(sdp); @@ -900,7 +914,7 @@ restart: iput(sdp->sd_quota_inode); gfs2_glock_put(sdp->sd_rename_gl); - gfs2_glock_put(sdp->sd_trans_gl); + gfs2_glock_put(sdp->sd_freeze_gl); if (!sdp->sd_args.ar_spectator) { gfs2_glock_dq_uninit(&sdp->sd_journal_gh); @@ -935,8 +949,8 @@ static int gfs2_sync_fs(struct super_block *sb, int wait) struct gfs2_sbd *sdp = sb->s_fs_info; gfs2_quota_sync(sb, -1); - if (wait && sdp) - gfs2_log_flush(sdp, NULL); + if (wait && sdp && !atomic_read(&sdp->sd_log_freeze)) + gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); return 0; } @@ -986,6 +1000,9 @@ static int gfs2_unfreeze(struct super_block *sb) struct gfs2_sbd *sdp = sb->s_fs_info; gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); + atomic_dec(&sdp->sd_frozen_root); + wait_event(sdp->sd_frozen_root_wait, atomic_read(&sdp->sd_frozen_root) == 0); + gfs2_glock_dq_uninit(&sdp->sd_freeze_root_gh); return 0; } @@ -1525,7 +1542,7 @@ static void gfs2_evict_inode(struct inode *inode) goto out_unlock; out_truncate: - gfs2_log_flush(sdp, ip->i_gl); + gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH); if (test_bit(GLF_DIRTY, &ip->i_gl->gl_flags)) { struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl); filemap_fdatawrite(metamapping); diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index de25d5577e5d..3ab566ba5696 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -240,8 +240,8 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len if (gltype > LM_TYPE_JOURNAL) return -EINVAL; - if (gltype == LM_TYPE_NONDISK && glnum == GFS2_TRANS_LOCK) - glops = &gfs2_trans_glops; + if (gltype == LM_TYPE_NONDISK && glnum == GFS2_FREEZE_LOCK) + glops = &gfs2_freeze_glops; else glops = gfs2_glops_list[gltype]; if (glops == NULL) @@ -333,7 +333,7 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len) set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); else if (val == 0) { clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); gfs2_glock_thaw(sdp); } else { ret = -EINVAL; @@ -407,6 +407,9 @@ int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid) struct gfs2_jdesc *jd; int rv; + /* Wait for our primary journal to be initialized */ + wait_for_completion(&sdp->sd_journal_ready); + spin_lock(&sdp->sd_jindex_spin); rv = -EBUSY; if (sdp->sd_jdesc->jd_jid == jid) @@ -482,7 +485,7 @@ static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len) rv = jid = -EINVAL; sdp->sd_lockstruct.ls_jid = jid; clear_bit(SDF_NOJOURNALID, &sdp->sd_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID); out: spin_unlock(&sdp->sd_jindex_spin); diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index bead90d27bad..0546ab4e28e8 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -48,6 +48,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, tr->tr_blocks = blocks; tr->tr_revokes = revokes; tr->tr_reserved = 1; + tr->tr_alloced = 1; if (blocks) tr->tr_reserved += 6 + blocks; if (revokes) @@ -57,48 +58,22 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, INIT_LIST_HEAD(&tr->tr_buf); sb_start_intwrite(sdp->sd_vfs); - gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh); - - error = gfs2_glock_nq(&tr->tr_t_gh); - if (error) - goto fail_holder_uninit; error = gfs2_log_reserve(sdp, tr->tr_reserved); if (error) - goto fail_gunlock; + goto fail; current->journal_info = tr; return 0; -fail_gunlock: - gfs2_glock_dq(&tr->tr_t_gh); - -fail_holder_uninit: +fail: sb_end_intwrite(sdp->sd_vfs); - gfs2_holder_uninit(&tr->tr_t_gh); kfree(tr); return error; } -/** - * gfs2_log_release - Release a given number of log blocks - * @sdp: The GFS2 superblock - * @blks: The number of blocks - * - */ - -static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks) -{ - - atomic_add(blks, &sdp->sd_log_blks_free); - trace_gfs2_log_blocks(sdp, blks); - gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= - sdp->sd_jdesc->jd_blocks); - up_read(&sdp->sd_log_flush_lock); -} - static void gfs2_print_trans(const struct gfs2_trans *tr) { pr_warn("Transaction created at: %pSR\n", (void *)tr->tr_ip); @@ -119,11 +94,8 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) if (!tr->tr_touched) { gfs2_log_release(sdp, tr->tr_reserved); - if (tr->tr_t_gh.gh_gl) { - gfs2_glock_dq(&tr->tr_t_gh); - gfs2_holder_uninit(&tr->tr_t_gh); + if (tr->tr_alloced) kfree(tr); - } sb_end_intwrite(sdp->sd_vfs); return; } @@ -137,16 +109,12 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) gfs2_print_trans(tr); gfs2_log_commit(sdp, tr); - if (tr->tr_t_gh.gh_gl) { - gfs2_glock_dq(&tr->tr_t_gh); - gfs2_holder_uninit(&tr->tr_t_gh); - if (!tr->tr_attached) + if (tr->tr_alloced && !tr->tr_attached) kfree(tr); - } up_read(&sdp->sd_log_flush_lock); if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS) - gfs2_log_flush(sdp, NULL); + gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); sb_end_intwrite(sdp->sd_vfs); } diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 9e2fecd62f62..d0929bc81782 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -125,15 +125,15 @@ static int hfs_releasepage(struct page *page, gfp_t mask) } static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = file_inode(file)->i_mapping->host; + size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, - hfs_get_block); + ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, hfs_get_block); /* * In case of error extending write may have instantiated a few @@ -141,7 +141,7 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb, */ if (unlikely((rw & WRITE) && ret < 0)) { loff_t isize = i_size_read(inode); - loff_t end = offset + iov_length(iov, nr_segs); + loff_t end = offset + count; if (end > isize) hfs_write_failed(mapping, end); @@ -674,10 +674,10 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end, static const struct file_operations hfs_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write = new_sync_write, + .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .splice_read = generic_file_splice_read, .fsync = hfs_file_fsync, diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c index caf89a7be0a1..e5b221de7de6 100644 --- a/fs/hfsplus/attributes.c +++ b/fs/hfsplus/attributes.c @@ -54,14 +54,11 @@ int hfsplus_attr_build_key(struct super_block *sb, hfsplus_btree_key *key, memset(key, 0, sizeof(struct hfsplus_attr_key)); key->attr.cnid = cpu_to_be32(cnid); if (name) { - len = strlen(name); - if (len > HFSPLUS_ATTR_MAX_STRLEN) { - pr_err("invalid xattr name's length\n"); - return -EINVAL; - } - hfsplus_asc2uni(sb, + int res = hfsplus_asc2uni(sb, (struct hfsplus_unistr *)&key->attr.key_name, - HFSPLUS_ATTR_MAX_STRLEN, name, len); + HFSPLUS_ATTR_MAX_STRLEN, name, strlen(name)); + if (res) + return res; len = be16_to_cpu(key->attr.key_name.length); } else { key->attr.key_name.length = 0; @@ -82,31 +79,6 @@ int hfsplus_attr_build_key(struct super_block *sb, hfsplus_btree_key *key, return 0; } -void hfsplus_attr_build_key_uni(hfsplus_btree_key *key, - u32 cnid, - struct hfsplus_attr_unistr *name) -{ - int ustrlen; - - memset(key, 0, sizeof(struct hfsplus_attr_key)); - ustrlen = be16_to_cpu(name->length); - key->attr.cnid = cpu_to_be32(cnid); - key->attr.key_name.length = cpu_to_be16(ustrlen); - ustrlen *= 2; - memcpy(key->attr.key_name.unicode, name->unicode, ustrlen); - - /* The length of the key, as stored in key_len field, does not include - * the size of the key_len field itself. - * So, offsetof(hfsplus_attr_key, key_name) is a trick because - * it takes into consideration key_len field (__be16) of - * hfsplus_attr_key structure instead of length field (__be16) of - * hfsplus_attr_unistr structure. - */ - key->key_len = - cpu_to_be16(offsetof(struct hfsplus_attr_key, key_name) + - ustrlen); -} - hfsplus_attr_entry *hfsplus_alloc_attr_entry(void) { return kmem_cache_alloc(hfsplus_attr_tree_cachep, GFP_KERNEL); diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 11c860204520..759708fd9331 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -27,13 +27,13 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) pagep = node->page + (off >> PAGE_CACHE_SHIFT); off &= ~PAGE_CACHE_MASK; - l = min(len, (int)PAGE_CACHE_SIZE - off); + l = min_t(int, len, PAGE_CACHE_SIZE - off); memcpy(buf, kmap(*pagep) + off, l); kunmap(*pagep); while ((len -= l) != 0) { buf += l; - l = min(len, (int)PAGE_CACHE_SIZE); + l = min_t(int, len, PAGE_CACHE_SIZE); memcpy(buf, kmap(*++pagep), l); kunmap(*pagep); } @@ -80,14 +80,14 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len) pagep = node->page + (off >> PAGE_CACHE_SHIFT); off &= ~PAGE_CACHE_MASK; - l = min(len, (int)PAGE_CACHE_SIZE - off); + l = min_t(int, len, PAGE_CACHE_SIZE - off); memcpy(kmap(*pagep) + off, buf, l); set_page_dirty(*pagep); kunmap(*pagep); while ((len -= l) != 0) { buf += l; - l = min(len, (int)PAGE_CACHE_SIZE); + l = min_t(int, len, PAGE_CACHE_SIZE); memcpy(kmap(*++pagep), buf, l); set_page_dirty(*pagep); kunmap(*pagep); @@ -110,13 +110,13 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) pagep = node->page + (off >> PAGE_CACHE_SHIFT); off &= ~PAGE_CACHE_MASK; - l = min(len, (int)PAGE_CACHE_SIZE - off); + l = min_t(int, len, PAGE_CACHE_SIZE - off); memset(kmap(*pagep) + off, 0, l); set_page_dirty(*pagep); kunmap(*pagep); while ((len -= l) != 0) { - l = min(len, (int)PAGE_CACHE_SIZE); + l = min_t(int, len, PAGE_CACHE_SIZE); memset(kmap(*++pagep), 0, l); set_page_dirty(*pagep); kunmap(*pagep); @@ -142,14 +142,14 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, dst &= ~PAGE_CACHE_MASK; if (src == dst) { - l = min(len, (int)PAGE_CACHE_SIZE - src); + l = min_t(int, len, PAGE_CACHE_SIZE - src); memcpy(kmap(*dst_page) + src, kmap(*src_page) + src, l); kunmap(*src_page); set_page_dirty(*dst_page); kunmap(*dst_page); while ((len -= l) != 0) { - l = min(len, (int)PAGE_CACHE_SIZE); + l = min_t(int, len, PAGE_CACHE_SIZE); memcpy(kmap(*++dst_page), kmap(*++src_page), l); kunmap(*src_page); set_page_dirty(*dst_page); @@ -251,7 +251,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) dst &= ~PAGE_CACHE_MASK; if (src == dst) { - l = min(len, (int)PAGE_CACHE_SIZE - src); + l = min_t(int, len, PAGE_CACHE_SIZE - src); memmove(kmap(*dst_page) + src, kmap(*src_page) + src, l); kunmap(*src_page); @@ -259,7 +259,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) kunmap(*dst_page); while ((len -= l) != 0) { - l = min(len, (int)PAGE_CACHE_SIZE); + l = min_t(int, len, PAGE_CACHE_SIZE); memmove(kmap(*++dst_page), kmap(*++src_page), l); kunmap(*src_page); @@ -386,9 +386,8 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid) struct hfs_bnode *node; if (cnid >= tree->node_count) { - pr_err("request for non-existent node " - "%d in B*Tree\n", - cnid); + pr_err("request for non-existent node %d in B*Tree\n", + cnid); return NULL; } @@ -409,9 +408,8 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) loff_t off; if (cnid >= tree->node_count) { - pr_err("request for non-existent node " - "%d in B*Tree\n", - cnid); + pr_err("request for non-existent node %d in B*Tree\n", + cnid); return NULL; } @@ -602,7 +600,7 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num) pagep = node->page; memset(kmap(*pagep) + node->page_offset, 0, - min((int)PAGE_CACHE_SIZE, (int)tree->node_size)); + min_t(int, PAGE_CACHE_SIZE, tree->node_size)); set_page_dirty(*pagep); kunmap(*pagep); for (i = 1; i < tree->pages_per_bnode; i++) { @@ -648,8 +646,8 @@ void hfs_bnode_put(struct hfs_bnode *node) if (test_bit(HFS_BNODE_DELETED, &node->flags)) { hfs_bnode_unhash(node); spin_unlock(&tree->hash_lock); - hfs_bnode_clear(node, 0, - PAGE_CACHE_SIZE * tree->pages_per_bnode); + if (hfs_bnode_need_zeroout(tree)) + hfs_bnode_clear(node, 0, tree->node_size); hfs_bmap_free(node); hfs_bnode_free(node); return; @@ -658,3 +656,16 @@ void hfs_bnode_put(struct hfs_bnode *node) } } +/* + * Unused nodes have to be zeroed if this is the catalog tree and + * a corresponding flag in the volume header is set. + */ +bool hfs_bnode_need_zeroout(struct hfs_btree *tree) +{ + struct super_block *sb = tree->inode->i_sb; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + const u32 volume_attr = be32_to_cpu(sbi->s_vhdr->attributes); + + return tree->cnid == HFSPLUS_CAT_CNID && + volume_attr & HFSPLUS_VOL_UNUSED_NODE_FIX; +} diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index 0fcec8b2a90b..3345c7553edc 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -358,7 +358,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) u32 count; int res; - res = hfsplus_file_extend(inode); + res = hfsplus_file_extend(inode, hfs_bnode_need_zeroout(tree)); if (res) return ERR_PTR(res); hip->phys_size = inode->i_size = diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index bdec66522de3..610a3260bef1 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -12,6 +12,7 @@ #include <linux/fs.h> #include <linux/slab.h> #include <linux/random.h> +#include <linux/nls.h> #include "hfsplus_fs.h" #include "hfsplus_raw.h" @@ -127,7 +128,7 @@ static int hfsplus_readdir(struct file *file, struct dir_context *ctx) struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; int len, err; - char strbuf[HFSPLUS_MAX_STRLEN + 1]; + char *strbuf; hfsplus_cat_entry entry; struct hfs_find_data fd; struct hfsplus_readdir_data *rd; @@ -139,6 +140,11 @@ static int hfsplus_readdir(struct file *file, struct dir_context *ctx) err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); if (err) return err; + strbuf = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_MAX_STRLEN + 1, GFP_KERNEL); + if (!strbuf) { + err = -ENOMEM; + goto out; + } hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL); err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err) @@ -193,7 +199,7 @@ static int hfsplus_readdir(struct file *file, struct dir_context *ctx) hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); type = be16_to_cpu(entry.type); - len = HFSPLUS_MAX_STRLEN; + len = NLS_MAX_CHARSET_SIZE * HFSPLUS_MAX_STRLEN; err = hfsplus_uni2asc(sb, &fd.key->cat.name, strbuf, &len); if (err) goto out; @@ -212,13 +218,31 @@ static int hfsplus_readdir(struct file *file, struct dir_context *ctx) be32_to_cpu(entry.folder.id), DT_DIR)) break; } else if (type == HFSPLUS_FILE) { + u16 mode; + unsigned type = DT_UNKNOWN; + if (fd.entrylength < sizeof(struct hfsplus_cat_file)) { pr_err("small file entry\n"); err = -EIO; goto out; } + + mode = be16_to_cpu(entry.file.permissions.mode); + if (S_ISREG(mode)) + type = DT_REG; + else if (S_ISLNK(mode)) + type = DT_LNK; + else if (S_ISFIFO(mode)) + type = DT_FIFO; + else if (S_ISCHR(mode)) + type = DT_CHR; + else if (S_ISBLK(mode)) + type = DT_BLK; + else if (S_ISSOCK(mode)) + type = DT_SOCK; + if (!dir_emit(ctx, strbuf, len, - be32_to_cpu(entry.file.id), DT_REG)) + be32_to_cpu(entry.file.id), type)) break; } else { pr_err("bad catalog entry type\n"); @@ -246,6 +270,7 @@ next: } memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key)); out: + kfree(strbuf); hfs_find_exit(&fd); return err; } diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index a7aafb35b624..feca524ce2a5 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -235,7 +235,7 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, if (iblock > hip->fs_blocks || !create) return -EIO; if (ablock >= hip->alloc_blocks) { - res = hfsplus_file_extend(inode); + res = hfsplus_file_extend(inode, false); if (res) return res; } @@ -425,7 +425,7 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, return res; } -int hfsplus_file_extend(struct inode *inode) +int hfsplus_file_extend(struct inode *inode, bool zeroout) { struct super_block *sb = inode->i_sb; struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); @@ -436,10 +436,9 @@ int hfsplus_file_extend(struct inode *inode) if (sbi->alloc_file->i_size * 8 < sbi->total_blocks - sbi->free_blocks + 8) { /* extend alloc file */ - pr_err("extend alloc file! " - "(%llu,%u,%u)\n", - sbi->alloc_file->i_size * 8, - sbi->total_blocks, sbi->free_blocks); + pr_err("extend alloc file! (%llu,%u,%u)\n", + sbi->alloc_file->i_size * 8, + sbi->total_blocks, sbi->free_blocks); return -ENOSPC; } @@ -463,6 +462,12 @@ int hfsplus_file_extend(struct inode *inode) } } + if (zeroout) { + res = sb_issue_zeroout(sb, start, len, GFP_NOFS); + if (res) + goto out; + } + hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); if (hip->alloc_blocks <= hip->first_blocks) { diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 83dc29286b10..eb5e059f481a 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -369,114 +369,119 @@ typedef int (*search_strategy_t)(struct hfs_bnode *, /* attributes.c */ int __init hfsplus_create_attr_tree_cache(void); void hfsplus_destroy_attr_tree_cache(void); +int hfsplus_attr_bin_cmp_key(const hfsplus_btree_key *k1, + const hfsplus_btree_key *k2); +int hfsplus_attr_build_key(struct super_block *sb, hfsplus_btree_key *key, + u32 cnid, const char *name); hfsplus_attr_entry *hfsplus_alloc_attr_entry(void); -void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry_p); -int hfsplus_attr_bin_cmp_key(const hfsplus_btree_key *, - const hfsplus_btree_key *); -int hfsplus_attr_build_key(struct super_block *, hfsplus_btree_key *, - u32, const char *); -void hfsplus_attr_build_key_uni(hfsplus_btree_key *key, - u32 cnid, - struct hfsplus_attr_unistr *name); -int hfsplus_find_attr(struct super_block *, u32, - const char *, struct hfs_find_data *); +void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry); +int hfsplus_find_attr(struct super_block *sb, u32 cnid, const char *name, + struct hfs_find_data *fd); int hfsplus_attr_exists(struct inode *inode, const char *name); -int hfsplus_create_attr(struct inode *, const char *, const void *, size_t); -int hfsplus_delete_attr(struct inode *, const char *); +int hfsplus_create_attr(struct inode *inode, const char *name, + const void *value, size_t size); +int hfsplus_delete_attr(struct inode *inode, const char *name); int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid); /* bitmap.c */ -int hfsplus_block_allocate(struct super_block *, u32, u32, u32 *); -int hfsplus_block_free(struct super_block *, u32, u32); +int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, + u32 *max); +int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count); /* btree.c */ -u32 hfsplus_calc_btree_clump_size(u32, u32, u64, int); -struct hfs_btree *hfs_btree_open(struct super_block *, u32); -void hfs_btree_close(struct hfs_btree *); -int hfs_btree_write(struct hfs_btree *); -struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *); -void hfs_bmap_free(struct hfs_bnode *); +u32 hfsplus_calc_btree_clump_size(u32 block_size, u32 node_size, u64 sectors, + int file_id); +struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id); +void hfs_btree_close(struct hfs_btree *tree); +int hfs_btree_write(struct hfs_btree *tree); +struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree); +void hfs_bmap_free(struct hfs_bnode *node); /* bnode.c */ -void hfs_bnode_read(struct hfs_bnode *, void *, int, int); -u16 hfs_bnode_read_u16(struct hfs_bnode *, int); -u8 hfs_bnode_read_u8(struct hfs_bnode *, int); -void hfs_bnode_read_key(struct hfs_bnode *, void *, int); -void hfs_bnode_write(struct hfs_bnode *, void *, int, int); -void hfs_bnode_write_u16(struct hfs_bnode *, int, u16); -void hfs_bnode_clear(struct hfs_bnode *, int, int); -void hfs_bnode_copy(struct hfs_bnode *, int, - struct hfs_bnode *, int, int); -void hfs_bnode_move(struct hfs_bnode *, int, int, int); -void hfs_bnode_dump(struct hfs_bnode *); -void hfs_bnode_unlink(struct hfs_bnode *); -struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *, u32); -struct hfs_bnode *hfs_bnode_find(struct hfs_btree *, u32); -void hfs_bnode_unhash(struct hfs_bnode *); -void hfs_bnode_free(struct hfs_bnode *); -struct hfs_bnode *hfs_bnode_create(struct hfs_btree *, u32); -void hfs_bnode_get(struct hfs_bnode *); -void hfs_bnode_put(struct hfs_bnode *); +void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len); +u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off); +u8 hfs_bnode_read_u8(struct hfs_bnode *node, int off); +void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off); +void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len); +void hfs_bnode_write_u16(struct hfs_bnode *node, int off, u16 data); +void hfs_bnode_clear(struct hfs_bnode *node, int off, int len); +void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, + struct hfs_bnode *src_node, int src, int len); +void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len); +void hfs_bnode_dump(struct hfs_bnode *node); +void hfs_bnode_unlink(struct hfs_bnode *node); +struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid); +void hfs_bnode_unhash(struct hfs_bnode *node); +struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num); +void hfs_bnode_free(struct hfs_bnode *node); +struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num); +void hfs_bnode_get(struct hfs_bnode *node); +void hfs_bnode_put(struct hfs_bnode *node); +bool hfs_bnode_need_zeroout(struct hfs_btree *tree); /* brec.c */ -u16 hfs_brec_lenoff(struct hfs_bnode *, u16, u16 *); -u16 hfs_brec_keylen(struct hfs_bnode *, u16); -int hfs_brec_insert(struct hfs_find_data *, void *, int); -int hfs_brec_remove(struct hfs_find_data *); +u16 hfs_brec_lenoff(struct hfs_bnode *node, u16 rec, u16 *off); +u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec); +int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len); +int hfs_brec_remove(struct hfs_find_data *fd); /* bfind.c */ -int hfs_find_init(struct hfs_btree *, struct hfs_find_data *); -void hfs_find_exit(struct hfs_find_data *); -int hfs_find_1st_rec_by_cnid(struct hfs_bnode *, - struct hfs_find_data *, - int *, int *, int *); -int hfs_find_rec_by_key(struct hfs_bnode *, - struct hfs_find_data *, - int *, int *, int *); -int __hfs_brec_find(struct hfs_bnode *, struct hfs_find_data *, - search_strategy_t); -int hfs_brec_find(struct hfs_find_data *, search_strategy_t); -int hfs_brec_read(struct hfs_find_data *, void *, int); -int hfs_brec_goto(struct hfs_find_data *, int); +int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd); +void hfs_find_exit(struct hfs_find_data *fd); +int hfs_find_1st_rec_by_cnid(struct hfs_bnode *bnode, struct hfs_find_data *fd, + int *begin, int *end, int *cur_rec); +int hfs_find_rec_by_key(struct hfs_bnode *bnode, struct hfs_find_data *fd, + int *begin, int *end, int *cur_rec); +int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd, + search_strategy_t rec_found); +int hfs_brec_find(struct hfs_find_data *fd, search_strategy_t do_key_compare); +int hfs_brec_read(struct hfs_find_data *fd, void *rec, int rec_len); +int hfs_brec_goto(struct hfs_find_data *fd, int cnt); /* catalog.c */ -int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *, - const hfsplus_btree_key *); -int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *, - const hfsplus_btree_key *); -void hfsplus_cat_build_key(struct super_block *sb, - hfsplus_btree_key *, u32, struct qstr *); -int hfsplus_find_cat(struct super_block *, u32, struct hfs_find_data *); -int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *); -int hfsplus_delete_cat(u32, struct inode *, struct qstr *); -int hfsplus_rename_cat(u32, struct inode *, struct qstr *, - struct inode *, struct qstr *); +int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *k1, + const hfsplus_btree_key *k2); +int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *k1, + const hfsplus_btree_key *k2); +void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key, + u32 parent, struct qstr *str); void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms); +int hfsplus_find_cat(struct super_block *sb, u32 cnid, + struct hfs_find_data *fd); +int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, + struct inode *inode); +int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str); +int hfsplus_rename_cat(u32 cnid, struct inode *src_dir, struct qstr *src_name, + struct inode *dst_dir, struct qstr *dst_name); /* dir.c */ extern const struct inode_operations hfsplus_dir_inode_operations; extern const struct file_operations hfsplus_dir_operations; /* extents.c */ -int hfsplus_ext_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *); -int hfsplus_ext_write_extent(struct inode *); -int hfsplus_get_block(struct inode *, sector_t, struct buffer_head *, int); -int hfsplus_free_fork(struct super_block *, u32, - struct hfsplus_fork_raw *, int); -int hfsplus_file_extend(struct inode *); -void hfsplus_file_truncate(struct inode *); +int hfsplus_ext_cmp_key(const hfsplus_btree_key *k1, + const hfsplus_btree_key *k2); +int hfsplus_ext_write_extent(struct inode *inode); +int hfsplus_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +int hfsplus_free_fork(struct super_block *sb, u32 cnid, + struct hfsplus_fork_raw *fork, int type); +int hfsplus_file_extend(struct inode *inode, bool zeroout); +void hfsplus_file_truncate(struct inode *inode); /* inode.c */ extern const struct address_space_operations hfsplus_aops; extern const struct address_space_operations hfsplus_btree_aops; extern const struct dentry_operations hfsplus_dentry_operations; -void hfsplus_inode_read_fork(struct inode *, struct hfsplus_fork_raw *); -void hfsplus_inode_write_fork(struct inode *, struct hfsplus_fork_raw *); -int hfsplus_cat_read_inode(struct inode *, struct hfs_find_data *); -int hfsplus_cat_write_inode(struct inode *); -struct inode *hfsplus_new_inode(struct super_block *, umode_t); -void hfsplus_delete_inode(struct inode *); +struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode); +void hfsplus_delete_inode(struct inode *inode); +void hfsplus_inode_read_fork(struct inode *inode, + struct hfsplus_fork_raw *fork); +void hfsplus_inode_write_fork(struct inode *inode, + struct hfsplus_fork_raw *fork); +int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd); +int hfsplus_cat_write_inode(struct inode *inode); int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); @@ -484,13 +489,17 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); /* options.c */ -int hfsplus_parse_options(char *, struct hfsplus_sb_info *); +void hfsplus_fill_defaults(struct hfsplus_sb_info *opts); int hfsplus_parse_options_remount(char *input, int *force); -void hfsplus_fill_defaults(struct hfsplus_sb_info *); -int hfsplus_show_options(struct seq_file *, struct dentry *); +int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi); +int hfsplus_show_options(struct seq_file *seq, struct dentry *root); + +/* part_tbl.c */ +int hfs_part_find(struct super_block *sb, sector_t *part_start, + sector_t *part_size); /* super.c */ -struct inode *hfsplus_iget(struct super_block *, unsigned long); +struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino); void hfsplus_mark_mdb_dirty(struct super_block *sb); /* tables.c */ @@ -499,23 +508,23 @@ extern u16 hfsplus_decompose_table[]; extern u16 hfsplus_compose_table[]; /* unicode.c */ -int hfsplus_strcasecmp(const struct hfsplus_unistr *, - const struct hfsplus_unistr *); -int hfsplus_strcmp(const struct hfsplus_unistr *, - const struct hfsplus_unistr *); -int hfsplus_uni2asc(struct super_block *, - const struct hfsplus_unistr *, char *, int *); -int hfsplus_asc2uni(struct super_block *, - struct hfsplus_unistr *, int, const char *, int); +int hfsplus_strcasecmp(const struct hfsplus_unistr *s1, + const struct hfsplus_unistr *s2); +int hfsplus_strcmp(const struct hfsplus_unistr *s1, + const struct hfsplus_unistr *s2); +int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, + char *astr, int *len_p); +int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, + int max_unistr_len, const char *astr, int len); int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str); -int hfsplus_compare_dentry(const struct dentry *parent, const struct dentry *dentry, - unsigned int len, const char *str, const struct qstr *name); +int hfsplus_compare_dentry(const struct dentry *parent, + const struct dentry *dentry, unsigned int len, + const char *str, const struct qstr *name); /* wrapper.c */ -int hfsplus_read_wrapper(struct super_block *); -int hfs_part_find(struct super_block *, sector_t *, sector_t *); -int hfsplus_submit_bio(struct super_block *sb, sector_t sector, - void *buf, void **data, int rw); +int hfsplus_submit_bio(struct super_block *sb, sector_t sector, void *buf, + void **data, int rw); +int hfsplus_read_wrapper(struct super_block *sb); /* time macros */ #define __hfsp_mt2ut(t) (be32_to_cpu(t) - 2082844800U) diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h index 5a126828d85e..8298d0985f81 100644 --- a/fs/hfsplus/hfsplus_raw.h +++ b/fs/hfsplus/hfsplus_raw.h @@ -144,6 +144,7 @@ struct hfsplus_vh { #define HFSPLUS_VOL_NODEID_REUSED (1 << 12) #define HFSPLUS_VOL_JOURNALED (1 << 13) #define HFSPLUS_VOL_SOFTLOCK (1 << 15) +#define HFSPLUS_VOL_UNUSED_NODE_FIX (1 << 31) /* HFS+ BTree node descriptor */ struct hfs_bnode_desc { diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index a4f45bd88a63..0cf786f2d046 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -123,14 +123,15 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask) } static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = file_inode(file)->i_mapping->host; + size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, + ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, hfsplus_get_block); /* @@ -139,7 +140,7 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb, */ if (unlikely((rw & WRITE) && ret < 0)) { loff_t isize = i_size_read(inode); - loff_t end = offset + iov_length(iov, nr_segs); + loff_t end = offset + count; if (end > isize) hfsplus_write_failed(mapping, end); @@ -340,10 +341,10 @@ static const struct inode_operations hfsplus_file_inode_operations = { static const struct file_operations hfsplus_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write = new_sync_write, + .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .splice_read = generic_file_splice_read, .fsync = hfsplus_file_fsync, diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index 68537e8b7a09..c90b72ee676d 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -173,9 +173,8 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi) if (p) sbi->nls = load_nls(p); if (!sbi->nls) { - pr_err("unable to load " - "nls mapping \"%s\"\n", - p); + pr_err("unable to load nls mapping \"%s\"\n", + p); kfree(p); return 0; } @@ -232,8 +231,8 @@ int hfsplus_show_options(struct seq_file *seq, struct dentry *root) if (sbi->nls) seq_printf(seq, ",nls=%s", sbi->nls->charset); if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags)) - seq_printf(seq, ",nodecompose"); + seq_puts(seq, ",nodecompose"); if (test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) - seq_printf(seq, ",nobarrier"); + seq_puts(seq, ",nobarrier"); return 0; } diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index a513d2d36be9..4cf2024b87da 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -131,9 +131,10 @@ static int hfsplus_system_write_inode(struct inode *inode) hfsplus_inode_write_fork(inode, fork); if (tree) { int err = hfs_btree_write(tree); + if (err) { pr_err("b-tree write err: %d, ino %lu\n", - err, inode->i_ino); + err, inode->i_ino); return err; } } diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 3f999649587f..cc6235671437 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -24,8 +24,8 @@ struct hfsplus_wd { u16 embed_count; }; -/* - * hfsplus_submit_bio - Perfrom block I/O +/** + * hfsplus_submit_bio - Perform block I/O * @sb: super block of volume for I/O * @sector: block to read or write, for blocks of HFSPLUS_SECTOR_SIZE bytes * @buf: buffer for I/O @@ -231,10 +231,8 @@ reread: if (blocksize < HFSPLUS_SECTOR_SIZE || ((blocksize - 1) & blocksize)) goto out_free_backup_vhdr; sbi->alloc_blksz = blocksize; - sbi->alloc_blksz_shift = 0; - while ((blocksize >>= 1) != 0) - sbi->alloc_blksz_shift++; - blocksize = min(sbi->alloc_blksz, (u32)PAGE_SIZE); + sbi->alloc_blksz_shift = ilog2(blocksize); + blocksize = min_t(u32, sbi->alloc_blksz, PAGE_SIZE); /* * Align block size to block offset. diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index 4e27edc082a4..d98094a9f476 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -8,6 +8,7 @@ #include "hfsplus_fs.h" #include <linux/posix_acl_xattr.h> +#include <linux/nls.h> #include "xattr.h" #include "acl.h" @@ -66,10 +67,10 @@ static void hfsplus_init_header_node(struct inode *attr_file, char *bmp; u32 used_nodes; u32 used_bmp_bytes; - loff_t tmp; + u64 tmp; hfs_dbg(ATTR_MOD, "init_hdr_attr_file: clump %u, node_size %u\n", - clump_size, node_size); + clump_size, node_size); /* The end of the node contains list of record offsets */ rec_offsets = (__be16 *)(buf + node_size); @@ -195,7 +196,7 @@ check_attr_tree_state_again: } while (hip->alloc_blocks < hip->clump_blocks) { - err = hfsplus_file_extend(attr_file); + err = hfsplus_file_extend(attr_file, false); if (unlikely(err)) { pr_err("failed to extend attributes file\n"); goto end_attr_file_creation; @@ -645,8 +646,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) struct hfs_find_data fd; u16 key_len = 0; struct hfsplus_attr_key attr_key; - char strbuf[HFSPLUS_ATTR_MAX_STRLEN + - XATTR_MAC_OSX_PREFIX_LEN + 1] = {0}; + char *strbuf; int xattr_name_len; if ((!S_ISREG(inode->i_mode) && @@ -666,6 +666,13 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) return err; } + strbuf = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + + XATTR_MAC_OSX_PREFIX_LEN + 1, GFP_KERNEL); + if (!strbuf) { + res = -ENOMEM; + goto out; + } + err = hfsplus_find_attr(inode->i_sb, inode->i_ino, NULL, &fd); if (err) { if (err == -ENOENT) { @@ -692,7 +699,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) if (be32_to_cpu(attr_key.cnid) != inode->i_ino) goto end_listxattr; - xattr_name_len = HFSPLUS_ATTR_MAX_STRLEN; + xattr_name_len = NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN; if (hfsplus_uni2asc(inode->i_sb, (const struct hfsplus_unistr *)&fd.key->attr.key_name, strbuf, &xattr_name_len)) { @@ -718,6 +725,8 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) } end_listxattr: + kfree(strbuf); +out: hfs_find_exit(&fd); return res; } @@ -797,47 +806,55 @@ end_removexattr: static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size, int type) { - char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + - XATTR_MAC_OSX_PREFIX_LEN + 1] = {0}; - size_t len = strlen(name); + char *xattr_name; + int res; if (!strcmp(name, "")) return -EINVAL; - if (len > HFSPLUS_ATTR_MAX_STRLEN) - return -EOPNOTSUPP; - /* * Don't allow retrieving properly prefixed attributes * by prepending them with "osx." */ if (is_known_namespace(name)) return -EOPNOTSUPP; + xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + + XATTR_MAC_OSX_PREFIX_LEN + 1, GFP_KERNEL); + if (!xattr_name) + return -ENOMEM; + strcpy(xattr_name, XATTR_MAC_OSX_PREFIX); + strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name); - return hfsplus_getxattr(dentry, xattr_name, buffer, size); + res = hfsplus_getxattr(dentry, xattr_name, buffer, size); + kfree(xattr_name); + return res; } static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name, const void *buffer, size_t size, int flags, int type) { - char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + - XATTR_MAC_OSX_PREFIX_LEN + 1] = {0}; - size_t len = strlen(name); + char *xattr_name; + int res; if (!strcmp(name, "")) return -EINVAL; - if (len > HFSPLUS_ATTR_MAX_STRLEN) - return -EOPNOTSUPP; - /* * Don't allow setting properly prefixed attributes * by prepending them with "osx." */ if (is_known_namespace(name)) return -EOPNOTSUPP; + xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + + XATTR_MAC_OSX_PREFIX_LEN + 1, GFP_KERNEL); + if (!xattr_name) + return -ENOMEM; + strcpy(xattr_name, XATTR_MAC_OSX_PREFIX); + strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name); - return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); + res = hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); + kfree(xattr_name); + return res; } static size_t hfsplus_osx_listxattr(struct dentry *dentry, char *list, diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c index 00722765ea79..6ec5e107691f 100644 --- a/fs/hfsplus/xattr_security.c +++ b/fs/hfsplus/xattr_security.c @@ -7,6 +7,8 @@ */ #include <linux/security.h> +#include <linux/nls.h> + #include "hfsplus_fs.h" #include "xattr.h" #include "acl.h" @@ -14,37 +16,43 @@ static int hfsplus_security_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size, int type) { - char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; - size_t len = strlen(name); + char *xattr_name; + int res; if (!strcmp(name, "")) return -EINVAL; - if (len + XATTR_SECURITY_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN) - return -EOPNOTSUPP; - + xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, + GFP_KERNEL); + if (!xattr_name) + return -ENOMEM; strcpy(xattr_name, XATTR_SECURITY_PREFIX); strcpy(xattr_name + XATTR_SECURITY_PREFIX_LEN, name); - return hfsplus_getxattr(dentry, xattr_name, buffer, size); + res = hfsplus_getxattr(dentry, xattr_name, buffer, size); + kfree(xattr_name); + return res; } static int hfsplus_security_setxattr(struct dentry *dentry, const char *name, const void *buffer, size_t size, int flags, int type) { - char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; - size_t len = strlen(name); + char *xattr_name; + int res; if (!strcmp(name, "")) return -EINVAL; - if (len + XATTR_SECURITY_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN) - return -EOPNOTSUPP; - + xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, + GFP_KERNEL); + if (!xattr_name) + return -ENOMEM; strcpy(xattr_name, XATTR_SECURITY_PREFIX); strcpy(xattr_name + XATTR_SECURITY_PREFIX_LEN, name); - return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); + res = hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); + kfree(xattr_name); + return res; } static size_t hfsplus_security_listxattr(struct dentry *dentry, char *list, @@ -62,31 +70,30 @@ static int hfsplus_initxattrs(struct inode *inode, void *fs_info) { const struct xattr *xattr; - char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; - size_t xattr_name_len; + char *xattr_name; int err = 0; + xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, + GFP_KERNEL); + if (!xattr_name) + return -ENOMEM; for (xattr = xattr_array; xattr->name != NULL; xattr++) { - xattr_name_len = strlen(xattr->name); - if (xattr_name_len == 0) + if (!strcmp(xattr->name, "")) continue; - if (xattr_name_len + XATTR_SECURITY_PREFIX_LEN > - HFSPLUS_ATTR_MAX_STRLEN) - return -EOPNOTSUPP; - strcpy(xattr_name, XATTR_SECURITY_PREFIX); strcpy(xattr_name + XATTR_SECURITY_PREFIX_LEN, xattr->name); memset(xattr_name + - XATTR_SECURITY_PREFIX_LEN + xattr_name_len, 0, 1); + XATTR_SECURITY_PREFIX_LEN + strlen(xattr->name), 0, 1); err = __hfsplus_setxattr(inode, xattr_name, xattr->value, xattr->value_len, 0); if (err) break; } + kfree(xattr_name); return err; } diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c index 426cee277542..3c5f27e4746a 100644 --- a/fs/hfsplus/xattr_trusted.c +++ b/fs/hfsplus/xattr_trusted.c @@ -6,43 +6,51 @@ * Handler for trusted extended attributes. */ +#include <linux/nls.h> + #include "hfsplus_fs.h" #include "xattr.h" static int hfsplus_trusted_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size, int type) { - char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; - size_t len = strlen(name); + char *xattr_name; + int res; if (!strcmp(name, "")) return -EINVAL; - if (len + XATTR_TRUSTED_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN) - return -EOPNOTSUPP; - + xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, + GFP_KERNEL); + if (!xattr_name) + return -ENOMEM; strcpy(xattr_name, XATTR_TRUSTED_PREFIX); strcpy(xattr_name + XATTR_TRUSTED_PREFIX_LEN, name); - return hfsplus_getxattr(dentry, xattr_name, buffer, size); + res = hfsplus_getxattr(dentry, xattr_name, buffer, size); + kfree(xattr_name); + return res; } static int hfsplus_trusted_setxattr(struct dentry *dentry, const char *name, const void *buffer, size_t size, int flags, int type) { - char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; - size_t len = strlen(name); + char *xattr_name; + int res; if (!strcmp(name, "")) return -EINVAL; - if (len + XATTR_TRUSTED_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN) - return -EOPNOTSUPP; - + xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, + GFP_KERNEL); + if (!xattr_name) + return -ENOMEM; strcpy(xattr_name, XATTR_TRUSTED_PREFIX); strcpy(xattr_name + XATTR_TRUSTED_PREFIX_LEN, name); - return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); + res = hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); + kfree(xattr_name); + return res; } static size_t hfsplus_trusted_listxattr(struct dentry *dentry, char *list, diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c index e34016561ae0..2b625a538b64 100644 --- a/fs/hfsplus/xattr_user.c +++ b/fs/hfsplus/xattr_user.c @@ -6,43 +6,51 @@ * Handler for user extended attributes. */ +#include <linux/nls.h> + #include "hfsplus_fs.h" #include "xattr.h" static int hfsplus_user_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size, int type) { - char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; - size_t len = strlen(name); + char *xattr_name; + int res; if (!strcmp(name, "")) return -EINVAL; - if (len + XATTR_USER_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN) - return -EOPNOTSUPP; - + xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, + GFP_KERNEL); + if (!xattr_name) + return -ENOMEM; strcpy(xattr_name, XATTR_USER_PREFIX); strcpy(xattr_name + XATTR_USER_PREFIX_LEN, name); - return hfsplus_getxattr(dentry, xattr_name, buffer, size); + res = hfsplus_getxattr(dentry, xattr_name, buffer, size); + kfree(xattr_name); + return res; } static int hfsplus_user_setxattr(struct dentry *dentry, const char *name, const void *buffer, size_t size, int flags, int type) { - char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0}; - size_t len = strlen(name); + char *xattr_name; + int res; if (!strcmp(name, "")) return -EINVAL; - if (len + XATTR_USER_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN) - return -EOPNOTSUPP; - + xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, + GFP_KERNEL); + if (!xattr_name) + return -ENOMEM; strcpy(xattr_name, XATTR_USER_PREFIX); strcpy(xattr_name + XATTR_USER_PREFIX_LEN, name); - return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); + res = hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); + kfree(xattr_name); + return res; } static size_t hfsplus_user_listxattr(struct dentry *dentry, char *list, diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 9c470fde9878..bb529f3b7f2b 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -378,11 +378,11 @@ static int hostfs_fsync(struct file *file, loff_t start, loff_t end, static const struct file_operations hostfs_file_fops = { .llseek = generic_file_llseek, - .read = do_sync_read, + .read = new_sync_read, .splice_read = generic_file_splice_read, - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, - .write = do_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .write = new_sync_write, .mmap = generic_file_mmap, .open = hostfs_file_open, .release = hostfs_file_release, diff --git a/fs/hpfs/alloc.c b/fs/hpfs/alloc.c index 58b5106186d0..f005046e1591 100644 --- a/fs/hpfs/alloc.c +++ b/fs/hpfs/alloc.c @@ -316,7 +316,7 @@ void hpfs_free_sectors(struct super_block *s, secno sec, unsigned n) struct quad_buffer_head qbh; __le32 *bmp; struct hpfs_sb_info *sbi = hpfs_sb(s); - /*printk("2 - ");*/ + /*pr_info("2 - ");*/ if (!n) return; if (sec < 0x12) { hpfs_error(s, "Trying to free reserved sector %08x", sec); diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c index 139ef1684d07..8057fe4e6574 100644 --- a/fs/hpfs/buffer.c +++ b/fs/hpfs/buffer.c @@ -55,7 +55,7 @@ void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head if (bh != NULL) return bh->b_data; else { - printk("HPFS: hpfs_map_sector: read error\n"); + pr_err("%s(): read error\n", __func__); return NULL; } } @@ -76,7 +76,7 @@ void *hpfs_get_sector(struct super_block *s, unsigned secno, struct buffer_head set_buffer_uptodate(bh); return bh->b_data; } else { - printk("HPFS: hpfs_get_sector: getblk failed\n"); + pr_err("%s(): getblk failed\n", __func__); return NULL; } } @@ -93,7 +93,7 @@ void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffe cond_resched(); if (secno & 3) { - printk("HPFS: hpfs_map_4sectors: unaligned read\n"); + pr_err("%s(): unaligned read\n", __func__); return NULL; } @@ -112,7 +112,7 @@ void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffe qbh->data = data = kmalloc(2048, GFP_NOFS); if (!data) { - printk("HPFS: hpfs_map_4sectors: out of memory\n"); + pr_err("%s(): out of memory\n", __func__); goto bail4; } @@ -145,7 +145,7 @@ void *hpfs_get_4sectors(struct super_block *s, unsigned secno, hpfs_lock_assert(s); if (secno & 3) { - printk("HPFS: hpfs_get_4sectors: unaligned read\n"); + pr_err("%s(): unaligned read\n", __func__); return NULL; } @@ -161,7 +161,7 @@ void *hpfs_get_4sectors(struct super_block *s, unsigned secno, } if (!(qbh->data = kmalloc(2048, GFP_NOFS))) { - printk("HPFS: hpfs_get_4sectors: out of memory\n"); + pr_err("%s(): out of memory\n", __func__); goto bail4; } return qbh->data; diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 292b1acb9b81..2a8e07425de0 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -36,7 +36,7 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence) mutex_lock(&i->i_mutex); hpfs_lock(s); - /*printk("dir lseek\n");*/ + /*pr_info("dir lseek\n");*/ if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok; pos = ((loff_t) hpfs_de_as_down_as_possible(s, hpfs_inode->i_dno) << 4) + 1; while (pos != new_off) { @@ -51,7 +51,7 @@ ok: mutex_unlock(&i->i_mutex); return new_off; fail: - /*printk("illegal lseek: %016llx\n", new_off);*/ + /*pr_warn("illegal lseek: %016llx\n", new_off);*/ hpfs_unlock(s); mutex_unlock(&i->i_mutex); return -ESPIPE; @@ -127,7 +127,7 @@ static int hpfs_readdir(struct file *file, struct dir_context *ctx) if (ctx->pos == 12) goto out; if (ctx->pos == 3 || ctx->pos == 4 || ctx->pos == 5) { - printk("HPFS: warning: pos==%d\n",(int)ctx->pos); + pr_err("pos==%d\n", (int)ctx->pos); goto out; } if (ctx->pos == 0) { diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c index 4364b2a02c5d..f36fc010fccb 100644 --- a/fs/hpfs/dnode.c +++ b/fs/hpfs/dnode.c @@ -17,7 +17,7 @@ static loff_t get_pos(struct dnode *d, struct hpfs_dirent *fde) if (de == fde) return ((loff_t) le32_to_cpu(d->self) << 4) | (loff_t)i; i++; } - printk("HPFS: get_pos: not_found\n"); + pr_info("%s(): not_found\n", __func__); return ((loff_t)le32_to_cpu(d->self) << 4) | (loff_t)1; } @@ -32,7 +32,7 @@ void hpfs_add_pos(struct inode *inode, loff_t *pos) if (hpfs_inode->i_rddir_off[i] == pos) return; if (!(i&0x0f)) { if (!(ppos = kmalloc((i+0x11) * sizeof(loff_t*), GFP_NOFS))) { - printk("HPFS: out of memory for position list\n"); + pr_err("out of memory for position list\n"); return; } if (hpfs_inode->i_rddir_off) { @@ -63,7 +63,8 @@ void hpfs_del_pos(struct inode *inode, loff_t *pos) } return; not_f: - /*printk("HPFS: warning: position pointer %p->%08x not found\n", pos, (int)*pos);*/ + /*pr_warn("position pointer %p->%08x not found\n", + pos, (int)*pos);*/ return; } @@ -92,8 +93,11 @@ static void hpfs_pos_ins(loff_t *p, loff_t d, loff_t c) { if ((*p & ~0x3f) == (d & ~0x3f) && (*p & 0x3f) >= (d & 0x3f)) { int n = (*p & 0x3f) + c; - if (n > 0x3f) printk("HPFS: hpfs_pos_ins: %08x + %d\n", (int)*p, (int)c >> 8); - else *p = (*p & ~0x3f) | n; + if (n > 0x3f) + pr_err("%s(): %08x + %d\n", + __func__, (int)*p, (int)c >> 8); + else + *p = (*p & ~0x3f) | n; } } @@ -101,8 +105,11 @@ static void hpfs_pos_del(loff_t *p, loff_t d, loff_t c) { if ((*p & ~0x3f) == (d & ~0x3f) && (*p & 0x3f) >= (d & 0x3f)) { int n = (*p & 0x3f) - c; - if (n < 1) printk("HPFS: hpfs_pos_ins: %08x - %d\n", (int)*p, (int)c >> 8); - else *p = (*p & ~0x3f) | n; + if (n < 1) + pr_err("%s(): %08x - %d\n", + __func__, (int)*p, (int)c >> 8); + else + *p = (*p & ~0x3f) | n; } } @@ -239,12 +246,12 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno, struct fnode *fnode; int c1, c2 = 0; if (!(nname = kmalloc(256, GFP_NOFS))) { - printk("HPFS: out of memory, can't add to dnode\n"); + pr_err("out of memory, can't add to dnode\n"); return 1; } go_up: if (namelen >= 256) { - hpfs_error(i->i_sb, "hpfs_add_to_dnode: namelen == %d", namelen); + hpfs_error(i->i_sb, "%s(): namelen == %d", __func__, namelen); kfree(nd); kfree(nname); return 1; @@ -281,7 +288,7 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno, not be any error while splitting dnodes, otherwise the whole directory, not only file we're adding, would be lost. */ - printk("HPFS: out of memory for dnode splitting\n"); + pr_err("out of memory for dnode splitting\n"); hpfs_brelse4(&qbh); kfree(nname); return 1; @@ -597,7 +604,7 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno) if (!de_next->down) goto endm; ndown = de_down_pointer(de_next); if (!(de_cp = kmalloc(le16_to_cpu(de->length), GFP_NOFS))) { - printk("HPFS: out of memory for dtree balancing\n"); + pr_err("out of memory for dtree balancing\n"); goto endm; } memcpy(de_cp, de, le16_to_cpu(de->length)); @@ -612,7 +619,8 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno) hpfs_brelse4(&qbh1); } hpfs_add_to_dnode(i, ndown, de_cp->name, de_cp->namelen, de_cp, de_cp->down ? de_down_pointer(de_cp) : 0); - /*printk("UP-TO-DNODE: %08x (ndown = %08x, down = %08x, dno = %08x)\n", up, ndown, down, dno);*/ + /*pr_info("UP-TO-DNODE: %08x (ndown = %08x, down = %08x, dno = %08x)\n", + up, ndown, down, dno);*/ dno = up; kfree(de_cp); goto try_it_again; @@ -637,15 +645,15 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno) if (!dlp && down) { if (le32_to_cpu(d1->first_free) > 2044) { if (hpfs_sb(i->i_sb)->sb_chk >= 2) { - printk("HPFS: warning: unbalanced dnode tree, see hpfs.txt 4 more info\n"); - printk("HPFS: warning: terminating balancing operation\n"); + pr_err("unbalanced dnode tree, see hpfs.txt 4 more info\n"); + pr_err("terminating balancing operation\n"); } hpfs_brelse4(&qbh1); goto endm; } if (hpfs_sb(i->i_sb)->sb_chk >= 2) { - printk("HPFS: warning: unbalanced dnode tree, see hpfs.txt 4 more info\n"); - printk("HPFS: warning: goin'on\n"); + pr_err("unbalanced dnode tree, see hpfs.txt 4 more info\n"); + pr_err("goin'on\n"); } le16_add_cpu(&del->length, 4); del->down = 1; @@ -659,7 +667,7 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno) *(__le32 *) ((void *) del + le16_to_cpu(del->length) - 4) = cpu_to_le32(down); } else goto endm; if (!(de_cp = kmalloc(le16_to_cpu(de_prev->length), GFP_NOFS))) { - printk("HPFS: out of memory for dtree balancing\n"); + pr_err("out of memory for dtree balancing\n"); hpfs_brelse4(&qbh1); goto endm; } @@ -1000,7 +1008,7 @@ struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno, int d1, d2 = 0; name1 = f->name; if (!(name2 = kmalloc(256, GFP_NOFS))) { - printk("HPFS: out of memory, can't map dirent\n"); + pr_err("out of memory, can't map dirent\n"); return NULL; } if (f->len <= 15) diff --git a/fs/hpfs/ea.c b/fs/hpfs/ea.c index bcaafcd2666a..ce3f98ba993a 100644 --- a/fs/hpfs/ea.c +++ b/fs/hpfs/ea.c @@ -51,7 +51,7 @@ static char *get_indirect_ea(struct super_block *s, int ano, secno a, int size) { char *ret; if (!(ret = kmalloc(size + 1, GFP_NOFS))) { - printk("HPFS: out of memory for EA\n"); + pr_err("out of memory for EA\n"); return NULL; } if (hpfs_ea_read(s, a, ano, 0, size, ret)) { @@ -139,7 +139,7 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si if (ea_indirect(ea)) return get_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), *size = ea_len(ea)); if (!(ret = kmalloc((*size = ea_valuelen(ea)) + 1, GFP_NOFS))) { - printk("HPFS: out of memory for EA\n"); + pr_err("out of memory for EA\n"); return NULL; } memcpy(ret, ea_data(ea), ea_valuelen(ea)); @@ -165,7 +165,7 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si if (ea_indirect(ea)) return get_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), *size = ea_len(ea)); if (!(ret = kmalloc((*size = ea_valuelen(ea)) + 1, GFP_NOFS))) { - printk("HPFS: out of memory for EA\n"); + pr_err("out of memory for EA\n"); return NULL; } if (hpfs_ea_read(s, a, ano, pos + 4 + ea->namelen + 1, ea_valuelen(ea), ret)) { diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 67c1a61e0955..7f54e5f76cec 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -197,10 +197,10 @@ const struct address_space_operations hpfs_aops = { const struct file_operations hpfs_file_ops = { .llseek = generic_file_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write = new_sync_write, + .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .release = hpfs_file_release, .fsync = hpfs_file_fsync, diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index 3ba49c080e42..b63b75fa00e7 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -8,6 +8,11 @@ //#define DBG //#define DEBUG_LOCKS +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mutex.h> #include <linux/pagemap.h> diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 50a427313835..7ce4b74234a1 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -183,7 +183,8 @@ void hpfs_write_inode(struct inode *i) struct inode *parent; if (i->i_ino == hpfs_sb(i->i_sb)->sb_root) return; if (hpfs_inode->i_rddir_off && !atomic_read(&i->i_count)) { - if (*hpfs_inode->i_rddir_off) printk("HPFS: write_inode: some position still there\n"); + if (*hpfs_inode->i_rddir_off) + pr_err("write_inode: some position still there\n"); kfree(hpfs_inode->i_rddir_off); hpfs_inode->i_rddir_off = NULL; } diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c index 3aa66ae1031e..442770edcdc7 100644 --- a/fs/hpfs/map.c +++ b/fs/hpfs/map.c @@ -65,12 +65,13 @@ unsigned char *hpfs_load_code_page(struct super_block *s, secno cps) struct code_page_directory *cp = hpfs_map_sector(s, cps, &bh, 0); if (!cp) return NULL; if (le32_to_cpu(cp->magic) != CP_DIR_MAGIC) { - printk("HPFS: Code page directory magic doesn't match (magic = %08x)\n", le32_to_cpu(cp->magic)); + pr_err("Code page directory magic doesn't match (magic = %08x)\n", + le32_to_cpu(cp->magic)); brelse(bh); return NULL; } if (!le32_to_cpu(cp->n_code_pages)) { - printk("HPFS: n_code_pages == 0\n"); + pr_err("n_code_pages == 0\n"); brelse(bh); return NULL; } @@ -79,19 +80,19 @@ unsigned char *hpfs_load_code_page(struct super_block *s, secno cps) brelse(bh); if (cpi >= 3) { - printk("HPFS: Code page index out of array\n"); + pr_err("Code page index out of array\n"); return NULL; } if (!(cpd = hpfs_map_sector(s, cpds, &bh, 0))) return NULL; if (le16_to_cpu(cpd->offs[cpi]) > 0x178) { - printk("HPFS: Code page index out of sector\n"); + pr_err("Code page index out of sector\n"); brelse(bh); return NULL; } ptr = (unsigned char *)cpd + le16_to_cpu(cpd->offs[cpi]) + 6; if (!(cp_table = kmalloc(256, GFP_KERNEL))) { - printk("HPFS: out of memory for code page table\n"); + pr_err("out of memory for code page table\n"); brelse(bh); return NULL; } @@ -114,7 +115,7 @@ __le32 *hpfs_load_bitmap_directory(struct super_block *s, secno bmp) int i; __le32 *b; if (!(b = kmalloc(n * 512, GFP_KERNEL))) { - printk("HPFS: can't allocate memory for bitmap directory\n"); + pr_err("can't allocate memory for bitmap directory\n"); return NULL; } for (i=0;i<n;i++) { @@ -281,7 +282,9 @@ struct dnode *hpfs_map_dnode(struct super_block *s, unsigned secno, hpfs_error(s, "dnode %08x does not end with \\377 entry", secno); goto bail; } - if (b == 3) printk("HPFS: warning: unbalanced dnode tree, dnode %08x; see hpfs.txt 4 more info\n", secno); + if (b == 3) + pr_err("unbalanced dnode tree, dnode %08x; see hpfs.txt 4 more info\n", + secno); } return dnode; bail: diff --git a/fs/hpfs/name.c b/fs/hpfs/name.c index 9acdf338def0..b00d396d22c6 100644 --- a/fs/hpfs/name.c +++ b/fs/hpfs/name.c @@ -56,14 +56,15 @@ unsigned char *hpfs_translate_name(struct super_block *s, unsigned char *from, unsigned char *to; int i; if (hpfs_sb(s)->sb_chk >= 2) if (hpfs_is_name_long(from, len) != lng) { - printk("HPFS: Long name flag mismatch - name "); - for (i=0; i<len; i++) printk("%c", from[i]); - printk(" misidentified as %s.\n", lng ? "short" : "long"); - printk("HPFS: It's nothing serious. It could happen because of bug in OS/2.\nHPFS: Set checks=normal to disable this message.\n"); + pr_err("Long name flag mismatch - name "); + for (i = 0; i < len; i++) + pr_cont("%c", from[i]); + pr_cont(" misidentified as %s.\n", lng ? "short" : "long"); + pr_err("It's nothing serious. It could happen because of bug in OS/2.\nSet checks=normal to disable this message.\n"); } if (!lc) return from; if (!(to = kmalloc(len, GFP_KERNEL))) { - printk("HPFS: can't allocate memory for name conversion buffer\n"); + pr_err("can't allocate memory for name conversion buffer\n"); return from; } for (i = 0; i < len; i++) to[i] = locase(hpfs_sb(s)->sb_cp_table,from[i]); diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 1b39afdd86fd..bdbc2c3080a4 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -404,7 +404,7 @@ again: d_rehash(dentry); } else { struct iattr newattrs; - /*printk("HPFS: truncating file before delete.\n");*/ + /*pr_info("truncating file before delete.\n");*/ newattrs.ia_size = 0; newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; err = notify_change(dentry, &newattrs, NULL); diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index fe3463a43236..7cd00d3a7c9b 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -62,22 +62,26 @@ void hpfs_error(struct super_block *s, const char *fmt, ...) vsnprintf(err_buf, sizeof(err_buf), fmt, args); va_end(args); - printk("HPFS: filesystem error: %s", err_buf); + pr_err("filesystem error: %s", err_buf); if (!hpfs_sb(s)->sb_was_error) { if (hpfs_sb(s)->sb_err == 2) { - printk("; crashing the system because you wanted it\n"); + pr_cont("; crashing the system because you wanted it\n"); mark_dirty(s, 0); panic("HPFS panic"); } else if (hpfs_sb(s)->sb_err == 1) { - if (s->s_flags & MS_RDONLY) printk("; already mounted read-only\n"); + if (s->s_flags & MS_RDONLY) + pr_cont("; already mounted read-only\n"); else { - printk("; remounting read-only\n"); + pr_cont("; remounting read-only\n"); mark_dirty(s, 0); s->s_flags |= MS_RDONLY; } - } else if (s->s_flags & MS_RDONLY) printk("; going on - but anything won't be destroyed because it's read-only\n"); - else printk("; corrupted filesystem mounted read/write - your computer will explode within 20 seconds ... but you wanted it so!\n"); - } else printk("\n"); + } else if (s->s_flags & MS_RDONLY) + pr_cont("; going on - but anything won't be destroyed because it's read-only\n"); + else + pr_cont("; corrupted filesystem mounted read/write - your computer will explode within 20 seconds ... but you wanted it so!\n"); + } else + pr_cont("\n"); hpfs_sb(s)->sb_was_error = 1; } @@ -292,7 +296,7 @@ static int parse_opts(char *opts, kuid_t *uid, kgid_t *gid, umode_t *umask, if (!opts) return 1; - /*printk("Parsing opts: '%s'\n",opts);*/ + /*pr_info("Parsing opts: '%s'\n",opts);*/ while ((p = strsep(&opts, ",")) != NULL) { substring_t args[MAX_OPT_ARGS]; @@ -387,7 +391,7 @@ static int parse_opts(char *opts, kuid_t *uid, kgid_t *gid, umode_t *umask, static inline void hpfs_help(void) { - printk("\n\ + pr_info("\n\ HPFS filesystem options:\n\ help do not mount and display this text\n\ uid=xxx set uid of files that don't have uid specified in eas\n\ @@ -434,7 +438,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) if (!(o = parse_opts(data, &uid, &gid, &umask, &lowercase, &eas, &chk, &errs, &chkdsk, ×hift))) { - printk("HPFS: bad mount options.\n"); + pr_err("bad mount options.\n"); goto out_err; } if (o == 2) { @@ -442,7 +446,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) goto out_err; } if (timeshift != sbi->sb_timeshift) { - printk("HPFS: timeshift can't be changed using remount.\n"); + pr_err("timeshift can't be changed using remount.\n"); goto out_err; } @@ -523,7 +527,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) if (!(o = parse_opts(options, &uid, &gid, &umask, &lowercase, &eas, &chk, &errs, &chkdsk, ×hift))) { - printk("HPFS: bad mount options.\n"); + pr_err("bad mount options.\n"); goto bail0; } if (o==2) { @@ -542,16 +546,17 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) if (/*le16_to_cpu(bootblock->magic) != BB_MAGIC ||*/ le32_to_cpu(superblock->magic) != SB_MAGIC || le32_to_cpu(spareblock->magic) != SP_MAGIC) { - if (!silent) printk("HPFS: Bad magic ... probably not HPFS\n"); + if (!silent) + pr_err("Bad magic ... probably not HPFS\n"); goto bail4; } /* Check version */ if (!(s->s_flags & MS_RDONLY) && superblock->funcversion != 2 && superblock->funcversion != 3) { - printk("HPFS: Bad version %d,%d. Mount readonly to go around\n", + pr_err("Bad version %d,%d. Mount readonly to go around\n", (int)superblock->version, (int)superblock->funcversion); - printk("HPFS: please try recent version of HPFS driver at http://artax.karlin.mff.cuni.cz/~mikulas/vyplody/hpfs/index-e.cgi and if it still can't understand this format, contact author - mikulas@artax.karlin.mff.cuni.cz\n"); + pr_err("please try recent version of HPFS driver at http://artax.karlin.mff.cuni.cz/~mikulas/vyplody/hpfs/index-e.cgi and if it still can't understand this format, contact author - mikulas@artax.karlin.mff.cuni.cz\n"); goto bail4; } @@ -597,7 +602,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) /* Check for general fs errors*/ if (spareblock->dirty && !spareblock->old_wrote) { if (errs == 2) { - printk("HPFS: Improperly stopped, not mounted\n"); + pr_err("Improperly stopped, not mounted\n"); goto bail4; } hpfs_error(s, "improperly stopped"); @@ -611,22 +616,25 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) if (spareblock->hotfixes_used || spareblock->n_spares_used) { if (errs >= 2) { - printk("HPFS: Hotfixes not supported here, try chkdsk\n"); + pr_err("Hotfixes not supported here, try chkdsk\n"); mark_dirty(s, 0); goto bail4; } hpfs_error(s, "hotfixes not supported here, try chkdsk"); - if (errs == 0) printk("HPFS: Proceeding, but your filesystem will be probably corrupted by this driver...\n"); - else printk("HPFS: This driver may read bad files or crash when operating on disk with hotfixes.\n"); + if (errs == 0) + pr_err("Proceeding, but your filesystem will be probably corrupted by this driver...\n"); + else + pr_err("This driver may read bad files or crash when operating on disk with hotfixes.\n"); } if (le32_to_cpu(spareblock->n_dnode_spares) != le32_to_cpu(spareblock->n_dnode_spares_free)) { if (errs >= 2) { - printk("HPFS: Spare dnodes used, try chkdsk\n"); + pr_err("Spare dnodes used, try chkdsk\n"); mark_dirty(s, 0); goto bail4; } hpfs_error(s, "warning: spare dnodes used, try chkdsk"); - if (errs == 0) printk("HPFS: Proceeding, but your filesystem could be corrupted if you delete files or directories\n"); + if (errs == 0) + pr_err("Proceeding, but your filesystem could be corrupted if you delete files or directories\n"); } if (chk) { unsigned a; @@ -645,12 +653,13 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) goto bail4; } sbi->sb_dirband_size = a; - } else printk("HPFS: You really don't want any checks? You are crazy...\n"); + } else + pr_err("You really don't want any checks? You are crazy...\n"); /* Load code page table */ if (le32_to_cpu(spareblock->n_code_pages)) if (!(sbi->sb_cp_table = hpfs_load_code_page(s, le32_to_cpu(spareblock->code_page_dir)))) - printk("HPFS: Warning: code page support is disabled\n"); + pr_err("code page support is disabled\n"); brelse(bh2); brelse(bh1); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 204027520937..1e2872b25343 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -6,6 +6,8 @@ * Copyright (C) 2002 Linus Torvalds. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/thread_info.h> #include <asm/current.h> @@ -475,7 +477,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, * annotation because huge_pmd_share() does an allocation under * i_mmap_mutex. */ -struct lock_class_key hugetlbfs_i_mmap_mutex_key; +static struct lock_class_key hugetlbfs_i_mmap_mutex_key; static struct inode *hugetlbfs_get_inode(struct super_block *sb, struct inode *dir, @@ -823,8 +825,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) ps = memparse(args[0].from, &rest); pconfig->hstate = size_to_hstate(ps); if (!pconfig->hstate) { - printk(KERN_ERR - "hugetlbfs: Unsupported page size %lu MB\n", + pr_err("Unsupported page size %lu MB\n", ps >> 20); return -EINVAL; } @@ -832,8 +833,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) } default: - printk(KERN_ERR "hugetlbfs: Bad mount option: \"%s\"\n", - p); + pr_err("Bad mount option: \"%s\"\n", p); return -EINVAL; break; } @@ -853,8 +853,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) return 0; bad_val: - printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n", - args[0].from, p); + pr_err("Bad value '%s' for mount option '%s'\n", args[0].from, p); return -EINVAL; } @@ -902,8 +901,7 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) goto out_free; return 0; out_free: - if (sbinfo->spool) - kfree(sbinfo->spool); + kfree(sbinfo->spool); kfree(sbinfo); return -ENOMEM; } @@ -939,7 +937,7 @@ static int get_hstate_idx(int page_size_log) return h - hstates; } -static struct dentry_operations anon_ops = { +static const struct dentry_operations anon_ops = { .d_dname = simple_dname }; @@ -970,8 +968,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, *user = current_user(); if (user_shm_lock(size, *user)) { task_lock(current); - printk_once(KERN_WARNING - "%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n", + pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n", current->comm, current->pid); task_unlock(current); } else { @@ -1030,6 +1027,11 @@ static int __init init_hugetlbfs_fs(void) int error; int i; + if (!hugepages_supported()) { + pr_info("disabling because there are no supported hugepage sizes\n"); + return -ENOTSUPP; + } + error = bdi_init(&hugetlbfs_backing_dev_info); if (error) return error; @@ -1055,7 +1057,7 @@ static int __init init_hugetlbfs_fs(void) buf); if (IS_ERR(hugetlbfs_vfsmount[i])) { - pr_err("hugetlb: Cannot mount internal hugetlbfs for " + pr_err("Cannot mount internal hugetlbfs for " "page size %uK", ps_kb); error = PTR_ERR(hugetlbfs_vfsmount[i]); hugetlbfs_vfsmount[i] = NULL; diff --git a/fs/inode.c b/fs/inode.c index f96d2a6f88cc..6eecb7ff0b9a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -105,7 +105,7 @@ long get_nr_dirty_inodes(void) * Handle nr_inode sysctl */ #ifdef CONFIG_SYSCTL -int proc_nr_inodes(ctl_table *table, int write, +int proc_nr_inodes(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { inodes_stat.nr_inodes = get_nr_inodes(); @@ -1839,14 +1839,18 @@ EXPORT_SYMBOL(inode_init_owner); * inode_owner_or_capable - check current task permissions to inode * @inode: inode being checked * - * Return true if current either has CAP_FOWNER to the inode, or - * owns the file. + * Return true if current either has CAP_FOWNER in a namespace with the + * inode owner uid mapped, or owns the file. */ bool inode_owner_or_capable(const struct inode *inode) { + struct user_namespace *ns; + if (uid_eq(current_fsuid(), inode->i_uid)) return true; - if (inode_capable(inode, CAP_FOWNER)) + + ns = current_user_ns(); + if (ns_capable(ns, CAP_FOWNER) && kuid_has_mapping(ns, inode->i_uid)) return true; return false; } diff --git a/fs/ioprio.c b/fs/ioprio.c deleted file mode 100644 index e50170ca7c33..000000000000 --- a/fs/ioprio.c +++ /dev/null @@ -1,241 +0,0 @@ -/* - * fs/ioprio.c - * - * Copyright (C) 2004 Jens Axboe <axboe@kernel.dk> - * - * Helper functions for setting/querying io priorities of processes. The - * system calls closely mimmick getpriority/setpriority, see the man page for - * those. The prio argument is a composite of prio class and prio data, where - * the data argument has meaning within that class. The standard scheduling - * classes have 8 distinct prio levels, with 0 being the highest prio and 7 - * being the lowest. - * - * IOW, setting BE scheduling class with prio 2 is done ala: - * - * unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2; - * - * ioprio_set(PRIO_PROCESS, pid, prio); - * - * See also Documentation/block/ioprio.txt - * - */ -#include <linux/gfp.h> -#include <linux/kernel.h> -#include <linux/export.h> -#include <linux/ioprio.h> -#include <linux/blkdev.h> -#include <linux/capability.h> -#include <linux/syscalls.h> -#include <linux/security.h> -#include <linux/pid_namespace.h> - -int set_task_ioprio(struct task_struct *task, int ioprio) -{ - int err; - struct io_context *ioc; - const struct cred *cred = current_cred(), *tcred; - - rcu_read_lock(); - tcred = __task_cred(task); - if (!uid_eq(tcred->uid, cred->euid) && - !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) { - rcu_read_unlock(); - return -EPERM; - } - rcu_read_unlock(); - - err = security_task_setioprio(task, ioprio); - if (err) - return err; - - ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); - if (ioc) { - ioc->ioprio = ioprio; - put_io_context(ioc); - } - - return err; -} -EXPORT_SYMBOL_GPL(set_task_ioprio); - -SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) -{ - int class = IOPRIO_PRIO_CLASS(ioprio); - int data = IOPRIO_PRIO_DATA(ioprio); - struct task_struct *p, *g; - struct user_struct *user; - struct pid *pgrp; - kuid_t uid; - int ret; - - switch (class) { - case IOPRIO_CLASS_RT: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - /* fall through, rt has prio field too */ - case IOPRIO_CLASS_BE: - if (data >= IOPRIO_BE_NR || data < 0) - return -EINVAL; - - break; - case IOPRIO_CLASS_IDLE: - break; - case IOPRIO_CLASS_NONE: - if (data) - return -EINVAL; - break; - default: - return -EINVAL; - } - - ret = -ESRCH; - rcu_read_lock(); - switch (which) { - case IOPRIO_WHO_PROCESS: - if (!who) - p = current; - else - p = find_task_by_vpid(who); - if (p) - ret = set_task_ioprio(p, ioprio); - break; - case IOPRIO_WHO_PGRP: - if (!who) - pgrp = task_pgrp(current); - else - pgrp = find_vpid(who); - do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { - ret = set_task_ioprio(p, ioprio); - if (ret) - break; - } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); - break; - case IOPRIO_WHO_USER: - uid = make_kuid(current_user_ns(), who); - if (!uid_valid(uid)) - break; - if (!who) - user = current_user(); - else - user = find_user(uid); - - if (!user) - break; - - do_each_thread(g, p) { - if (!uid_eq(task_uid(p), uid)) - continue; - ret = set_task_ioprio(p, ioprio); - if (ret) - goto free_uid; - } while_each_thread(g, p); -free_uid: - if (who) - free_uid(user); - break; - default: - ret = -EINVAL; - } - - rcu_read_unlock(); - return ret; -} - -static int get_task_ioprio(struct task_struct *p) -{ - int ret; - - ret = security_task_getioprio(p); - if (ret) - goto out; - ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM); - if (p->io_context) - ret = p->io_context->ioprio; -out: - return ret; -} - -int ioprio_best(unsigned short aprio, unsigned short bprio) -{ - unsigned short aclass = IOPRIO_PRIO_CLASS(aprio); - unsigned short bclass = IOPRIO_PRIO_CLASS(bprio); - - if (aclass == IOPRIO_CLASS_NONE) - aclass = IOPRIO_CLASS_BE; - if (bclass == IOPRIO_CLASS_NONE) - bclass = IOPRIO_CLASS_BE; - - if (aclass == bclass) - return min(aprio, bprio); - if (aclass > bclass) - return bprio; - else - return aprio; -} - -SYSCALL_DEFINE2(ioprio_get, int, which, int, who) -{ - struct task_struct *g, *p; - struct user_struct *user; - struct pid *pgrp; - kuid_t uid; - int ret = -ESRCH; - int tmpio; - - rcu_read_lock(); - switch (which) { - case IOPRIO_WHO_PROCESS: - if (!who) - p = current; - else - p = find_task_by_vpid(who); - if (p) - ret = get_task_ioprio(p); - break; - case IOPRIO_WHO_PGRP: - if (!who) - pgrp = task_pgrp(current); - else - pgrp = find_vpid(who); - do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { - tmpio = get_task_ioprio(p); - if (tmpio < 0) - continue; - if (ret == -ESRCH) - ret = tmpio; - else - ret = ioprio_best(ret, tmpio); - } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); - break; - case IOPRIO_WHO_USER: - uid = make_kuid(current_user_ns(), who); - if (!who) - user = current_user(); - else - user = find_user(uid); - - if (!user) - break; - - do_each_thread(g, p) { - if (!uid_eq(task_uid(p), user->uid)) - continue; - tmpio = get_task_ioprio(p); - if (tmpio < 0) - continue; - if (ret == -ESRCH) - ret = tmpio; - else - ret = ioprio_best(ret, tmpio); - } while_each_thread(g, p); - - if (who) - free_uid(user); - break; - default: - ret = -EINVAL; - } - - rcu_read_unlock(); - return ret; -} diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c index 25c713e7071c..8898bbd2b61e 100644 --- a/fs/jbd/revoke.c +++ b/fs/jbd/revoke.c @@ -231,19 +231,15 @@ record_cache_failure: static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size) { - int shift = 0; - int tmp = hash_size; + int i; struct jbd_revoke_table_s *table; table = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); if (!table) goto out; - while((tmp >>= 1UL) != 0UL) - shift++; - table->hash_size = hash_size; - table->hash_shift = shift; + table->hash_shift = ilog2(hash_size); table->hash_table = kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); if (!table->hash_table) { @@ -252,8 +248,8 @@ static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size) goto out; } - for (tmp = 0; tmp < hash_size; tmp++) - INIT_LIST_HEAD(&table->hash_table[tmp]); + for (i = 0; i < hash_size; i++) + INIT_LIST_HEAD(&table->hash_table[i]); out: return table; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 5f26139a165a..6fac74349856 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -43,7 +43,7 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) clear_buffer_uptodate(bh); if (orig_bh) { clear_bit_unlock(BH_Shadow, &orig_bh->b_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&orig_bh->b_state, BH_Shadow); } unlock_buffer(bh); @@ -239,7 +239,7 @@ static int journal_submit_data_buffers(journal_t *journal, spin_lock(&journal->j_list_lock); J_ASSERT(jinode->i_transaction == commit_transaction); clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); } spin_unlock(&journal->j_list_lock); @@ -277,7 +277,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal, } spin_lock(&journal->j_list_lock); clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 38cfcf5f6fce..6f0f590cc5a3 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1588,9 +1588,12 @@ int jbd2_journal_stop(handle_t *handle) * to perform a synchronous write. We do this to detect the * case where a single process is doing a stream of sync * writes. No point in waiting for joiners in that case. + * + * Setting max_batch_time to 0 disables this completely. */ pid = current->pid; - if (handle->h_sync && journal->j_last_sync_writer != pid) { + if (handle->h_sync && journal->j_last_sync_writer != pid && + journal->j_max_batch_time) { u64 commit_time, trans_time; journal->j_last_sync_writer = pid; diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c index 2b60ce1996aa..bb9cebc9ca8a 100644 --- a/fs/jffs2/background.c +++ b/fs/jffs2/background.c @@ -75,10 +75,13 @@ void jffs2_stop_garbage_collect_thread(struct jffs2_sb_info *c) static int jffs2_garbage_collect_thread(void *_c) { struct jffs2_sb_info *c = _c; + sigset_t hupmask; + siginitset(&hupmask, sigmask(SIGHUP)); allow_signal(SIGKILL); allow_signal(SIGSTOP); allow_signal(SIGCONT); + allow_signal(SIGHUP); c->gc_task = current; complete(&c->gc_thread_start); @@ -87,7 +90,7 @@ static int jffs2_garbage_collect_thread(void *_c) set_freezable(); for (;;) { - allow_signal(SIGHUP); + sigprocmask(SIG_UNBLOCK, &hupmask, NULL); again: spin_lock(&c->erase_completion_lock); if (!jffs2_thread_should_wake(c)) { @@ -95,10 +98,9 @@ static int jffs2_garbage_collect_thread(void *_c) spin_unlock(&c->erase_completion_lock); jffs2_dbg(1, "%s(): sleeping...\n", __func__); schedule(); - } else + } else { spin_unlock(&c->erase_completion_lock); - - + } /* Problem - immediately after bootup, the GCD spends a lot * of time in places like jffs2_kill_fragtree(); so much so * that userspace processes (like gdm and X) are starved @@ -150,7 +152,7 @@ static int jffs2_garbage_collect_thread(void *_c) } } /* We don't want SIGHUP to interrupt us. STOP and KILL are OK though. */ - disallow_signal(SIGHUP); + sigprocmask(SIG_BLOCK, &hupmask, NULL); jffs2_dbg(1, "%s(): pass\n", __func__); if (jffs2_garbage_collect_pass(c) == -ENOSPC) { diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 256cd19a3b78..64989ca9ba90 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -51,10 +51,10 @@ const struct file_operations jffs2_file_operations = { .llseek = generic_file_llseek, .open = generic_file_open, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write = new_sync_write, + .write_iter = generic_file_write_iter, .unlocked_ioctl=jffs2_ioctl, .mmap = generic_file_readonly_mmap, .fsync = jffs2_fsync, diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 5a8ea16eedbc..0c8ca830b113 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -83,13 +83,15 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type, switch (type) { case ACL_TYPE_ACCESS: ea_name = POSIX_ACL_XATTR_ACCESS; - rc = posix_acl_equiv_mode(acl, &inode->i_mode); - if (rc < 0) - return rc; - inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(inode); - if (rc == 0) - acl = NULL; + if (acl) { + rc = posix_acl_equiv_mode(acl, &inode->i_mode); + if (rc < 0) + return rc; + inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); + if (rc == 0) + acl = NULL; + } break; case ACL_TYPE_DEFAULT: ea_name = POSIX_ACL_XATTR_DEFAULT; diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 794da944d5cd..33aa0cc1f8b8 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -151,13 +151,13 @@ const struct inode_operations jfs_file_inode_operations = { const struct file_operations jfs_file_operations = { .open = jfs_open, .llseek = generic_file_llseek, - .write = do_sync_write, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, + .write = new_sync_write, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, .fsync = jfs_fsync, .release = jfs_release, .unlocked_ioctl = jfs_ioctl, diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 6f8fe72c2a7a..bd3df1ca3c9b 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -331,15 +331,15 @@ static sector_t jfs_bmap(struct address_space *mapping, sector_t block) } static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = file->f_mapping->host; + size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, - jfs_get_block); + ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, jfs_get_block); /* * In case of error extending write may have instantiated a few @@ -347,7 +347,7 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb, */ if (unlikely((rw & WRITE) && ret < 0)) { loff_t isize = i_size_read(inode); - loff_t end = offset + iov_length(iov, nr_segs); + loff_t end = offset + count; if (end > isize) jfs_write_failed(mapping, end); diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 370d7b6c5942..2d514c7affc2 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -1208,7 +1208,7 @@ static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno, * by this leaf. */ l2size = - min((int)leaf[word], NLSTOL2BSZ(nwords)); + min_t(int, leaf[word], NLSTOL2BSZ(nwords)); /* determine how many words were handled. */ @@ -1902,7 +1902,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results) /* determine how many blocks to allocate from this dmap. */ - nb = min(n, (s64)BPERDMAP); + nb = min_t(s64, n, BPERDMAP); /* allocate the blocks from the dmap. */ @@ -2260,7 +2260,8 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno, * of bits being allocated and the l2 number * of bits currently described by this leaf. */ - size = min((int)leaf[word], NLSTOL2BSZ(nwords)); + size = min_t(int, leaf[word], + NLSTOL2BSZ(nwords)); /* update the leaf to reflect the allocation. * in addition to setting the leaf value to @@ -3563,7 +3564,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks) if (mp == NULL) goto errout; - n = min(nblocks, (s64)BPERDMAP); + n = min_t(s64, nblocks, BPERDMAP); } dp = (struct dmap *) mp->data; diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index 7f464c513ba0..6b0f816201a2 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -29,20 +29,20 @@ void jfs_set_inode_flags(struct inode *inode) { unsigned int flags = JFS_IP(inode)->mode2; - - inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | - S_NOATIME | S_DIRSYNC | S_SYNC); + unsigned int new_fl = 0; if (flags & JFS_IMMUTABLE_FL) - inode->i_flags |= S_IMMUTABLE; + new_fl |= S_IMMUTABLE; if (flags & JFS_APPEND_FL) - inode->i_flags |= S_APPEND; + new_fl |= S_APPEND; if (flags & JFS_NOATIME_FL) - inode->i_flags |= S_NOATIME; + new_fl |= S_NOATIME; if (flags & JFS_DIRSYNC_FL) - inode->i_flags |= S_DIRSYNC; + new_fl |= S_DIRSYNC; if (flags & JFS_SYNC_FL) - inode->i_flags |= S_SYNC; + new_fl |= S_SYNC; + inode_set_flags(inode, new_fl, S_IMMUTABLE | S_APPEND | S_NOATIME | + S_DIRSYNC | S_SYNC); } void jfs_get_inode_flags(struct jfs_inode_info *jfs_ip) diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 8d811e02b4b9..0acddf60af55 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -167,7 +167,7 @@ do { \ * Global list of active external journals */ static LIST_HEAD(jfs_external_logs); -static struct jfs_log *dummy_log = NULL; +static struct jfs_log *dummy_log; static DEFINE_MUTEX(jfs_log_mutex); /* diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 97f7fda51890..adf8cb045b9e 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -50,14 +50,14 @@ MODULE_DESCRIPTION("The Journaled Filesystem (JFS)"); MODULE_AUTHOR("Steve Best/Dave Kleikamp/Barry Arndt, IBM"); MODULE_LICENSE("GPL"); -static struct kmem_cache * jfs_inode_cachep; +static struct kmem_cache *jfs_inode_cachep; static const struct super_operations jfs_super_operations; static const struct export_operations jfs_export_operations; static struct file_system_type jfs_fs_type; #define MAX_COMMIT_THREADS 64 -static int commit_threads = 0; +static int commit_threads; module_param(commit_threads, int, 0); MODULE_PARM_DESC(commit_threads, "Number of commit threads"); @@ -84,8 +84,7 @@ static void jfs_handle_error(struct super_block *sb) panic("JFS (device %s): panic forced after error\n", sb->s_id); else if (sbi->flag & JFS_ERR_REMOUNT_RO) { - jfs_err("ERROR: (device %s): remounting filesystem " - "as read-only\n", + jfs_err("ERROR: (device %s): remounting filesystem as read-only\n", sb->s_id); sb->s_flags |= MS_RDONLY; } @@ -273,7 +272,10 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, case Opt_resize: { char *resize = args[0].from; - *newLVSize = simple_strtoull(resize, &resize, 0); + int rc = kstrtoll(resize, 0, newLVSize); + + if (rc) + goto cleanup; break; } case Opt_resize_nosize: @@ -327,7 +329,11 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, case Opt_uid: { char *uid = args[0].from; - uid_t val = simple_strtoul(uid, &uid, 0); + uid_t val; + int rc = kstrtouint(uid, 0, &val); + + if (rc) + goto cleanup; sbi->uid = make_kuid(current_user_ns(), val); if (!uid_valid(sbi->uid)) goto cleanup; @@ -337,7 +343,11 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, case Opt_gid: { char *gid = args[0].from; - gid_t val = simple_strtoul(gid, &gid, 0); + gid_t val; + int rc = kstrtouint(gid, 0, &val); + + if (rc) + goto cleanup; sbi->gid = make_kgid(current_user_ns(), val); if (!gid_valid(sbi->gid)) goto cleanup; @@ -347,7 +357,10 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, case Opt_umask: { char *umask = args[0].from; - sbi->umask = simple_strtoul(umask, &umask, 8); + int rc = kstrtouint(umask, 8, &sbi->umask); + + if (rc) + goto cleanup; if (sbi->umask & ~0777) { pr_err("JFS: Invalid value of umask\n"); goto cleanup; @@ -363,12 +376,10 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, * -> user has more control over the online trimming */ sbi->minblks_trim = 64; - if (blk_queue_discard(q)) { + if (blk_queue_discard(q)) *flag |= JFS_DISCARD; - } else { - pr_err("JFS: discard option " \ - "not supported on device\n"); - } + else + pr_err("JFS: discard option not supported on device\n"); break; } @@ -380,20 +391,21 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, { struct request_queue *q = bdev_get_queue(sb->s_bdev); char *minblks_trim = args[0].from; + int rc; if (blk_queue_discard(q)) { *flag |= JFS_DISCARD; - sbi->minblks_trim = simple_strtoull( - minblks_trim, &minblks_trim, 0); - } else { - pr_err("JFS: discard option " \ - "not supported on device\n"); - } + rc = kstrtouint(minblks_trim, 0, + &sbi->minblks_trim); + if (rc) + goto cleanup; + } else + pr_err("JFS: discard option not supported on device\n"); break; } default: - printk("jfs: Unrecognized mount option \"%s\" " - " or missing value\n", p); + printk("jfs: Unrecognized mount option \"%s\" or missing value\n", + p); goto cleanup; } } @@ -419,14 +431,12 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data) int ret; sync_filesystem(sb); - if (!parse_options(data, sb, &newLVSize, &flag)) { + if (!parse_options(data, sb, &newLVSize, &flag)) return -EINVAL; - } if (newLVSize) { if (sb->s_flags & MS_RDONLY) { - pr_err("JFS: resize requires volume" \ - " to be mounted read-write\n"); + pr_err("JFS: resize requires volume to be mounted read-write\n"); return -EROFS; } rc = jfs_extendfs(sb, newLVSize, 0); @@ -452,9 +462,8 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data) } if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) { rc = dquot_suspend(sb, -1); - if (rc < 0) { + if (rc < 0) return rc; - } rc = jfs_umount_rw(sb); JFS_SBI(sb)->flag = flag; return rc; @@ -487,7 +496,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) if (!new_valid_dev(sb->s_bdev->bd_dev)) return -EOVERFLOW; - sbi = kzalloc(sizeof (struct jfs_sb_info), GFP_KERNEL); + sbi = kzalloc(sizeof(struct jfs_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; @@ -548,9 +557,8 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) rc = jfs_mount(sb); if (rc) { - if (!silent) { + if (!silent) jfs_err("jfs_mount failed w/return code = %d", rc); - } goto out_mount_failed; } if (sb->s_flags & MS_RDONLY) @@ -587,7 +595,8 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) * Page cache is indexed by long. * I would use MAX_LFS_FILESIZE, but it's only half as big */ - sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1, (u64)sb->s_maxbytes); + sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1, + (u64)sb->s_maxbytes); #endif sb->s_time_gran = 1; return 0; @@ -597,9 +606,8 @@ out_no_root: out_no_rw: rc = jfs_umount(sb); - if (rc) { + if (rc) jfs_err("jfs_umount failed with return code %d", rc); - } out_mount_failed: filemap_write_and_wait(sbi->direct_inode->i_mapping); truncate_inode_pages(sbi->direct_inode->i_mapping, 0); @@ -924,7 +932,8 @@ static int __init init_jfs_fs(void) commit_threads = MAX_COMMIT_THREADS; for (i = 0; i < commit_threads; i++) { - jfsCommitThread[i] = kthread_run(jfs_lazycommit, NULL, "jfsCommit"); + jfsCommitThread[i] = kthread_run(jfs_lazycommit, NULL, + "jfsCommit"); if (IS_ERR(jfsCommitThread[i])) { rc = PTR_ERR(jfsCommitThread[i]); jfs_err("init_jfs_fs: fork failed w/rc = %d", rc); diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 78f3403300af..a693f5b01ae6 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -232,9 +232,6 @@ static int kernfs_link_sibling(struct kernfs_node *kn) struct rb_node **node = &kn->parent->dir.children.rb_node; struct rb_node *parent = NULL; - if (kernfs_type(kn) == KERNFS_DIR) - kn->parent->dir.subdirs++; - while (*node) { struct kernfs_node *pos; int result; @@ -249,9 +246,15 @@ static int kernfs_link_sibling(struct kernfs_node *kn) else return -EEXIST; } + /* add new node and rebalance the tree */ rb_link_node(&kn->rb, parent, node); rb_insert_color(&kn->rb, &kn->parent->dir.children); + + /* successfully added, account subdir number */ + if (kernfs_type(kn) == KERNFS_DIR) + kn->parent->dir.subdirs++; + return 0; } @@ -711,6 +714,7 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, return ERR_PTR(-ENOMEM); ida_init(&root->ino_ida); + INIT_LIST_HEAD(&root->supers); kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO, KERNFS_DIR); diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 8034706a7af8..d895b4b7b661 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -14,6 +14,7 @@ #include <linux/poll.h> #include <linux/pagemap.h> #include <linux/sched.h> +#include <linux/fsnotify.h> #include "kernfs-internal.h" @@ -38,6 +39,19 @@ struct kernfs_open_node { struct list_head files; /* goes through kernfs_open_file.list */ }; +/* + * kernfs_notify() may be called from any context and bounces notifications + * through a work item. To minimize space overhead in kernfs_node, the + * pending queue is implemented as a singly linked list of kernfs_nodes. + * The list is terminated with the self pointer so that whether a + * kernfs_node is on the list or not can be determined by testing the next + * pointer for NULL. + */ +#define KERNFS_NOTIFY_EOL ((void *)&kernfs_notify_list) + +static DEFINE_SPINLOCK(kernfs_notify_lock); +static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL; + static struct kernfs_open_file *kernfs_of(struct file *file) { return ((struct seq_file *)file->private_data)->private; @@ -484,6 +498,8 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) ops = kernfs_ops(of->kn); rc = ops->mmap(of, vma); + if (rc) + goto out_put; /* * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup() @@ -608,6 +624,7 @@ static void kernfs_put_open_node(struct kernfs_node *kn, static int kernfs_fop_open(struct inode *inode, struct file *file) { struct kernfs_node *kn = file->f_path.dentry->d_fsdata; + struct kernfs_root *root = kernfs_root(kn); const struct kernfs_ops *ops; struct kernfs_open_file *of; bool has_read, has_write, has_mmap; @@ -622,14 +639,16 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) has_write = ops->write || ops->mmap; has_mmap = ops->mmap; - /* check perms and supported operations */ - if ((file->f_mode & FMODE_WRITE) && - (!(inode->i_mode & S_IWUGO) || !has_write)) - goto err_out; + /* see the flag definition for details */ + if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) { + if ((file->f_mode & FMODE_WRITE) && + (!(inode->i_mode & S_IWUGO) || !has_write)) + goto err_out; - if ((file->f_mode & FMODE_READ) && - (!(inode->i_mode & S_IRUGO) || !has_read)) - goto err_out; + if ((file->f_mode & FMODE_READ) && + (!(inode->i_mode & S_IRUGO) || !has_read)) + goto err_out; + } /* allocate a kernfs_open_file for the file */ error = -ENOMEM; @@ -777,28 +796,84 @@ static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait) return DEFAULT_POLLMASK|POLLERR|POLLPRI; } +static void kernfs_notify_workfn(struct work_struct *work) +{ + struct kernfs_node *kn; + struct kernfs_open_node *on; + struct kernfs_super_info *info; +repeat: + /* pop one off the notify_list */ + spin_lock_irq(&kernfs_notify_lock); + kn = kernfs_notify_list; + if (kn == KERNFS_NOTIFY_EOL) { + spin_unlock_irq(&kernfs_notify_lock); + return; + } + kernfs_notify_list = kn->attr.notify_next; + kn->attr.notify_next = NULL; + spin_unlock_irq(&kernfs_notify_lock); + + /* kick poll */ + spin_lock_irq(&kernfs_open_node_lock); + + on = kn->attr.open; + if (on) { + atomic_inc(&on->event); + wake_up_interruptible(&on->poll); + } + + spin_unlock_irq(&kernfs_open_node_lock); + + /* kick fsnotify */ + mutex_lock(&kernfs_mutex); + + list_for_each_entry(info, &kernfs_root(kn)->supers, node) { + struct inode *inode; + struct dentry *dentry; + + inode = ilookup(info->sb, kn->ino); + if (!inode) + continue; + + dentry = d_find_any_alias(inode); + if (dentry) { + fsnotify_parent(NULL, dentry, FS_MODIFY); + fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE, + NULL, 0); + dput(dentry); + } + + iput(inode); + } + + mutex_unlock(&kernfs_mutex); + kernfs_put(kn); + goto repeat; +} + /** * kernfs_notify - notify a kernfs file * @kn: file to notify * - * Notify @kn such that poll(2) on @kn wakes up. + * Notify @kn such that poll(2) on @kn wakes up. Maybe be called from any + * context. */ void kernfs_notify(struct kernfs_node *kn) { - struct kernfs_open_node *on; + static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn); unsigned long flags; - spin_lock_irqsave(&kernfs_open_node_lock, flags); + if (WARN_ON(kernfs_type(kn) != KERNFS_FILE)) + return; - if (!WARN_ON(kernfs_type(kn) != KERNFS_FILE)) { - on = kn->attr.open; - if (on) { - atomic_inc(&on->event); - wake_up_interruptible(&on->poll); - } + spin_lock_irqsave(&kernfs_notify_lock, flags); + if (!kn->attr.notify_next) { + kernfs_get(kn); + kn->attr.notify_next = kernfs_notify_list; + kernfs_notify_list = kn; + schedule_work(&kernfs_notify_work); } - - spin_unlock_irqrestore(&kernfs_open_node_lock, flags); + spin_unlock_irqrestore(&kernfs_notify_lock, flags); } EXPORT_SYMBOL_GPL(kernfs_notify); diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index abb0f1f53d93..985217626e66 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -48,14 +48,18 @@ void __init kernfs_inode_init(void) static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) { + static DEFINE_MUTEX(iattr_mutex); + struct kernfs_iattrs *ret; struct iattr *iattrs; + mutex_lock(&iattr_mutex); + if (kn->iattr) - return kn->iattr; + goto out_unlock; kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL); if (!kn->iattr) - return NULL; + goto out_unlock; iattrs = &kn->iattr->ia_iattr; /* assign default attributes */ @@ -65,8 +69,10 @@ static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME; simple_xattrs_init(&kn->iattr->xattrs); - - return kn->iattr; +out_unlock: + ret = kn->iattr; + mutex_unlock(&iattr_mutex); + return ret; } static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 8be13b2a079b..dc84a3ef9ca2 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -49,6 +49,8 @@ static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn) * mount.c */ struct kernfs_super_info { + struct super_block *sb; + /* * The root associated with this super_block. Each super_block is * identified by the root and ns it's associated with. @@ -62,6 +64,9 @@ struct kernfs_super_info { * an array and compare kernfs_node tag against every entry. */ const void *ns; + + /* anchored at kernfs_root->supers, protected by kernfs_mutex */ + struct list_head node; }; #define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info)) diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 6a5f04ac8704..f973ae9b05f1 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -62,15 +62,16 @@ struct kernfs_root *kernfs_root_from_sb(struct super_block *sb) return NULL; } -static int kernfs_fill_super(struct super_block *sb) +static int kernfs_fill_super(struct super_block *sb, unsigned long magic) { struct kernfs_super_info *info = kernfs_info(sb); struct inode *inode; struct dentry *root; + info->sb = sb; sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; - sb->s_magic = SYSFS_MAGIC; + sb->s_magic = magic; sb->s_op = &kernfs_sops; sb->s_time_gran = 1; @@ -131,6 +132,7 @@ const void *kernfs_super_ns(struct super_block *sb) * @fs_type: file_system_type of the fs being mounted * @flags: mount flags specified for the mount * @root: kernfs_root of the hierarchy being mounted + * @magic: file system specific magic number * @new_sb_created: tell the caller if we allocated a new superblock * @ns: optional namespace tag of the mount * @@ -142,8 +144,8 @@ const void *kernfs_super_ns(struct super_block *sb) * The return value can be passed to the vfs layer verbatim. */ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, - struct kernfs_root *root, bool *new_sb_created, - const void *ns) + struct kernfs_root *root, unsigned long magic, + bool *new_sb_created, const void *ns) { struct super_block *sb; struct kernfs_super_info *info; @@ -166,12 +168,18 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, *new_sb_created = !sb->s_root; if (!sb->s_root) { - error = kernfs_fill_super(sb); + struct kernfs_super_info *info = kernfs_info(sb); + + error = kernfs_fill_super(sb, magic); if (error) { deactivate_locked_super(sb); return ERR_PTR(error); } sb->s_flags |= MS_ACTIVE; + + mutex_lock(&kernfs_mutex); + list_add(&info->node, &root->supers); + mutex_unlock(&kernfs_mutex); } return dget(sb->s_root); @@ -190,6 +198,10 @@ void kernfs_kill_sb(struct super_block *sb) struct kernfs_super_info *info = kernfs_info(sb); struct kernfs_node *root_kn = sb->s_root->d_fsdata; + mutex_lock(&kernfs_mutex); + list_del(&info->node); + mutex_unlock(&kernfs_mutex); + /* * Remove the superblock from fs_supers/s_instances * so we can't find it, before freeing kernfs_super_info. @@ -199,6 +211,36 @@ void kernfs_kill_sb(struct super_block *sb) kernfs_put(root_kn); } +/** + * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root + * @kernfs_root: the kernfs_root in question + * @ns: the namespace tag + * + * Pin the superblock so the superblock won't be destroyed in subsequent + * operations. This can be used to block ->kill_sb() which may be useful + * for kernfs users which dynamically manage superblocks. + * + * Returns NULL if there's no superblock associated to this kernfs_root, or + * -EINVAL if the superblock is being freed. + */ +struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns) +{ + struct kernfs_super_info *info; + struct super_block *sb = NULL; + + mutex_lock(&kernfs_mutex); + list_for_each_entry(info, &root->supers, node) { + if (info->ns == ns) { + sb = info->sb; + if (!atomic_inc_not_zero(&info->sb->s_active)) + sb = ERR_PTR(-EINVAL); + break; + } + } + mutex_unlock(&kernfs_mutex); + return sb; +} + void __init kernfs_init(void) { kernfs_node_cache = kmem_cache_create("kernfs_node_cache", diff --git a/fs/libfs.c b/fs/libfs.c index a1844244246f..88e3e00e2eca 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -3,6 +3,7 @@ * Library for filesystems writers. */ +#include <linux/blkdev.h> #include <linux/export.h> #include <linux/pagemap.h> #include <linux/slab.h> @@ -923,16 +924,19 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid, EXPORT_SYMBOL_GPL(generic_fh_to_parent); /** - * generic_file_fsync - generic fsync implementation for simple filesystems + * __generic_file_fsync - generic fsync implementation for simple filesystems + * * @file: file to synchronize + * @start: start offset in bytes + * @end: end offset in bytes (inclusive) * @datasync: only synchronize essential metadata if true * * This is a generic implementation of the fsync method for simple * filesystems which track all non-inode metadata in the buffers list * hanging off the address_space structure. */ -int generic_file_fsync(struct file *file, loff_t start, loff_t end, - int datasync) +int __generic_file_fsync(struct file *file, loff_t start, loff_t end, + int datasync) { struct inode *inode = file->f_mapping->host; int err; @@ -952,10 +956,34 @@ int generic_file_fsync(struct file *file, loff_t start, loff_t end, err = sync_inode_metadata(inode, 1); if (ret == 0) ret = err; + out: mutex_unlock(&inode->i_mutex); return ret; } +EXPORT_SYMBOL(__generic_file_fsync); + +/** + * generic_file_fsync - generic fsync implementation for simple filesystems + * with flush + * @file: file to synchronize + * @start: start offset in bytes + * @end: end offset in bytes (inclusive) + * @datasync: only synchronize essential metadata if true + * + */ + +int generic_file_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + struct inode *inode = file->f_mapping->host; + int err; + + err = __generic_file_fsync(file, start, end, datasync); + if (err) + return err; + return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); +} EXPORT_SYMBOL(generic_file_fsync); /** diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c index 00ec0b9c94d1..d3e40db28930 100644 --- a/fs/lockd/clnt4xdr.c +++ b/fs/lockd/clnt4xdr.c @@ -14,6 +14,8 @@ #include <linux/sunrpc/stats.h> #include <linux/lockd/lockd.h> +#include <uapi/linux/nfs3.h> + #define NLMDBG_FACILITY NLMDBG_XDR #if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ) diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c index 9a55797a1cd4..3e9f7874b975 100644 --- a/fs/lockd/clntxdr.c +++ b/fs/lockd/clntxdr.c @@ -15,6 +15,8 @@ #include <linux/sunrpc/stats.h> #include <linux/lockd/lockd.h> +#include <uapi/linux/nfs2.h> + #define NLMDBG_FACILITY NLMDBG_XDR #if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 6bf06a07f3e0..8f27c93f8d2e 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -436,7 +436,7 @@ EXPORT_SYMBOL_GPL(lockd_down); * Sysctl parameters (same as module parameters, different interface). */ -static ctl_table nlm_sysctls[] = { +static struct ctl_table nlm_sysctls[] = { { .procname = "nlm_grace_period", .data = &nlm_grace_period, @@ -490,7 +490,7 @@ static ctl_table nlm_sysctls[] = { { } }; -static ctl_table nlm_sysctl_dir[] = { +static struct ctl_table nlm_sysctl_dir[] = { { .procname = "nfs", .mode = 0555, @@ -499,7 +499,7 @@ static ctl_table nlm_sysctl_dir[] = { { } }; -static ctl_table nlm_sysctl_root[] = { +static struct ctl_table nlm_sysctl_root[] = { { .procname = "fs", .mode = 0555, @@ -622,8 +622,8 @@ static int __init init_nlm(void) err_pernet: #ifdef CONFIG_SYSCTL unregister_sysctl_table(nlm_sysctl_table); -#endif err_sysctl: +#endif return err; } diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index dc5c75930f0f..b6f3b84b6e99 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -14,12 +14,11 @@ #include <linux/mutex.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/addr.h> -#include <linux/nfsd/nfsfh.h> -#include <linux/nfsd/export.h> #include <linux/lockd/lockd.h> #include <linux/lockd/share.h> #include <linux/module.h> #include <linux/mount.h> +#include <uapi/linux/nfs2.h> #define NLMDBG_FACILITY NLMDBG_SVCSUBS diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index 964666c68a86..9340e7e10ef6 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -16,6 +16,8 @@ #include <linux/sunrpc/stats.h> #include <linux/lockd/lockd.h> +#include <uapi/linux/nfs2.h> + #define NLMDBG_FACILITY NLMDBG_XDR diff --git a/fs/locks.c b/fs/locks.c index 13fc7a6d380a..717fbc404e6b 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -130,12 +130,15 @@ #include <linux/percpu.h> #include <linux/lglock.h> +#define CREATE_TRACE_POINTS +#include <trace/events/filelock.h> + #include <asm/uaccess.h> #define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) #define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) #define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG)) -#define IS_FILE_PVT(fl) (fl->fl_flags & FL_FILE_PVT) +#define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK) static bool lease_breaking(struct file_lock *fl) { @@ -322,6 +325,7 @@ static int flock_make_lock(struct file *filp, struct file_lock **lock, return -ENOMEM; fl->fl_file = filp; + fl->fl_owner = (fl_owner_t)filp; fl->fl_pid = current->tgid; fl->fl_flags = FL_FLOCK; fl->fl_type = type; @@ -389,18 +393,6 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl, fl->fl_ops = NULL; fl->fl_lmops = NULL; - /* Ensure that fl->fl_filp has compatible f_mode */ - switch (l->l_type) { - case F_RDLCK: - if (!(filp->f_mode & FMODE_READ)) - return -EBADF; - break; - case F_WRLCK: - if (!(filp->f_mode & FMODE_WRITE)) - return -EBADF; - break; - } - return assign_type(fl, l->l_type); } @@ -439,7 +431,7 @@ static int lease_init(struct file *filp, long type, struct file_lock *fl) if (assign_type(fl, type) != 0) return -EINVAL; - fl->fl_owner = current->files; + fl->fl_owner = (fl_owner_t)current->files; fl->fl_pid = current->tgid; fl->fl_file = filp; @@ -564,7 +556,7 @@ static void __locks_insert_block(struct file_lock *blocker, BUG_ON(!list_empty(&waiter->fl_block)); waiter->fl_next = blocker; list_add_tail(&waiter->fl_block, &blocker->fl_block); - if (IS_POSIX(blocker) && !IS_FILE_PVT(blocker)) + if (IS_POSIX(blocker) && !IS_OFDLCK(blocker)) locks_insert_global_blocked(waiter); } @@ -759,12 +751,12 @@ EXPORT_SYMBOL(posix_test_lock); * of tasks (such as posix threads) sharing the same open file table. * To handle those cases, we just bail out after a few iterations. * - * For FL_FILE_PVT locks, the owner is the filp, not the files_struct. + * For FL_OFDLCK locks, the owner is the filp, not the files_struct. * Because the owner is not even nominally tied to a thread of * execution, the deadlock detection below can't reasonably work well. Just * skip it for those. * - * In principle, we could do a more limited deadlock detection on FL_FILE_PVT + * In principle, we could do a more limited deadlock detection on FL_OFDLCK * locks that just checks for the case where two tasks are attempting to * upgrade from read to write locks on the same inode. */ @@ -791,9 +783,9 @@ static int posix_locks_deadlock(struct file_lock *caller_fl, /* * This deadlock detector can't reasonably detect deadlocks with - * FL_FILE_PVT locks, since they aren't owned by a process, per-se. + * FL_OFDLCK locks, since they aren't owned by a process, per-se. */ - if (IS_FILE_PVT(caller_fl)) + if (IS_OFDLCK(caller_fl)) return 0; while ((block_fl = what_owner_is_waiting_for(block_fl))) { @@ -1298,6 +1290,7 @@ static void time_out_leases(struct inode *inode) before = &inode->i_flock; while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) { + trace_time_out_leases(inode, fl); if (past_time(fl->fl_downgrade_time)) lease_modify(before, F_RDLCK); if (past_time(fl->fl_break_time)) @@ -1385,22 +1378,24 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) } if (i_have_this_lease || (mode & O_NONBLOCK)) { + trace_break_lease_noblock(inode, new_fl); error = -EWOULDBLOCK; goto out; } restart: break_time = flock->fl_break_time; - if (break_time != 0) { + if (break_time != 0) break_time -= jiffies; - if (break_time == 0) - break_time++; - } + if (break_time == 0) + break_time++; locks_insert_block(flock, new_fl); + trace_break_lease_block(inode, new_fl); spin_unlock(&inode->i_lock); error = wait_event_interruptible_timeout(new_fl->fl_wait, !new_fl->fl_next, break_time); spin_lock(&inode->i_lock); + trace_break_lease_unblock(inode, new_fl); locks_delete_block(new_fl); if (error >= 0) { if (error == 0) @@ -1522,6 +1517,8 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp int error; lease = *flp; + trace_generic_add_lease(inode, lease); + /* * In the delegation case we need mutual exclusion with * a number of operations that take the i_mutex. We trylock @@ -1611,6 +1608,8 @@ static int generic_delete_lease(struct file *filp, struct file_lock **flp) struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; + trace_generic_delete_lease(inode, *flp); + for (before = &inode->i_flock; ((fl = *before) != NULL) && IS_LEASE(fl); before = &fl->fl_next) { @@ -1891,7 +1890,7 @@ EXPORT_SYMBOL_GPL(vfs_test_lock); static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl) { - flock->l_pid = IS_FILE_PVT(fl) ? -1 : fl->fl_pid; + flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid; #if BITS_PER_LONG == 32 /* * Make sure we can represent the posix lock via @@ -1913,7 +1912,7 @@ static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl) #if BITS_PER_LONG == 32 static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl) { - flock->l_pid = IS_FILE_PVT(fl) ? -1 : fl->fl_pid; + flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid; flock->l_start = fl->fl_start; flock->l_len = fl->fl_end == OFFSET_MAX ? 0 : fl->fl_end - fl->fl_start + 1; @@ -1942,13 +1941,13 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l) if (error) goto out; - if (cmd == F_GETLKP) { + if (cmd == F_OFD_GETLK) { error = -EINVAL; if (flock.l_pid != 0) goto out; cmd = F_GETLK; - file_lock.fl_flags |= FL_FILE_PVT; + file_lock.fl_flags |= FL_OFDLCK; file_lock.fl_owner = (fl_owner_t)filp; } @@ -2035,6 +2034,22 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd, return error; } +/* Ensure that fl->fl_filp has compatible f_mode for F_SETLK calls */ +static int +check_fmode_for_setlk(struct file_lock *fl) +{ + switch (fl->fl_type) { + case F_RDLCK: + if (!(fl->fl_file->f_mode & FMODE_READ)) + return -EBADF; + break; + case F_WRLCK: + if (!(fl->fl_file->f_mode & FMODE_WRITE)) + return -EBADF; + } + return 0; +} + /* Apply the lock described by l to an open file descriptor. * This implements both the F_SETLK and F_SETLKW commands of fcntl(). */ @@ -2072,27 +2087,31 @@ again: if (error) goto out; + error = check_fmode_for_setlk(file_lock); + if (error) + goto out; + /* * If the cmd is requesting file-private locks, then set the - * FL_FILE_PVT flag and override the owner. + * FL_OFDLCK flag and override the owner. */ switch (cmd) { - case F_SETLKP: + case F_OFD_SETLK: error = -EINVAL; if (flock.l_pid != 0) goto out; cmd = F_SETLK; - file_lock->fl_flags |= FL_FILE_PVT; + file_lock->fl_flags |= FL_OFDLCK; file_lock->fl_owner = (fl_owner_t)filp; break; - case F_SETLKPW: + case F_OFD_SETLKW: error = -EINVAL; if (flock.l_pid != 0) goto out; cmd = F_SETLKW; - file_lock->fl_flags |= FL_FILE_PVT; + file_lock->fl_flags |= FL_OFDLCK; file_lock->fl_owner = (fl_owner_t)filp; /* Fallthrough */ case F_SETLKW: @@ -2144,13 +2163,13 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l) if (error) goto out; - if (cmd == F_GETLKP) { + if (cmd == F_OFD_GETLK) { error = -EINVAL; if (flock.l_pid != 0) goto out; cmd = F_GETLK64; - file_lock.fl_flags |= FL_FILE_PVT; + file_lock.fl_flags |= FL_OFDLCK; file_lock.fl_owner = (fl_owner_t)filp; } @@ -2207,27 +2226,31 @@ again: if (error) goto out; + error = check_fmode_for_setlk(file_lock); + if (error) + goto out; + /* * If the cmd is requesting file-private locks, then set the - * FL_FILE_PVT flag and override the owner. + * FL_OFDLCK flag and override the owner. */ switch (cmd) { - case F_SETLKP: + case F_OFD_SETLK: error = -EINVAL; if (flock.l_pid != 0) goto out; cmd = F_SETLK64; - file_lock->fl_flags |= FL_FILE_PVT; + file_lock->fl_flags |= FL_OFDLCK; file_lock->fl_owner = (fl_owner_t)filp; break; - case F_SETLKPW: + case F_OFD_SETLKW: error = -EINVAL; if (flock.l_pid != 0) goto out; cmd = F_SETLKW64; - file_lock->fl_flags |= FL_FILE_PVT; + file_lock->fl_flags |= FL_OFDLCK; file_lock->fl_owner = (fl_owner_t)filp; /* Fallthrough */ case F_SETLKW64: @@ -2305,6 +2328,7 @@ void locks_remove_file(struct file *filp) if (filp->f_op->flock) { struct file_lock fl = { + .fl_owner = (fl_owner_t)filp, .fl_pid = current->tgid, .fl_file = filp, .fl_flags = FL_FLOCK, @@ -2412,31 +2436,31 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, seq_printf(f, "%lld:%s ", id, pfx); if (IS_POSIX(fl)) { if (fl->fl_flags & FL_ACCESS) - seq_printf(f, "ACCESS"); - else if (IS_FILE_PVT(fl)) - seq_printf(f, "FLPVT "); + seq_puts(f, "ACCESS"); + else if (IS_OFDLCK(fl)) + seq_puts(f, "OFDLCK"); else - seq_printf(f, "POSIX "); + seq_puts(f, "POSIX "); seq_printf(f, " %s ", (inode == NULL) ? "*NOINODE*" : mandatory_lock(inode) ? "MANDATORY" : "ADVISORY "); } else if (IS_FLOCK(fl)) { if (fl->fl_type & LOCK_MAND) { - seq_printf(f, "FLOCK MSNFS "); + seq_puts(f, "FLOCK MSNFS "); } else { - seq_printf(f, "FLOCK ADVISORY "); + seq_puts(f, "FLOCK ADVISORY "); } } else if (IS_LEASE(fl)) { - seq_printf(f, "LEASE "); + seq_puts(f, "LEASE "); if (lease_breaking(fl)) - seq_printf(f, "BREAKING "); + seq_puts(f, "BREAKING "); else if (fl->fl_file) - seq_printf(f, "ACTIVE "); + seq_puts(f, "ACTIVE "); else - seq_printf(f, "BREAKER "); + seq_puts(f, "BREAKER "); } else { - seq_printf(f, "UNKNOWN UNKNOWN "); + seq_puts(f, "UNKNOWN UNKNOWN "); } if (fl->fl_type & LOCK_MAND) { seq_printf(f, "%s ", @@ -2468,7 +2492,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, else seq_printf(f, "%Ld %Ld\n", fl->fl_start, fl->fl_end); } else { - seq_printf(f, "0 EOF\n"); + seq_puts(f, "0 EOF\n"); } } diff --git a/fs/logfs/file.c b/fs/logfs/file.c index 57914fc32b62..8538752df2f6 100644 --- a/fs/logfs/file.c +++ b/fs/logfs/file.c @@ -264,15 +264,15 @@ const struct inode_operations logfs_reg_iops = { }; const struct file_operations logfs_reg_fops = { - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .fsync = logfs_fsync, .unlocked_ioctl = logfs_ioctl, .llseek = generic_file_llseek, .mmap = generic_file_readonly_mmap, .open = generic_file_open, - .read = do_sync_read, - .write = do_sync_write, + .read = new_sync_read, + .write = new_sync_write, }; const struct address_space_operations logfs_reg_aops = { diff --git a/fs/mbcache.c b/fs/mbcache.c index bf166e388f0d..187477ded6b3 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -73,6 +73,7 @@ #include <linux/mbcache.h> #include <linux/init.h> #include <linux/blockgroup_lock.h> +#include <linux/log2.h> #ifdef MB_CACHE_DEBUG # define mb_debug(f...) do { \ @@ -93,7 +94,7 @@ #define MB_CACHE_WRITER ((unsigned short)~0U >> 1) -#define MB_CACHE_ENTRY_LOCK_BITS __builtin_log2(NR_BG_LOCKS) +#define MB_CACHE_ENTRY_LOCK_BITS ilog2(NR_BG_LOCKS) #define MB_CACHE_ENTRY_LOCK_INDEX(ce) \ (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS)) diff --git a/fs/minix/file.c b/fs/minix/file.c index adc6f5494231..a967de085ac0 100644 --- a/fs/minix/file.c +++ b/fs/minix/file.c @@ -14,10 +14,10 @@ */ const struct file_operations minix_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write = new_sync_write, + .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .fsync = generic_file_fsync, .splice_read = generic_file_splice_read, diff --git a/fs/mpage.c b/fs/mpage.c index 4979ffa60aaa..5f9ed622274f 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -48,23 +48,7 @@ static void mpage_end_io(struct bio *bio, int err) bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - - if (bio_data_dir(bio) == READ) { - if (!err) { - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - unlock_page(page); - } else { /* bio_data_dir(bio) == WRITE */ - if (err) { - SetPageError(page); - if (page->mapping) - set_bit(AS_EIO, &page->mapping->flags); - } - end_page_writeback(page); - } + page_endio(page, bio_data_dir(bio), err); } bio_put(bio); @@ -285,6 +269,11 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, alloc_new: if (bio == NULL) { + if (first_hole == blocks_per_page) { + if (!bdev_read_page(bdev, blocks[0] << (blkbits - 9), + page)) + goto out; + } bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), min_t(int, nr_pages, bio_get_nr_vecs(bdev)), GFP_KERNEL); @@ -439,6 +428,35 @@ struct mpage_data { unsigned use_writepage; }; +/* + * We have our BIO, so we can now mark the buffers clean. Make + * sure to only clean buffers which we know we'll be writing. + */ +static void clean_buffers(struct page *page, unsigned first_unmapped) +{ + unsigned buffer_counter = 0; + struct buffer_head *bh, *head; + if (!page_has_buffers(page)) + return; + head = page_buffers(page); + bh = head; + + do { + if (buffer_counter++ == first_unmapped) + break; + clear_buffer_dirty(bh); + bh = bh->b_this_page; + } while (bh != head); + + /* + * we cannot drop the bh if the page is not uptodate or a concurrent + * readpage would fail to serialize with the bh and it would read from + * disk before we reach the platter. + */ + if (buffer_heads_over_limit && PageUptodate(page)) + try_to_free_buffers(page); +} + static int __mpage_writepage(struct page *page, struct writeback_control *wbc, void *data) { @@ -574,6 +592,13 @@ page_is_mapped: alloc_new: if (bio == NULL) { + if (first_unmapped == blocks_per_page) { + if (!bdev_write_page(bdev, blocks[0] << (blkbits - 9), + page, wbc)) { + clean_buffers(page, first_unmapped); + goto out; + } + } bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH); if (bio == NULL) @@ -591,30 +616,7 @@ alloc_new: goto alloc_new; } - /* - * OK, we have our BIO, so we can now mark the buffers clean. Make - * sure to only clean buffers which we know we'll be writing. - */ - if (page_has_buffers(page)) { - struct buffer_head *head = page_buffers(page); - struct buffer_head *bh = head; - unsigned buffer_counter = 0; - - do { - if (buffer_counter++ == first_unmapped) - break; - clear_buffer_dirty(bh); - bh = bh->b_this_page; - } while (bh != head); - - /* - * we cannot drop the bh if the page is not uptodate - * or a concurrent readpage would fail to serialize with the bh - * and it would read from disk before we reach the platter. - */ - if (buffer_heads_over_limit && PageUptodate(page)) - try_to_free_buffers(page); - } + clean_buffers(page, first_unmapped); BUG_ON(PageWriteback(page)); set_page_writeback(page); diff --git a/fs/namei.c b/fs/namei.c index c6157c894fce..9eb787e5c167 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -332,10 +332,11 @@ int generic_permission(struct inode *inode, int mask) if (S_ISDIR(inode->i_mode)) { /* DACs are overridable for directories */ - if (inode_capable(inode, CAP_DAC_OVERRIDE)) + if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE)) return 0; if (!(mask & MAY_WRITE)) - if (inode_capable(inode, CAP_DAC_READ_SEARCH)) + if (capable_wrt_inode_uidgid(inode, + CAP_DAC_READ_SEARCH)) return 0; return -EACCES; } @@ -345,7 +346,7 @@ int generic_permission(struct inode *inode, int mask) * at least one exec bit set. */ if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) - if (inode_capable(inode, CAP_DAC_OVERRIDE)) + if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE)) return 0; /* @@ -353,7 +354,7 @@ int generic_permission(struct inode *inode, int mask) */ mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (mask == MAY_READ) - if (inode_capable(inode, CAP_DAC_READ_SEARCH)) + if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH)) return 0; return -EACCES; @@ -1542,7 +1543,7 @@ static inline int walk_component(struct nameidata *nd, struct path *path, inode = path->dentry->d_inode; } err = -ENOENT; - if (!inode) + if (!inode || d_is_negative(path->dentry)) goto out_path_put; if (should_follow_link(path->dentry, follow)) { @@ -2249,15 +2250,16 @@ mountpoint_last(struct nameidata *nd, struct path *path) mutex_unlock(&dir->d_inode->i_mutex); done: - if (!dentry->d_inode) { + if (!dentry->d_inode || d_is_negative(dentry)) { error = -ENOENT; dput(dentry); goto out; } path->dentry = dentry; - path->mnt = mntget(nd->path.mnt); + path->mnt = nd->path.mnt; if (should_follow_link(dentry, nd->flags & LOOKUP_FOLLOW)) return 1; + mntget(path->mnt); follow_mount(path); error = 0; out: @@ -2379,7 +2381,7 @@ static inline int check_sticky(struct inode *dir, struct inode *inode) return 0; if (uid_eq(dir->i_uid, fsuid)) return 0; - return !inode_capable(inode, CAP_FOWNER); + return !capable_wrt_inode_uidgid(inode, CAP_FOWNER); } /* @@ -2994,7 +2996,7 @@ retry_lookup: finish_lookup: /* we _can_ be in RCU mode here */ error = -ENOENT; - if (d_is_negative(path->dentry)) { + if (!inode || d_is_negative(path->dentry)) { path_to_nameidata(path, nd); goto out; } diff --git a/fs/ncpfs/getopt.c b/fs/ncpfs/getopt.c index 03ffde1f44d6..344889cd120e 100644 --- a/fs/ncpfs/getopt.c +++ b/fs/ncpfs/getopt.c @@ -53,15 +53,14 @@ int ncp_getopt(const char *caller, char **options, const struct ncp_option *opts return -EINVAL; } if (opts->has_arg & OPT_INT) { - char* v; + int rc = kstrtoul(val, 0, value); - *value = simple_strtoul(val, &v, 0); - if (!*v) { - return opts->val; + if (rc) { + pr_info("%s: invalid numeric value in %s=%s\n", + caller, token, val); + return rc; } - pr_info("%s: invalid numeric value in %s=%s\n", - caller, token, val); - return -EDOM; + return opts->val; } if (opts->has_arg & OPT_STRING) { return opts->val; diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 03192a66c143..4782e0840dcc 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -29,8 +29,6 @@ nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o -obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o -nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o - +obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/ obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 65d849bdf77a..9b431f44fad9 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -210,7 +210,7 @@ static void bl_end_io_read(struct bio *bio, int err) SetPageUptodate(bvec->bv_page); if (err) { - struct nfs_read_data *rdata = par->data; + struct nfs_pgio_data *rdata = par->data; struct nfs_pgio_header *header = rdata->header; if (!header->pnfs_error) @@ -224,17 +224,17 @@ static void bl_end_io_read(struct bio *bio, int err) static void bl_read_cleanup(struct work_struct *work) { struct rpc_task *task; - struct nfs_read_data *rdata; + struct nfs_pgio_data *rdata; dprintk("%s enter\n", __func__); task = container_of(work, struct rpc_task, u.tk_work); - rdata = container_of(task, struct nfs_read_data, task); + rdata = container_of(task, struct nfs_pgio_data, task); pnfs_ld_read_done(rdata); } static void bl_end_par_io_read(void *data, int unused) { - struct nfs_read_data *rdata = data; + struct nfs_pgio_data *rdata = data; rdata->task.tk_status = rdata->header->pnfs_error; INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); @@ -242,7 +242,7 @@ bl_end_par_io_read(void *data, int unused) } static enum pnfs_try_status -bl_read_pagelist(struct nfs_read_data *rdata) +bl_read_pagelist(struct nfs_pgio_data *rdata) { struct nfs_pgio_header *header = rdata->header; int i, hole; @@ -390,7 +390,7 @@ static void bl_end_io_write_zero(struct bio *bio, int err) } if (unlikely(err)) { - struct nfs_write_data *data = par->data; + struct nfs_pgio_data *data = par->data; struct nfs_pgio_header *header = data->header; if (!header->pnfs_error) @@ -405,7 +405,7 @@ static void bl_end_io_write(struct bio *bio, int err) { struct parallel_io *par = bio->bi_private; const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct nfs_write_data *data = par->data; + struct nfs_pgio_data *data = par->data; struct nfs_pgio_header *header = data->header; if (!uptodate) { @@ -423,10 +423,10 @@ static void bl_end_io_write(struct bio *bio, int err) static void bl_write_cleanup(struct work_struct *work) { struct rpc_task *task; - struct nfs_write_data *wdata; + struct nfs_pgio_data *wdata; dprintk("%s enter\n", __func__); task = container_of(work, struct rpc_task, u.tk_work); - wdata = container_of(task, struct nfs_write_data, task); + wdata = container_of(task, struct nfs_pgio_data, task); if (likely(!wdata->header->pnfs_error)) { /* Marks for LAYOUTCOMMIT */ mark_extents_written(BLK_LSEG2EXT(wdata->header->lseg), @@ -438,7 +438,7 @@ static void bl_write_cleanup(struct work_struct *work) /* Called when last of bios associated with a bl_write_pagelist call finishes */ static void bl_end_par_io_write(void *data, int num_se) { - struct nfs_write_data *wdata = data; + struct nfs_pgio_data *wdata = data; if (unlikely(wdata->header->pnfs_error)) { bl_free_short_extents(&BLK_LSEG2EXT(wdata->header->lseg)->bl_inval, @@ -673,7 +673,7 @@ check_page: } static enum pnfs_try_status -bl_write_pagelist(struct nfs_write_data *wdata, int sync) +bl_write_pagelist(struct nfs_pgio_data *wdata, int sync) { struct nfs_pgio_header *header = wdata->header; int i, ret, npg_zero, pg_index, last = 0; @@ -1189,13 +1189,17 @@ bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) pnfs_generic_pg_init_read(pgio, req); } -static bool +/* + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +static size_t bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req) { if (pgio->pg_dreq != NULL && !is_aligned_req(req, SECTOR_SIZE)) - return false; + return 0; return pnfs_generic_pg_test(pgio, prev, req); } @@ -1241,13 +1245,17 @@ bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) } } -static bool +/* + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +static size_t bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req) { if (pgio->pg_dreq != NULL && !is_aligned_req(req, PAGE_CACHE_SIZE)) - return false; + return 0; return pnfs_generic_pg_test(pgio, prev, req); } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index d9f3d067cd15..4a3d4ef76127 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2032,9 +2032,9 @@ static void nfs_access_free_entry(struct nfs_access_entry *entry) { put_rpccred(entry->cred); kfree(entry); - smp_mb__before_atomic_dec(); + smp_mb__before_atomic(); atomic_long_dec(&nfs_access_nr_entries); - smp_mb__after_atomic_dec(); + smp_mb__after_atomic(); } static void nfs_access_free_list(struct list_head *head) @@ -2082,9 +2082,9 @@ nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc) else { remove_lru_entry: list_del_init(&nfsi->access_cache_inode_lru); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); } spin_unlock(&inode->i_lock); } @@ -2232,9 +2232,9 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) nfs_access_add_rbtree(inode, cache); /* Update accounting */ - smp_mb__before_atomic_inc(); + smp_mb__before_atomic(); atomic_long_inc(&nfs_access_nr_entries); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); /* Add inode to global LRU list */ if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index b8797ae6831f..f11b9eed0de1 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -108,6 +108,97 @@ static inline int put_dreq(struct nfs_direct_req *dreq) return atomic_dec_and_test(&dreq->io_count); } +/* + * nfs_direct_select_verf - select the right verifier + * @dreq - direct request possibly spanning multiple servers + * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs + * @ds_idx - index of data server in data server list, only valid if ds_clp set + * + * returns the correct verifier to use given the role of the server + */ +static struct nfs_writeverf * +nfs_direct_select_verf(struct nfs_direct_req *dreq, + struct nfs_client *ds_clp, + int ds_idx) +{ + struct nfs_writeverf *verfp = &dreq->verf; + +#ifdef CONFIG_NFS_V4_1 + if (ds_clp) { + /* pNFS is in use, use the DS verf */ + if (ds_idx >= 0 && ds_idx < dreq->ds_cinfo.nbuckets) + verfp = &dreq->ds_cinfo.buckets[ds_idx].direct_verf; + else + WARN_ON_ONCE(1); + } +#endif + return verfp; +} + + +/* + * nfs_direct_set_hdr_verf - set the write/commit verifier + * @dreq - direct request possibly spanning multiple servers + * @hdr - pageio header to validate against previously seen verfs + * + * Set the server's (MDS or DS) "seen" verifier + */ +static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq, + struct nfs_pgio_header *hdr) +{ + struct nfs_writeverf *verfp; + + verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp, + hdr->data->ds_idx); + WARN_ON_ONCE(verfp->committed >= 0); + memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); + WARN_ON_ONCE(verfp->committed < 0); +} + +/* + * nfs_direct_cmp_hdr_verf - compare verifier for pgio header + * @dreq - direct request possibly spanning multiple servers + * @hdr - pageio header to validate against previously seen verf + * + * set the server's "seen" verf if not initialized. + * returns result of comparison between @hdr->verf and the "seen" + * verf of the server used by @hdr (DS or MDS) + */ +static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq, + struct nfs_pgio_header *hdr) +{ + struct nfs_writeverf *verfp; + + verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp, + hdr->data->ds_idx); + if (verfp->committed < 0) { + nfs_direct_set_hdr_verf(dreq, hdr); + return 0; + } + return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); +} + +#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) +/* + * nfs_direct_cmp_commit_data_verf - compare verifier for commit data + * @dreq - direct request possibly spanning multiple servers + * @data - commit data to validate against previously seen verf + * + * returns result of comparison between @data->verf and the verf of + * the server used by @data (DS or MDS) + */ +static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq, + struct nfs_commit_data *data) +{ + struct nfs_writeverf *verfp; + + verfp = nfs_direct_select_verf(dreq, data->ds_clp, + data->ds_commit_index); + WARN_ON_ONCE(verfp->committed < 0); + return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf)); +} +#endif + /** * nfs_direct_IO - NFS address space operation for direct I/O * @rw: direction (read or write) @@ -121,20 +212,20 @@ static inline int put_dreq(struct nfs_direct_req *dreq) * shunt off direct read and write requests before the VFS gets them, * so this method is only ever called for swap. */ -ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) +ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos) { #ifndef CONFIG_NFS_SWAP dprintk("NFS: nfs_direct_IO (%pD) off/no(%Ld/%lu) EINVAL\n", - iocb->ki_filp, (long long) pos, nr_segs); + iocb->ki_filp, (long long) pos, iter->nr_segs); return -EINVAL; #else VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE); if (rw == READ || rw == KERNEL_READ) - return nfs_file_direct_read(iocb, iov, nr_segs, pos, + return nfs_file_direct_read(iocb, iter, pos, rw == READ ? true : false); - return nfs_file_direct_write(iocb, iov, nr_segs, pos, + return nfs_file_direct_write(iocb, iter, pos, rw == WRITE ? true : false); #endif /* CONFIG_NFS_SWAP */ } @@ -168,6 +259,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void) kref_get(&dreq->kref); init_completion(&dreq->completion); INIT_LIST_HEAD(&dreq->mds_cinfo.list); + dreq->verf.committed = NFS_INVALID_STABLE_HOW; /* not set yet */ INIT_WORK(&dreq->work, nfs_direct_write_schedule_work); spin_lock_init(&dreq->lock); @@ -322,66 +414,42 @@ static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = { * handled automatically by nfs_direct_read_result(). Otherwise, if * no requests have been sent, just return an error. */ -static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *desc, - const struct iovec *iov, - loff_t pos, bool uio) -{ - struct nfs_direct_req *dreq = desc->pg_dreq; - struct nfs_open_context *ctx = dreq->ctx; - struct inode *inode = ctx->dentry->d_inode; - unsigned long user_addr = (unsigned long)iov->iov_base; - size_t count = iov->iov_len; - size_t rsize = NFS_SERVER(inode)->rsize; - unsigned int pgbase; - int result; - ssize_t started = 0; - struct page **pagevec = NULL; - unsigned int npages; - - do { - size_t bytes; - int i; - pgbase = user_addr & ~PAGE_MASK; - bytes = min(max_t(size_t, rsize, PAGE_SIZE), count); +static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, + struct iov_iter *iter, + loff_t pos) +{ + struct nfs_pageio_descriptor desc; + struct inode *inode = dreq->inode; + ssize_t result = -EINVAL; + size_t requested_bytes = 0; + size_t rsize = max_t(size_t, NFS_SERVER(inode)->rsize, PAGE_SIZE); - result = -ENOMEM; - npages = nfs_page_array_len(pgbase, bytes); - if (!pagevec) - pagevec = kmalloc(npages * sizeof(struct page *), - GFP_KERNEL); - if (!pagevec) - break; - if (uio) { - down_read(¤t->mm->mmap_sem); - result = get_user_pages(current, current->mm, user_addr, - npages, 1, 0, pagevec, NULL); - up_read(¤t->mm->mmap_sem); - if (result < 0) - break; - } else { - WARN_ON(npages != 1); - result = get_kernel_page(user_addr, 1, pagevec); - if (WARN_ON(result != 1)) - break; - } + nfs_pageio_init_read(&desc, dreq->inode, false, + &nfs_direct_read_completion_ops); + get_dreq(dreq); + desc.pg_dreq = dreq; + atomic_inc(&inode->i_dio_count); - if ((unsigned)result < npages) { - bytes = result * PAGE_SIZE; - if (bytes <= pgbase) { - nfs_direct_release_pages(pagevec, result); - break; - } - bytes -= pgbase; - npages = result; - } + while (iov_iter_count(iter)) { + struct page **pagevec; + size_t bytes; + size_t pgbase; + unsigned npages, i; + result = iov_iter_get_pages_alloc(iter, &pagevec, + rsize, &pgbase); + if (result < 0) + break; + + bytes = result; + iov_iter_advance(iter, bytes); + npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE; for (i = 0; i < npages; i++) { struct nfs_page *req; unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); /* XXX do we need to do the eof zeroing found in async_filler? */ - req = nfs_create_request(dreq->ctx, dreq->inode, - pagevec[i], + req = nfs_create_request(dreq->ctx, pagevec[i], NULL, pgbase, req_len); if (IS_ERR(req)) { result = PTR_ERR(req); @@ -389,56 +457,21 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de } req->wb_index = pos >> PAGE_SHIFT; req->wb_offset = pos & ~PAGE_MASK; - if (!nfs_pageio_add_request(desc, req)) { - result = desc->pg_error; + if (!nfs_pageio_add_request(&desc, req)) { + result = desc.pg_error; nfs_release_request(req); break; } pgbase = 0; bytes -= req_len; - started += req_len; - user_addr += req_len; + requested_bytes += req_len; pos += req_len; - count -= req_len; dreq->bytes_left -= req_len; } - /* The nfs_page now hold references to these pages */ nfs_direct_release_pages(pagevec, npages); - } while (count != 0 && result >= 0); - - kfree(pagevec); - - if (started) - return started; - return result < 0 ? (ssize_t) result : -EFAULT; -} - -static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, - const struct iovec *iov, - unsigned long nr_segs, - loff_t pos, bool uio) -{ - struct nfs_pageio_descriptor desc; - struct inode *inode = dreq->inode; - ssize_t result = -EINVAL; - size_t requested_bytes = 0; - unsigned long seg; - - NFS_PROTO(dreq->inode)->read_pageio_init(&desc, dreq->inode, - &nfs_direct_read_completion_ops); - get_dreq(dreq); - desc.pg_dreq = dreq; - atomic_inc(&inode->i_dio_count); - - for (seg = 0; seg < nr_segs; seg++) { - const struct iovec *vec = &iov[seg]; - result = nfs_direct_read_schedule_segment(&desc, vec, pos, uio); + kvfree(pagevec); if (result < 0) break; - requested_bytes += result; - if ((size_t)result < vec->iov_len) - break; - pos += vec->iov_len; } nfs_pageio_complete(&desc); @@ -461,8 +494,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, /** * nfs_file_direct_read - file direct read operation for NFS files * @iocb: target I/O control block - * @iov: vector of user buffers into which to read data - * @nr_segs: size of iov vector + * @iter: vector of user buffers into which to read data * @pos: byte offset in file where reading starts * * We use this function for direct reads instead of calling @@ -479,8 +511,8 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * client must read the updated atime from the server back into its * cache. */ -ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, bool uio) +ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, + loff_t pos, bool uio) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -488,9 +520,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, struct nfs_direct_req *dreq; struct nfs_lock_context *l_ctx; ssize_t result = -EINVAL; - size_t count; - - count = iov_length(iov, nr_segs); + size_t count = iov_iter_count(iter); nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n", @@ -513,7 +543,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, goto out_unlock; dreq->inode = inode; - dreq->bytes_left = iov_length(iov, nr_segs); + dreq->bytes_left = count; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); l_ctx = nfs_get_lock_context(dreq->ctx); if (IS_ERR(l_ctx)) { @@ -524,8 +554,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; - NFS_I(inode)->read_io += iov_length(iov, nr_segs); - result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio); + NFS_I(inode)->read_io += count; + result = nfs_direct_read_schedule_iovec(dreq, iter, pos); mutex_unlock(&inode->i_mutex); @@ -564,7 +594,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) dreq->count = 0; get_dreq(dreq); - NFS_PROTO(dreq->inode)->write_pageio_init(&desc, dreq->inode, FLUSH_STABLE, + nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false, &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; @@ -603,7 +633,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) dprintk("NFS: %5u commit failed with error %d.\n", data->task.tk_pid, status); dreq->flags = NFS_ODIRECT_RESCHED_WRITES; - } else if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) { + } else if (nfs_direct_cmp_commit_data_verf(dreq, data)) { dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid); dreq->flags = NFS_ODIRECT_RESCHED_WRITES; } @@ -681,109 +711,6 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode } #endif -/* - * NB: Return the value of the first error return code. Subsequent - * errors after the first one are ignored. - */ -/* - * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE - * operation. If nfs_writedata_alloc() or get_user_pages() fails, - * bail and stop sending more writes. Write length accounting is - * handled automatically by nfs_direct_write_result(). Otherwise, if - * no requests have been sent, just return an error. - */ -static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *desc, - const struct iovec *iov, - loff_t pos, bool uio) -{ - struct nfs_direct_req *dreq = desc->pg_dreq; - struct nfs_open_context *ctx = dreq->ctx; - struct inode *inode = ctx->dentry->d_inode; - unsigned long user_addr = (unsigned long)iov->iov_base; - size_t count = iov->iov_len; - size_t wsize = NFS_SERVER(inode)->wsize; - unsigned int pgbase; - int result; - ssize_t started = 0; - struct page **pagevec = NULL; - unsigned int npages; - - do { - size_t bytes; - int i; - - pgbase = user_addr & ~PAGE_MASK; - bytes = min(max_t(size_t, wsize, PAGE_SIZE), count); - - result = -ENOMEM; - npages = nfs_page_array_len(pgbase, bytes); - if (!pagevec) - pagevec = kmalloc(npages * sizeof(struct page *), GFP_KERNEL); - if (!pagevec) - break; - - if (uio) { - down_read(¤t->mm->mmap_sem); - result = get_user_pages(current, current->mm, user_addr, - npages, 0, 0, pagevec, NULL); - up_read(¤t->mm->mmap_sem); - if (result < 0) - break; - } else { - WARN_ON(npages != 1); - result = get_kernel_page(user_addr, 0, pagevec); - if (WARN_ON(result != 1)) - break; - } - - if ((unsigned)result < npages) { - bytes = result * PAGE_SIZE; - if (bytes <= pgbase) { - nfs_direct_release_pages(pagevec, result); - break; - } - bytes -= pgbase; - npages = result; - } - - for (i = 0; i < npages; i++) { - struct nfs_page *req; - unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); - - req = nfs_create_request(dreq->ctx, dreq->inode, - pagevec[i], - pgbase, req_len); - if (IS_ERR(req)) { - result = PTR_ERR(req); - break; - } - nfs_lock_request(req); - req->wb_index = pos >> PAGE_SHIFT; - req->wb_offset = pos & ~PAGE_MASK; - if (!nfs_pageio_add_request(desc, req)) { - result = desc->pg_error; - nfs_unlock_and_release_request(req); - break; - } - pgbase = 0; - bytes -= req_len; - started += req_len; - user_addr += req_len; - pos += req_len; - count -= req_len; - dreq->bytes_left -= req_len; - } - /* The nfs_page now hold references to these pages */ - nfs_direct_release_pages(pagevec, npages); - } while (count != 0 && result >= 0); - - kfree(pagevec); - - if (started) - return started; - return result < 0 ? (ssize_t) result : -EFAULT; -} - static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) { struct nfs_direct_req *dreq = hdr->dreq; @@ -813,13 +740,13 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) bit = NFS_IOHDR_NEED_RESCHED; else if (dreq->flags == 0) { - memcpy(&dreq->verf, hdr->verf, - sizeof(dreq->verf)); + nfs_direct_set_hdr_verf(dreq, hdr); bit = NFS_IOHDR_NEED_COMMIT; dreq->flags = NFS_ODIRECT_DO_COMMIT; } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) { - if (memcmp(&dreq->verf, hdr->verf, sizeof(dreq->verf))) { - dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr)) { + dreq->flags = + NFS_ODIRECT_RESCHED_WRITES; bit = NFS_IOHDR_NEED_RESCHED; } else bit = NFS_IOHDR_NEED_COMMIT; @@ -829,6 +756,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) spin_unlock(&dreq->lock); while (!list_empty(&hdr->pages)) { + req = nfs_list_entry(hdr->pages.next); nfs_list_remove_request(req); switch (bit) { @@ -863,33 +791,77 @@ static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { .completion = nfs_direct_write_completion, }; + +/* + * NB: Return the value of the first error return code. Subsequent + * errors after the first one are ignored. + */ +/* + * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE + * operation. If nfs_writedata_alloc() or get_user_pages() fails, + * bail and stop sending more writes. Write length accounting is + * handled automatically by nfs_direct_write_result(). Otherwise, if + * no requests have been sent, just return an error. + */ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, - const struct iovec *iov, - unsigned long nr_segs, - loff_t pos, bool uio) + struct iov_iter *iter, + loff_t pos) { struct nfs_pageio_descriptor desc; struct inode *inode = dreq->inode; ssize_t result = 0; size_t requested_bytes = 0; - unsigned long seg; + size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE); - NFS_PROTO(inode)->write_pageio_init(&desc, inode, FLUSH_COND_STABLE, + nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, false, &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; get_dreq(dreq); atomic_inc(&inode->i_dio_count); - NFS_I(dreq->inode)->write_io += iov_length(iov, nr_segs); - for (seg = 0; seg < nr_segs; seg++) { - const struct iovec *vec = &iov[seg]; - result = nfs_direct_write_schedule_segment(&desc, vec, pos, uio); + NFS_I(inode)->write_io += iov_iter_count(iter); + while (iov_iter_count(iter)) { + struct page **pagevec; + size_t bytes; + size_t pgbase; + unsigned npages, i; + + result = iov_iter_get_pages_alloc(iter, &pagevec, + wsize, &pgbase); if (result < 0) break; - requested_bytes += result; - if ((size_t)result < vec->iov_len) + + bytes = result; + iov_iter_advance(iter, bytes); + npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE; + for (i = 0; i < npages; i++) { + struct nfs_page *req; + unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); + + req = nfs_create_request(dreq->ctx, pagevec[i], NULL, + pgbase, req_len); + if (IS_ERR(req)) { + result = PTR_ERR(req); + break; + } + nfs_lock_request(req); + req->wb_index = pos >> PAGE_SHIFT; + req->wb_offset = pos & ~PAGE_MASK; + if (!nfs_pageio_add_request(&desc, req)) { + result = desc.pg_error; + nfs_unlock_and_release_request(req); + break; + } + pgbase = 0; + bytes -= req_len; + requested_bytes += req_len; + pos += req_len; + dreq->bytes_left -= req_len; + } + nfs_direct_release_pages(pagevec, npages); + kvfree(pagevec); + if (result < 0) break; - pos += vec->iov_len; } nfs_pageio_complete(&desc); @@ -911,8 +883,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, /** * nfs_file_direct_write - file direct write operation for NFS files * @iocb: target I/O control block - * @iov: vector of user buffers from which to write data - * @nr_segs: size of iov vector + * @iter: vector of user buffers from which to write data * @pos: byte offset in file where writing starts * * We use this function for direct writes instead of calling @@ -930,8 +901,8 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, * Note that O_APPEND is not supported for NFS direct writes, as there * is no atomic O_APPEND write facility in the NFS protocol. */ -ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, bool uio) +ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, + loff_t pos, bool uio) { ssize_t result = -EINVAL; struct file *file = iocb->ki_filp; @@ -940,9 +911,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, struct nfs_direct_req *dreq; struct nfs_lock_context *l_ctx; loff_t end; - size_t count; - - count = iov_length(iov, nr_segs); + size_t count = iov_iter_count(iter); end = (pos + count - 1) >> PAGE_CACHE_SHIFT; nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count); @@ -993,7 +962,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; - result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio); + result = nfs_direct_write_schedule_iovec(dreq, iter, pos); if (mapping->nrpages) { invalidate_inode_pages2_range(mapping, diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 284ca901fe16..4042ff58fe3f 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -165,22 +165,21 @@ nfs_file_flush(struct file *file, fl_owner_t id) EXPORT_SYMBOL_GPL(nfs_file_flush); ssize_t -nfs_file_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +nfs_file_read(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); ssize_t result; if (iocb->ki_filp->f_flags & O_DIRECT) - return nfs_file_direct_read(iocb, iov, nr_segs, pos, true); + return nfs_file_direct_read(iocb, to, iocb->ki_pos, true); - dprintk("NFS: read(%pD2, %lu@%lu)\n", + dprintk("NFS: read(%pD2, %zu@%lu)\n", iocb->ki_filp, - (unsigned long) iov_length(iov, nr_segs), (unsigned long) pos); + iov_iter_count(to), (unsigned long) iocb->ki_pos); result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); if (!result) { - result = generic_file_aio_read(iocb, iov, nr_segs, pos); + result = generic_file_read_iter(iocb, to); if (result > 0) nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result); } @@ -635,24 +634,24 @@ static int nfs_need_sync_write(struct file *filp, struct inode *inode) return 0; } -ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); unsigned long written = 0; ssize_t result; - size_t count = iov_length(iov, nr_segs); + size_t count = iov_iter_count(from); + loff_t pos = iocb->ki_pos; result = nfs_key_timeout_notify(file, inode); if (result) return result; if (file->f_flags & O_DIRECT) - return nfs_file_direct_write(iocb, iov, nr_segs, pos, true); + return nfs_file_direct_write(iocb, from, pos, true); - dprintk("NFS: write(%pD2, %lu@%Ld)\n", - file, (unsigned long) count, (long long) pos); + dprintk("NFS: write(%pD2, %zu@%Ld)\n", + file, count, (long long) pos); result = -EBUSY; if (IS_SWAPFILE(inode)) @@ -670,7 +669,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, if (!count) goto out; - result = generic_file_aio_write(iocb, iov, nr_segs, pos); + result = generic_file_write_iter(iocb, from); if (result > 0) written = result; @@ -691,36 +690,6 @@ out_swapfile: } EXPORT_SYMBOL_GPL(nfs_file_write); -ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe, - struct file *filp, loff_t *ppos, - size_t count, unsigned int flags) -{ - struct inode *inode = file_inode(filp); - unsigned long written = 0; - ssize_t ret; - - dprintk("NFS splice_write(%pD2, %lu@%llu)\n", - filp, (unsigned long) count, (unsigned long long) *ppos); - - /* - * The combination of splice and an O_APPEND destination is disallowed. - */ - - ret = generic_file_splice_write(pipe, filp, ppos, count, flags); - if (ret > 0) - written = ret; - - if (ret >= 0 && nfs_need_sync_write(filp, inode)) { - int err = vfs_fsync(filp, 0); - if (err < 0) - ret = err; - } - if (ret > 0) - nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); - return ret; -} -EXPORT_SYMBOL_GPL(nfs_file_splice_write); - static int do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { @@ -916,10 +885,6 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) is_local = 1; /* We're simulating flock() locks using posix locks on the server */ - fl->fl_owner = (fl_owner_t)filp; - fl->fl_start = 0; - fl->fl_end = OFFSET_MAX; - if (fl->fl_type == F_UNLCK) return do_unlk(filp, cmd, fl, is_local); return do_setlk(filp, cmd, fl, is_local); @@ -939,10 +904,10 @@ EXPORT_SYMBOL_GPL(nfs_setlease); const struct file_operations nfs_file_operations = { .llseek = nfs_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = nfs_file_read, - .aio_write = nfs_file_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = nfs_file_read, + .write_iter = nfs_file_write, .mmap = nfs_file_mmap, .open = nfs_file_open, .flush = nfs_file_flush, @@ -951,7 +916,7 @@ const struct file_operations nfs_file_operations = { .lock = nfs_lock, .flock = nfs_flock, .splice_read = nfs_file_splice_read, - .splice_write = nfs_file_splice_write, + .splice_write = iter_file_splice_write, .check_flags = nfs_check_flags, .setlease = nfs_setlease, }; diff --git a/fs/nfs/filelayout/Makefile b/fs/nfs/filelayout/Makefile new file mode 100644 index 000000000000..8516cdffb9e9 --- /dev/null +++ b/fs/nfs/filelayout/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the pNFS Files Layout Driver kernel module +# +obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o +nfs_layout_nfsv41_files-y := filelayout.o filelayoutdev.o diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/filelayout/filelayout.c index b9a35c05b60f..d2eba1c13b7e 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -35,11 +35,11 @@ #include <linux/sunrpc/metrics.h> -#include "nfs4session.h" -#include "internal.h" -#include "delegation.h" -#include "nfs4filelayout.h" -#include "nfs4trace.h" +#include "../nfs4session.h" +#include "../internal.h" +#include "../delegation.h" +#include "filelayout.h" +#include "../nfs4trace.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD @@ -84,7 +84,7 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) BUG(); } -static void filelayout_reset_write(struct nfs_write_data *data) +static void filelayout_reset_write(struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; struct rpc_task *task = &data->task; @@ -105,7 +105,7 @@ static void filelayout_reset_write(struct nfs_write_data *data) } } -static void filelayout_reset_read(struct nfs_read_data *data) +static void filelayout_reset_read(struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; struct rpc_task *task = &data->task; @@ -243,7 +243,7 @@ wait_on_recovery: /* NFS_PROTO call done callback routines */ static int filelayout_read_done_cb(struct rpc_task *task, - struct nfs_read_data *data) + struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; int err; @@ -270,7 +270,7 @@ static int filelayout_read_done_cb(struct rpc_task *task, * rfc5661 is not clear about which credential should be used. */ static void -filelayout_set_layoutcommit(struct nfs_write_data *wdata) +filelayout_set_layoutcommit(struct nfs_pgio_data *wdata) { struct nfs_pgio_header *hdr = wdata->header; @@ -279,7 +279,7 @@ filelayout_set_layoutcommit(struct nfs_write_data *wdata) return; pnfs_set_layoutcommit(wdata); - dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, + dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); } @@ -305,7 +305,7 @@ filelayout_reset_to_mds(struct pnfs_layout_segment *lseg) */ static void filelayout_read_prepare(struct rpc_task *task, void *data) { - struct nfs_read_data *rdata = data; + struct nfs_pgio_data *rdata = data; if (unlikely(test_bit(NFS_CONTEXT_BAD, &rdata->args.context->flags))) { rpc_exit(task, -EIO); @@ -317,7 +317,7 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data) rpc_exit(task, 0); return; } - rdata->read_done_cb = filelayout_read_done_cb; + rdata->pgio_done_cb = filelayout_read_done_cb; if (nfs41_setup_sequence(rdata->ds_clp->cl_session, &rdata->args.seq_args, @@ -331,7 +331,7 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data) static void filelayout_read_call_done(struct rpc_task *task, void *data) { - struct nfs_read_data *rdata = data; + struct nfs_pgio_data *rdata = data; dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); @@ -347,14 +347,14 @@ static void filelayout_read_call_done(struct rpc_task *task, void *data) static void filelayout_read_count_stats(struct rpc_task *task, void *data) { - struct nfs_read_data *rdata = data; + struct nfs_pgio_data *rdata = data; rpc_count_iostats(task, NFS_SERVER(rdata->header->inode)->client->cl_metrics); } static void filelayout_read_release(void *data) { - struct nfs_read_data *rdata = data; + struct nfs_pgio_data *rdata = data; struct pnfs_layout_hdr *lo = rdata->header->lseg->pls_layout; filelayout_fenceme(lo->plh_inode, lo); @@ -363,7 +363,7 @@ static void filelayout_read_release(void *data) } static int filelayout_write_done_cb(struct rpc_task *task, - struct nfs_write_data *data) + struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; int err; @@ -419,7 +419,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task, static void filelayout_write_prepare(struct rpc_task *task, void *data) { - struct nfs_write_data *wdata = data; + struct nfs_pgio_data *wdata = data; if (unlikely(test_bit(NFS_CONTEXT_BAD, &wdata->args.context->flags))) { rpc_exit(task, -EIO); @@ -443,7 +443,7 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data) static void filelayout_write_call_done(struct rpc_task *task, void *data) { - struct nfs_write_data *wdata = data; + struct nfs_pgio_data *wdata = data; if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags) && task->tk_status == 0) { @@ -457,14 +457,14 @@ static void filelayout_write_call_done(struct rpc_task *task, void *data) static void filelayout_write_count_stats(struct rpc_task *task, void *data) { - struct nfs_write_data *wdata = data; + struct nfs_pgio_data *wdata = data; rpc_count_iostats(task, NFS_SERVER(wdata->header->inode)->client->cl_metrics); } static void filelayout_write_release(void *data) { - struct nfs_write_data *wdata = data; + struct nfs_pgio_data *wdata = data; struct pnfs_layout_hdr *lo = wdata->header->lseg->pls_layout; filelayout_fenceme(lo->plh_inode, lo); @@ -529,7 +529,7 @@ static const struct rpc_call_ops filelayout_commit_call_ops = { }; static enum pnfs_try_status -filelayout_read_pagelist(struct nfs_read_data *data) +filelayout_read_pagelist(struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; struct pnfs_layout_segment *lseg = hdr->lseg; @@ -560,6 +560,7 @@ filelayout_read_pagelist(struct nfs_read_data *data) /* No multipath support. Use first DS */ atomic_inc(&ds->ds_clp->cl_count); data->ds_clp = ds->ds_clp; + data->ds_idx = idx; fh = nfs4_fl_select_ds_fh(lseg, j); if (fh) data->args.fh = fh; @@ -568,14 +569,14 @@ filelayout_read_pagelist(struct nfs_read_data *data) data->mds_offset = offset; /* Perform an asynchronous read to ds */ - nfs_initiate_read(ds_clnt, data, - &filelayout_read_call_ops, RPC_TASK_SOFTCONN); + nfs_initiate_pgio(ds_clnt, data, + &filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN); return PNFS_ATTEMPTED; } /* Perform async writes. */ static enum pnfs_try_status -filelayout_write_pagelist(struct nfs_write_data *data, int sync) +filelayout_write_pagelist(struct nfs_pgio_data *data, int sync) { struct nfs_pgio_header *hdr = data->header; struct pnfs_layout_segment *lseg = hdr->lseg; @@ -600,20 +601,18 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync) __func__, hdr->inode->i_ino, sync, (size_t) data->args.count, offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count)); - data->write_done_cb = filelayout_write_done_cb; + data->pgio_done_cb = filelayout_write_done_cb; atomic_inc(&ds->ds_clp->cl_count); data->ds_clp = ds->ds_clp; + data->ds_idx = idx; fh = nfs4_fl_select_ds_fh(lseg, j); if (fh) data->args.fh = fh; - /* - * Get the file offset on the dserver. Set the write offset to - * this offset and save the original offset. - */ + data->args.offset = filelayout_get_dserver_offset(lseg, offset); /* Perform an asynchronous write */ - nfs_initiate_write(ds_clnt, data, + nfs_initiate_pgio(ds_clnt, data, &filelayout_write_call_ops, sync, RPC_TASK_SOFTCONN); return PNFS_ATTEMPTED; @@ -637,7 +636,6 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, struct nfs4_deviceid_node *d; struct nfs4_file_layout_dsaddr *dsaddr; int status = -EINVAL; - struct nfs_server *nfss = NFS_SERVER(lo->plh_inode); dprintk("--> %s\n", __func__); @@ -655,7 +653,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, goto out; } - if (!fl->stripe_unit || fl->stripe_unit % PAGE_SIZE) { + if (!fl->stripe_unit) { dprintk("%s Invalid stripe unit (%u)\n", __func__, fl->stripe_unit); goto out; @@ -692,12 +690,6 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, goto out_put; } - if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) { - dprintk("%s Stripe unit (%u) not aligned with rsize %u " - "wsize %u\n", __func__, fl->stripe_unit, nfss->rsize, - nfss->wsize); - } - status = 0; out: dprintk("--> %s returns %d\n", __func__, status); @@ -850,11 +842,15 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg, { struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); struct pnfs_commit_bucket *buckets; - int size; + int size, i; if (fl->commit_through_mds) return 0; - if (cinfo->ds->nbuckets != 0) { + + size = (fl->stripe_type == STRIPE_SPARSE) ? + fl->dsaddr->ds_num : fl->dsaddr->stripe_count; + + if (cinfo->ds->nbuckets >= size) { /* This assumes there is only one IOMODE_RW lseg. What * we really want to do is have a layout_hdr level * dictionary of <multipath_list4, fh> keys, each @@ -864,30 +860,36 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg, return 0; } - size = (fl->stripe_type == STRIPE_SPARSE) ? - fl->dsaddr->ds_num : fl->dsaddr->stripe_count; - buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket), gfp_flags); if (!buckets) return -ENOMEM; - else { - int i; + for (i = 0; i < size; i++) { + INIT_LIST_HEAD(&buckets[i].written); + INIT_LIST_HEAD(&buckets[i].committing); + /* mark direct verifier as unset */ + buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; + } - spin_lock(cinfo->lock); - if (cinfo->ds->nbuckets != 0) - kfree(buckets); - else { - cinfo->ds->buckets = buckets; - cinfo->ds->nbuckets = size; - for (i = 0; i < size; i++) { - INIT_LIST_HEAD(&buckets[i].written); - INIT_LIST_HEAD(&buckets[i].committing); - } - } - spin_unlock(cinfo->lock); - return 0; + spin_lock(cinfo->lock); + if (cinfo->ds->nbuckets >= size) + goto out; + for (i = 0; i < cinfo->ds->nbuckets; i++) { + list_splice(&cinfo->ds->buckets[i].written, + &buckets[i].written); + list_splice(&cinfo->ds->buckets[i].committing, + &buckets[i].committing); + buckets[i].direct_verf.committed = + cinfo->ds->buckets[i].direct_verf.committed; + buckets[i].wlseg = cinfo->ds->buckets[i].wlseg; + buckets[i].clseg = cinfo->ds->buckets[i].clseg; } + swap(cinfo->ds->buckets, buckets); + cinfo->ds->nbuckets = size; +out: + spin_unlock(cinfo->lock); + kfree(buckets); + return 0; } static struct pnfs_layout_segment * @@ -915,47 +917,51 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, /* * filelayout_pg_test(). Called by nfs_can_coalesce_requests() * - * return true : coalesce page - * return false : don't coalesce page + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. */ -static bool +static size_t filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req) { + unsigned int size; u64 p_stripe, r_stripe; - u32 stripe_unit; + u32 stripe_offset; + u64 segment_offset = pgio->pg_lseg->pls_range.offset; + u32 stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit; - if (!pnfs_generic_pg_test(pgio, prev, req) || - !nfs_generic_pg_test(pgio, prev, req)) - return false; + /* calls nfs_generic_pg_test */ + size = pnfs_generic_pg_test(pgio, prev, req); + if (!size) + return 0; - p_stripe = (u64)req_offset(prev); - r_stripe = (u64)req_offset(req); - stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit; + /* see if req and prev are in the same stripe */ + if (prev) { + p_stripe = (u64)req_offset(prev) - segment_offset; + r_stripe = (u64)req_offset(req) - segment_offset; + do_div(p_stripe, stripe_unit); + do_div(r_stripe, stripe_unit); - do_div(p_stripe, stripe_unit); - do_div(r_stripe, stripe_unit); + if (p_stripe != r_stripe) + return 0; + } - return (p_stripe == r_stripe); + /* calculate remaining bytes in the current stripe */ + div_u64_rem((u64)req_offset(req) - segment_offset, + stripe_unit, + &stripe_offset); + WARN_ON_ONCE(stripe_offset > stripe_unit); + if (stripe_offset >= stripe_unit) + return 0; + return min(stripe_unit - (unsigned int)stripe_offset, size); } static void filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - WARN_ON_ONCE(pgio->pg_lseg != NULL); - - if (req->wb_offset != req->wb_pgbase) { - /* - * Handling unaligned pages is difficult, because have to - * somehow split a req in two in certain cases in the - * pg.test code. Avoid this by just not using pnfs - * in this case. - */ - nfs_pageio_reset_read_mds(pgio); - return; - } - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + if (!pgio->pg_lseg) + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, NFS4_MAX_UINT64, @@ -973,11 +979,8 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_commit_info cinfo; int status; - WARN_ON_ONCE(pgio->pg_lseg != NULL); - - if (req->wb_offset != req->wb_pgbase) - goto out_mds; - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + if (!pgio->pg_lseg) + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, NFS4_MAX_UINT64, @@ -1067,6 +1070,7 @@ filelayout_choose_commit_list(struct nfs_page *req, */ j = nfs4_fl_calc_j_index(lseg, req_offset(req)); i = select_bucket_index(fl, j); + spin_lock(cinfo->lock); buckets = cinfo->ds->buckets; list = &buckets[i].written; if (list_empty(list)) { @@ -1080,6 +1084,7 @@ filelayout_choose_commit_list(struct nfs_page *req, } set_bit(PG_COMMIT_TO_DS, &req->wb_flags); cinfo->ds->nwritten++; + spin_unlock(cinfo->lock); return list; } @@ -1176,6 +1181,7 @@ transfer_commit_list(struct list_head *src, struct list_head *dst, return ret; } +/* Note called with cinfo->lock held. */ static int filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, struct nfs_commit_info *cinfo, @@ -1220,15 +1226,18 @@ static void filelayout_recover_commit_reqs(struct list_head *dst, struct nfs_commit_info *cinfo) { struct pnfs_commit_bucket *b; + struct pnfs_layout_segment *freeme; int i; +restart: spin_lock(cinfo->lock); for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { if (transfer_commit_list(&b->written, dst, cinfo, 0)) { - spin_unlock(cinfo->lock); - pnfs_put_lseg(b->wlseg); + freeme = b->wlseg; b->wlseg = NULL; - spin_lock(cinfo->lock); + spin_unlock(cinfo->lock); + pnfs_put_lseg(freeme); + goto restart; } } cinfo->ds->nwritten = 0; @@ -1243,6 +1252,7 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list) struct nfs_commit_data *data; int i, j; unsigned int nreq = 0; + struct pnfs_layout_segment *freeme; fl_cinfo = cinfo->ds; bucket = fl_cinfo->buckets; @@ -1253,8 +1263,10 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list) if (!data) break; data->ds_commit_index = i; + spin_lock(cinfo->lock); data->lseg = bucket->clseg; bucket->clseg = NULL; + spin_unlock(cinfo->lock); list_add(&data->pages, list); nreq++; } @@ -1264,8 +1276,11 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list) if (list_empty(&bucket->committing)) continue; nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo); - pnfs_put_lseg(bucket->clseg); + spin_lock(cinfo->lock); + freeme = bucket->clseg; bucket->clseg = NULL; + spin_unlock(cinfo->lock); + pnfs_put_lseg(freeme); } /* Caller will clean up entries put on list */ return nreq; @@ -1330,7 +1345,7 @@ filelayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) struct nfs4_filelayout *flo; flo = kzalloc(sizeof(*flo), gfp_flags); - return &flo->generic_hdr; + return flo != NULL ? &flo->generic_hdr : NULL; } static void diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/filelayout/filelayout.h index cebd20e7e923..ffbddf2219ea 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/filelayout/filelayout.h @@ -30,7 +30,7 @@ #ifndef FS_NFS_NFS4FILELAYOUT_H #define FS_NFS_NFS4FILELAYOUT_H -#include "pnfs.h" +#include "../pnfs.h" /* * Default data server connection timeout and retrans vaules. diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index efac602edb37..44bf0140a4c7 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -33,9 +33,9 @@ #include <linux/module.h> #include <linux/sunrpc/addr.h> -#include "internal.h" -#include "nfs4session.h" -#include "nfs4filelayout.h" +#include "../internal.h" +#include "../nfs4session.h" +#include "filelayout.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD @@ -789,9 +789,9 @@ static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds) static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(NFS4DS_CONNECTING, &ds->ds_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING); } diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 66984a9aafaa..b94f80420a58 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -120,7 +120,8 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh, security_d_instantiate(ret, inode); spin_lock(&ret->d_lock); - if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) { + if (IS_ROOT(ret) && !ret->d_fsdata && + !(ret->d_flags & DCACHE_NFSFS_RENAMED)) { ret->d_fsdata = name; name = NULL; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 0c438973f3c8..9927913c97c2 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -147,6 +147,17 @@ int nfs_sync_mapping(struct address_space *mapping) return ret; } +static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + if (inode->i_mapping->nrpages == 0) + flags &= ~NFS_INO_INVALID_DATA; + nfsi->cache_validity |= flags; + if (flags & NFS_INO_INVALID_DATA) + nfs_fscache_invalidate(inode); +} + /* * Invalidate the local caches */ @@ -162,17 +173,16 @@ static void nfs_zap_caches_locked(struct inode *inode) memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf)); if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { - nfs_fscache_invalidate(inode); - nfsi->cache_validity |= NFS_INO_INVALID_ATTR + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR | NFS_INO_INVALID_DATA | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL - | NFS_INO_REVAL_PAGECACHE; + | NFS_INO_REVAL_PAGECACHE); } else - nfsi->cache_validity |= NFS_INO_INVALID_ATTR + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL - | NFS_INO_REVAL_PAGECACHE; + | NFS_INO_REVAL_PAGECACHE); nfs_zap_label_cache_locked(nfsi); } @@ -187,8 +197,7 @@ void nfs_zap_mapping(struct inode *inode, struct address_space *mapping) { if (mapping->nrpages != 0) { spin_lock(&inode->i_lock); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; - nfs_fscache_invalidate(inode); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); spin_unlock(&inode->i_lock); } } @@ -209,7 +218,7 @@ EXPORT_SYMBOL_GPL(nfs_zap_acl_cache); void nfs_invalidate_atime(struct inode *inode) { spin_lock(&inode->i_lock); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_invalidate_atime); @@ -369,7 +378,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st inode->i_mode = fattr->mode; if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0 && nfs_server_capable(inode, NFS_CAP_MODE)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); /* Why so? Because we want revalidate for devices/FIFOs, and * that's precisely what we have in nfs_file_inode_operations. */ @@ -415,36 +424,36 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st if (fattr->valid & NFS_ATTR_FATTR_ATIME) inode->i_atime = fattr->atime; else if (nfs_server_capable(inode, NFS_CAP_ATIME)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_MTIME) inode->i_mtime = fattr->mtime; else if (nfs_server_capable(inode, NFS_CAP_MTIME)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode->i_ctime = fattr->ctime; else if (nfs_server_capable(inode, NFS_CAP_CTIME)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_CHANGE) inode->i_version = fattr->change_attr; else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_SIZE) inode->i_size = nfs_size_to_loff_t(fattr->size); else - nfsi->cache_validity |= NFS_INO_INVALID_ATTR - | NFS_INO_REVAL_PAGECACHE; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_PAGECACHE); if (fattr->valid & NFS_ATTR_FATTR_NLINK) set_nlink(inode, fattr->nlink); else if (nfs_server_capable(inode, NFS_CAP_NLINK)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_OWNER) inode->i_uid = fattr->uid; else if (nfs_server_capable(inode, NFS_CAP_OWNER)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_GROUP) inode->i_gid = fattr->gid; else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP)) - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { @@ -550,6 +559,9 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset) spin_lock(&inode->i_lock); i_size_write(inode, offset); + /* Optimisation */ + if (offset == 0) + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA; spin_unlock(&inode->i_lock); truncate_pagecache(inode, offset); @@ -578,7 +590,8 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) inode->i_uid = attr->ia_uid; if ((attr->ia_valid & ATTR_GID) != 0) inode->i_gid = attr->ia_gid; - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL); spin_unlock(&inode->i_lock); } if ((attr->ia_valid & ATTR_SIZE) != 0) { @@ -1085,7 +1098,7 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) trace_nfs_invalidate_mapping_exit(inode, ret); clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(bitlock, NFS_INO_INVALIDATING); out: return ret; @@ -1101,7 +1114,7 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr && inode->i_version == fattr->pre_change_attr) { inode->i_version = fattr->change_attr; if (S_ISDIR(inode->i_mode)) - nfsi->cache_validity |= NFS_INO_INVALID_DATA; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); ret |= NFS_INO_INVALID_ATTR; } /* If we have atomic WCC data, we may update some attributes */ @@ -1117,7 +1130,7 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); if (S_ISDIR(inode->i_mode)) - nfsi->cache_validity |= NFS_INO_INVALID_DATA; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); ret |= NFS_INO_INVALID_ATTR; } if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) @@ -1128,9 +1141,6 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr ret |= NFS_INO_INVALID_ATTR; } - if (nfsi->cache_validity & NFS_INO_INVALID_DATA) - nfs_fscache_invalidate(inode); - return ret; } @@ -1189,7 +1199,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat invalid |= NFS_INO_INVALID_ATIME; if (invalid != 0) - nfsi->cache_validity |= invalid; + nfs_set_cache_invalid(inode, invalid); nfsi->read_cache_jiffies = fattr->time_start; return 0; @@ -1402,13 +1412,11 @@ EXPORT_SYMBOL_GPL(nfs_refresh_inode); static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr) { - struct nfs_inode *nfsi = NFS_I(inode); + unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; - nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; - if (S_ISDIR(inode->i_mode)) { - nfsi->cache_validity |= NFS_INO_INVALID_DATA; - nfs_fscache_invalidate(inode); - } + if (S_ISDIR(inode->i_mode)) + invalid |= NFS_INO_INVALID_DATA; + nfs_set_cache_invalid(inode, invalid); if ((fattr->valid & NFS_ATTR_FATTR) == 0) return 0; return nfs_refresh_inode_locked(inode, fattr); @@ -1575,18 +1583,20 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_version = fattr->change_attr; } } else if (server->caps & NFS_CAP_CHANGE_ATTR) - invalid |= save_cache_validity; + nfsi->cache_validity |= save_cache_validity; if (fattr->valid & NFS_ATTR_FATTR_MTIME) { memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); } else if (server->caps & NFS_CAP_MTIME) - invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR | NFS_INO_REVAL_FORCED); if (fattr->valid & NFS_ATTR_FATTR_CTIME) { memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); } else if (server->caps & NFS_CAP_CTIME) - invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR | NFS_INO_REVAL_FORCED); /* Check if our cached file size is stale */ @@ -1599,6 +1609,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if ((nfsi->npages == 0) || new_isize > cur_isize) { i_size_write(inode, new_isize); invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + invalid &= ~NFS_INO_REVAL_PAGECACHE; } dprintk("NFS: isize change on server for file %s/%ld " "(%Ld to %Ld)\n", @@ -1608,7 +1619,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) (long long)new_isize); } } else - invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED); @@ -1616,7 +1628,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (fattr->valid & NFS_ATTR_FATTR_ATIME) memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); else if (server->caps & NFS_CAP_ATIME) - invalid |= save_cache_validity & (NFS_INO_INVALID_ATIME + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATIME | NFS_INO_REVAL_FORCED); if (fattr->valid & NFS_ATTR_FATTR_MODE) { @@ -1627,7 +1640,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; } } else if (server->caps & NFS_CAP_MODE) - invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_REVAL_FORCED); @@ -1638,7 +1652,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_uid = fattr->uid; } } else if (server->caps & NFS_CAP_OWNER) - invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_REVAL_FORCED); @@ -1649,7 +1664,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_gid = fattr->gid; } } else if (server->caps & NFS_CAP_OWNER_GROUP) - invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_REVAL_FORCED); @@ -1662,7 +1678,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) set_nlink(inode, fattr->nlink); } } else if (server->caps & NFS_CAP_NLINK) - invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + nfsi->cache_validity |= save_cache_validity & + (NFS_INO_INVALID_ATTR | NFS_INO_REVAL_FORCED); if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { @@ -1694,10 +1711,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) invalid &= ~NFS_INO_INVALID_DATA; if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) || (save_cache_validity & NFS_INO_REVAL_FORCED)) - nfsi->cache_validity |= invalid; - - if (invalid & NFS_INO_INVALID_DATA) - nfs_fscache_invalidate(inode); + nfs_set_cache_invalid(inode, invalid); return 0; out_err: diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index dd8bfc2e2464..f415cbf9f6c3 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -231,13 +231,21 @@ extern void nfs_destroy_writepagecache(void); extern int __init nfs_init_directcache(void); extern void nfs_destroy_directcache(void); -extern bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount); extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr, void (*release)(struct nfs_pgio_header *hdr)); void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos); int nfs_iocounter_wait(struct nfs_io_counter *c); +extern const struct nfs_pageio_ops nfs_pgio_rw_ops; +struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *); +void nfs_rw_header_free(struct nfs_pgio_header *); +void nfs_pgio_data_release(struct nfs_pgio_data *); +int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *); +int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_data *, + const struct rpc_call_ops *, int, int); +void nfs_free_request(struct nfs_page *req); + static inline void nfs_iocounter_init(struct nfs_io_counter *c) { c->flags = 0; @@ -320,16 +328,14 @@ int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *) int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int); loff_t nfs_file_llseek(struct file *, loff_t, int); int nfs_file_flush(struct file *, fl_owner_t); -ssize_t nfs_file_read(struct kiocb *, const struct iovec *, unsigned long, loff_t); +ssize_t nfs_file_read(struct kiocb *, struct iov_iter *); ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); int nfs_file_mmap(struct file *, struct vm_area_struct *); -ssize_t nfs_file_write(struct kiocb *, const struct iovec *, unsigned long, loff_t); +ssize_t nfs_file_write(struct kiocb *, struct iov_iter *); int nfs_file_release(struct inode *, struct file *); int nfs_lock(struct file *, int, struct file_lock *); int nfs_flock(struct file *, int, struct file_lock *); -ssize_t nfs_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *, - size_t, unsigned int); int nfs_check_flags(int); int nfs_setlease(struct file *, long, struct file_lock **); @@ -395,19 +401,11 @@ extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool struct nfs_pgio_completion_ops; /* read.c */ -extern struct nfs_read_header *nfs_readhdr_alloc(void); -extern void nfs_readhdr_free(struct nfs_pgio_header *hdr); extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, - struct inode *inode, + struct inode *inode, bool force_mds, const struct nfs_pgio_completion_ops *compl_ops); -extern int nfs_initiate_read(struct rpc_clnt *clnt, - struct nfs_read_data *data, - const struct rpc_call_ops *call_ops, int flags); extern void nfs_read_prepare(struct rpc_task *task, void *calldata); -extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr); extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); -extern void nfs_readdata_release(struct nfs_read_data *rdata); /* super.c */ void nfs_clone_super(struct super_block *, struct nfs_mount_info *); @@ -422,19 +420,10 @@ int nfs_remount(struct super_block *sb, int *flags, char *raw_data); /* write.c */ extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, - struct inode *inode, int ioflags, + struct inode *inode, int ioflags, bool force_mds, const struct nfs_pgio_completion_ops *compl_ops); -extern struct nfs_write_header *nfs_writehdr_alloc(void); -extern void nfs_writehdr_free(struct nfs_pgio_header *hdr); -extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr); extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio); -extern void nfs_writedata_release(struct nfs_write_data *wdata); extern void nfs_commit_free(struct nfs_commit_data *p); -extern int nfs_initiate_write(struct rpc_clnt *clnt, - struct nfs_write_data *data, - const struct rpc_call_ops *call_ops, - int how, int flags); extern void nfs_write_prepare(struct rpc_task *task, void *calldata); extern void nfs_commit_prepare(struct rpc_task *task, void *calldata); extern int nfs_initiate_commit(struct rpc_clnt *clnt, @@ -447,6 +436,7 @@ extern void nfs_init_commit(struct nfs_commit_data *data, struct nfs_commit_info *cinfo); int nfs_scan_commit_list(struct list_head *src, struct list_head *dst, struct nfs_commit_info *cinfo, int max); +unsigned long nfs_reqs_to_commit(struct nfs_commit_info *); int nfs_scan_commit(struct inode *inode, struct list_head *dst, struct nfs_commit_info *cinfo); void nfs_mark_request_commit(struct nfs_page *req, @@ -492,7 +482,7 @@ static inline void nfs_inode_dio_wait(struct inode *inode) extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); /* nfs4proc.c */ -extern void __nfs4_read_done_cb(struct nfs_read_data *); +extern void __nfs4_read_done_cb(struct nfs_pgio_data *); extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms, const char *ip_addr); diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 62db136339ea..5f61b83f4a1c 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -103,7 +103,7 @@ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) /* * typedef opaque nfsdata<>; */ -static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_readres *result) +static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_pgio_res *result) { u32 recvd, count; __be32 *p; @@ -613,7 +613,7 @@ static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req, * }; */ static void encode_readargs(struct xdr_stream *xdr, - const struct nfs_readargs *args) + const struct nfs_pgio_args *args) { u32 offset = args->offset; u32 count = args->count; @@ -629,7 +629,7 @@ static void encode_readargs(struct xdr_stream *xdr, static void nfs2_xdr_enc_readargs(struct rpc_rqst *req, struct xdr_stream *xdr, - const struct nfs_readargs *args) + const struct nfs_pgio_args *args) { encode_readargs(xdr, args); prepare_reply_buffer(req, args->pages, args->pgbase, @@ -649,7 +649,7 @@ static void nfs2_xdr_enc_readargs(struct rpc_rqst *req, * }; */ static void encode_writeargs(struct xdr_stream *xdr, - const struct nfs_writeargs *args) + const struct nfs_pgio_args *args) { u32 offset = args->offset; u32 count = args->count; @@ -669,7 +669,7 @@ static void encode_writeargs(struct xdr_stream *xdr, static void nfs2_xdr_enc_writeargs(struct rpc_rqst *req, struct xdr_stream *xdr, - const struct nfs_writeargs *args) + const struct nfs_pgio_args *args) { encode_writeargs(xdr, args); xdr->buf->flags |= XDRBUF_WRITE; @@ -857,7 +857,7 @@ out_default: * }; */ static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr, - struct nfs_readres *result) + struct nfs_pgio_res *result) { enum nfs_stat status; int error; @@ -878,7 +878,7 @@ out_default: } static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr, - struct nfs_writeres *result) + struct nfs_pgio_res *result) { /* All NFSv2 writes are "file sync" writes */ result->verf->committed = NFS_FILE_SYNC; diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 871d6eda8dba..8f854dde4150 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -247,3 +247,46 @@ const struct xattr_handler *nfs3_xattr_handlers[] = { &posix_acl_default_xattr_handler, NULL, }; + +static int +nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data, + size_t size, ssize_t *result) +{ + struct posix_acl *acl; + char *p = data + *result; + + acl = get_acl(inode, type); + if (!acl) + return 0; + + posix_acl_release(acl); + + *result += strlen(name); + *result += 1; + if (!size) + return 0; + if (*result > size) + return -ERANGE; + + strcpy(p, name); + return 0; +} + +ssize_t +nfs3_listxattr(struct dentry *dentry, char *data, size_t size) +{ + struct inode *inode = dentry->d_inode; + ssize_t result = 0; + int error; + + error = nfs3_list_one_acl(inode, ACL_TYPE_ACCESS, + POSIX_ACL_XATTR_ACCESS, data, size, &result); + if (error) + return error; + + error = nfs3_list_one_acl(inode, ACL_TYPE_DEFAULT, + POSIX_ACL_XATTR_DEFAULT, data, size, &result); + if (error) + return error; + return result; +} diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index db60149c4579..f0afa291fd58 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -795,7 +795,7 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, return status; } -static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data) +static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_data *data) { struct inode *inode = data->header->inode; @@ -807,18 +807,18 @@ static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data) return 0; } -static void nfs3_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) +static void nfs3_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg) { msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ]; } -static int nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) +static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data) { rpc_call_start(task); return 0; } -static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data) +static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_data *data) { struct inode *inode = data->header->inode; @@ -829,17 +829,11 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data) return 0; } -static void nfs3_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg) +static void nfs3_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg) { msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE]; } -static int nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) -{ - rpc_call_start(task); - return 0; -} - static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) { rpc_call_start(task); @@ -891,7 +885,7 @@ static const struct inode_operations nfs3_dir_inode_operations = { .getattr = nfs_getattr, .setattr = nfs_setattr, #ifdef CONFIG_NFS_V3_ACL - .listxattr = generic_listxattr, + .listxattr = nfs3_listxattr, .getxattr = generic_getxattr, .setxattr = generic_setxattr, .removexattr = generic_removexattr, @@ -905,7 +899,7 @@ static const struct inode_operations nfs3_file_inode_operations = { .getattr = nfs_getattr, .setattr = nfs_setattr, #ifdef CONFIG_NFS_V3_ACL - .listxattr = generic_listxattr, + .listxattr = nfs3_listxattr, .getxattr = generic_getxattr, .setxattr = generic_setxattr, .removexattr = generic_removexattr, @@ -946,13 +940,10 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .fsinfo = nfs3_proc_fsinfo, .pathconf = nfs3_proc_pathconf, .decode_dirent = nfs3_decode_dirent, + .pgio_rpc_prepare = nfs3_proc_pgio_rpc_prepare, .read_setup = nfs3_proc_read_setup, - .read_pageio_init = nfs_pageio_init_read, - .read_rpc_prepare = nfs3_proc_read_rpc_prepare, .read_done = nfs3_read_done, .write_setup = nfs3_proc_write_setup, - .write_pageio_init = nfs_pageio_init_write, - .write_rpc_prepare = nfs3_proc_write_rpc_prepare, .write_done = nfs3_write_done, .commit_setup = nfs3_proc_commit_setup, .commit_rpc_prepare = nfs3_proc_commit_rpc_prepare, diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index fa6d72131c19..8f4cbe7f4aa8 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -953,7 +953,7 @@ static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req, * }; */ static void encode_read3args(struct xdr_stream *xdr, - const struct nfs_readargs *args) + const struct nfs_pgio_args *args) { __be32 *p; @@ -966,7 +966,7 @@ static void encode_read3args(struct xdr_stream *xdr, static void nfs3_xdr_enc_read3args(struct rpc_rqst *req, struct xdr_stream *xdr, - const struct nfs_readargs *args) + const struct nfs_pgio_args *args) { encode_read3args(xdr, args); prepare_reply_buffer(req, args->pages, args->pgbase, @@ -992,7 +992,7 @@ static void nfs3_xdr_enc_read3args(struct rpc_rqst *req, * }; */ static void encode_write3args(struct xdr_stream *xdr, - const struct nfs_writeargs *args) + const struct nfs_pgio_args *args) { __be32 *p; @@ -1008,7 +1008,7 @@ static void encode_write3args(struct xdr_stream *xdr, static void nfs3_xdr_enc_write3args(struct rpc_rqst *req, struct xdr_stream *xdr, - const struct nfs_writeargs *args) + const struct nfs_pgio_args *args) { encode_write3args(xdr, args); xdr->buf->flags |= XDRBUF_WRITE; @@ -1589,7 +1589,7 @@ out_default: * }; */ static int decode_read3resok(struct xdr_stream *xdr, - struct nfs_readres *result) + struct nfs_pgio_res *result) { u32 eof, count, ocount, recvd; __be32 *p; @@ -1625,7 +1625,7 @@ out_overflow: } static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr, - struct nfs_readres *result) + struct nfs_pgio_res *result) { enum nfs_stat status; int error; @@ -1673,7 +1673,7 @@ out_status: * }; */ static int decode_write3resok(struct xdr_stream *xdr, - struct nfs_writeres *result) + struct nfs_pgio_res *result) { __be32 *p; @@ -1697,7 +1697,7 @@ out_eio: } static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr, - struct nfs_writeres *result) + struct nfs_pgio_res *result) { enum nfs_stat status; int error; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index e1d1badbe53c..ba2affa51941 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -230,7 +230,7 @@ int nfs_atomic_open(struct inode *, struct dentry *, struct file *, extern struct file_system_type nfs4_fs_type; /* nfs4namespace.c */ -struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *, struct inode *, struct qstr *); +struct rpc_clnt *nfs4_negotiate_security(struct rpc_clnt *, struct inode *, struct qstr *); struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *, struct nfs_fh *, struct nfs_fattr *); int nfs4_replace_transport(struct nfs_server *server, @@ -337,7 +337,7 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode, */ static inline void nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp, - struct rpc_message *msg, struct nfs_write_data *wdata) + struct rpc_message *msg, struct nfs_pgio_data *wdata) { if (_nfs4_state_protect(clp, NFS_SP4_MACH_CRED_WRITE, clntp, msg) && !test_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags)) @@ -369,7 +369,7 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_flags, static inline void nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp, - struct rpc_message *msg, struct nfs_write_data *wdata) + struct rpc_message *msg, struct nfs_pgio_data *wdata) { } #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 8de3407e0360..a816f0627a6c 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -100,8 +100,7 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) break; mutex_lock(&inode->i_mutex); ret = nfs_file_fsync_commit(file, start, end, datasync); - if (!ret && !datasync) - /* application has asked for meta-data sync */ + if (!ret) ret = pnfs_layoutcommit_inode(inode, true); mutex_unlock(&inode->i_mutex); /* @@ -118,10 +117,10 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) const struct file_operations nfs4_file_operations = { .llseek = nfs_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = nfs_file_read, - .aio_write = nfs_file_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = nfs_file_read, + .write_iter = nfs_file_write, .mmap = nfs_file_mmap, .open = nfs4_file_open, .flush = nfs_file_flush, @@ -130,7 +129,7 @@ const struct file_operations nfs4_file_operations = { .lock = nfs_lock, .flock = nfs_flock, .splice_read = nfs_file_splice_read, - .splice_write = nfs_file_splice_write, + .splice_write = iter_file_splice_write, .check_flags = nfs_check_flags, .setlease = nfs_setlease, }; diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 3d5dbf80d46a..3d83cb1fdc70 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -139,16 +139,22 @@ static size_t nfs_parse_server_name(char *string, size_t len, * @server: NFS server struct * @flavors: List of security tuples returned by SECINFO procedure * - * Return the pseudoflavor of the first security mechanism in - * "flavors" that is locally supported. Return RPC_AUTH_UNIX if - * no matching flavor is found in the array. The "flavors" array + * Return an rpc client that uses the first security mechanism in + * "flavors" that is locally supported. The "flavors" array * is searched in the order returned from the server, per RFC 3530 - * recommendation. + * recommendation and each flavor is checked for membership in the + * sec= mount option list if it exists. + * + * Return -EPERM if no matching flavor is found in the array. + * + * Please call rpc_shutdown_client() when you are done with this rpc client. + * */ -static rpc_authflavor_t nfs_find_best_sec(struct nfs_server *server, +static struct rpc_clnt *nfs_find_best_sec(struct rpc_clnt *clnt, + struct nfs_server *server, struct nfs4_secinfo_flavors *flavors) { - rpc_authflavor_t pseudoflavor; + rpc_authflavor_t pflavor; struct nfs4_secinfo4 *secinfo; unsigned int i; @@ -159,62 +165,73 @@ static rpc_authflavor_t nfs_find_best_sec(struct nfs_server *server, case RPC_AUTH_NULL: case RPC_AUTH_UNIX: case RPC_AUTH_GSS: - pseudoflavor = rpcauth_get_pseudoflavor(secinfo->flavor, + pflavor = rpcauth_get_pseudoflavor(secinfo->flavor, &secinfo->flavor_info); - /* make sure pseudoflavor matches sec= mount opt */ - if (pseudoflavor != RPC_AUTH_MAXFLAVOR && - nfs_auth_info_match(&server->auth_info, - pseudoflavor)) - return pseudoflavor; - break; + /* does the pseudoflavor match a sec= mount opt? */ + if (pflavor != RPC_AUTH_MAXFLAVOR && + nfs_auth_info_match(&server->auth_info, pflavor)) { + struct rpc_clnt *new; + struct rpc_cred *cred; + + /* Cloning creates an rpc_auth for the flavor */ + new = rpc_clone_client_set_auth(clnt, pflavor); + if (IS_ERR(new)) + continue; + /** + * Check that the user actually can use the + * flavor. This is mostly for RPC_AUTH_GSS + * where cr_init obtains a gss context + */ + cred = rpcauth_lookupcred(new->cl_auth, 0); + if (IS_ERR(cred)) { + rpc_shutdown_client(new); + continue; + } + put_rpccred(cred); + return new; + } } } - - /* if there were any sec= options then nothing matched */ - if (server->auth_info.flavor_len > 0) - return -EPERM; - - return RPC_AUTH_UNIX; + return ERR_PTR(-EPERM); } -static rpc_authflavor_t nfs4_negotiate_security(struct inode *inode, struct qstr *name) +/** + * nfs4_negotiate_security - in response to an NFS4ERR_WRONGSEC on lookup, + * return an rpc_clnt that uses the best available security flavor with + * respect to the secinfo flavor list and the sec= mount options. + * + * @clnt: RPC client to clone + * @inode: directory inode + * @name: lookup name + * + * Please call rpc_shutdown_client() when you are done with this rpc client. + */ +struct rpc_clnt * +nfs4_negotiate_security(struct rpc_clnt *clnt, struct inode *inode, + struct qstr *name) { struct page *page; struct nfs4_secinfo_flavors *flavors; - rpc_authflavor_t flavor; + struct rpc_clnt *new; int err; page = alloc_page(GFP_KERNEL); if (!page) - return -ENOMEM; + return ERR_PTR(-ENOMEM); + flavors = page_address(page); err = nfs4_proc_secinfo(inode, name, flavors); if (err < 0) { - flavor = err; + new = ERR_PTR(err); goto out; } - flavor = nfs_find_best_sec(NFS_SERVER(inode), flavors); + new = nfs_find_best_sec(clnt, NFS_SERVER(inode), flavors); out: put_page(page); - return flavor; -} - -/* - * Please call rpc_shutdown_client() when you are done with this client. - */ -struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *clnt, struct inode *inode, - struct qstr *name) -{ - rpc_authflavor_t flavor; - - flavor = nfs4_negotiate_security(inode, name); - if ((int)flavor < 0) - return ERR_PTR((int)flavor); - - return rpc_clone_client_set_auth(clnt, flavor); + return new; } static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, @@ -397,11 +414,6 @@ struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry, if (client->cl_auth->au_flavor != flavor) flavor = client->cl_auth->au_flavor; - else { - rpc_authflavor_t new = nfs4_negotiate_security(dir, name); - if ((int)new >= 0) - flavor = new; - } mnt = nfs_do_submount(dentry, fh, fattr, flavor); out: rpc_shutdown_client(client); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 397be39c6dc8..4bf3d97cc5a0 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2027,7 +2027,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) return status; } if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) - _nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label); + nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label); return 0; } @@ -2750,7 +2750,7 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) #define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL) #define FATTR4_WORD2_NFS41_MASK (2*FATTR4_WORD2_SUPPATTR_EXCLCREAT - 1UL) -#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_CHANGE_SECURITY_LABEL - 1UL) +#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_SECURITY_LABEL - 1UL) static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) { @@ -3247,7 +3247,7 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir, err = -EPERM; if (client != *clnt) goto out; - client = nfs4_create_sec_client(client, dir, name); + client = nfs4_negotiate_security(client, dir, name); if (IS_ERR(client)) return PTR_ERR(client); @@ -4033,12 +4033,12 @@ static bool nfs4_error_stateid_expired(int err) return false; } -void __nfs4_read_done_cb(struct nfs_read_data *data) +void __nfs4_read_done_cb(struct nfs_pgio_data *data) { nfs_invalidate_atime(data->header->inode); } -static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) +static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_data *data) { struct nfs_server *server = NFS_SERVER(data->header->inode); @@ -4055,7 +4055,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) } static bool nfs4_read_stateid_changed(struct rpc_task *task, - struct nfs_readargs *args) + struct nfs_pgio_args *args) { if (!nfs4_error_stateid_expired(task->tk_status) || @@ -4068,7 +4068,7 @@ static bool nfs4_read_stateid_changed(struct rpc_task *task, return true; } -static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) +static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_data *data) { dprintk("--> %s\n", __func__); @@ -4077,19 +4077,19 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) return -EAGAIN; if (nfs4_read_stateid_changed(task, &data->args)) return -EAGAIN; - return data->read_done_cb ? data->read_done_cb(task, data) : + return data->pgio_done_cb ? data->pgio_done_cb(task, data) : nfs4_read_done_cb(task, data); } -static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) +static void nfs4_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg) { data->timestamp = jiffies; - data->read_done_cb = nfs4_read_done_cb; + data->pgio_done_cb = nfs4_read_done_cb; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); } -static int nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) +static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data) { if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), &data->args.seq_args, @@ -4097,14 +4097,14 @@ static int nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_dat task)) return 0; if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context, - data->args.lock_context, FMODE_READ) == -EIO) + data->args.lock_context, data->header->rw_ops->rw_mode) == -EIO) return -EIO; if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags))) return -EIO; return 0; } -static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data) +static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_pgio_data *data) { struct inode *inode = data->header->inode; @@ -4121,7 +4121,7 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data } static bool nfs4_write_stateid_changed(struct rpc_task *task, - struct nfs_writeargs *args) + struct nfs_pgio_args *args) { if (!nfs4_error_stateid_expired(task->tk_status) || @@ -4134,18 +4134,18 @@ static bool nfs4_write_stateid_changed(struct rpc_task *task, return true; } -static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) +static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_data *data) { if (!nfs4_sequence_done(task, &data->res.seq_res)) return -EAGAIN; if (nfs4_write_stateid_changed(task, &data->args)) return -EAGAIN; - return data->write_done_cb ? data->write_done_cb(task, data) : + return data->pgio_done_cb ? data->pgio_done_cb(task, data) : nfs4_write_done_cb(task, data); } static -bool nfs4_write_need_cache_consistency_data(const struct nfs_write_data *data) +bool nfs4_write_need_cache_consistency_data(const struct nfs_pgio_data *data) { const struct nfs_pgio_header *hdr = data->header; @@ -4158,7 +4158,7 @@ bool nfs4_write_need_cache_consistency_data(const struct nfs_write_data *data) return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0; } -static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg) +static void nfs4_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg) { struct nfs_server *server = NFS_SERVER(data->header->inode); @@ -4168,8 +4168,8 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag } else data->args.bitmask = server->cache_consistency_bitmask; - if (!data->write_done_cb) - data->write_done_cb = nfs4_write_done_cb; + if (!data->pgio_done_cb) + data->pgio_done_cb = nfs4_write_done_cb; data->res.server = server; data->timestamp = jiffies; @@ -4177,21 +4177,6 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); } -static int nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) -{ - if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), - &data->args.seq_args, - &data->res.seq_res, - task)) - return 0; - if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context, - data->args.lock_context, FMODE_WRITE) == -EIO) - return -EIO; - if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags))) - return -EIO; - return 0; -} - static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) { nfs4_setup_sequence(NFS_SERVER(data->inode), @@ -8432,13 +8417,10 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .pathconf = nfs4_proc_pathconf, .set_capabilities = nfs4_server_capabilities, .decode_dirent = nfs4_decode_dirent, + .pgio_rpc_prepare = nfs4_proc_pgio_rpc_prepare, .read_setup = nfs4_proc_read_setup, - .read_pageio_init = pnfs_pageio_init_read, - .read_rpc_prepare = nfs4_proc_read_rpc_prepare, .read_done = nfs4_read_done, .write_setup = nfs4_proc_write_setup, - .write_pageio_init = pnfs_pageio_init_write, - .write_rpc_prepare = nfs4_proc_write_rpc_prepare, .write_done = nfs4_write_done, .commit_setup = nfs4_proc_commit_setup, .commit_rpc_prepare = nfs4_proc_commit_rpc_prepare, diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 2349518eef2c..848f6853c59e 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1140,9 +1140,9 @@ static int nfs4_run_state_manager(void *); static void nfs4_clear_state_manager_bit(struct nfs_client *clp) { - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING); rpc_wake_up(&clp->cl_rpcwaitq); } @@ -1456,7 +1456,7 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs * server that doesn't support a grace period. */ spin_lock(&sp->so_lock); - write_seqcount_begin(&sp->so_reclaim_seqcount); + raw_write_seqcount_begin(&sp->so_reclaim_seqcount); restart: list_for_each_entry(state, &sp->so_states, open_states) { if (!test_and_clear_bit(ops->state_flag_bit, &state->flags)) @@ -1519,13 +1519,13 @@ restart: spin_lock(&sp->so_lock); goto restart; } - write_seqcount_end(&sp->so_reclaim_seqcount); + raw_write_seqcount_end(&sp->so_reclaim_seqcount); spin_unlock(&sp->so_lock); return 0; out_err: nfs4_put_open_state(state); spin_lock(&sp->so_lock); - write_seqcount_end(&sp->so_reclaim_seqcount); + raw_write_seqcount_end(&sp->so_reclaim_seqcount); spin_unlock(&sp->so_lock); return status; } diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c index 2628d921b7e3..b6ebe7e445f6 100644 --- a/fs/nfs/nfs4sysctl.c +++ b/fs/nfs/nfs4sysctl.c @@ -16,7 +16,7 @@ static const int nfs_set_port_min = 0; static const int nfs_set_port_max = 65535; static struct ctl_table_header *nfs4_callback_sysctl_table; -static ctl_table nfs4_cb_sysctls[] = { +static struct ctl_table nfs4_cb_sysctls[] = { { .procname = "nfs_callback_tcpport", .data = &nfs_callback_set_tcpport, @@ -36,7 +36,7 @@ static ctl_table nfs4_cb_sysctls[] = { { } }; -static ctl_table nfs4_cb_sysctl_dir[] = { +static struct ctl_table nfs4_cb_sysctl_dir[] = { { .procname = "nfs", .mode = 0555, @@ -45,7 +45,7 @@ static ctl_table nfs4_cb_sysctl_dir[] = { { } }; -static ctl_table nfs4_cb_sysctl_root[] = { +static struct ctl_table nfs4_cb_sysctl_root[] = { { .procname = "fs", .mode = 0555, diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 849cf146db30..0a744f3a86f6 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -932,7 +932,7 @@ DEFINE_NFS4_IDMAP_EVENT(nfs4_map_gid_to_group); DECLARE_EVENT_CLASS(nfs4_read_event, TP_PROTO( - const struct nfs_read_data *data, + const struct nfs_pgio_data *data, int error ), @@ -972,7 +972,7 @@ DECLARE_EVENT_CLASS(nfs4_read_event, #define DEFINE_NFS4_READ_EVENT(name) \ DEFINE_EVENT(nfs4_read_event, name, \ TP_PROTO( \ - const struct nfs_read_data *data, \ + const struct nfs_pgio_data *data, \ int error \ ), \ TP_ARGS(data, error)) @@ -983,7 +983,7 @@ DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read); DECLARE_EVENT_CLASS(nfs4_write_event, TP_PROTO( - const struct nfs_write_data *data, + const struct nfs_pgio_data *data, int error ), @@ -1024,7 +1024,7 @@ DECLARE_EVENT_CLASS(nfs4_write_event, #define DEFINE_NFS4_WRITE_EVENT(name) \ DEFINE_EVENT(nfs4_write_event, name, \ TP_PROTO( \ - const struct nfs_write_data *data, \ + const struct nfs_pgio_data *data, \ int error \ ), \ TP_ARGS(data, error)) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 73ce8d4fe2c8..939ae606cfa4 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1556,7 +1556,8 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr) encode_op_hdr(xdr, OP_PUTROOTFH, decode_putrootfh_maxsz, hdr); } -static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr) +static void encode_read(struct xdr_stream *xdr, const struct nfs_pgio_args *args, + struct compound_hdr *hdr) { __be32 *p; @@ -1701,7 +1702,8 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4 encode_nfs4_verifier(xdr, &arg->confirm); } -static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr) +static void encode_write(struct xdr_stream *xdr, const struct nfs_pgio_args *args, + struct compound_hdr *hdr) { __be32 *p; @@ -2451,7 +2453,7 @@ static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr, * Encode a READ request */ static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr, - struct nfs_readargs *args) + struct nfs_pgio_args *args) { struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), @@ -2513,7 +2515,7 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, * Encode a WRITE request */ static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr, - struct nfs_writeargs *args) + struct nfs_pgio_args *args) { struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), @@ -5085,7 +5087,8 @@ static int decode_putrootfh(struct xdr_stream *xdr) return decode_op_hdr(xdr, OP_PUTROOTFH); } -static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_readres *res) +static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, + struct nfs_pgio_res *res) { __be32 *p; uint32_t count, eof, recvd; @@ -5339,7 +5342,7 @@ static int decode_setclientid_confirm(struct xdr_stream *xdr) return decode_op_hdr(xdr, OP_SETCLIENTID_CONFIRM); } -static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res) +static int decode_write(struct xdr_stream *xdr, struct nfs_pgio_res *res) { __be32 *p; int status; @@ -6636,7 +6639,7 @@ out: * Decode Read response */ static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr, - struct nfs_readres *res) + struct nfs_pgio_res *res) { struct compound_hdr hdr; int status; @@ -6661,7 +6664,7 @@ out: * Decode WRITE response */ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr, - struct nfs_writeres *res) + struct nfs_pgio_res *res) { struct compound_hdr hdr; int status; diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 5457745dd4f1..611320753db2 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -439,7 +439,7 @@ static void _read_done(struct ore_io_state *ios, void *private) objlayout_read_done(&objios->oir, status, objios->sync); } -int objio_read_pagelist(struct nfs_read_data *rdata) +int objio_read_pagelist(struct nfs_pgio_data *rdata) { struct nfs_pgio_header *hdr = rdata->header; struct objio_state *objios; @@ -487,7 +487,7 @@ static void _write_done(struct ore_io_state *ios, void *private) static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) { struct objio_state *objios = priv; - struct nfs_write_data *wdata = objios->oir.rpcdata; + struct nfs_pgio_data *wdata = objios->oir.rpcdata; struct address_space *mapping = wdata->header->inode->i_mapping; pgoff_t index = offset / PAGE_SIZE; struct page *page; @@ -531,7 +531,7 @@ static const struct _ore_r4w_op _r4w_op = { .put_page = &__r4w_put_page, }; -int objio_write_pagelist(struct nfs_write_data *wdata, int how) +int objio_write_pagelist(struct nfs_pgio_data *wdata, int how) { struct nfs_pgio_header *hdr = wdata->header; struct objio_state *objios; @@ -564,14 +564,22 @@ int objio_write_pagelist(struct nfs_write_data *wdata, int how) return 0; } -static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, +/* + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req) { - if (!pnfs_generic_pg_test(pgio, prev, req)) - return false; + unsigned int size; + + size = pnfs_generic_pg_test(pgio, prev, req); + + if (!size || pgio->pg_count + req->wb_bytes > + (unsigned long)pgio->pg_layout_private) + return 0; - return pgio->pg_count + req->wb_bytes <= - (unsigned long)pgio->pg_layout_private; + return min(size, req->wb_bytes); } static void objio_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index e4f9cbfec67b..765d3f54e986 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -53,10 +53,10 @@ objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) struct objlayout *objlay; objlay = kzalloc(sizeof(struct objlayout), gfp_flags); - if (objlay) { - spin_lock_init(&objlay->lock); - INIT_LIST_HEAD(&objlay->err_list); - } + if (!objlay) + return NULL; + spin_lock_init(&objlay->lock); + INIT_LIST_HEAD(&objlay->err_list); dprintk("%s: Return %p\n", __func__, objlay); return &objlay->pnfs_layout; } @@ -229,11 +229,11 @@ objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index, static void _rpc_read_complete(struct work_struct *work) { struct rpc_task *task; - struct nfs_read_data *rdata; + struct nfs_pgio_data *rdata; dprintk("%s enter\n", __func__); task = container_of(work, struct rpc_task, u.tk_work); - rdata = container_of(task, struct nfs_read_data, task); + rdata = container_of(task, struct nfs_pgio_data, task); pnfs_ld_read_done(rdata); } @@ -241,7 +241,7 @@ static void _rpc_read_complete(struct work_struct *work) void objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync) { - struct nfs_read_data *rdata = oir->rpcdata; + struct nfs_pgio_data *rdata = oir->rpcdata; oir->status = rdata->task.tk_status = status; if (status >= 0) @@ -266,7 +266,7 @@ objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync) * Perform sync or async reads. */ enum pnfs_try_status -objlayout_read_pagelist(struct nfs_read_data *rdata) +objlayout_read_pagelist(struct nfs_pgio_data *rdata) { struct nfs_pgio_header *hdr = rdata->header; struct inode *inode = hdr->inode; @@ -312,11 +312,11 @@ objlayout_read_pagelist(struct nfs_read_data *rdata) static void _rpc_write_complete(struct work_struct *work) { struct rpc_task *task; - struct nfs_write_data *wdata; + struct nfs_pgio_data *wdata; dprintk("%s enter\n", __func__); task = container_of(work, struct rpc_task, u.tk_work); - wdata = container_of(task, struct nfs_write_data, task); + wdata = container_of(task, struct nfs_pgio_data, task); pnfs_ld_write_done(wdata); } @@ -324,7 +324,7 @@ static void _rpc_write_complete(struct work_struct *work) void objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync) { - struct nfs_write_data *wdata = oir->rpcdata; + struct nfs_pgio_data *wdata = oir->rpcdata; oir->status = wdata->task.tk_status = status; if (status >= 0) { @@ -351,7 +351,7 @@ objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync) * Perform sync or async writes. */ enum pnfs_try_status -objlayout_write_pagelist(struct nfs_write_data *wdata, +objlayout_write_pagelist(struct nfs_pgio_data *wdata, int how) { struct nfs_pgio_header *hdr = wdata->header; diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index 87aa1dec6120..01e041029a6c 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -119,8 +119,8 @@ extern void objio_free_lseg(struct pnfs_layout_segment *lseg); */ extern void objio_free_result(struct objlayout_io_res *oir); -extern int objio_read_pagelist(struct nfs_read_data *rdata); -extern int objio_write_pagelist(struct nfs_write_data *wdata, int how); +extern int objio_read_pagelist(struct nfs_pgio_data *rdata); +extern int objio_write_pagelist(struct nfs_pgio_data *wdata, int how); /* * callback API @@ -168,10 +168,10 @@ extern struct pnfs_layout_segment *objlayout_alloc_lseg( extern void objlayout_free_lseg(struct pnfs_layout_segment *); extern enum pnfs_try_status objlayout_read_pagelist( - struct nfs_read_data *); + struct nfs_pgio_data *); extern enum pnfs_try_status objlayout_write_pagelist( - struct nfs_write_data *, + struct nfs_pgio_data *, int how); extern void objlayout_encode_layoutcommit( diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 2ffebf2081ce..17fab89f6358 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -24,9 +24,12 @@ #include "internal.h" #include "pnfs.h" +#define NFSDBG_FACILITY NFSDBG_PAGECACHE + static struct kmem_cache *nfs_page_cachep; +static const struct rpc_call_ops nfs_pgio_common_ops; -bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount) +static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount) { p->npages = pagecount; if (pagecount <= ARRAY_SIZE(p->page_array)) @@ -95,7 +98,7 @@ nfs_iocounter_dec(struct nfs_io_counter *c) { if (atomic_dec_and_test(&c->io_count)) { clear_bit(NFS_IO_INPROGRESS, &c->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&c->flags, NFS_IO_INPROGRESS); } } @@ -133,11 +136,168 @@ nfs_iocounter_wait(struct nfs_io_counter *c) return __nfs_iocounter_wait(c); } +static int nfs_wait_bit_uninterruptible(void *word) +{ + io_schedule(); + return 0; +} + +/* + * nfs_page_group_lock - lock the head of the page group + * @req - request in group that is to be locked + * + * this lock must be held if modifying the page group list + */ +void +nfs_page_group_lock(struct nfs_page *req) +{ + struct nfs_page *head = req->wb_head; + + WARN_ON_ONCE(head != head->wb_head); + + wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, + nfs_wait_bit_uninterruptible, + TASK_UNINTERRUPTIBLE); +} + +/* + * nfs_page_group_unlock - unlock the head of the page group + * @req - request in group that is to be unlocked + */ +void +nfs_page_group_unlock(struct nfs_page *req) +{ + struct nfs_page *head = req->wb_head; + + WARN_ON_ONCE(head != head->wb_head); + + smp_mb__before_atomic(); + clear_bit(PG_HEADLOCK, &head->wb_flags); + smp_mb__after_atomic(); + wake_up_bit(&head->wb_flags, PG_HEADLOCK); +} + +/* + * nfs_page_group_sync_on_bit_locked + * + * must be called with page group lock held + */ +static bool +nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit) +{ + struct nfs_page *head = req->wb_head; + struct nfs_page *tmp; + + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_flags)); + WARN_ON_ONCE(test_and_set_bit(bit, &req->wb_flags)); + + tmp = req->wb_this_page; + while (tmp != req) { + if (!test_bit(bit, &tmp->wb_flags)) + return false; + tmp = tmp->wb_this_page; + } + + /* true! reset all bits */ + tmp = req; + do { + clear_bit(bit, &tmp->wb_flags); + tmp = tmp->wb_this_page; + } while (tmp != req); + + return true; +} + +/* + * nfs_page_group_sync_on_bit - set bit on current request, but only + * return true if the bit is set for all requests in page group + * @req - request in page group + * @bit - PG_* bit that is used to sync page group + */ +bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit) +{ + bool ret; + + nfs_page_group_lock(req); + ret = nfs_page_group_sync_on_bit_locked(req, bit); + nfs_page_group_unlock(req); + + return ret; +} + +/* + * nfs_page_group_init - Initialize the page group linkage for @req + * @req - a new nfs request + * @prev - the previous request in page group, or NULL if @req is the first + * or only request in the group (the head). + */ +static inline void +nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev) +{ + WARN_ON_ONCE(prev == req); + + if (!prev) { + /* a head request */ + req->wb_head = req; + req->wb_this_page = req; + } else { + /* a subrequest */ + WARN_ON_ONCE(prev->wb_this_page != prev->wb_head); + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &prev->wb_head->wb_flags)); + req->wb_head = prev->wb_head; + req->wb_this_page = prev->wb_this_page; + prev->wb_this_page = req; + + /* All subrequests take a ref on the head request until + * nfs_page_group_destroy is called */ + kref_get(&req->wb_head->wb_kref); + + /* grab extra ref if head request has extra ref from + * the write/commit path to handle handoff between write + * and commit lists */ + if (test_bit(PG_INODE_REF, &prev->wb_head->wb_flags)) { + set_bit(PG_INODE_REF, &req->wb_flags); + kref_get(&req->wb_kref); + } + } +} + +/* + * nfs_page_group_destroy - sync the destruction of page groups + * @req - request that no longer needs the page group + * + * releases the page group reference from each member once all + * members have called this function. + */ +static void +nfs_page_group_destroy(struct kref *kref) +{ + struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); + struct nfs_page *tmp, *next; + + /* subrequests must release the ref on the head request */ + if (req->wb_head != req) + nfs_release_request(req->wb_head); + + if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN)) + return; + + tmp = req; + do { + next = tmp->wb_this_page; + /* unlink and free */ + tmp->wb_this_page = tmp; + tmp->wb_head = tmp; + nfs_free_request(tmp); + tmp = next; + } while (tmp != req); +} + /** * nfs_create_request - Create an NFS read/write request. * @ctx: open context to use - * @inode: inode to which the request is attached * @page: page to write + * @last: last nfs request created for this page group or NULL if head * @offset: starting offset within the page for the write * @count: number of bytes to read/write * @@ -146,9 +306,9 @@ nfs_iocounter_wait(struct nfs_io_counter *c) * User should ensure it is safe to sleep in this function. */ struct nfs_page * -nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, - struct page *page, - unsigned int offset, unsigned int count) +nfs_create_request(struct nfs_open_context *ctx, struct page *page, + struct nfs_page *last, unsigned int offset, + unsigned int count) { struct nfs_page *req; struct nfs_lock_context *l_ctx; @@ -180,6 +340,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, req->wb_bytes = count; req->wb_context = get_nfs_open_context(ctx); kref_init(&req->wb_kref); + nfs_page_group_init(req, last); return req; } @@ -193,9 +354,9 @@ void nfs_unlock_request(struct nfs_page *req) printk(KERN_ERR "NFS: Invalid unlock attempted\n"); BUG(); } - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(PG_BUSY, &req->wb_flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&req->wb_flags, PG_BUSY); } @@ -237,16 +398,22 @@ static void nfs_clear_request(struct nfs_page *req) } } - /** * nfs_release_request - Release the count on an NFS read/write request * @req: request to release * * Note: Should never be called with the spinlock held! */ -static void nfs_free_request(struct kref *kref) +void nfs_free_request(struct nfs_page *req) { - struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); + WARN_ON_ONCE(req->wb_this_page != req); + + /* extra debug: make sure no sync bits are still set */ + WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags)); + WARN_ON_ONCE(test_bit(PG_UNLOCKPAGE, &req->wb_flags)); + WARN_ON_ONCE(test_bit(PG_UPTODATE, &req->wb_flags)); + WARN_ON_ONCE(test_bit(PG_WB_END, &req->wb_flags)); + WARN_ON_ONCE(test_bit(PG_REMOVE, &req->wb_flags)); /* Release struct file and open context */ nfs_clear_request(req); @@ -255,13 +422,7 @@ static void nfs_free_request(struct kref *kref) void nfs_release_request(struct nfs_page *req) { - kref_put(&req->wb_kref, nfs_free_request); -} - -static int nfs_wait_bit_uninterruptible(void *word) -{ - io_schedule(); - return 0; + kref_put(&req->wb_kref, nfs_page_group_destroy); } /** @@ -279,22 +440,249 @@ nfs_wait_on_request(struct nfs_page *req) TASK_UNINTERRUPTIBLE); } -bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req) +/* + * nfs_generic_pg_test - determine if requests can be coalesced + * @desc: pointer to descriptor + * @prev: previous request in desc, or NULL + * @req: this request + * + * Returns zero if @req can be coalesced into @desc, otherwise it returns + * the size of the request. + */ +size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, + struct nfs_page *prev, struct nfs_page *req) { - /* - * FIXME: ideally we should be able to coalesce all requests - * that are not block boundary aligned, but currently this - * is problematic for the case of bsize < PAGE_CACHE_SIZE, - * since nfs_flush_multi and nfs_pagein_multi assume you - * can have only one struct nfs_page. - */ - if (desc->pg_bsize < PAGE_SIZE) + if (desc->pg_count > desc->pg_bsize) { + /* should never happen */ + WARN_ON_ONCE(1); return 0; + } - return desc->pg_count + req->wb_bytes <= desc->pg_bsize; + return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes); } EXPORT_SYMBOL_GPL(nfs_generic_pg_test); +static inline struct nfs_rw_header *NFS_RW_HEADER(struct nfs_pgio_header *hdr) +{ + return container_of(hdr, struct nfs_rw_header, header); +} + +/** + * nfs_rw_header_alloc - Allocate a header for a read or write + * @ops: Read or write function vector + */ +struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *ops) +{ + struct nfs_rw_header *header = ops->rw_alloc_header(); + + if (header) { + struct nfs_pgio_header *hdr = &header->header; + + INIT_LIST_HEAD(&hdr->pages); + spin_lock_init(&hdr->lock); + atomic_set(&hdr->refcnt, 0); + hdr->rw_ops = ops; + } + return header; +} +EXPORT_SYMBOL_GPL(nfs_rw_header_alloc); + +/* + * nfs_rw_header_free - Free a read or write header + * @hdr: The header to free + */ +void nfs_rw_header_free(struct nfs_pgio_header *hdr) +{ + hdr->rw_ops->rw_free_header(NFS_RW_HEADER(hdr)); +} +EXPORT_SYMBOL_GPL(nfs_rw_header_free); + +/** + * nfs_pgio_data_alloc - Allocate pageio data + * @hdr: The header making a request + * @pagecount: Number of pages to create + */ +static struct nfs_pgio_data *nfs_pgio_data_alloc(struct nfs_pgio_header *hdr, + unsigned int pagecount) +{ + struct nfs_pgio_data *data, *prealloc; + + prealloc = &NFS_RW_HEADER(hdr)->rpc_data; + if (prealloc->header == NULL) + data = prealloc; + else + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + goto out; + + if (nfs_pgarray_set(&data->pages, pagecount)) { + data->header = hdr; + atomic_inc(&hdr->refcnt); + } else { + if (data != prealloc) + kfree(data); + data = NULL; + } +out: + return data; +} + +/** + * nfs_pgio_data_release - Properly free pageio data + * @data: The data to release + */ +void nfs_pgio_data_release(struct nfs_pgio_data *data) +{ + struct nfs_pgio_header *hdr = data->header; + struct nfs_rw_header *pageio_header = NFS_RW_HEADER(hdr); + + put_nfs_open_context(data->args.context); + if (data->pages.pagevec != data->pages.page_array) + kfree(data->pages.pagevec); + if (data == &pageio_header->rpc_data) { + data->header = NULL; + data = NULL; + } + if (atomic_dec_and_test(&hdr->refcnt)) + hdr->completion_ops->completion(hdr); + /* Note: we only free the rpc_task after callbacks are done. + * See the comment in rpc_free_task() for why + */ + kfree(data); +} +EXPORT_SYMBOL_GPL(nfs_pgio_data_release); + +/** + * nfs_pgio_rpcsetup - Set up arguments for a pageio call + * @data: The pageio data + * @count: Number of bytes to read + * @offset: Initial offset + * @how: How to commit data (writes only) + * @cinfo: Commit information for the call (writes only) + */ +static void nfs_pgio_rpcsetup(struct nfs_pgio_data *data, + unsigned int count, unsigned int offset, + int how, struct nfs_commit_info *cinfo) +{ + struct nfs_page *req = data->header->req; + + /* Set up the RPC argument and reply structs + * NB: take care not to mess about with data->commit et al. */ + + data->args.fh = NFS_FH(data->header->inode); + data->args.offset = req_offset(req) + offset; + /* pnfs_set_layoutcommit needs this */ + data->mds_offset = data->args.offset; + data->args.pgbase = req->wb_pgbase + offset; + data->args.pages = data->pages.pagevec; + data->args.count = count; + data->args.context = get_nfs_open_context(req->wb_context); + data->args.lock_context = req->wb_lock_context; + data->args.stable = NFS_UNSTABLE; + switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { + case 0: + break; + case FLUSH_COND_STABLE: + if (nfs_reqs_to_commit(cinfo)) + break; + default: + data->args.stable = NFS_FILE_SYNC; + } + + data->res.fattr = &data->fattr; + data->res.count = count; + data->res.eof = 0; + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); +} + +/** + * nfs_pgio_prepare - Prepare pageio data to go over the wire + * @task: The current task + * @calldata: pageio data to prepare + */ +static void nfs_pgio_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs_pgio_data *data = calldata; + int err; + err = NFS_PROTO(data->header->inode)->pgio_rpc_prepare(task, data); + if (err) + rpc_exit(task, err); +} + +int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_data *data, + const struct rpc_call_ops *call_ops, int how, int flags) +{ + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, + .rpc_cred = data->header->cred, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clnt, + .task = &data->task, + .rpc_message = &msg, + .callback_ops = call_ops, + .callback_data = data, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC | flags, + }; + int ret = 0; + + data->header->rw_ops->rw_initiate(data, &msg, &task_setup_data, how); + + dprintk("NFS: %5u initiated pgio call " + "(req %s/%llu, %u bytes @ offset %llu)\n", + data->task.tk_pid, + data->header->inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(data->header->inode), + data->args.count, + (unsigned long long)data->args.offset); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) { + ret = PTR_ERR(task); + goto out; + } + if (how & FLUSH_SYNC) { + ret = rpc_wait_for_completion_task(task); + if (ret == 0) + ret = task->tk_status; + } + rpc_put_task(task); +out: + return ret; +} +EXPORT_SYMBOL_GPL(nfs_initiate_pgio); + +/** + * nfs_pgio_error - Clean up from a pageio error + * @desc: IO descriptor + * @hdr: pageio header + */ +static int nfs_pgio_error(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr) +{ + set_bit(NFS_IOHDR_REDO, &hdr->flags); + nfs_pgio_data_release(hdr->data); + hdr->data = NULL; + desc->pg_completion_ops->error_cleanup(&desc->pg_list); + return -ENOMEM; +} + +/** + * nfs_pgio_release - Release pageio data + * @calldata: The pageio data to release + */ +static void nfs_pgio_release(void *calldata) +{ + struct nfs_pgio_data *data = calldata; + if (data->header->rw_ops->rw_release) + data->header->rw_ops->rw_release(data); + nfs_pgio_data_release(data); +} + /** * nfs_pageio_init - initialise a page io descriptor * @desc: pointer to descriptor @@ -307,6 +695,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, struct inode *inode, const struct nfs_pageio_ops *pg_ops, const struct nfs_pgio_completion_ops *compl_ops, + const struct nfs_rw_ops *rw_ops, size_t bsize, int io_flags) { @@ -320,6 +709,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, desc->pg_inode = inode; desc->pg_ops = pg_ops; desc->pg_completion_ops = compl_ops; + desc->pg_rw_ops = rw_ops; desc->pg_ioflags = io_flags; desc->pg_error = 0; desc->pg_lseg = NULL; @@ -328,6 +718,94 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, } EXPORT_SYMBOL_GPL(nfs_pageio_init); +/** + * nfs_pgio_result - Basic pageio error handling + * @task: The task that ran + * @calldata: Pageio data to check + */ +static void nfs_pgio_result(struct rpc_task *task, void *calldata) +{ + struct nfs_pgio_data *data = calldata; + struct inode *inode = data->header->inode; + + dprintk("NFS: %s: %5u, (status %d)\n", __func__, + task->tk_pid, task->tk_status); + + if (data->header->rw_ops->rw_done(task, data, inode) != 0) + return; + if (task->tk_status < 0) + nfs_set_pgio_error(data->header, task->tk_status, data->args.offset); + else + data->header->rw_ops->rw_result(task, data); +} + +/* + * Create an RPC task for the given read or write request and kick it. + * The page must have been locked by the caller. + * + * It may happen that the page we're passed is not marked dirty. + * This is the case if nfs_updatepage detects a conflicting request + * that has been written but not committed. + */ +int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr) +{ + struct nfs_page *req; + struct page **pages; + struct nfs_pgio_data *data; + struct list_head *head = &desc->pg_list; + struct nfs_commit_info cinfo; + + data = nfs_pgio_data_alloc(hdr, nfs_page_array_len(desc->pg_base, + desc->pg_count)); + if (!data) + return nfs_pgio_error(desc, hdr); + + nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq); + pages = data->pages.pagevec; + while (!list_empty(head)) { + req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_list_add_request(req, &hdr->pages); + *pages++ = req->wb_page; + } + + if ((desc->pg_ioflags & FLUSH_COND_STABLE) && + (desc->pg_moreio || nfs_reqs_to_commit(&cinfo))) + desc->pg_ioflags &= ~FLUSH_COND_STABLE; + + /* Set up the argument struct */ + nfs_pgio_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo); + hdr->data = data; + desc->pg_rpc_callops = &nfs_pgio_common_ops; + return 0; +} +EXPORT_SYMBOL_GPL(nfs_generic_pgio); + +static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) +{ + struct nfs_rw_header *rw_hdr; + struct nfs_pgio_header *hdr; + int ret; + + rw_hdr = nfs_rw_header_alloc(desc->pg_rw_ops); + if (!rw_hdr) { + desc->pg_completion_ops->error_cleanup(&desc->pg_list); + return -ENOMEM; + } + hdr = &rw_hdr->header; + nfs_pgheader_init(desc, hdr, nfs_rw_header_free); + atomic_inc(&hdr->refcnt); + ret = nfs_generic_pgio(desc, hdr); + if (ret == 0) + ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode), + hdr->data, desc->pg_rpc_callops, + desc->pg_ioflags, 0); + if (atomic_dec_and_test(&hdr->refcnt)) + hdr->completion_ops->completion(hdr); + return ret; +} + static bool nfs_match_open_context(const struct nfs_open_context *ctx1, const struct nfs_open_context *ctx2) { @@ -356,18 +834,23 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev, struct nfs_page *req, struct nfs_pageio_descriptor *pgio) { - if (!nfs_match_open_context(req->wb_context, prev->wb_context)) - return false; - if (req->wb_context->dentry->d_inode->i_flock != NULL && - !nfs_match_lock_context(req->wb_lock_context, prev->wb_lock_context)) - return false; - if (req->wb_pgbase != 0) - return false; - if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) - return false; - if (req_offset(req) != req_offset(prev) + prev->wb_bytes) - return false; - return pgio->pg_ops->pg_test(pgio, prev, req); + size_t size; + + if (prev) { + if (!nfs_match_open_context(req->wb_context, prev->wb_context)) + return false; + if (req->wb_context->dentry->d_inode->i_flock != NULL && + !nfs_match_lock_context(req->wb_lock_context, + prev->wb_lock_context)) + return false; + if (req_offset(req) != req_offset(prev) + prev->wb_bytes) + return false; + } + size = pgio->pg_ops->pg_test(pgio, prev, req); + WARN_ON_ONCE(size > req->wb_bytes); + if (size && size < req->wb_bytes) + req->wb_bytes = size; + return size > 0; } /** @@ -381,17 +864,16 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev, static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, struct nfs_page *req) { + struct nfs_page *prev = NULL; if (desc->pg_count != 0) { - struct nfs_page *prev; - prev = nfs_list_entry(desc->pg_list.prev); - if (!nfs_can_coalesce_requests(prev, req, desc)) - return 0; } else { if (desc->pg_ops->pg_init) desc->pg_ops->pg_init(desc, req); desc->pg_base = req->wb_pgbase; } + if (!nfs_can_coalesce_requests(prev, req, desc)) + return 0; nfs_list_remove_request(req); nfs_list_add_request(req, &desc->pg_list); desc->pg_count += req->wb_bytes; @@ -421,22 +903,72 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) * @desc: destination io descriptor * @req: request * + * This may split a request into subrequests which are all part of the + * same page group. + * * Returns true if the request 'req' was successfully coalesced into the * existing list of pages 'desc'. */ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, struct nfs_page *req) { - while (!nfs_pageio_do_add_request(desc, req)) { - desc->pg_moreio = 1; - nfs_pageio_doio(desc); - if (desc->pg_error < 0) - return 0; - desc->pg_moreio = 0; - if (desc->pg_recoalesce) - return 0; - } + struct nfs_page *subreq; + unsigned int bytes_left = 0; + unsigned int offset, pgbase; + + nfs_page_group_lock(req); + + subreq = req; + bytes_left = subreq->wb_bytes; + offset = subreq->wb_offset; + pgbase = subreq->wb_pgbase; + + do { + if (!nfs_pageio_do_add_request(desc, subreq)) { + /* make sure pg_test call(s) did nothing */ + WARN_ON_ONCE(subreq->wb_bytes != bytes_left); + WARN_ON_ONCE(subreq->wb_offset != offset); + WARN_ON_ONCE(subreq->wb_pgbase != pgbase); + + nfs_page_group_unlock(req); + desc->pg_moreio = 1; + nfs_pageio_doio(desc); + if (desc->pg_error < 0) + return 0; + if (desc->pg_recoalesce) + return 0; + /* retry add_request for this subreq */ + nfs_page_group_lock(req); + continue; + } + + /* check for buggy pg_test call(s) */ + WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE); + WARN_ON_ONCE(subreq->wb_bytes > bytes_left); + WARN_ON_ONCE(subreq->wb_bytes == 0); + + bytes_left -= subreq->wb_bytes; + offset += subreq->wb_bytes; + pgbase += subreq->wb_bytes; + + if (bytes_left) { + subreq = nfs_create_request(req->wb_context, + req->wb_page, + subreq, pgbase, bytes_left); + if (IS_ERR(subreq)) + goto err_ptr; + nfs_lock_request(subreq); + subreq->wb_offset = offset; + subreq->wb_index = req->wb_index; + } + } while (bytes_left > 0); + + nfs_page_group_unlock(req); return 1; +err_ptr: + desc->pg_error = PTR_ERR(subreq); + nfs_page_group_unlock(req); + return 0; } static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) @@ -449,6 +981,7 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) desc->pg_count = 0; desc->pg_base = 0; desc->pg_recoalesce = 0; + desc->pg_moreio = 0; while (!list_empty(&head)) { struct nfs_page *req; @@ -535,3 +1068,13 @@ void nfs_destroy_nfspagecache(void) kmem_cache_destroy(nfs_page_cachep); } +static const struct rpc_call_ops nfs_pgio_common_ops = { + .rpc_call_prepare = nfs_pgio_prepare, + .rpc_call_done = nfs_pgio_result, + .rpc_release = nfs_pgio_release, +}; + +const struct nfs_pageio_ops nfs_pgio_rw_ops = { + .pg_test = nfs_generic_pg_test, + .pg_doio = nfs_generic_pg_pgios, +}; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index cb53d450ae32..6fdcd233d6f7 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1388,11 +1388,6 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r WARN_ON_ONCE(pgio->pg_lseg != NULL); - if (req->wb_offset != req->wb_pgbase) { - nfs_pageio_reset_read_mds(pgio); - return; - } - if (pgio->pg_dreq == NULL) rd_size = i_size_read(pgio->pg_inode) - req_offset(req); else @@ -1417,11 +1412,6 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, { WARN_ON_ONCE(pgio->pg_lseg != NULL); - if (req->wb_offset != req->wb_pgbase) { - nfs_pageio_reset_write_mds(pgio); - return; - } - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, req_offset(req), @@ -1434,56 +1424,49 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, } EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write); -void -pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, - const struct nfs_pgio_completion_ops *compl_ops) -{ - struct nfs_server *server = NFS_SERVER(inode); - struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; - - if (ld == NULL) - nfs_pageio_init_read(pgio, inode, compl_ops); - else - nfs_pageio_init(pgio, inode, ld->pg_read_ops, compl_ops, server->rsize, 0); -} - -void -pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, - int ioflags, - const struct nfs_pgio_completion_ops *compl_ops) -{ - struct nfs_server *server = NFS_SERVER(inode); - struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; - - if (ld == NULL) - nfs_pageio_init_write(pgio, inode, ioflags, compl_ops); - else - nfs_pageio_init(pgio, inode, ld->pg_write_ops, compl_ops, server->wsize, ioflags); -} - -bool +/* + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +size_t pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req) { - if (pgio->pg_lseg == NULL) - return nfs_generic_pg_test(pgio, prev, req); + unsigned int size; + u64 seg_end, req_start, seg_left; + + size = nfs_generic_pg_test(pgio, prev, req); + if (!size) + return 0; /* - * Test if a nfs_page is fully contained in the pnfs_layout_range. - * Note that this test makes several assumptions: - * - that the previous nfs_page in the struct nfs_pageio_descriptor - * is known to lie within the range. - * - that the nfs_page being tested is known to be contiguous with the - * previous nfs_page. - * - Layout ranges are page aligned, so we only have to test the - * start offset of the request. + * 'size' contains the number of bytes left in the current page (up + * to the original size asked for in @req->wb_bytes). + * + * Calculate how many bytes are left in the layout segment + * and if there are less bytes than 'size', return that instead. * * Please also note that 'end_offset' is actually the offset of the * first byte that lies outside the pnfs_layout_range. FIXME? * */ - return req_offset(req) < end_offset(pgio->pg_lseg->pls_range.offset, - pgio->pg_lseg->pls_range.length); + if (pgio->pg_lseg) { + seg_end = end_offset(pgio->pg_lseg->pls_range.offset, + pgio->pg_lseg->pls_range.length); + req_start = req_offset(req); + WARN_ON_ONCE(req_start > seg_end); + /* start of request is past the last byte of this segment */ + if (req_start >= seg_end) + return 0; + + /* adjust 'size' iff there are fewer bytes left in the + * segment than what nfs_generic_pg_test returned */ + seg_left = seg_end - req_start; + if (seg_left < size) + size = (unsigned int)seg_left; + } + + return size; } EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); @@ -1496,7 +1479,7 @@ int pnfs_write_done_resend_to_mds(struct inode *inode, LIST_HEAD(failed); /* Resend all requests through the MDS */ - nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, compl_ops); + nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, true, compl_ops); pgio.pg_dreq = dreq; while (!list_empty(head)) { struct nfs_page *req = nfs_list_entry(head->next); @@ -1519,7 +1502,7 @@ int pnfs_write_done_resend_to_mds(struct inode *inode, } EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds); -static void pnfs_ld_handle_write_error(struct nfs_write_data *data) +static void pnfs_ld_handle_write_error(struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; @@ -1538,7 +1521,7 @@ static void pnfs_ld_handle_write_error(struct nfs_write_data *data) /* * Called by non rpc-based layout drivers */ -void pnfs_ld_write_done(struct nfs_write_data *data) +void pnfs_ld_write_done(struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; @@ -1554,7 +1537,7 @@ EXPORT_SYMBOL_GPL(pnfs_ld_write_done); static void pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, - struct nfs_write_data *data) + struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; @@ -1563,11 +1546,11 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, nfs_pageio_reset_write_mds(desc); desc->pg_recoalesce = 1; } - nfs_writedata_release(data); + nfs_pgio_data_release(data); } static enum pnfs_try_status -pnfs_try_to_write_data(struct nfs_write_data *wdata, +pnfs_try_to_write_data(struct nfs_pgio_data *wdata, const struct rpc_call_ops *call_ops, struct pnfs_layout_segment *lseg, int how) @@ -1589,41 +1572,36 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata, } static void -pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *head, int how) +pnfs_do_write(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr, int how) { - struct nfs_write_data *data; + struct nfs_pgio_data *data = hdr->data; const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; struct pnfs_layout_segment *lseg = desc->pg_lseg; + enum pnfs_try_status trypnfs; desc->pg_lseg = NULL; - while (!list_empty(head)) { - enum pnfs_try_status trypnfs; - - data = list_first_entry(head, struct nfs_write_data, list); - list_del_init(&data->list); - - trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how); - if (trypnfs == PNFS_NOT_ATTEMPTED) - pnfs_write_through_mds(desc, data); - } + trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how); + if (trypnfs == PNFS_NOT_ATTEMPTED) + pnfs_write_through_mds(desc, data); pnfs_put_lseg(lseg); } static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) { pnfs_put_lseg(hdr->lseg); - nfs_writehdr_free(hdr); + nfs_rw_header_free(hdr); } EXPORT_SYMBOL_GPL(pnfs_writehdr_free); int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) { - struct nfs_write_header *whdr; + struct nfs_rw_header *whdr; struct nfs_pgio_header *hdr; int ret; - whdr = nfs_writehdr_alloc(); + whdr = nfs_rw_header_alloc(desc->pg_rw_ops); if (!whdr) { desc->pg_completion_ops->error_cleanup(&desc->pg_list); pnfs_put_lseg(desc->pg_lseg); @@ -1634,12 +1612,12 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) nfs_pgheader_init(desc, hdr, pnfs_writehdr_free); hdr->lseg = pnfs_get_lseg(desc->pg_lseg); atomic_inc(&hdr->refcnt); - ret = nfs_generic_flush(desc, hdr); + ret = nfs_generic_pgio(desc, hdr); if (ret != 0) { pnfs_put_lseg(desc->pg_lseg); desc->pg_lseg = NULL; } else - pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags); + pnfs_do_write(desc, hdr, desc->pg_ioflags); if (atomic_dec_and_test(&hdr->refcnt)) hdr->completion_ops->completion(hdr); return ret; @@ -1655,7 +1633,7 @@ int pnfs_read_done_resend_to_mds(struct inode *inode, LIST_HEAD(failed); /* Resend all requests through the MDS */ - nfs_pageio_init_read(&pgio, inode, compl_ops); + nfs_pageio_init_read(&pgio, inode, true, compl_ops); pgio.pg_dreq = dreq; while (!list_empty(head)) { struct nfs_page *req = nfs_list_entry(head->next); @@ -1674,7 +1652,7 @@ int pnfs_read_done_resend_to_mds(struct inode *inode, } EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds); -static void pnfs_ld_handle_read_error(struct nfs_read_data *data) +static void pnfs_ld_handle_read_error(struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; @@ -1693,7 +1671,7 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data) /* * Called by non rpc-based layout drivers */ -void pnfs_ld_read_done(struct nfs_read_data *data) +void pnfs_ld_read_done(struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; @@ -1709,7 +1687,7 @@ EXPORT_SYMBOL_GPL(pnfs_ld_read_done); static void pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, - struct nfs_read_data *data) + struct nfs_pgio_data *data) { struct nfs_pgio_header *hdr = data->header; @@ -1718,14 +1696,14 @@ pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, nfs_pageio_reset_read_mds(desc); desc->pg_recoalesce = 1; } - nfs_readdata_release(data); + nfs_pgio_data_release(data); } /* * Call the appropriate parallel I/O subsystem read function. */ static enum pnfs_try_status -pnfs_try_to_read_data(struct nfs_read_data *rdata, +pnfs_try_to_read_data(struct nfs_pgio_data *rdata, const struct rpc_call_ops *call_ops, struct pnfs_layout_segment *lseg) { @@ -1747,41 +1725,35 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata, } static void -pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *head) +pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) { - struct nfs_read_data *data; + struct nfs_pgio_data *data = hdr->data; const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; struct pnfs_layout_segment *lseg = desc->pg_lseg; + enum pnfs_try_status trypnfs; desc->pg_lseg = NULL; - while (!list_empty(head)) { - enum pnfs_try_status trypnfs; - - data = list_first_entry(head, struct nfs_read_data, list); - list_del_init(&data->list); - - trypnfs = pnfs_try_to_read_data(data, call_ops, lseg); - if (trypnfs == PNFS_NOT_ATTEMPTED) - pnfs_read_through_mds(desc, data); - } + trypnfs = pnfs_try_to_read_data(data, call_ops, lseg); + if (trypnfs == PNFS_NOT_ATTEMPTED) + pnfs_read_through_mds(desc, data); pnfs_put_lseg(lseg); } static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) { pnfs_put_lseg(hdr->lseg); - nfs_readhdr_free(hdr); + nfs_rw_header_free(hdr); } EXPORT_SYMBOL_GPL(pnfs_readhdr_free); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) { - struct nfs_read_header *rhdr; + struct nfs_rw_header *rhdr; struct nfs_pgio_header *hdr; int ret; - rhdr = nfs_readhdr_alloc(); + rhdr = nfs_rw_header_alloc(desc->pg_rw_ops); if (!rhdr) { desc->pg_completion_ops->error_cleanup(&desc->pg_list); ret = -ENOMEM; @@ -1793,12 +1765,12 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) nfs_pgheader_init(desc, hdr, pnfs_readhdr_free); hdr->lseg = pnfs_get_lseg(desc->pg_lseg); atomic_inc(&hdr->refcnt); - ret = nfs_generic_pagein(desc, hdr); + ret = nfs_generic_pgio(desc, hdr); if (ret != 0) { pnfs_put_lseg(desc->pg_lseg); desc->pg_lseg = NULL; } else - pnfs_do_multiple_reads(desc, &hdr->rpc_list); + pnfs_do_read(desc, hdr); if (atomic_dec_and_test(&hdr->refcnt)) hdr->completion_ops->completion(hdr); return ret; @@ -1810,7 +1782,7 @@ static void pnfs_clear_layoutcommitting(struct inode *inode) unsigned long *bitlock = &NFS_I(inode)->flags; clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); } @@ -1848,7 +1820,7 @@ void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) EXPORT_SYMBOL_GPL(pnfs_set_lo_fail); void -pnfs_set_layoutcommit(struct nfs_write_data *wdata) +pnfs_set_layoutcommit(struct nfs_pgio_data *wdata) { struct nfs_pgio_header *hdr = wdata->header; struct inode *inode = hdr->inode; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 023793909778..4fb309a2b4c4 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -113,8 +113,8 @@ struct pnfs_layoutdriver_type { * Return PNFS_ATTEMPTED to indicate the layout code has attempted * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS */ - enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data); - enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how); + enum pnfs_try_status (*read_pagelist) (struct nfs_pgio_data *nfs_data); + enum pnfs_try_status (*write_pagelist) (struct nfs_pgio_data *nfs_data, int how); void (*free_deviceid_node) (struct nfs4_deviceid_node *); @@ -180,11 +180,6 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp); void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo); void pnfs_put_lseg(struct pnfs_layout_segment *lseg); -void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *, - const struct nfs_pgio_completion_ops *); -void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, - int, const struct nfs_pgio_completion_ops *); - void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32); void unset_pnfs_layoutdriver(struct nfs_server *); void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); @@ -192,7 +187,8 @@ int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req, u64 wb_size); int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc); -bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req); +size_t pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, + struct nfs_page *prev, struct nfs_page *req); void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg); struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp); void pnfs_free_lseg_list(struct list_head *tmp_list); @@ -217,13 +213,13 @@ bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task); -void pnfs_set_layoutcommit(struct nfs_write_data *wdata); +void pnfs_set_layoutcommit(struct nfs_pgio_data *wdata); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); int _pnfs_return_layout(struct inode *); int pnfs_commit_and_return_layout(struct inode *); -void pnfs_ld_write_done(struct nfs_write_data *); -void pnfs_ld_read_done(struct nfs_read_data *); +void pnfs_ld_write_done(struct nfs_pgio_data *); +void pnfs_ld_read_done(struct nfs_pgio_data *); struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, loff_t pos, @@ -275,7 +271,7 @@ pnfs_get_lseg(struct pnfs_layout_segment *lseg) { if (lseg) { atomic_inc(&lseg->pls_refcount); - smp_mb__after_atomic_inc(); + smp_mb__after_atomic(); } return lseg; } @@ -461,18 +457,6 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s) { } -static inline void pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, - const struct nfs_pgio_completion_ops *compl_ops) -{ - nfs_pageio_init_read(pgio, inode, compl_ops); -} - -static inline void pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags, - const struct nfs_pgio_completion_ops *compl_ops) -{ - nfs_pageio_init_write(pgio, inode, ioflags, compl_ops); -} - static inline int pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how, struct nfs_commit_info *cinfo) diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index e55ce9e8b034..c171ce1a8a30 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -578,7 +578,7 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, return 0; } -static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data) +static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_data *data) { struct inode *inode = data->header->inode; @@ -594,18 +594,18 @@ static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data) return 0; } -static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) +static void nfs_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg) { msg->rpc_proc = &nfs_procedures[NFSPROC_READ]; } -static int nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) +static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data) { rpc_call_start(task); return 0; } -static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data) +static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_data *data) { struct inode *inode = data->header->inode; @@ -614,19 +614,13 @@ static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data) return 0; } -static void nfs_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg) +static void nfs_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg) { /* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */ data->args.stable = NFS_FILE_SYNC; msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE]; } -static int nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) -{ - rpc_call_start(task); - return 0; -} - static void nfs_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) { BUG(); @@ -734,13 +728,10 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .fsinfo = nfs_proc_fsinfo, .pathconf = nfs_proc_pathconf, .decode_dirent = nfs2_decode_dirent, + .pgio_rpc_prepare = nfs_proc_pgio_rpc_prepare, .read_setup = nfs_proc_read_setup, - .read_pageio_init = nfs_pageio_init_read, - .read_rpc_prepare = nfs_proc_read_rpc_prepare, .read_done = nfs_read_done, .write_setup = nfs_proc_write_setup, - .write_pageio_init = nfs_pageio_init_write, - .write_rpc_prepare = nfs_proc_write_rpc_prepare, .write_done = nfs_write_done, .commit_setup = nfs_proc_commit_setup, .commit_rpc_prepare = nfs_proc_commit_rpc_prepare, diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 411aedda14bb..e818a475ca64 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -24,85 +24,24 @@ #include "internal.h" #include "iostat.h" #include "fscache.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE -static const struct nfs_pageio_ops nfs_pageio_read_ops; -static const struct rpc_call_ops nfs_read_common_ops; static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops; +static const struct nfs_rw_ops nfs_rw_read_ops; static struct kmem_cache *nfs_rdata_cachep; -struct nfs_read_header *nfs_readhdr_alloc(void) +static struct nfs_rw_header *nfs_readhdr_alloc(void) { - struct nfs_read_header *rhdr; - - rhdr = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL); - if (rhdr) { - struct nfs_pgio_header *hdr = &rhdr->header; - - INIT_LIST_HEAD(&hdr->pages); - INIT_LIST_HEAD(&hdr->rpc_list); - spin_lock_init(&hdr->lock); - atomic_set(&hdr->refcnt, 0); - } - return rhdr; + return kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL); } -EXPORT_SYMBOL_GPL(nfs_readhdr_alloc); -static struct nfs_read_data *nfs_readdata_alloc(struct nfs_pgio_header *hdr, - unsigned int pagecount) +static void nfs_readhdr_free(struct nfs_rw_header *rhdr) { - struct nfs_read_data *data, *prealloc; - - prealloc = &container_of(hdr, struct nfs_read_header, header)->rpc_data; - if (prealloc->header == NULL) - data = prealloc; - else - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - goto out; - - if (nfs_pgarray_set(&data->pages, pagecount)) { - data->header = hdr; - atomic_inc(&hdr->refcnt); - } else { - if (data != prealloc) - kfree(data); - data = NULL; - } -out: - return data; -} - -void nfs_readhdr_free(struct nfs_pgio_header *hdr) -{ - struct nfs_read_header *rhdr = container_of(hdr, struct nfs_read_header, header); - kmem_cache_free(nfs_rdata_cachep, rhdr); } -EXPORT_SYMBOL_GPL(nfs_readhdr_free); - -void nfs_readdata_release(struct nfs_read_data *rdata) -{ - struct nfs_pgio_header *hdr = rdata->header; - struct nfs_read_header *read_header = container_of(hdr, struct nfs_read_header, header); - - put_nfs_open_context(rdata->args.context); - if (rdata->pages.pagevec != rdata->pages.page_array) - kfree(rdata->pages.pagevec); - if (rdata == &read_header->rpc_data) { - rdata->header = NULL; - rdata = NULL; - } - if (atomic_dec_and_test(&hdr->refcnt)) - hdr->completion_ops->completion(hdr); - /* Note: we only free the rpc_task after callbacks are done. - * See the comment in rpc_free_task() for why - */ - kfree(rdata); -} -EXPORT_SYMBOL_GPL(nfs_readdata_release); static int nfs_return_empty_page(struct page *page) @@ -114,17 +53,24 @@ int nfs_return_empty_page(struct page *page) } void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, - struct inode *inode, + struct inode *inode, bool force_mds, const struct nfs_pgio_completion_ops *compl_ops) { - nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops, compl_ops, - NFS_SERVER(inode)->rsize, 0); + struct nfs_server *server = NFS_SERVER(inode); + const struct nfs_pageio_ops *pg_ops = &nfs_pgio_rw_ops; + +#ifdef CONFIG_NFS_V4_1 + if (server->pnfs_curr_ld && !force_mds) + pg_ops = server->pnfs_curr_ld->pg_read_ops; +#endif + nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_read_ops, + server->rsize, 0); } EXPORT_SYMBOL_GPL(nfs_pageio_init_read); void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) { - pgio->pg_ops = &nfs_pageio_read_ops; + pgio->pg_ops = &nfs_pgio_rw_ops; pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize; } EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); @@ -139,7 +85,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, len = nfs_page_length(page); if (len == 0) return nfs_return_empty_page(page); - new = nfs_create_request(ctx, inode, page, 0, len); + new = nfs_create_request(ctx, page, NULL, 0, len); if (IS_ERR(new)) { unlock_page(page); return PTR_ERR(new); @@ -147,7 +93,8 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, if (len < PAGE_CACHE_SIZE) zero_user_segment(page, len, PAGE_CACHE_SIZE); - NFS_PROTO(inode)->read_pageio_init(&pgio, inode, &nfs_async_read_completion_ops); + nfs_pageio_init_read(&pgio, inode, false, + &nfs_async_read_completion_ops); nfs_pageio_add_request(&pgio, new); nfs_pageio_complete(&pgio); NFS_I(inode)->read_io += pgio.pg_bytes_written; @@ -158,10 +105,16 @@ static void nfs_readpage_release(struct nfs_page *req) { struct inode *d_inode = req->wb_context->dentry->d_inode; - if (PageUptodate(req->wb_page)) - nfs_readpage_to_fscache(d_inode, req->wb_page, 0); + dprintk("NFS: read done (%s/%llu %d@%lld)\n", d_inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(d_inode), req->wb_bytes, + (long long)req_offset(req)); - unlock_page(req->wb_page); + if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) { + if (PageUptodate(req->wb_page)) + nfs_readpage_to_fscache(d_inode, req->wb_page, 0); + + unlock_page(req->wb_page); + } dprintk("NFS: read done (%s/%Lu %d@%Ld)\n", req->wb_context->dentry->d_inode->i_sb->s_id, @@ -171,7 +124,12 @@ static void nfs_readpage_release(struct nfs_page *req) nfs_release_request(req); } -/* Note io was page aligned */ +static void nfs_page_group_set_uptodate(struct nfs_page *req) +{ + if (nfs_page_group_sync_on_bit(req, PG_UPTODATE)) + SetPageUptodate(req->wb_page); +} + static void nfs_read_completion(struct nfs_pgio_header *hdr) { unsigned long bytes = 0; @@ -181,21 +139,32 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr) while (!list_empty(&hdr->pages)) { struct nfs_page *req = nfs_list_entry(hdr->pages.next); struct page *page = req->wb_page; + unsigned long start = req->wb_pgbase; + unsigned long end = req->wb_pgbase + req->wb_bytes; if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) { - if (bytes > hdr->good_bytes) - zero_user(page, 0, PAGE_SIZE); - else if (hdr->good_bytes - bytes < PAGE_SIZE) - zero_user_segment(page, - hdr->good_bytes & ~PAGE_MASK, - PAGE_SIZE); + /* note: regions of the page not covered by a + * request are zeroed in nfs_readpage_async / + * readpage_async_filler */ + if (bytes > hdr->good_bytes) { + /* nothing in this request was good, so zero + * the full extent of the request */ + zero_user_segment(page, start, end); + + } else if (hdr->good_bytes - bytes < req->wb_bytes) { + /* part of this request has good bytes, but + * not all. zero the bad bytes */ + start += hdr->good_bytes - bytes; + WARN_ON(start < req->wb_pgbase); + zero_user_segment(page, start, end); + } } bytes += req->wb_bytes; if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { if (bytes <= hdr->good_bytes) - SetPageUptodate(page); + nfs_page_group_set_uptodate(req); } else - SetPageUptodate(page); + nfs_page_group_set_uptodate(req); nfs_list_remove_request(req); nfs_readpage_release(req); } @@ -203,95 +172,14 @@ out: hdr->release(hdr); } -int nfs_initiate_read(struct rpc_clnt *clnt, - struct nfs_read_data *data, - const struct rpc_call_ops *call_ops, int flags) +static void nfs_initiate_read(struct nfs_pgio_data *data, struct rpc_message *msg, + struct rpc_task_setup *task_setup_data, int how) { struct inode *inode = data->header->inode; int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; - struct rpc_task *task; - struct rpc_message msg = { - .rpc_argp = &data->args, - .rpc_resp = &data->res, - .rpc_cred = data->header->cred, - }; - struct rpc_task_setup task_setup_data = { - .task = &data->task, - .rpc_client = clnt, - .rpc_message = &msg, - .callback_ops = call_ops, - .callback_data = data, - .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC | swap_flags | flags, - }; - /* Set up the initial task struct. */ - NFS_PROTO(inode)->read_setup(data, &msg); - - dprintk("NFS: %5u initiated read call (req %s/%llu, %u bytes @ " - "offset %llu)\n", - data->task.tk_pid, - inode->i_sb->s_id, - (unsigned long long)NFS_FILEID(inode), - data->args.count, - (unsigned long long)data->args.offset); - - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) - return PTR_ERR(task); - rpc_put_task(task); - return 0; -} -EXPORT_SYMBOL_GPL(nfs_initiate_read); - -/* - * Set up the NFS read request struct - */ -static void nfs_read_rpcsetup(struct nfs_read_data *data, - unsigned int count, unsigned int offset) -{ - struct nfs_page *req = data->header->req; - - data->args.fh = NFS_FH(data->header->inode); - data->args.offset = req_offset(req) + offset; - data->args.pgbase = req->wb_pgbase + offset; - data->args.pages = data->pages.pagevec; - data->args.count = count; - data->args.context = get_nfs_open_context(req->wb_context); - data->args.lock_context = req->wb_lock_context; - - data->res.fattr = &data->fattr; - data->res.count = count; - data->res.eof = 0; - nfs_fattr_init(&data->fattr); -} - -static int nfs_do_read(struct nfs_read_data *data, - const struct rpc_call_ops *call_ops) -{ - struct inode *inode = data->header->inode; - - return nfs_initiate_read(NFS_CLIENT(inode), data, call_ops, 0); -} - -static int -nfs_do_multiple_reads(struct list_head *head, - const struct rpc_call_ops *call_ops) -{ - struct nfs_read_data *data; - int ret = 0; - - while (!list_empty(head)) { - int ret2; - - data = list_first_entry(head, struct nfs_read_data, list); - list_del_init(&data->list); - - ret2 = nfs_do_read(data, call_ops); - if (ret == 0) - ret = ret2; - } - return ret; + task_setup_data->flags |= swap_flags; + NFS_PROTO(inode)->read_setup(data, msg); } static void @@ -311,143 +199,14 @@ static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = { .completion = nfs_read_completion, }; -static void nfs_pagein_error(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr) -{ - set_bit(NFS_IOHDR_REDO, &hdr->flags); - while (!list_empty(&hdr->rpc_list)) { - struct nfs_read_data *data = list_first_entry(&hdr->rpc_list, - struct nfs_read_data, list); - list_del(&data->list); - nfs_readdata_release(data); - } - desc->pg_completion_ops->error_cleanup(&desc->pg_list); -} - -/* - * Generate multiple requests to fill a single page. - * - * We optimize to reduce the number of read operations on the wire. If we - * detect that we're reading a page, or an area of a page, that is past the - * end of file, we do not generate NFS read operations but just clear the - * parts of the page that would have come back zero from the server anyway. - * - * We rely on the cached value of i_size to make this determination; another - * client can fill pages on the server past our cached end-of-file, but we - * won't see the new data until our attribute cache is updated. This is more - * or less conventional NFS client behavior. - */ -static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr) -{ - struct nfs_page *req = hdr->req; - struct page *page = req->wb_page; - struct nfs_read_data *data; - size_t rsize = desc->pg_bsize, nbytes; - unsigned int offset; - - offset = 0; - nbytes = desc->pg_count; - do { - size_t len = min(nbytes,rsize); - - data = nfs_readdata_alloc(hdr, 1); - if (!data) { - nfs_pagein_error(desc, hdr); - return -ENOMEM; - } - data->pages.pagevec[0] = page; - nfs_read_rpcsetup(data, len, offset); - list_add(&data->list, &hdr->rpc_list); - nbytes -= len; - offset += len; - } while (nbytes != 0); - - nfs_list_remove_request(req); - nfs_list_add_request(req, &hdr->pages); - desc->pg_rpc_callops = &nfs_read_common_ops; - return 0; -} - -static int nfs_pagein_one(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr) -{ - struct nfs_page *req; - struct page **pages; - struct nfs_read_data *data; - struct list_head *head = &desc->pg_list; - - data = nfs_readdata_alloc(hdr, nfs_page_array_len(desc->pg_base, - desc->pg_count)); - if (!data) { - nfs_pagein_error(desc, hdr); - return -ENOMEM; - } - - pages = data->pages.pagevec; - while (!list_empty(head)) { - req = nfs_list_entry(head->next); - nfs_list_remove_request(req); - nfs_list_add_request(req, &hdr->pages); - *pages++ = req->wb_page; - } - - nfs_read_rpcsetup(data, desc->pg_count, 0); - list_add(&data->list, &hdr->rpc_list); - desc->pg_rpc_callops = &nfs_read_common_ops; - return 0; -} - -int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr) -{ - if (desc->pg_bsize < PAGE_CACHE_SIZE) - return nfs_pagein_multi(desc, hdr); - return nfs_pagein_one(desc, hdr); -} -EXPORT_SYMBOL_GPL(nfs_generic_pagein); - -static int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) -{ - struct nfs_read_header *rhdr; - struct nfs_pgio_header *hdr; - int ret; - - rhdr = nfs_readhdr_alloc(); - if (!rhdr) { - desc->pg_completion_ops->error_cleanup(&desc->pg_list); - return -ENOMEM; - } - hdr = &rhdr->header; - nfs_pgheader_init(desc, hdr, nfs_readhdr_free); - atomic_inc(&hdr->refcnt); - ret = nfs_generic_pagein(desc, hdr); - if (ret == 0) - ret = nfs_do_multiple_reads(&hdr->rpc_list, - desc->pg_rpc_callops); - if (atomic_dec_and_test(&hdr->refcnt)) - hdr->completion_ops->completion(hdr); - return ret; -} - -static const struct nfs_pageio_ops nfs_pageio_read_ops = { - .pg_test = nfs_generic_pg_test, - .pg_doio = nfs_generic_pg_readpages, -}; - /* * This is the callback from RPC telling us whether a reply was * received or some error occurred (timeout or socket shutdown). */ -int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data) +static int nfs_readpage_done(struct rpc_task *task, struct nfs_pgio_data *data, + struct inode *inode) { - struct inode *inode = data->header->inode; - int status; - - dprintk("NFS: %s: %5u, (status %d)\n", __func__, task->tk_pid, - task->tk_status); - - status = NFS_PROTO(inode)->read_done(task, data); + int status = NFS_PROTO(inode)->read_done(task, data); if (status != 0) return status; @@ -460,10 +219,10 @@ int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data) return 0; } -static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data) +static void nfs_readpage_retry(struct rpc_task *task, struct nfs_pgio_data *data) { - struct nfs_readargs *argp = &data->args; - struct nfs_readres *resp = &data->res; + struct nfs_pgio_args *argp = &data->args; + struct nfs_pgio_res *resp = &data->res; /* This is a short read! */ nfs_inc_stats(data->header->inode, NFSIOS_SHORTREAD); @@ -480,17 +239,11 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data rpc_restart_call_prepare(task); } -static void nfs_readpage_result_common(struct rpc_task *task, void *calldata) +static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_data *data) { - struct nfs_read_data *data = calldata; struct nfs_pgio_header *hdr = data->header; - /* Note the only returns of nfs_readpage_result are 0 and -EAGAIN */ - if (nfs_readpage_result(task, data) != 0) - return; - if (task->tk_status < 0) - nfs_set_pgio_error(hdr, task->tk_status, data->args.offset); - else if (data->res.eof) { + if (data->res.eof) { loff_t bound; bound = data->args.offset + data->res.count; @@ -505,26 +258,6 @@ static void nfs_readpage_result_common(struct rpc_task *task, void *calldata) nfs_readpage_retry(task, data); } -static void nfs_readpage_release_common(void *calldata) -{ - nfs_readdata_release(calldata); -} - -void nfs_read_prepare(struct rpc_task *task, void *calldata) -{ - struct nfs_read_data *data = calldata; - int err; - err = NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data); - if (err) - rpc_exit(task, err); -} - -static const struct rpc_call_ops nfs_read_common_ops = { - .rpc_call_prepare = nfs_read_prepare, - .rpc_call_done = nfs_readpage_result_common, - .rpc_release = nfs_readpage_release_common, -}; - /* * Read a page over NFS. * We read the page synchronously in the following case: @@ -592,7 +325,6 @@ static int readpage_async_filler(void *data, struct page *page) { struct nfs_readdesc *desc = (struct nfs_readdesc *)data; - struct inode *inode = page_file_mapping(page)->host; struct nfs_page *new; unsigned int len; int error; @@ -601,7 +333,7 @@ readpage_async_filler(void *data, struct page *page) if (len == 0) return nfs_return_empty_page(page); - new = nfs_create_request(desc->ctx, inode, page, 0, len); + new = nfs_create_request(desc->ctx, page, NULL, 0, len); if (IS_ERR(new)) goto out_error; @@ -654,7 +386,8 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, if (ret == 0) goto read_complete; /* all pages were read */ - NFS_PROTO(inode)->read_pageio_init(&pgio, inode, &nfs_async_read_completion_ops); + nfs_pageio_init_read(&pgio, inode, false, + &nfs_async_read_completion_ops); ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); @@ -671,7 +404,7 @@ out: int __init nfs_init_readpagecache(void) { nfs_rdata_cachep = kmem_cache_create("nfs_read_data", - sizeof(struct nfs_read_header), + sizeof(struct nfs_rw_header), 0, SLAB_HWCACHE_ALIGN, NULL); if (nfs_rdata_cachep == NULL) @@ -684,3 +417,12 @@ void nfs_destroy_readpagecache(void) { kmem_cache_destroy(nfs_rdata_cachep); } + +static const struct nfs_rw_ops nfs_rw_read_ops = { + .rw_mode = FMODE_READ, + .rw_alloc_header = nfs_readhdr_alloc, + .rw_free_header = nfs_readhdr_free, + .rw_done = nfs_readpage_done, + .rw_result = nfs_readpage_result, + .rw_initiate = nfs_initiate_read, +}; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 2cb56943e232..084af1060d79 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2180,11 +2180,23 @@ out_no_address: return -EINVAL; } +#define NFS_MOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \ + | NFS_MOUNT_SECURE \ + | NFS_MOUNT_TCP \ + | NFS_MOUNT_VER3 \ + | NFS_MOUNT_KERBEROS \ + | NFS_MOUNT_NONLM \ + | NFS_MOUNT_BROKEN_SUID \ + | NFS_MOUNT_STRICTLOCK \ + | NFS_MOUNT_UNSHARED \ + | NFS_MOUNT_NORESVPORT \ + | NFS_MOUNT_LEGACY_INTERFACE) + static int nfs_compare_remount_data(struct nfs_server *nfss, struct nfs_parsed_mount_data *data) { - if (data->flags != nfss->flags || + if ((data->flags ^ nfss->flags) & NFS_MOUNT_CMP_FLAGMASK || data->rsize != nfss->rsize || data->wsize != nfss->wsize || data->version != nfss->nfs_client->rpc_ops->version || @@ -2248,6 +2260,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen; data->version = nfsvers; data->minorversion = nfss->nfs_client->cl_minorversion; + data->net = current->nsproxy->net_ns; memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr, data->nfs_server.addrlen); @@ -2347,18 +2360,6 @@ void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info) nfs_initialise_sb(sb); } -#define NFS_MOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \ - | NFS_MOUNT_SECURE \ - | NFS_MOUNT_TCP \ - | NFS_MOUNT_VER3 \ - | NFS_MOUNT_KERBEROS \ - | NFS_MOUNT_NONLM \ - | NFS_MOUNT_BROKEN_SUID \ - | NFS_MOUNT_STRICTLOCK \ - | NFS_MOUNT_UNSHARED \ - | NFS_MOUNT_NORESVPORT \ - | NFS_MOUNT_LEGACY_INTERFACE) - static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags) { const struct nfs_server *a = s->s_fs_info; diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c index 6b3f2535a3ec..bb6ed810fa6f 100644 --- a/fs/nfs/sysctl.c +++ b/fs/nfs/sysctl.c @@ -13,7 +13,7 @@ static struct ctl_table_header *nfs_callback_sysctl_table; -static ctl_table nfs_cb_sysctls[] = { +static struct ctl_table nfs_cb_sysctls[] = { { .procname = "nfs_mountpoint_timeout", .data = &nfs_mountpoint_expiry_timeout, @@ -31,7 +31,7 @@ static ctl_table nfs_cb_sysctls[] = { { } }; -static ctl_table nfs_cb_sysctl_dir[] = { +static struct ctl_table nfs_cb_sysctl_dir[] = { { .procname = "nfs", .mode = 0555, @@ -40,7 +40,7 @@ static ctl_table nfs_cb_sysctl_dir[] = { { } }; -static ctl_table nfs_cb_sysctl_root[] = { +static struct ctl_table nfs_cb_sysctl_root[] = { { .procname = "fs", .mode = 0555, diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 9a3b6a4cd6b9..5e2f10304548 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -42,10 +42,11 @@ * Local function declarations */ static void nfs_redirty_request(struct nfs_page *req); -static const struct rpc_call_ops nfs_write_common_ops; static const struct rpc_call_ops nfs_commit_ops; static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops; static const struct nfs_commit_completion_ops nfs_commit_completion_ops; +static const struct nfs_rw_ops nfs_rw_write_ops; +static void nfs_clear_request_commit(struct nfs_page *req); static struct kmem_cache *nfs_wdata_cachep; static mempool_t *nfs_wdata_mempool; @@ -70,76 +71,19 @@ void nfs_commit_free(struct nfs_commit_data *p) } EXPORT_SYMBOL_GPL(nfs_commit_free); -struct nfs_write_header *nfs_writehdr_alloc(void) +static struct nfs_rw_header *nfs_writehdr_alloc(void) { - struct nfs_write_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO); - - if (p) { - struct nfs_pgio_header *hdr = &p->header; + struct nfs_rw_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO); + if (p) memset(p, 0, sizeof(*p)); - INIT_LIST_HEAD(&hdr->pages); - INIT_LIST_HEAD(&hdr->rpc_list); - spin_lock_init(&hdr->lock); - atomic_set(&hdr->refcnt, 0); - hdr->verf = &p->verf; - } return p; } -EXPORT_SYMBOL_GPL(nfs_writehdr_alloc); -static struct nfs_write_data *nfs_writedata_alloc(struct nfs_pgio_header *hdr, - unsigned int pagecount) +static void nfs_writehdr_free(struct nfs_rw_header *whdr) { - struct nfs_write_data *data, *prealloc; - - prealloc = &container_of(hdr, struct nfs_write_header, header)->rpc_data; - if (prealloc->header == NULL) - data = prealloc; - else - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - goto out; - - if (nfs_pgarray_set(&data->pages, pagecount)) { - data->header = hdr; - atomic_inc(&hdr->refcnt); - } else { - if (data != prealloc) - kfree(data); - data = NULL; - } -out: - return data; -} - -void nfs_writehdr_free(struct nfs_pgio_header *hdr) -{ - struct nfs_write_header *whdr = container_of(hdr, struct nfs_write_header, header); mempool_free(whdr, nfs_wdata_mempool); } -EXPORT_SYMBOL_GPL(nfs_writehdr_free); - -void nfs_writedata_release(struct nfs_write_data *wdata) -{ - struct nfs_pgio_header *hdr = wdata->header; - struct nfs_write_header *write_header = container_of(hdr, struct nfs_write_header, header); - - put_nfs_open_context(wdata->args.context); - if (wdata->pages.pagevec != wdata->pages.page_array) - kfree(wdata->pages.pagevec); - if (wdata == &write_header->rpc_data) { - wdata->header = NULL; - wdata = NULL; - } - if (atomic_dec_and_test(&hdr->refcnt)) - hdr->completion_ops->completion(hdr); - /* Note: we only free the rpc_task after callbacks are done. - * See the comment in rpc_free_task() for why - */ - kfree(wdata); -} -EXPORT_SYMBOL_GPL(nfs_writedata_release); static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) { @@ -148,8 +92,15 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); } +/* + * nfs_page_find_head_request_locked - find head request associated with @page + * + * must be called while holding the inode lock. + * + * returns matching head request with reference held, or NULL if not found. + */ static struct nfs_page * -nfs_page_find_request_locked(struct nfs_inode *nfsi, struct page *page) +nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page) { struct nfs_page *req = NULL; @@ -161,25 +112,33 @@ nfs_page_find_request_locked(struct nfs_inode *nfsi, struct page *page) /* Linearly search the commit list for the correct req */ list_for_each_entry_safe(freq, t, &nfsi->commit_info.list, wb_list) { if (freq->wb_page == page) { - req = freq; + req = freq->wb_head; break; } } } - if (req) + if (req) { + WARN_ON_ONCE(req->wb_head != req); + kref_get(&req->wb_kref); + } return req; } -static struct nfs_page *nfs_page_find_request(struct page *page) +/* + * nfs_page_find_head_request - find head request associated with @page + * + * returns matching head request with reference held, or NULL if not found. + */ +static struct nfs_page *nfs_page_find_head_request(struct page *page) { struct inode *inode = page_file_mapping(page)->host; struct nfs_page *req = NULL; spin_lock(&inode->i_lock); - req = nfs_page_find_request_locked(NFS_I(inode), page); + req = nfs_page_find_head_request_locked(NFS_I(inode), page); spin_unlock(&inode->i_lock); return req; } @@ -211,18 +170,78 @@ static void nfs_set_pageerror(struct page *page) nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page)); } +/* + * nfs_page_group_search_locked + * @head - head request of page group + * @page_offset - offset into page + * + * Search page group with head @head to find a request that contains the + * page offset @page_offset. + * + * Returns a pointer to the first matching nfs request, or NULL if no + * match is found. + * + * Must be called with the page group lock held + */ +static struct nfs_page * +nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset) +{ + struct nfs_page *req; + + WARN_ON_ONCE(head != head->wb_head); + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_head->wb_flags)); + + req = head; + do { + if (page_offset >= req->wb_pgbase && + page_offset < (req->wb_pgbase + req->wb_bytes)) + return req; + + req = req->wb_this_page; + } while (req != head); + + return NULL; +} + +/* + * nfs_page_group_covers_page + * @head - head request of page group + * + * Return true if the page group with head @head covers the whole page, + * returns false otherwise + */ +static bool nfs_page_group_covers_page(struct nfs_page *req) +{ + struct nfs_page *tmp; + unsigned int pos = 0; + unsigned int len = nfs_page_length(req->wb_page); + + nfs_page_group_lock(req); + + do { + tmp = nfs_page_group_search_locked(req->wb_head, pos); + if (tmp) { + /* no way this should happen */ + WARN_ON_ONCE(tmp->wb_pgbase != pos); + pos += tmp->wb_bytes - (pos - tmp->wb_pgbase); + } + } while (tmp && pos < len); + + nfs_page_group_unlock(req); + WARN_ON_ONCE(pos > len); + return pos == len; +} + /* We can set the PG_uptodate flag if we see that a write request * covers the full page. */ -static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int count) +static void nfs_mark_uptodate(struct nfs_page *req) { - if (PageUptodate(page)) - return; - if (base != 0) + if (PageUptodate(req->wb_page)) return; - if (count != nfs_page_length(page)) + if (!nfs_page_group_covers_page(req)) return; - SetPageUptodate(page); + SetPageUptodate(req->wb_page); } static int wb_priority(struct writeback_control *wbc) @@ -258,46 +277,259 @@ static void nfs_set_page_writeback(struct page *page) } } -static void nfs_end_page_writeback(struct page *page) +static void nfs_end_page_writeback(struct nfs_page *req) { - struct inode *inode = page_file_mapping(page)->host; + struct inode *inode = page_file_mapping(req->wb_page)->host; struct nfs_server *nfss = NFS_SERVER(inode); - end_page_writeback(page); + if (!nfs_page_group_sync_on_bit(req, PG_WB_END)) + return; + + end_page_writeback(req->wb_page); if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); } -static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock) + +/* nfs_page_group_clear_bits + * @req - an nfs request + * clears all page group related bits from @req + */ +static void +nfs_page_group_clear_bits(struct nfs_page *req) +{ + clear_bit(PG_TEARDOWN, &req->wb_flags); + clear_bit(PG_UNLOCKPAGE, &req->wb_flags); + clear_bit(PG_UPTODATE, &req->wb_flags); + clear_bit(PG_WB_END, &req->wb_flags); + clear_bit(PG_REMOVE, &req->wb_flags); +} + + +/* + * nfs_unroll_locks_and_wait - unlock all newly locked reqs and wait on @req + * + * this is a helper function for nfs_lock_and_join_requests + * + * @inode - inode associated with request page group, must be holding inode lock + * @head - head request of page group, must be holding head lock + * @req - request that couldn't lock and needs to wait on the req bit lock + * @nonblock - if true, don't actually wait + * + * NOTE: this must be called holding page_group bit lock and inode spin lock + * and BOTH will be released before returning. + * + * returns 0 on success, < 0 on error. + */ +static int +nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head, + struct nfs_page *req, bool nonblock) + __releases(&inode->i_lock) +{ + struct nfs_page *tmp; + int ret; + + /* relinquish all the locks successfully grabbed this run */ + for (tmp = head ; tmp != req; tmp = tmp->wb_this_page) + nfs_unlock_request(tmp); + + WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags)); + + /* grab a ref on the request that will be waited on */ + kref_get(&req->wb_kref); + + nfs_page_group_unlock(head); + spin_unlock(&inode->i_lock); + + /* release ref from nfs_page_find_head_request_locked */ + nfs_release_request(head); + + if (!nonblock) + ret = nfs_wait_on_request(req); + else + ret = -EAGAIN; + nfs_release_request(req); + + return ret; +} + +/* + * nfs_destroy_unlinked_subrequests - destroy recently unlinked subrequests + * + * @destroy_list - request list (using wb_this_page) terminated by @old_head + * @old_head - the old head of the list + * + * All subrequests must be locked and removed from all lists, so at this point + * they are only "active" in this function, and possibly in nfs_wait_on_request + * with a reference held by some other context. + */ +static void +nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, + struct nfs_page *old_head) +{ + while (destroy_list) { + struct nfs_page *subreq = destroy_list; + + destroy_list = (subreq->wb_this_page == old_head) ? + NULL : subreq->wb_this_page; + + WARN_ON_ONCE(old_head != subreq->wb_head); + + /* make sure old group is not used */ + subreq->wb_head = subreq; + subreq->wb_this_page = subreq; + + nfs_clear_request_commit(subreq); + + /* subreq is now totally disconnected from page group or any + * write / commit lists. last chance to wake any waiters */ + nfs_unlock_request(subreq); + + if (!test_bit(PG_TEARDOWN, &subreq->wb_flags)) { + /* release ref on old head request */ + nfs_release_request(old_head); + + nfs_page_group_clear_bits(subreq); + + /* release the PG_INODE_REF reference */ + if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) + nfs_release_request(subreq); + else + WARN_ON_ONCE(1); + } else { + WARN_ON_ONCE(test_bit(PG_CLEAN, &subreq->wb_flags)); + /* zombie requests have already released the last + * reference and were waiting on the rest of the + * group to complete. Since it's no longer part of a + * group, simply free the request */ + nfs_page_group_clear_bits(subreq); + nfs_free_request(subreq); + } + } +} + +/* + * nfs_lock_and_join_requests - join all subreqs to the head req and return + * a locked reference, cancelling any pending + * operations for this page. + * + * @page - the page used to lookup the "page group" of nfs_page structures + * @nonblock - if true, don't block waiting for request locks + * + * This function joins all sub requests to the head request by first + * locking all requests in the group, cancelling any pending operations + * and finally updating the head request to cover the whole range covered by + * the (former) group. All subrequests are removed from any write or commit + * lists, unlinked from the group and destroyed. + * + * Returns a locked, referenced pointer to the head request - which after + * this call is guaranteed to be the only request associated with the page. + * Returns NULL if no requests are found for @page, or a ERR_PTR if an + * error was encountered. + */ +static struct nfs_page * +nfs_lock_and_join_requests(struct page *page, bool nonblock) { struct inode *inode = page_file_mapping(page)->host; - struct nfs_page *req; + struct nfs_page *head, *subreq; + struct nfs_page *destroy_list = NULL; + unsigned int total_bytes; int ret; +try_again: + total_bytes = 0; + + WARN_ON_ONCE(destroy_list); + spin_lock(&inode->i_lock); - for (;;) { - req = nfs_page_find_request_locked(NFS_I(inode), page); - if (req == NULL) - break; - if (nfs_lock_request(req)) - break; - /* Note: If we hold the page lock, as is the case in nfs_writepage, - * then the call to nfs_lock_request() will always - * succeed provided that someone hasn't already marked the - * request as dirty (in which case we don't care). - */ + + /* + * A reference is taken only on the head request which acts as a + * reference to the whole page group - the group will not be destroyed + * until the head reference is released. + */ + head = nfs_page_find_head_request_locked(NFS_I(inode), page); + + if (!head) { spin_unlock(&inode->i_lock); - if (!nonblock) - ret = nfs_wait_on_request(req); - else - ret = -EAGAIN; - nfs_release_request(req); - if (ret != 0) + return NULL; + } + + /* lock each request in the page group */ + nfs_page_group_lock(head); + subreq = head; + do { + /* + * Subrequests are always contiguous, non overlapping + * and in order. If not, it's a programming error. + */ + WARN_ON_ONCE(subreq->wb_offset != + (head->wb_offset + total_bytes)); + + /* keep track of how many bytes this group covers */ + total_bytes += subreq->wb_bytes; + + if (!nfs_lock_request(subreq)) { + /* releases page group bit lock and + * inode spin lock and all references */ + ret = nfs_unroll_locks_and_wait(inode, head, + subreq, nonblock); + + if (ret == 0) + goto try_again; + return ERR_PTR(ret); - spin_lock(&inode->i_lock); + } + + subreq = subreq->wb_this_page; + } while (subreq != head); + + /* Now that all requests are locked, make sure they aren't on any list. + * Commit list removal accounting is done after locks are dropped */ + subreq = head; + do { + nfs_list_remove_request(subreq); + subreq = subreq->wb_this_page; + } while (subreq != head); + + /* unlink subrequests from head, destroy them later */ + if (head->wb_this_page != head) { + /* destroy list will be terminated by head */ + destroy_list = head->wb_this_page; + head->wb_this_page = head; + + /* change head request to cover whole range that + * the former page group covered */ + head->wb_bytes = total_bytes; } + + /* + * prepare head request to be added to new pgio descriptor + */ + nfs_page_group_clear_bits(head); + + /* + * some part of the group was still on the inode list - otherwise + * the group wouldn't be involved in async write. + * grab a reference for the head request, iff it needs one. + */ + if (!test_and_set_bit(PG_INODE_REF, &head->wb_flags)) + kref_get(&head->wb_kref); + + nfs_page_group_unlock(head); + + /* drop lock to clear_request_commit the head req and clean up + * requests on destroy list */ spin_unlock(&inode->i_lock); - return req; + + nfs_destroy_unlinked_subrequests(destroy_list, head); + + /* clean up commit list state */ + nfs_clear_request_commit(head); + + /* still holds ref on head from nfs_page_find_head_request_locked + * and still has lock on head from lock loop */ + return head; } /* @@ -310,7 +542,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, struct nfs_page *req; int ret = 0; - req = nfs_find_and_lock_request(page, nonblock); + req = nfs_lock_and_join_requests(page, nonblock); if (!req) goto out; ret = PTR_ERR(req); @@ -354,10 +586,8 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc struct nfs_pageio_descriptor pgio; int err; - NFS_PROTO(page_file_mapping(page)->host)->write_pageio_init(&pgio, - page->mapping->host, - wb_priority(wbc), - &nfs_async_write_completion_ops); + nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc), + false, &nfs_async_write_completion_ops); err = nfs_do_writepage(page, wbc, &pgio); nfs_pageio_complete(&pgio); if (err < 0) @@ -400,12 +630,13 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); - NFS_PROTO(inode)->write_pageio_init(&pgio, inode, wb_priority(wbc), &nfs_async_write_completion_ops); + nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false, + &nfs_async_write_completion_ops); err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); nfs_pageio_complete(&pgio); clear_bit_unlock(NFS_INO_FLUSHING, bitlock); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(bitlock, NFS_INO_FLUSHING); if (err < 0) @@ -425,6 +656,8 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) { struct nfs_inode *nfsi = NFS_I(inode); + WARN_ON_ONCE(req->wb_this_page != req); + /* Lock the request! */ nfs_lock_request(req); @@ -441,6 +674,9 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) set_page_private(req->wb_page, (unsigned long)req); } nfsi->npages++; + /* this a head request for a page group - mark it as having an + * extra reference so sub groups can follow suit */ + WARN_ON(test_and_set_bit(PG_INODE_REF, &req->wb_flags)); kref_get(&req->wb_kref); spin_unlock(&inode->i_lock); } @@ -452,16 +688,23 @@ static void nfs_inode_remove_request(struct nfs_page *req) { struct inode *inode = req->wb_context->dentry->d_inode; struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_page *head; - spin_lock(&inode->i_lock); - if (likely(!PageSwapCache(req->wb_page))) { - set_page_private(req->wb_page, 0); - ClearPagePrivate(req->wb_page); - clear_bit(PG_MAPPED, &req->wb_flags); + if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) { + head = req->wb_head; + + spin_lock(&inode->i_lock); + if (likely(!PageSwapCache(head->wb_page))) { + set_page_private(head->wb_page, 0); + ClearPagePrivate(head->wb_page); + clear_bit(PG_MAPPED, &head->wb_flags); + } + nfsi->npages--; + spin_unlock(&inode->i_lock); } - nfsi->npages--; - spin_unlock(&inode->i_lock); - nfs_release_request(req); + + if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) + nfs_release_request(req); } static void @@ -583,7 +826,7 @@ nfs_clear_request_commit(struct nfs_page *req) } static inline -int nfs_write_need_commit(struct nfs_write_data *data) +int nfs_write_need_commit(struct nfs_pgio_data *data) { if (data->verf.committed == NFS_DATA_SYNC) return data->header->lseg == NULL; @@ -614,7 +857,7 @@ nfs_clear_request_commit(struct nfs_page *req) } static inline -int nfs_write_need_commit(struct nfs_write_data *data) +int nfs_write_need_commit(struct nfs_pgio_data *data) { return 0; } @@ -645,7 +888,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) goto next; } if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) { - memcpy(&req->wb_verf, &hdr->verf->verifier, sizeof(req->wb_verf)); + memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf)); nfs_mark_request_commit(req, hdr->lseg, &cinfo); goto next; } @@ -653,7 +896,7 @@ remove_req: nfs_inode_remove_request(req); next: nfs_unlock_request(req); - nfs_end_page_writeback(req->wb_page); + nfs_end_page_writeback(req); nfs_release_request(req); } out: @@ -661,7 +904,7 @@ out: } #if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) -static unsigned long +unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo) { return cinfo->mds->ncommit; @@ -718,7 +961,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, } #else -static unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo) +unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo) { return 0; } @@ -754,10 +997,14 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, spin_lock(&inode->i_lock); for (;;) { - req = nfs_page_find_request_locked(NFS_I(inode), page); + req = nfs_page_find_head_request_locked(NFS_I(inode), page); if (req == NULL) goto out_unlock; + /* should be handled by nfs_flush_incompatible */ + WARN_ON_ONCE(req->wb_head != req); + WARN_ON_ONCE(req->wb_this_page != req); + rqend = req->wb_offset + req->wb_bytes; /* * Tell the caller to flush out the request if @@ -819,7 +1066,7 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, req = nfs_try_to_update_request(inode, page, offset, bytes); if (req != NULL) goto out; - req = nfs_create_request(ctx, inode, page, offset, bytes); + req = nfs_create_request(ctx, page, NULL, offset, bytes); if (IS_ERR(req)) goto out; nfs_inode_add_request(inode, req); @@ -837,7 +1084,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, return PTR_ERR(req); /* Update file length */ nfs_grow_file(page, offset, count); - nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); + nfs_mark_uptodate(req); nfs_mark_request_dirty(req); nfs_unlock_and_release_request(req); return 0; @@ -858,11 +1105,13 @@ int nfs_flush_incompatible(struct file *file, struct page *page) * dropped page. */ do { - req = nfs_page_find_request(page); + req = nfs_page_find_head_request(page); if (req == NULL) return 0; l_ctx = req->wb_lock_context; do_flush = req->wb_page != page || req->wb_context != ctx; + /* for now, flush if more than 1 request in page_group */ + do_flush |= req->wb_this_page != req; if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) { do_flush |= l_ctx->lockowner.l_owner != current->files || l_ctx->lockowner.l_pid != current->tgid; @@ -913,12 +1162,14 @@ static bool nfs_write_pageuptodate(struct page *page, struct inode *inode) if (nfs_have_delegated_attributes(inode)) goto out; - if (nfsi->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE)) + if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) return false; smp_rmb(); if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags)) return false; out: + if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + return false; return PageUptodate(page) != 0; } @@ -990,126 +1241,17 @@ static int flush_task_priority(int how) return RPC_PRIORITY_NORMAL; } -int nfs_initiate_write(struct rpc_clnt *clnt, - struct nfs_write_data *data, - const struct rpc_call_ops *call_ops, - int how, int flags) +static void nfs_initiate_write(struct nfs_pgio_data *data, struct rpc_message *msg, + struct rpc_task_setup *task_setup_data, int how) { struct inode *inode = data->header->inode; int priority = flush_task_priority(how); - struct rpc_task *task; - struct rpc_message msg = { - .rpc_argp = &data->args, - .rpc_resp = &data->res, - .rpc_cred = data->header->cred, - }; - struct rpc_task_setup task_setup_data = { - .rpc_client = clnt, - .task = &data->task, - .rpc_message = &msg, - .callback_ops = call_ops, - .callback_data = data, - .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC | flags, - .priority = priority, - }; - int ret = 0; - - /* Set up the initial task struct. */ - NFS_PROTO(inode)->write_setup(data, &msg); - dprintk("NFS: %5u initiated write call " - "(req %s/%llu, %u bytes @ offset %llu)\n", - data->task.tk_pid, - inode->i_sb->s_id, - (unsigned long long)NFS_FILEID(inode), - data->args.count, - (unsigned long long)data->args.offset); + task_setup_data->priority = priority; + NFS_PROTO(inode)->write_setup(data, msg); nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client, - &task_setup_data.rpc_client, &msg, data); - - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) { - ret = PTR_ERR(task); - goto out; - } - if (how & FLUSH_SYNC) { - ret = rpc_wait_for_completion_task(task); - if (ret == 0) - ret = task->tk_status; - } - rpc_put_task(task); -out: - return ret; -} -EXPORT_SYMBOL_GPL(nfs_initiate_write); - -/* - * Set up the argument/result storage required for the RPC call. - */ -static void nfs_write_rpcsetup(struct nfs_write_data *data, - unsigned int count, unsigned int offset, - int how, struct nfs_commit_info *cinfo) -{ - struct nfs_page *req = data->header->req; - - /* Set up the RPC argument and reply structs - * NB: take care not to mess about with data->commit et al. */ - - data->args.fh = NFS_FH(data->header->inode); - data->args.offset = req_offset(req) + offset; - /* pnfs_set_layoutcommit needs this */ - data->mds_offset = data->args.offset; - data->args.pgbase = req->wb_pgbase + offset; - data->args.pages = data->pages.pagevec; - data->args.count = count; - data->args.context = get_nfs_open_context(req->wb_context); - data->args.lock_context = req->wb_lock_context; - data->args.stable = NFS_UNSTABLE; - switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { - case 0: - break; - case FLUSH_COND_STABLE: - if (nfs_reqs_to_commit(cinfo)) - break; - default: - data->args.stable = NFS_FILE_SYNC; - } - - data->res.fattr = &data->fattr; - data->res.count = count; - data->res.verf = &data->verf; - nfs_fattr_init(&data->fattr); -} - -static int nfs_do_write(struct nfs_write_data *data, - const struct rpc_call_ops *call_ops, - int how) -{ - struct inode *inode = data->header->inode; - - return nfs_initiate_write(NFS_CLIENT(inode), data, call_ops, how, 0); -} - -static int nfs_do_multiple_writes(struct list_head *head, - const struct rpc_call_ops *call_ops, - int how) -{ - struct nfs_write_data *data; - int ret = 0; - - while (!list_empty(head)) { - int ret2; - - data = list_first_entry(head, struct nfs_write_data, list); - list_del_init(&data->list); - - ret2 = nfs_do_write(data, call_ops, how); - if (ret == 0) - ret = ret2; - } - return ret; + &task_setup_data->rpc_client, msg, data); } /* If a nfs_flush_* function fails, it should remove reqs from @head and @@ -1120,7 +1262,7 @@ static void nfs_redirty_request(struct nfs_page *req) { nfs_mark_request_dirty(req); nfs_unlock_request(req); - nfs_end_page_writeback(req->wb_page); + nfs_end_page_writeback(req); nfs_release_request(req); } @@ -1140,173 +1282,30 @@ static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = { .completion = nfs_write_completion, }; -static void nfs_flush_error(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr) -{ - set_bit(NFS_IOHDR_REDO, &hdr->flags); - while (!list_empty(&hdr->rpc_list)) { - struct nfs_write_data *data = list_first_entry(&hdr->rpc_list, - struct nfs_write_data, list); - list_del(&data->list); - nfs_writedata_release(data); - } - desc->pg_completion_ops->error_cleanup(&desc->pg_list); -} - -/* - * Generate multiple small requests to write out a single - * contiguous dirty area on one page. - */ -static int nfs_flush_multi(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr) -{ - struct nfs_page *req = hdr->req; - struct page *page = req->wb_page; - struct nfs_write_data *data; - size_t wsize = desc->pg_bsize, nbytes; - unsigned int offset; - int requests = 0; - struct nfs_commit_info cinfo; - - nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq); - - if ((desc->pg_ioflags & FLUSH_COND_STABLE) && - (desc->pg_moreio || nfs_reqs_to_commit(&cinfo) || - desc->pg_count > wsize)) - desc->pg_ioflags &= ~FLUSH_COND_STABLE; - - - offset = 0; - nbytes = desc->pg_count; - do { - size_t len = min(nbytes, wsize); - - data = nfs_writedata_alloc(hdr, 1); - if (!data) { - nfs_flush_error(desc, hdr); - return -ENOMEM; - } - data->pages.pagevec[0] = page; - nfs_write_rpcsetup(data, len, offset, desc->pg_ioflags, &cinfo); - list_add(&data->list, &hdr->rpc_list); - requests++; - nbytes -= len; - offset += len; - } while (nbytes != 0); - nfs_list_remove_request(req); - nfs_list_add_request(req, &hdr->pages); - desc->pg_rpc_callops = &nfs_write_common_ops; - return 0; -} - -/* - * Create an RPC task for the given write request and kick it. - * The page must have been locked by the caller. - * - * It may happen that the page we're passed is not marked dirty. - * This is the case if nfs_updatepage detects a conflicting request - * that has been written but not committed. - */ -static int nfs_flush_one(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr) -{ - struct nfs_page *req; - struct page **pages; - struct nfs_write_data *data; - struct list_head *head = &desc->pg_list; - struct nfs_commit_info cinfo; - - data = nfs_writedata_alloc(hdr, nfs_page_array_len(desc->pg_base, - desc->pg_count)); - if (!data) { - nfs_flush_error(desc, hdr); - return -ENOMEM; - } - - nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq); - pages = data->pages.pagevec; - while (!list_empty(head)) { - req = nfs_list_entry(head->next); - nfs_list_remove_request(req); - nfs_list_add_request(req, &hdr->pages); - *pages++ = req->wb_page; - } - - if ((desc->pg_ioflags & FLUSH_COND_STABLE) && - (desc->pg_moreio || nfs_reqs_to_commit(&cinfo))) - desc->pg_ioflags &= ~FLUSH_COND_STABLE; - - /* Set up the argument struct */ - nfs_write_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo); - list_add(&data->list, &hdr->rpc_list); - desc->pg_rpc_callops = &nfs_write_common_ops; - return 0; -} - -int nfs_generic_flush(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr) -{ - if (desc->pg_bsize < PAGE_CACHE_SIZE) - return nfs_flush_multi(desc, hdr); - return nfs_flush_one(desc, hdr); -} -EXPORT_SYMBOL_GPL(nfs_generic_flush); - -static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) -{ - struct nfs_write_header *whdr; - struct nfs_pgio_header *hdr; - int ret; - - whdr = nfs_writehdr_alloc(); - if (!whdr) { - desc->pg_completion_ops->error_cleanup(&desc->pg_list); - return -ENOMEM; - } - hdr = &whdr->header; - nfs_pgheader_init(desc, hdr, nfs_writehdr_free); - atomic_inc(&hdr->refcnt); - ret = nfs_generic_flush(desc, hdr); - if (ret == 0) - ret = nfs_do_multiple_writes(&hdr->rpc_list, - desc->pg_rpc_callops, - desc->pg_ioflags); - if (atomic_dec_and_test(&hdr->refcnt)) - hdr->completion_ops->completion(hdr); - return ret; -} - -static const struct nfs_pageio_ops nfs_pageio_write_ops = { - .pg_test = nfs_generic_pg_test, - .pg_doio = nfs_generic_pg_writepages, -}; - void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, - struct inode *inode, int ioflags, + struct inode *inode, int ioflags, bool force_mds, const struct nfs_pgio_completion_ops *compl_ops) { - nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops, compl_ops, - NFS_SERVER(inode)->wsize, ioflags); + struct nfs_server *server = NFS_SERVER(inode); + const struct nfs_pageio_ops *pg_ops = &nfs_pgio_rw_ops; + +#ifdef CONFIG_NFS_V4_1 + if (server->pnfs_curr_ld && !force_mds) + pg_ops = server->pnfs_curr_ld->pg_write_ops; +#endif + nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops, + server->wsize, ioflags); } EXPORT_SYMBOL_GPL(nfs_pageio_init_write); void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio) { - pgio->pg_ops = &nfs_pageio_write_ops; + pgio->pg_ops = &nfs_pgio_rw_ops; pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize; } EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds); -void nfs_write_prepare(struct rpc_task *task, void *calldata) -{ - struct nfs_write_data *data = calldata; - int err; - err = NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data); - if (err) - rpc_exit(task, err); -} - void nfs_commit_prepare(struct rpc_task *task, void *calldata) { struct nfs_commit_data *data = calldata; @@ -1314,23 +1313,8 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata) NFS_PROTO(data->inode)->commit_rpc_prepare(task, data); } -/* - * Handle a write reply that flushes a whole page. - * - * FIXME: There is an inherent race with invalidate_inode_pages and - * writebacks since the page->count is kept > 1 for as long - * as the page has a write request pending. - */ -static void nfs_writeback_done_common(struct rpc_task *task, void *calldata) -{ - struct nfs_write_data *data = calldata; - - nfs_writeback_done(task, data); -} - -static void nfs_writeback_release_common(void *calldata) +static void nfs_writeback_release_common(struct nfs_pgio_data *data) { - struct nfs_write_data *data = calldata; struct nfs_pgio_header *hdr = data->header; int status = data->task.tk_status; @@ -1339,34 +1323,46 @@ static void nfs_writeback_release_common(void *calldata) if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) ; /* Do nothing */ else if (!test_and_set_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) - memcpy(hdr->verf, &data->verf, sizeof(*hdr->verf)); - else if (memcmp(hdr->verf, &data->verf, sizeof(*hdr->verf))) + memcpy(&hdr->verf, &data->verf, sizeof(hdr->verf)); + else if (memcmp(&hdr->verf, &data->verf, sizeof(hdr->verf))) set_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags); spin_unlock(&hdr->lock); } - nfs_writedata_release(data); } -static const struct rpc_call_ops nfs_write_common_ops = { - .rpc_call_prepare = nfs_write_prepare, - .rpc_call_done = nfs_writeback_done_common, - .rpc_release = nfs_writeback_release_common, -}; +/* + * Special version of should_remove_suid() that ignores capabilities. + */ +static int nfs_should_remove_suid(const struct inode *inode) +{ + umode_t mode = inode->i_mode; + int kill = 0; + /* suid always must be killed */ + if (unlikely(mode & S_ISUID)) + kill = ATTR_KILL_SUID; + + /* + * sgid without any exec bits is just a mandatory locking mark; leave + * it alone. If some exec bits are set, it's a real sgid; kill it. + */ + if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) + kill |= ATTR_KILL_SGID; + + if (unlikely(kill && S_ISREG(mode))) + return kill; + + return 0; +} /* * This function is called when the WRITE call is complete. */ -void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) +static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data, + struct inode *inode) { - struct nfs_writeargs *argp = &data->args; - struct nfs_writeres *resp = &data->res; - struct inode *inode = data->header->inode; int status; - dprintk("NFS: %5u nfs_writeback_done (status %d)\n", - task->tk_pid, task->tk_status); - /* * ->write_done will attempt to use post-op attributes to detect * conflicting writes by other clients. A strict interpretation @@ -1376,11 +1372,11 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) */ status = NFS_PROTO(inode)->write_done(task, data); if (status != 0) - return; - nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, resp->count); + return status; + nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, data->res.count); #if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) - if (resp->verf->committed < argp->stable && task->tk_status >= 0) { + if (data->res.verf->committed < data->args.stable && task->tk_status >= 0) { /* We tried a write call, but the server did not * commit data to stable storage even though we * requested it. @@ -1396,18 +1392,31 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) dprintk("NFS: faulty NFS server %s:" " (committed = %d) != (stable = %d)\n", NFS_SERVER(inode)->nfs_client->cl_hostname, - resp->verf->committed, argp->stable); + data->res.verf->committed, data->args.stable); complain = jiffies + 300 * HZ; } } #endif - if (task->tk_status < 0) - nfs_set_pgio_error(data->header, task->tk_status, argp->offset); - else if (resp->count < argp->count) { + + /* Deal with the suid/sgid bit corner case */ + if (nfs_should_remove_suid(inode)) + nfs_mark_for_revalidate(inode); + return 0; +} + +/* + * This function is called when the WRITE call is complete. + */ +static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *data) +{ + struct nfs_pgio_args *argp = &data->args; + struct nfs_pgio_res *resp = &data->res; + + if (resp->count < argp->count) { static unsigned long complain; /* This a short write! */ - nfs_inc_stats(inode, NFSIOS_SHORTWRITE); + nfs_inc_stats(data->header->inode, NFSIOS_SHORTWRITE); /* Has the server at least made some progress? */ if (resp->count == 0) { @@ -1458,7 +1467,7 @@ static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) static void nfs_commit_clear_lock(struct nfs_inode *nfsi) { clear_bit(NFS_INO_COMMIT, &nfsi->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&nfsi->flags, NFS_INO_COMMIT); } @@ -1788,27 +1797,28 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) struct nfs_page *req; int ret = 0; - for (;;) { - wait_on_page_writeback(page); - req = nfs_page_find_request(page); - if (req == NULL) - break; - if (nfs_lock_request(req)) { - nfs_clear_request_commit(req); - nfs_inode_remove_request(req); - /* - * In case nfs_inode_remove_request has marked the - * page as being dirty - */ - cancel_dirty_page(page, PAGE_CACHE_SIZE); - nfs_unlock_and_release_request(req); - break; - } - ret = nfs_wait_on_request(req); - nfs_release_request(req); - if (ret < 0) - break; + wait_on_page_writeback(page); + + /* blocking call to cancel all requests and join to a single (head) + * request */ + req = nfs_lock_and_join_requests(page, false); + + if (IS_ERR(req)) { + ret = PTR_ERR(req); + } else if (req) { + /* all requests from this page have been cancelled by + * nfs_lock_and_join_requests, so just remove the head + * request from the inode / page_private pointer and + * release it */ + nfs_inode_remove_request(req); + /* + * In case nfs_inode_remove_request has marked the + * page as being dirty + */ + cancel_dirty_page(page, PAGE_CACHE_SIZE); + nfs_unlock_and_release_request(req); } + return ret; } @@ -1874,7 +1884,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage, int __init nfs_init_writepagecache(void) { nfs_wdata_cachep = kmem_cache_create("nfs_write_data", - sizeof(struct nfs_write_header), + sizeof(struct nfs_rw_header), 0, SLAB_HWCACHE_ALIGN, NULL); if (nfs_wdata_cachep == NULL) @@ -1936,3 +1946,12 @@ void nfs_destroy_writepagecache(void) kmem_cache_destroy(nfs_wdata_cachep); } +static const struct nfs_rw_ops nfs_rw_write_ops = { + .rw_mode = FMODE_WRITE, + .rw_alloc_header = nfs_writehdr_alloc, + .rw_free_header = nfs_writehdr_free, + .rw_release = nfs_writeback_release_common, + .rw_done = nfs_writeback_done, + .rw_result = nfs_writeback_result, + .rw_initiate = nfs_initiate_write, +}; diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h index b481e1f5eecc..a986ceb6fd0d 100644 --- a/fs/nfsd/acl.h +++ b/fs/nfsd/acl.h @@ -49,7 +49,7 @@ struct svc_rqst; struct nfs4_acl *nfs4_acl_new(int); int nfs4_acl_get_whotype(char *, u32); -__be32 nfs4_acl_write_who(int who, __be32 **p, int *len); +__be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who); int nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_acl **acl); diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 2645be435e75..72f44823adbb 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -1,7 +1,6 @@ /* Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ #include <linux/sched.h> -#include <linux/user_namespace.h> #include "nfsd.h" #include "auth.h" @@ -25,7 +24,6 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) struct cred *new; int i; int flags = nfsexp_flags(rqstp, exp); - int ret; validate_process_creds(); @@ -86,8 +84,7 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) return 0; oom: - ret = -ENOMEM; abort_creds(new); - return ret; + return -ENOMEM; } diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 8513c598fabf..13b85f94d9e2 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -17,17 +17,12 @@ #include <linux/exportfs.h> #include <linux/sunrpc/svc_xprt.h> -#include <net/ipv6.h> - #include "nfsd.h" #include "nfsfh.h" #include "netns.h" #define NFSDDBG_FACILITY NFSDDBG_EXPORT -typedef struct auth_domain svc_client; -typedef struct svc_export svc_export; - /* * We have two caches. * One maps client+vfsmnt+dentry to export options - the export map @@ -73,7 +68,7 @@ static struct svc_expkey *svc_expkey_lookup(struct cache_detail *cd, struct svc_ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) { - /* client fsidtype fsid [path] */ + /* client fsidtype fsid expiry [path] */ char *buf; int len; struct auth_domain *dom = NULL; @@ -295,13 +290,19 @@ svc_expkey_update(struct cache_detail *cd, struct svc_expkey *new, static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc) { + struct nfsd4_fs_location *locations = fsloc->locations; int i; + if (!locations) + return; + for (i = 0; i < fsloc->locations_count; i++) { - kfree(fsloc->locations[i].path); - kfree(fsloc->locations[i].hosts); + kfree(locations[i].path); + kfree(locations[i].hosts); } - kfree(fsloc->locations); + + kfree(locations); + fsloc->locations = NULL; } static void svc_export_put(struct kref *ref) @@ -388,6 +389,10 @@ fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc) int len; int migrated, i, err; + /* more than one fsloc */ + if (fsloc->locations) + return -EINVAL; + /* listsize */ err = get_uint(mesg, &fsloc->locations_count); if (err) @@ -437,13 +442,18 @@ out_free_all: static int secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { - int listsize, err; struct exp_flavor_info *f; + u32 listsize; + int err; + + /* more than one secinfo */ + if (exp->ex_nflavors) + return -EINVAL; - err = get_int(mesg, &listsize); + err = get_uint(mesg, &listsize); if (err) return err; - if (listsize < 0 || listsize > MAX_SECINFO_LIST) + if (listsize > MAX_SECINFO_LIST) return -EINVAL; for (f = exp->ex_flavors; f < exp->ex_flavors + listsize; f++) { @@ -474,6 +484,27 @@ static inline int secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; } #endif +static inline int +uuid_parse(char **mesg, char *buf, unsigned char **puuid) +{ + int len; + + /* more than one uuid */ + if (*puuid) + return -EINVAL; + + /* expect a 16 byte uuid encoded as \xXXXX... */ + len = qword_get(mesg, buf, PAGE_SIZE); + if (len != EX_UUID_LEN) + return -EINVAL; + + *puuid = kmemdup(buf, EX_UUID_LEN, GFP_KERNEL); + if (*puuid == NULL) + return -ENOMEM; + + return 0; +} + static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) { /* client path expiry [flags anonuid anongid fsid] */ @@ -552,18 +583,9 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) while ((len = qword_get(&mesg, buf, PAGE_SIZE)) > 0) { if (strcmp(buf, "fsloc") == 0) err = fsloc_parse(&mesg, buf, &exp.ex_fslocs); - else if (strcmp(buf, "uuid") == 0) { - /* expect a 16 byte uuid encoded as \xXXXX... */ - len = qword_get(&mesg, buf, PAGE_SIZE); - if (len != 16) - err = -EINVAL; - else { - exp.ex_uuid = - kmemdup(buf, 16, GFP_KERNEL); - if (exp.ex_uuid == NULL) - err = -ENOMEM; - } - } else if (strcmp(buf, "secinfo") == 0) + else if (strcmp(buf, "uuid") == 0) + err = uuid_parse(&mesg, buf, &exp.ex_uuid); + else if (strcmp(buf, "secinfo") == 0) err = secinfo_parse(&mesg, buf, &exp); else /* quietly ignore unknown words and anything @@ -649,7 +671,7 @@ static int svc_export_show(struct seq_file *m, if (exp->ex_uuid) { int i; seq_puts(m, ",uuid="); - for (i=0; i<16; i++) { + for (i = 0; i < EX_UUID_LEN; i++) { if ((i&3) == 0 && i) seq_putc(m, ':'); seq_printf(m, "%02x", exp->ex_uuid[i]); @@ -771,7 +793,7 @@ svc_export_update(struct svc_export *new, struct svc_export *old) static struct svc_expkey * -exp_find_key(struct cache_detail *cd, svc_client *clp, int fsid_type, +exp_find_key(struct cache_detail *cd, struct auth_domain *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp) { struct svc_expkey key, *ek; @@ -793,9 +815,9 @@ exp_find_key(struct cache_detail *cd, svc_client *clp, int fsid_type, return ek; } - -static svc_export *exp_get_by_name(struct cache_detail *cd, svc_client *clp, - const struct path *path, struct cache_req *reqp) +static struct svc_export * +exp_get_by_name(struct cache_detail *cd, struct auth_domain *clp, + const struct path *path, struct cache_req *reqp) { struct svc_export *exp, key; int err; @@ -819,11 +841,11 @@ static svc_export *exp_get_by_name(struct cache_detail *cd, svc_client *clp, /* * Find the export entry for a given dentry. */ -static struct svc_export *exp_parent(struct cache_detail *cd, svc_client *clp, - struct path *path) +static struct svc_export * +exp_parent(struct cache_detail *cd, struct auth_domain *clp, struct path *path) { struct dentry *saved = dget(path->dentry); - svc_export *exp = exp_get_by_name(cd, clp, path, NULL); + struct svc_export *exp = exp_get_by_name(cd, clp, path, NULL); while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) { struct dentry *parent = dget_parent(path->dentry); @@ -844,7 +866,7 @@ static struct svc_export *exp_parent(struct cache_detail *cd, svc_client *clp, * since its harder to fool a kernel module than a user space program. */ int -exp_rootfh(struct net *net, svc_client *clp, char *name, +exp_rootfh(struct net *net, struct auth_domain *clp, char *name, struct knfsd_fh *f, int maxsize) { struct svc_export *exp; diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h new file mode 100644 index 000000000000..cfeea85c5bed --- /dev/null +++ b/fs/nfsd/export.h @@ -0,0 +1,110 @@ +/* + * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de> + */ +#ifndef NFSD_EXPORT_H +#define NFSD_EXPORT_H + +#include <linux/sunrpc/cache.h> +#include <uapi/linux/nfsd/export.h> + +struct knfsd_fh; +struct svc_fh; +struct svc_rqst; + +/* + * FS Locations + */ + +#define MAX_FS_LOCATIONS 128 + +struct nfsd4_fs_location { + char *hosts; /* colon separated list of hosts */ + char *path; /* slash separated list of path components */ +}; + +struct nfsd4_fs_locations { + uint32_t locations_count; + struct nfsd4_fs_location *locations; +/* If we're not actually serving this data ourselves (only providing a + * list of replicas that do serve it) then we set "migrated": */ + int migrated; +}; + +/* + * We keep an array of pseudoflavors with the export, in order from most + * to least preferred. For the foreseeable future, we don't expect more + * than the eight pseudoflavors null, unix, krb5, krb5i, krb5p, skpm3, + * spkm3i, and spkm3p (and using all 8 at once should be rare). + */ +#define MAX_SECINFO_LIST 8 +#define EX_UUID_LEN 16 + +struct exp_flavor_info { + u32 pseudoflavor; + u32 flags; +}; + +struct svc_export { + struct cache_head h; + struct auth_domain * ex_client; + int ex_flags; + struct path ex_path; + kuid_t ex_anon_uid; + kgid_t ex_anon_gid; + int ex_fsid; + unsigned char * ex_uuid; /* 16 byte fsid */ + struct nfsd4_fs_locations ex_fslocs; + uint32_t ex_nflavors; + struct exp_flavor_info ex_flavors[MAX_SECINFO_LIST]; + struct cache_detail *cd; +}; + +/* an "export key" (expkey) maps a filehandlefragement to an + * svc_export for a given client. There can be several per export, + * for the different fsid types. + */ +struct svc_expkey { + struct cache_head h; + + struct auth_domain * ek_client; + int ek_fsidtype; + u32 ek_fsid[6]; + + struct path ek_path; +}; + +#define EX_ISSYNC(exp) (!((exp)->ex_flags & NFSEXP_ASYNC)) +#define EX_NOHIDE(exp) ((exp)->ex_flags & NFSEXP_NOHIDE) +#define EX_WGATHER(exp) ((exp)->ex_flags & NFSEXP_GATHERED_WRITES) + +int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp); +__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp); + +/* + * Function declarations + */ +int nfsd_export_init(struct net *); +void nfsd_export_shutdown(struct net *); +void nfsd_export_flush(struct net *); +struct svc_export * rqst_exp_get_by_name(struct svc_rqst *, + struct path *); +struct svc_export * rqst_exp_parent(struct svc_rqst *, + struct path *); +struct svc_export * rqst_find_fsidzero_export(struct svc_rqst *); +int exp_rootfh(struct net *, struct auth_domain *, + char *path, struct knfsd_fh *, int maxsize); +__be32 exp_pseudoroot(struct svc_rqst *, struct svc_fh *); +__be32 nfserrno(int errno); + +static inline void exp_put(struct svc_export *exp) +{ + cache_put(&exp->h, exp->cd); +} + +static inline void exp_get(struct svc_export *exp) +{ + cache_get(&exp->h); +} +struct svc_export * rqst_exp_find(struct svc_rqst *, int, u32 *); + +#endif /* NFSD_EXPORT_H */ diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c index d620e7f81429..2ed05c3cd43d 100644 --- a/fs/nfsd/fault_inject.c +++ b/fs/nfsd/fault_inject.c @@ -97,25 +97,14 @@ static ssize_t fault_inject_read(struct file *file, char __user *buf, { static u64 val; char read_buf[25]; - size_t size, ret; + size_t size; loff_t pos = *ppos; if (!pos) nfsd_inject_get(file_inode(file)->i_private, &val); size = scnprintf(read_buf, sizeof(read_buf), "%llu\n", val); - if (pos < 0) - return -EINVAL; - if (pos >= size || !len) - return 0; - if (len > size - pos) - len = size - pos; - ret = copy_to_user(buf, read_buf + pos, len); - if (ret == len) - return -EFAULT; - len -= ret; - *ppos = pos + len; - return len; + return simple_read_from_buffer(buf, len, ppos, read_buf, size); } static ssize_t fault_inject_write(struct file *file, const char __user *buf, diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h index 66e58db01936..a3f34900091f 100644 --- a/fs/nfsd/idmap.h +++ b/fs/nfsd/idmap.h @@ -56,7 +56,7 @@ static inline void nfsd_idmap_shutdown(struct net *net) __be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, kuid_t *); __be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, kgid_t *); -__be32 nfsd4_encode_user(struct svc_rqst *, kuid_t, __be32 **, int *); -__be32 nfsd4_encode_group(struct svc_rqst *, kgid_t, __be32 **, int *); +__be32 nfsd4_encode_user(struct xdr_stream *, struct svc_rqst *, kuid_t); +__be32 nfsd4_encode_group(struct xdr_stream *, struct svc_rqst *, kgid_t); #endif /* LINUX_NFSD_IDMAP_H */ diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 11c1fba29312..12b023a7ab7d 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -182,7 +182,8 @@ static __be32 nfsacld_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessarg static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_getaclargs *argp) { - if (!(p = nfs2svc_decode_fh(p, &argp->fh))) + p = nfs2svc_decode_fh(p, &argp->fh); + if (!p) return 0; argp->mask = ntohl(*p); p++; @@ -197,7 +198,8 @@ static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p, unsigned int base; int n; - if (!(p = nfs2svc_decode_fh(p, &argp->fh))) + p = nfs2svc_decode_fh(p, &argp->fh); + if (!p) return 0; argp->mask = ntohl(*p++); if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) || @@ -218,7 +220,8 @@ static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p, static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_fhandle *argp) { - if (!(p = nfs2svc_decode_fh(p, &argp->fh))) + p = nfs2svc_decode_fh(p, &argp->fh); + if (!p) return 0; return xdr_argsize_check(rqstp, p); } @@ -226,7 +229,8 @@ static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p, static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_accessargs *argp) { - if (!(p = nfs2svc_decode_fh(p, &argp->fh))) + p = nfs2svc_decode_fh(p, &argp->fh); + if (!p) return 0; argp->access = ntohl(*p++); diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index adc5f1b1dc26..2a514e21dc74 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -128,7 +128,8 @@ out: static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_getaclargs *args) { - if (!(p = nfs3svc_decode_fh(p, &args->fh))) + p = nfs3svc_decode_fh(p, &args->fh); + if (!p) return 0; args->mask = ntohl(*p); p++; @@ -143,7 +144,8 @@ static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p, unsigned int base; int n; - if (!(p = nfs3svc_decode_fh(p, &args->fh))) + p = nfs3svc_decode_fh(p, &args->fh); + if (!p) return 0; args->mask = ntohl(*p++); if (args->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) || diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index de6e39e12cb3..e6c01e80325e 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -278,7 +278,8 @@ void fill_post_wcc(struct svc_fh *fhp) int nfs3svc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p, struct nfsd_fhandle *args) { - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; return xdr_argsize_check(rqstp, p); } @@ -287,7 +288,8 @@ int nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_sattrargs *args) { - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; p = decode_sattr3(p, &args->attrs); @@ -315,7 +317,8 @@ int nfs3svc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_accessargs *args) { - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; args->access = ntohl(*p++); @@ -330,7 +333,8 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p, int v; u32 max_blocksize = svc_max_payload(rqstp); - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; p = xdr_decode_hyper(p, &args->offset); @@ -360,7 +364,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, unsigned int len, v, hdr, dlen; u32 max_blocksize = svc_max_payload(rqstp); - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; p = xdr_decode_hyper(p, &args->offset); @@ -535,7 +540,8 @@ int nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_readlinkargs *args) { - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; args->buffer = page_address(*(rqstp->rq_next_page++)); @@ -558,7 +564,8 @@ int nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_readdirargs *args) { - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; p = xdr_decode_hyper(p, &args->cookie); args->verf = p; p += 2; @@ -580,7 +587,8 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p, int len; u32 max_blocksize = svc_max_payload(rqstp); - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; p = xdr_decode_hyper(p, &args->cookie); args->verf = p; p += 2; @@ -605,7 +613,8 @@ int nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_commitargs *args) { - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; p = xdr_decode_hyper(p, &args->offset); args->count = ntohl(*p++); diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 6f3f392d48af..d714156a19fd 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -36,7 +36,6 @@ #include <linux/slab.h> #include <linux/nfs_fs.h> -#include <linux/export.h> #include "nfsfh.h" #include "nfsd.h" #include "acl.h" @@ -402,8 +401,10 @@ sort_pacl(struct posix_acl *pacl) * by uid/gid. */ int i, j; - if (pacl->a_count <= 4) - return; /* no users or groups */ + /* no users or groups */ + if (!pacl || pacl->a_count <= 4) + return; + i = 1; while (pacl->a_entries[i].e_tag == ACL_USER) i++; @@ -530,13 +531,12 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) /* * ACLs with no ACEs are treated differently in the inheritable - * and effective cases: when there are no inheritable ACEs, we - * set a zero-length default posix acl: + * and effective cases: when there are no inheritable ACEs, + * calls ->set_acl with a NULL ACL structure. */ - if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT)) { - pacl = posix_acl_alloc(0, GFP_KERNEL); - return pacl ? pacl : ERR_PTR(-ENOMEM); - } + if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT)) + return NULL; + /* * When there are no effective ACEs, the following will end * up setting a 3-element effective posix ACL with all @@ -589,7 +589,7 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) add_to_mask(state, &state->groups->aces[i].perms); } - if (!state->users->n && !state->groups->n) { + if (state->users->n || state->groups->n) { pace++; pace->e_tag = ACL_MASK; low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags); @@ -919,20 +919,19 @@ nfs4_acl_get_whotype(char *p, u32 len) return NFS4_ACL_WHO_NAMED; } -__be32 nfs4_acl_write_who(int who, __be32 **p, int *len) +__be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who) { + __be32 *p; int i; - int bytes; for (i = 0; i < ARRAY_SIZE(s2t_map); i++) { if (s2t_map[i].type != who) continue; - bytes = 4 + (XDR_QUADLEN(s2t_map[i].stringlen) << 2); - if (bytes > *len) + p = xdr_reserve_space(xdr, s2t_map[i].stringlen + 4); + if (!p) return nfserr_resource; - *p = xdr_encode_opaque(*p, s2t_map[i].string, + p = xdr_encode_opaque(p, s2t_map[i].string, s2t_map[i].stringlen); - *len -= bytes; return 0; } WARN_ON_ONCE(1); diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 39c8ef875f91..2c73cae9899d 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -654,9 +654,11 @@ static struct rpc_clnt *create_backchannel_client(struct rpc_create_args *args) static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses) { + int maxtime = max_cb_time(clp->net); struct rpc_timeout timeparms = { - .to_initval = max_cb_time(clp->net), + .to_initval = maxtime, .to_retries = 0, + .to_maxval = maxtime, }; struct rpc_create_args args = { .net = clp->net, diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index c0dfde68742e..a0ab0a847d69 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -551,44 +551,43 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen return 0; } -static __be32 encode_ascii_id(u32 id, __be32 **p, int *buflen) +static __be32 encode_ascii_id(struct xdr_stream *xdr, u32 id) { char buf[11]; int len; - int bytes; + __be32 *p; len = sprintf(buf, "%u", id); - bytes = 4 + (XDR_QUADLEN(len) << 2); - if (bytes > *buflen) + p = xdr_reserve_space(xdr, len + 4); + if (!p) return nfserr_resource; - *p = xdr_encode_opaque(*p, buf, len); - *buflen -= bytes; + p = xdr_encode_opaque(p, buf, len); return 0; } -static __be32 idmap_id_to_name(struct svc_rqst *rqstp, int type, u32 id, __be32 **p, int *buflen) +static __be32 idmap_id_to_name(struct xdr_stream *xdr, + struct svc_rqst *rqstp, int type, u32 id) { struct ent *item, key = { .id = id, .type = type, }; + __be32 *p; int ret; - int bytes; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); ret = idmap_lookup(rqstp, idtoname_lookup, &key, nn->idtoname_cache, &item); if (ret == -ENOENT) - return encode_ascii_id(id, p, buflen); + return encode_ascii_id(xdr, id); if (ret) return nfserrno(ret); ret = strlen(item->name); WARN_ON_ONCE(ret > IDMAP_NAMESZ); - bytes = 4 + (XDR_QUADLEN(ret) << 2); - if (bytes > *buflen) + p = xdr_reserve_space(xdr, ret + 4); + if (!p) return nfserr_resource; - *p = xdr_encode_opaque(*p, item->name, ret); - *buflen -= bytes; + p = xdr_encode_opaque(p, item->name, ret); cache_put(&item->h, nn->idtoname_cache); return 0; } @@ -622,11 +621,12 @@ do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u return idmap_name_to_id(rqstp, type, name, namelen, id); } -static __be32 encode_name_from_id(struct svc_rqst *rqstp, int type, u32 id, __be32 **p, int *buflen) +static __be32 encode_name_from_id(struct xdr_stream *xdr, + struct svc_rqst *rqstp, int type, u32 id) { if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS) - return encode_ascii_id(id, p, buflen); - return idmap_id_to_name(rqstp, type, id, p, buflen); + return encode_ascii_id(xdr, id); + return idmap_id_to_name(xdr, rqstp, type, id); } __be32 @@ -655,14 +655,16 @@ nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen, return status; } -__be32 nfsd4_encode_user(struct svc_rqst *rqstp, kuid_t uid, __be32 **p, int *buflen) +__be32 nfsd4_encode_user(struct xdr_stream *xdr, struct svc_rqst *rqstp, + kuid_t uid) { u32 id = from_kuid(&init_user_ns, uid); - return encode_name_from_id(rqstp, IDMAP_TYPE_USER, id, p, buflen); + return encode_name_from_id(xdr, rqstp, IDMAP_TYPE_USER, id); } -__be32 nfsd4_encode_group(struct svc_rqst *rqstp, kgid_t gid, __be32 **p, int *buflen) +__be32 nfsd4_encode_group(struct xdr_stream *xdr, struct svc_rqst *rqstp, + kgid_t gid) { u32 id = from_kgid(&init_user_ns, gid); - return encode_name_from_id(rqstp, IDMAP_TYPE_GROUP, id, p, buflen); + return encode_name_from_id(xdr, rqstp, IDMAP_TYPE_GROUP, id); } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index d543222babf3..8f029db5d271 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -430,12 +430,12 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; break; case NFS4_OPEN_CLAIM_PREVIOUS: - open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; status = nfs4_check_open_reclaim(&open->op_clientid, cstate->minorversion, nn); if (status) goto out; + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; case NFS4_OPEN_CLAIM_FH: case NFS4_OPEN_CLAIM_DELEG_CUR_FH: status = do_open_fhandle(rqstp, cstate, open); @@ -445,7 +445,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, break; case NFS4_OPEN_CLAIM_DELEG_PREV_FH: case NFS4_OPEN_CLAIM_DELEGATE_PREV: - open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; dprintk("NFSD: unsupported OPEN claim type %d\n", open->op_claim_type); status = nfserr_notsupp; @@ -618,15 +617,6 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, switch (create->cr_type) { case NF4LNK: - /* ugh! we have to null-terminate the linktext, or - * vfs_symlink() will choke. it is always safe to - * null-terminate by brute force, since at worst we - * will overwrite the first byte of the create namelen - * in the XDR buffer, which has already been extracted - * during XDR decode. - */ - create->cr_linkname[create->cr_linklen] = 0; - status = nfsd_symlink(rqstp, &cstate->current_fh, create->cr_name, create->cr_namelen, create->cr_linkname, create->cr_linklen, @@ -786,7 +776,6 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!nfsd4_last_compound_op(rqstp)) rqstp->rq_splice_ok = false; - nfs4_lock_state(); /* check stateid */ if ((status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate, &read->rd_stateid, @@ -794,11 +783,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); goto out; } - if (read->rd_filp) - get_file(read->rd_filp); status = nfs_ok; out: - nfs4_unlock_state(); read->rd_rqstp = rqstp; read->rd_fhp = &cstate->current_fh; return status; @@ -937,10 +923,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, int err; if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { - nfs4_lock_state(); status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate, &setattr->sa_stateid, WR_STATE, NULL); - nfs4_unlock_state(); if (status) { dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n"); return status; @@ -1006,17 +990,12 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (write->wr_offset >= OFFSET_MAX) return nfserr_inval; - nfs4_lock_state(); status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate, stateid, WR_STATE, &filp); if (status) { - nfs4_unlock_state(); dprintk("NFSD: nfsd4_write: couldn't process stateid!\n"); return status; } - if (filp) - get_file(filp); - nfs4_unlock_state(); cnt = write->wr_buflen; write->wr_how_written = write->wr_stable_how; @@ -1072,10 +1051,10 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_jukebox; p = buf; - status = nfsd4_encode_fattr(&cstate->current_fh, + status = nfsd4_encode_fattr_to_buf(&p, count, &cstate->current_fh, cstate->current_fh.fh_export, - cstate->current_fh.fh_dentry, &p, - count, verify->ve_bmval, + cstate->current_fh.fh_dentry, + verify->ve_bmval, rqstp, 0); /* * If nfsd4_encode_fattr() ran out of space, assume that's because @@ -1182,9 +1161,7 @@ struct nfsd4_operation { static struct nfsd4_operation nfsd4_ops[]; -#ifdef NFSD_DEBUG static const char *nfsd4_op_name(unsigned opnum); -#endif /* * Enforce NFSv4.1 COMPOUND ordering rules: @@ -1226,6 +1203,8 @@ static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op) bool nfsd4_cache_this_op(struct nfsd4_op *op) { + if (op->opnum == OP_ILLEGAL) + return false; return OPDESC(op)->op_flags & OP_CACHEME; } @@ -1262,6 +1241,25 @@ static bool need_wrongsec_check(struct svc_rqst *rqstp) return !(nextd->op_flags & OP_HANDLES_WRONGSEC); } +static void svcxdr_init_encode(struct svc_rqst *rqstp, + struct nfsd4_compoundres *resp) +{ + struct xdr_stream *xdr = &resp->xdr; + struct xdr_buf *buf = &rqstp->rq_res; + struct kvec *head = buf->head; + + xdr->buf = buf; + xdr->iov = head; + xdr->p = head->iov_base + head->iov_len; + xdr->end = head->iov_base + PAGE_SIZE - rqstp->rq_auth_slack; + /* Tail and page_len should be zero at this point: */ + buf->len = buf->head[0].iov_len; + xdr->scratch.iov_len = 0; + xdr->page_ptr = buf->pages - 1; + buf->buflen = PAGE_SIZE * (1 + rqstp->rq_page_end - buf->pages) + - rqstp->rq_auth_slack; +} + /* * COMPOUND call. */ @@ -1275,24 +1273,16 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate = &resp->cstate; struct svc_fh *current_fh = &cstate->current_fh; struct svc_fh *save_fh = &cstate->save_fh; - int slack_bytes; - u32 plen = 0; __be32 status; - resp->xbuf = &rqstp->rq_res; - resp->p = rqstp->rq_res.head[0].iov_base + - rqstp->rq_res.head[0].iov_len; - resp->tagp = resp->p; + svcxdr_init_encode(rqstp, resp); + resp->tagp = resp->xdr.p; /* reserve space for: taglen, tag, and opcnt */ - resp->p += 2 + XDR_QUADLEN(args->taglen); - resp->end = rqstp->rq_res.head[0].iov_base + PAGE_SIZE; + xdr_reserve_space(&resp->xdr, 8 + args->taglen); resp->taglen = args->taglen; resp->tag = args->tag; - resp->opcnt = 0; resp->rqstp = rqstp; cstate->minorversion = args->minorversion; - cstate->replay_owner = NULL; - cstate->session = NULL; fh_init(current_fh, NFS4_FHSIZE); fh_init(save_fh, NFS4_FHSIZE); /* @@ -1332,19 +1322,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, goto encode_op; } - /* We must be able to encode a successful response to - * this operation, with enough room left over to encode a - * failed response to the next operation. If we don't - * have enough room, fail with ERR_RESOURCE. - */ - slack_bytes = (char *)resp->end - (char *)resp->p; - if (slack_bytes < COMPOUND_SLACK_SPACE - + COMPOUND_ERR_SLACK_SPACE) { - BUG_ON(slack_bytes < COMPOUND_ERR_SLACK_SPACE); - op->status = nfserr_resource; - goto encode_op; - } - opdesc = OPDESC(op); if (!current_fh->fh_dentry) { @@ -1362,9 +1339,13 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, /* If op is non-idempotent */ if (opdesc->op_flags & OP_MODIFIES_SOMETHING) { - plen = opdesc->op_rsize_bop(rqstp, op); /* - * If there's still another operation, make sure + * Don't execute this op if we couldn't encode a + * succesful reply: + */ + u32 plen = opdesc->op_rsize_bop(rqstp, op); + /* + * Plus if there's another operation, make sure * we'll have space to at least encode an error: */ if (resp->opcnt < args->opcnt) @@ -1399,7 +1380,7 @@ encode_op: } if (op->status == nfserr_replay_me) { op->replay = &cstate->replay_owner->so_replay; - nfsd4_encode_replay(resp, op); + nfsd4_encode_replay(&resp->xdr, op); status = op->status = op->replay->rp_status; } else { nfsd4_encode_operation(resp, op); @@ -1438,7 +1419,8 @@ out: #define op_encode_change_info_maxsz (5) #define nfs4_fattr_bitmap_maxsz (4) -#define op_encode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) +/* We'll fall back on returning no lockowner if run out of space: */ +#define op_encode_lockowner_maxsz (0) #define op_encode_lock_denied_maxsz (8 + op_encode_lockowner_maxsz) #define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) @@ -1470,6 +1452,49 @@ static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op + nfs4_fattr_bitmap_maxsz) * sizeof(__be32); } +/* + * Note since this is an idempotent operation we won't insist on failing + * the op prematurely if the estimate is too large. We may turn off splice + * reads unnecessarily. + */ +static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp, + struct nfsd4_op *op) +{ + u32 *bmap = op->u.getattr.ga_bmval; + u32 bmap0 = bmap[0], bmap1 = bmap[1], bmap2 = bmap[2]; + u32 ret = 0; + + if (bmap0 & FATTR4_WORD0_ACL) + return svc_max_payload(rqstp); + if (bmap0 & FATTR4_WORD0_FS_LOCATIONS) + return svc_max_payload(rqstp); + + if (bmap1 & FATTR4_WORD1_OWNER) { + ret += IDMAP_NAMESZ + 4; + bmap1 &= ~FATTR4_WORD1_OWNER; + } + if (bmap1 & FATTR4_WORD1_OWNER_GROUP) { + ret += IDMAP_NAMESZ + 4; + bmap1 &= ~FATTR4_WORD1_OWNER_GROUP; + } + if (bmap0 & FATTR4_WORD0_FILEHANDLE) { + ret += NFS4_FHSIZE + 4; + bmap0 &= ~FATTR4_WORD0_FILEHANDLE; + } + if (bmap2 & FATTR4_WORD2_SECURITY_LABEL) { + ret += NFSD4_MAX_SEC_LABEL_LEN + 12; + bmap2 &= ~FATTR4_WORD2_SECURITY_LABEL; + } + /* + * Largest of remaining attributes are 16 bytes (e.g., + * supported_attributes) + */ + ret += 16 * (hweight32(bmap0) + hweight32(bmap1) + hweight32(bmap2)); + /* bitmask, length */ + ret += 20; + return ret; +} + static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { return (op_encode_hdr_size + op_encode_change_info_maxsz) @@ -1500,18 +1525,19 @@ static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) if (rlen > maxcount) rlen = maxcount; - return (op_encode_hdr_size + 2) * sizeof(__be32) + rlen; + return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32); } static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { + u32 maxcount = svc_max_payload(rqstp); u32 rlen = op->u.readdir.rd_maxcount; - if (rlen > PAGE_SIZE) - rlen = PAGE_SIZE; + if (rlen > maxcount) + rlen = maxcount; - return (op_encode_hdr_size + op_encode_verifier_maxsz) - * sizeof(__be32) + rlen; + return (op_encode_hdr_size + op_encode_verifier_maxsz + + XDR_QUADLEN(rlen)) * sizeof(__be32); } static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) @@ -1526,6 +1552,12 @@ static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op + op_encode_change_info_maxsz) * sizeof(__be32); } +static inline u32 nfsd4_sequence_rsize(struct svc_rqst *rqstp, + struct nfsd4_op *op) +{ + return NFS4_MAX_SESSIONID_LEN + 20; +} + static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32); @@ -1539,7 +1571,7 @@ static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_o static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { - return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32); + return (op_encode_hdr_size + 2 + op_encode_verifier_maxsz) * sizeof(__be32); } static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) @@ -1607,6 +1639,7 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_GETATTR] = { .op_func = (nfsd4op_func)nfsd4_getattr, .op_flags = ALLOWED_ON_ABSENT_FS, + .op_rsize_bop = nfsd4_getattr_rsize, .op_name = "OP_GETATTR", }, [OP_GETFH] = { @@ -1676,37 +1709,32 @@ static struct nfsd4_operation nfsd4_ops[] = { [OP_PUTFH] = { .op_func = (nfsd4op_func)nfsd4_putfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING - | OP_CLEAR_STATEID, + | OP_IS_PUTFH_LIKE | OP_CLEAR_STATEID, .op_name = "OP_PUTFH", .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_PUTPUBFH] = { .op_func = (nfsd4op_func)nfsd4_putrootfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING - | OP_CLEAR_STATEID, + | OP_IS_PUTFH_LIKE | OP_CLEAR_STATEID, .op_name = "OP_PUTPUBFH", .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_PUTROOTFH] = { .op_func = (nfsd4op_func)nfsd4_putrootfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING - | OP_CLEAR_STATEID, + | OP_IS_PUTFH_LIKE | OP_CLEAR_STATEID, .op_name = "OP_PUTROOTFH", .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_READ] = { .op_func = (nfsd4op_func)nfsd4_read, - .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_READ", .op_rsize_bop = (nfsd4op_rsize)nfsd4_read_rsize, .op_get_currentstateid = (stateid_getter)nfsd4_get_readstateid, }, [OP_READDIR] = { .op_func = (nfsd4op_func)nfsd4_readdir, - .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_READDIR", .op_rsize_bop = (nfsd4op_rsize)nfsd4_readdir_rsize, }, @@ -1864,14 +1892,33 @@ static struct nfsd4_operation nfsd4_ops[] = { }, }; -#ifdef NFSD_DEBUG +int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + struct nfsd4_operation *opdesc; + nfsd4op_rsize estimator; + + if (op->opnum == OP_ILLEGAL) + return op_encode_hdr_size * sizeof(__be32); + opdesc = OPDESC(op); + estimator = opdesc->op_rsize_bop; + return estimator ? estimator(rqstp, op) : PAGE_SIZE; +} + +void warn_on_nonidempotent_op(struct nfsd4_op *op) +{ + if (OPDESC(op)->op_flags & OP_MODIFIES_SOMETHING) { + pr_err("unable to encode reply to nonidempotent op %d (%s)\n", + op->opnum, nfsd4_op_name(op->opnum)); + WARN_ON_ONCE(1); + } +} + static const char *nfsd4_op_name(unsigned opnum) { if (opnum < ARRAY_SIZE(nfsd4_ops)) return nfsd4_ops[opnum].op_name; return "unknown_operation"; } -#endif #define nfsd4_voidres nfsd4_voidargs struct nfsd4_voidargs { int dummy; }; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 3ba65979a3cd..2204e1fe5725 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -41,6 +41,7 @@ #include <linux/ratelimit.h> #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/addr.h> +#include <linux/hash.h> #include "xdr4.h" #include "xdr4cb.h" #include "vfs.h" @@ -81,13 +82,13 @@ static DEFINE_MUTEX(client_mutex); * effort to decrease the scope of the client_mutex, this spinlock may * eventually cover more: */ -static DEFINE_SPINLOCK(recall_lock); +static DEFINE_SPINLOCK(state_lock); -static struct kmem_cache *openowner_slab = NULL; -static struct kmem_cache *lockowner_slab = NULL; -static struct kmem_cache *file_slab = NULL; -static struct kmem_cache *stateid_slab = NULL; -static struct kmem_cache *deleg_slab = NULL; +static struct kmem_cache *openowner_slab; +static struct kmem_cache *lockowner_slab; +static struct kmem_cache *file_slab; +static struct kmem_cache *stateid_slab; +static struct kmem_cache *deleg_slab; void nfs4_lock_state(void) @@ -235,9 +236,9 @@ static void nfsd4_free_file(struct nfs4_file *f) static inline void put_nfs4_file(struct nfs4_file *fi) { - if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) { + if (atomic_dec_and_lock(&fi->fi_ref, &state_lock)) { hlist_del(&fi->fi_hash); - spin_unlock(&recall_lock); + spin_unlock(&state_lock); iput(fi->fi_inode); nfsd4_free_file(fi); } @@ -364,6 +365,79 @@ static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp) return openlockstateid(nfs4_alloc_stid(clp, stateid_slab)); } +/* + * When we recall a delegation, we should be careful not to hand it + * out again straight away. + * To ensure this we keep a pair of bloom filters ('new' and 'old') + * in which the filehandles of recalled delegations are "stored". + * If a filehandle appear in either filter, a delegation is blocked. + * When a delegation is recalled, the filehandle is stored in the "new" + * filter. + * Every 30 seconds we swap the filters and clear the "new" one, + * unless both are empty of course. + * + * Each filter is 256 bits. We hash the filehandle to 32bit and use the + * low 3 bytes as hash-table indices. + * + * 'state_lock', which is always held when block_delegations() is called, + * is used to manage concurrent access. Testing does not need the lock + * except when swapping the two filters. + */ +static struct bloom_pair { + int entries, old_entries; + time_t swap_time; + int new; /* index into 'set' */ + DECLARE_BITMAP(set[2], 256); +} blocked_delegations; + +static int delegation_blocked(struct knfsd_fh *fh) +{ + u32 hash; + struct bloom_pair *bd = &blocked_delegations; + + if (bd->entries == 0) + return 0; + if (seconds_since_boot() - bd->swap_time > 30) { + spin_lock(&state_lock); + if (seconds_since_boot() - bd->swap_time > 30) { + bd->entries -= bd->old_entries; + bd->old_entries = bd->entries; + memset(bd->set[bd->new], 0, + sizeof(bd->set[0])); + bd->new = 1-bd->new; + bd->swap_time = seconds_since_boot(); + } + spin_unlock(&state_lock); + } + hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0); + if (test_bit(hash&255, bd->set[0]) && + test_bit((hash>>8)&255, bd->set[0]) && + test_bit((hash>>16)&255, bd->set[0])) + return 1; + + if (test_bit(hash&255, bd->set[1]) && + test_bit((hash>>8)&255, bd->set[1]) && + test_bit((hash>>16)&255, bd->set[1])) + return 1; + + return 0; +} + +static void block_delegations(struct knfsd_fh *fh) +{ + u32 hash; + struct bloom_pair *bd = &blocked_delegations; + + hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0); + + __set_bit(hash&255, bd->set[bd->new]); + __set_bit((hash>>8)&255, bd->set[bd->new]); + __set_bit((hash>>16)&255, bd->set[bd->new]); + if (bd->entries == 0) + bd->swap_time = seconds_since_boot(); + bd->entries += 1; +} + static struct nfs4_delegation * alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh) { @@ -372,10 +446,11 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv dprintk("NFSD alloc_init_deleg\n"); if (num_delegations > max_delegations) return NULL; + if (delegation_blocked(¤t_fh->fh_handle)) + return NULL; dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab)); if (dp == NULL) return dp; - dp->dl_stid.sc_type = NFS4_DELEG_STID; /* * delegation seqid's are never incremented. The 4.1 special * meaning of seqid 0 isn't meaningful, really, but let's avoid @@ -418,6 +493,8 @@ nfs4_put_delegation(struct nfs4_delegation *dp) static void nfs4_put_deleg_lease(struct nfs4_file *fp) { + if (!fp->fi_lease) + return; if (atomic_dec_and_test(&fp->fi_delegees)) { vfs_setlease(fp->fi_deleg_file, F_UNLCK, &fp->fi_lease); fp->fi_lease = NULL; @@ -431,18 +508,30 @@ static void unhash_stid(struct nfs4_stid *s) s->sc_type = 0; } +static void +hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp) +{ + lockdep_assert_held(&state_lock); + + dp->dl_stid.sc_type = NFS4_DELEG_STID; + list_add(&dp->dl_perfile, &fp->fi_delegations); + list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); +} + /* Called under the state lock. */ static void unhash_delegation(struct nfs4_delegation *dp) { + spin_lock(&state_lock); list_del_init(&dp->dl_perclnt); - spin_lock(&recall_lock); list_del_init(&dp->dl_perfile); list_del_init(&dp->dl_recall_lru); - spin_unlock(&recall_lock); - nfs4_put_deleg_lease(dp->dl_file); - put_nfs4_file(dp->dl_file); - dp->dl_file = NULL; + spin_unlock(&state_lock); + if (dp->dl_file) { + nfs4_put_deleg_lease(dp->dl_file); + put_nfs4_file(dp->dl_file); + dp->dl_file = NULL; + } } @@ -645,6 +734,12 @@ static void unhash_lockowner(struct nfs4_lockowner *lo) } } +static void nfs4_free_lockowner(struct nfs4_lockowner *lo) +{ + kfree(lo->lo_owner.so_owner.data); + kmem_cache_free(lockowner_slab, lo); +} + static void release_lockowner(struct nfs4_lockowner *lo) { unhash_lockowner(lo); @@ -699,6 +794,12 @@ static void release_last_closed_stateid(struct nfs4_openowner *oo) } } +static void nfs4_free_openowner(struct nfs4_openowner *oo) +{ + kfree(oo->oo_owner.so_owner.data); + kmem_cache_free(openowner_slab, oo); +} + static void release_openowner(struct nfs4_openowner *oo) { unhash_openowner(oo); @@ -1078,10 +1179,22 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) return NULL; } clp->cl_name.len = name.len; + INIT_LIST_HEAD(&clp->cl_sessions); + idr_init(&clp->cl_stateids); + atomic_set(&clp->cl_refcount, 0); + clp->cl_cb_state = NFSD4_CB_UNKNOWN; + INIT_LIST_HEAD(&clp->cl_idhash); + INIT_LIST_HEAD(&clp->cl_openowners); + INIT_LIST_HEAD(&clp->cl_delegations); + INIT_LIST_HEAD(&clp->cl_lru); + INIT_LIST_HEAD(&clp->cl_callbacks); + INIT_LIST_HEAD(&clp->cl_revoked); + spin_lock_init(&clp->cl_lock); + rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); return clp; } -static inline void +static void free_client(struct nfs4_client *clp) { struct nfsd_net __maybe_unused *nn = net_generic(clp->net, nfsd_net_id); @@ -1095,6 +1208,7 @@ free_client(struct nfs4_client *clp) WARN_ON_ONCE(atomic_read(&ses->se_ref)); free_session(ses); } + rpc_destroy_wait_queue(&clp->cl_cb_waitq); free_svc_cred(&clp->cl_cred); kfree(clp->cl_name.data); idr_destroy(&clp->cl_stateids); @@ -1123,13 +1237,13 @@ destroy_client(struct nfs4_client *clp) struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); INIT_LIST_HEAD(&reaplist); - spin_lock(&recall_lock); + spin_lock(&state_lock); while (!list_empty(&clp->cl_delegations)) { dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt); list_del_init(&dp->dl_perclnt); list_move(&dp->dl_recall_lru, &reaplist); } - spin_unlock(&recall_lock); + spin_unlock(&state_lock); while (!list_empty(&reaplist)) { dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); destroy_delegation(dp); @@ -1347,7 +1461,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name, if (clp == NULL) return NULL; - INIT_LIST_HEAD(&clp->cl_sessions); ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred); if (ret) { spin_lock(&nn->client_lock); @@ -1355,20 +1468,9 @@ static struct nfs4_client *create_client(struct xdr_netobj name, spin_unlock(&nn->client_lock); return NULL; } - idr_init(&clp->cl_stateids); - atomic_set(&clp->cl_refcount, 0); - clp->cl_cb_state = NFSD4_CB_UNKNOWN; - INIT_LIST_HEAD(&clp->cl_idhash); - INIT_LIST_HEAD(&clp->cl_openowners); - INIT_LIST_HEAD(&clp->cl_delegations); - INIT_LIST_HEAD(&clp->cl_lru); - INIT_LIST_HEAD(&clp->cl_callbacks); - INIT_LIST_HEAD(&clp->cl_revoked); - spin_lock_init(&clp->cl_lock); nfsd4_init_callback(&clp->cl_cb_null); clp->cl_time = get_seconds(); clear_bit(0, &clp->cl_cb_slot_busy); - rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); copy_verf(clp, verf); rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa); gen_confirm(clp); @@ -1543,6 +1645,7 @@ out_err: void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) { + struct xdr_buf *buf = resp->xdr.buf; struct nfsd4_slot *slot = resp->cstate.slot; unsigned int base; @@ -1556,11 +1659,9 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) slot->sl_datalen = 0; return; } - slot->sl_datalen = (char *)resp->p - (char *)resp->cstate.datap; - base = (char *)resp->cstate.datap - - (char *)resp->xbuf->head[0].iov_base; - if (read_bytes_from_xdr_buf(resp->xbuf, base, slot->sl_data, - slot->sl_datalen)) + base = resp->cstate.data_offset; + slot->sl_datalen = buf->len - base; + if (read_bytes_from_xdr_buf(buf, base, slot->sl_data, slot->sl_datalen)) WARN("%s: sessions DRC could not cache compound\n", __func__); return; } @@ -1601,6 +1702,8 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, struct nfsd4_sequence *seq) { struct nfsd4_slot *slot = resp->cstate.slot; + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; __be32 status; dprintk("--> %s slot %p\n", __func__, slot); @@ -1609,14 +1712,16 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, if (status) return status; - /* The sequence operation has been encoded, cstate->datap set. */ - memcpy(resp->cstate.datap, slot->sl_data, slot->sl_datalen); + p = xdr_reserve_space(xdr, slot->sl_datalen); + if (!p) { + WARN_ON_ONCE(1); + return nfserr_serverfault; + } + xdr_encode_opaque_fixed(p, slot->sl_data, slot->sl_datalen); + xdr_commit_encode(xdr); resp->opcnt = slot->sl_opcnt; - resp->p = resp->cstate.datap + XDR_QUADLEN(slot->sl_datalen); - status = slot->sl_status; - - return status; + return slot->sl_status; } /* @@ -2188,11 +2293,13 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_sequence *seq) { struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct xdr_stream *xdr = &resp->xdr; struct nfsd4_session *session; struct nfs4_client *clp; struct nfsd4_slot *slot; struct nfsd4_conn *conn; __be32 status; + int buflen; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); if (resp->opcnt != 1) @@ -2261,6 +2368,16 @@ nfsd4_sequence(struct svc_rqst *rqstp, if (status) goto out_put_session; + buflen = (seq->cachethis) ? + session->se_fchannel.maxresp_cached : + session->se_fchannel.maxresp_sz; + status = (seq->cachethis) ? nfserr_rep_too_big_to_cache : + nfserr_rep_too_big; + if (xdr_restrict_buflen(xdr, buflen - rqstp->rq_auth_slack)) + goto out_put_session; + svc_reserve(rqstp, buflen); + + status = nfs_ok; /* Success! bump slot seqid */ slot->sl_seqid = seq->seqid; slot->sl_flags |= NFSD4_SLOT_INUSE; @@ -2498,28 +2615,19 @@ static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino) fp->fi_lease = NULL; memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); memset(fp->fi_access, 0, sizeof(fp->fi_access)); - spin_lock(&recall_lock); + spin_lock(&state_lock); hlist_add_head(&fp->fi_hash, &file_hashtbl[hashval]); - spin_unlock(&recall_lock); -} - -static void -nfsd4_free_slab(struct kmem_cache **slab) -{ - if (*slab == NULL) - return; - kmem_cache_destroy(*slab); - *slab = NULL; + spin_unlock(&state_lock); } void nfsd4_free_slabs(void) { - nfsd4_free_slab(&openowner_slab); - nfsd4_free_slab(&lockowner_slab); - nfsd4_free_slab(&file_slab); - nfsd4_free_slab(&stateid_slab); - nfsd4_free_slab(&deleg_slab); + kmem_cache_destroy(openowner_slab); + kmem_cache_destroy(lockowner_slab); + kmem_cache_destroy(file_slab); + kmem_cache_destroy(stateid_slab); + kmem_cache_destroy(deleg_slab); } int @@ -2528,42 +2636,38 @@ nfsd4_init_slabs(void) openowner_slab = kmem_cache_create("nfsd4_openowners", sizeof(struct nfs4_openowner), 0, 0, NULL); if (openowner_slab == NULL) - goto out_nomem; + goto out; lockowner_slab = kmem_cache_create("nfsd4_lockowners", sizeof(struct nfs4_lockowner), 0, 0, NULL); if (lockowner_slab == NULL) - goto out_nomem; + goto out_free_openowner_slab; file_slab = kmem_cache_create("nfsd4_files", sizeof(struct nfs4_file), 0, 0, NULL); if (file_slab == NULL) - goto out_nomem; + goto out_free_lockowner_slab; stateid_slab = kmem_cache_create("nfsd4_stateids", sizeof(struct nfs4_ol_stateid), 0, 0, NULL); if (stateid_slab == NULL) - goto out_nomem; + goto out_free_file_slab; deleg_slab = kmem_cache_create("nfsd4_delegations", sizeof(struct nfs4_delegation), 0, 0, NULL); if (deleg_slab == NULL) - goto out_nomem; + goto out_free_stateid_slab; return 0; -out_nomem: - nfsd4_free_slabs(); + +out_free_stateid_slab: + kmem_cache_destroy(stateid_slab); +out_free_file_slab: + kmem_cache_destroy(file_slab); +out_free_lockowner_slab: + kmem_cache_destroy(lockowner_slab); +out_free_openowner_slab: + kmem_cache_destroy(openowner_slab); +out: dprintk("nfsd4: out of memory while initializing nfsv4\n"); return -ENOMEM; } -void nfs4_free_openowner(struct nfs4_openowner *oo) -{ - kfree(oo->oo_owner.so_owner.data); - kmem_cache_free(openowner_slab, oo); -} - -void nfs4_free_lockowner(struct nfs4_lockowner *lo) -{ - kfree(lo->lo_owner.so_owner.data); - kmem_cache_free(lockowner_slab, lo); -} - static void init_nfs4_replay(struct nfs4_replay *rp) { rp->rp_status = nfserr_serverfault; @@ -2684,15 +2788,15 @@ find_file(struct inode *ino) unsigned int hashval = file_hashval(ino); struct nfs4_file *fp; - spin_lock(&recall_lock); + spin_lock(&state_lock); hlist_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) { if (fp->fi_inode == ino) { get_nfs4_file(fp); - spin_unlock(&recall_lock); + spin_unlock(&state_lock); return fp; } } - spin_unlock(&recall_lock); + spin_unlock(&state_lock); return NULL; } @@ -2729,6 +2833,7 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp) struct nfs4_client *clp = dp->dl_stid.sc_client; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + lockdep_assert_held(&state_lock); /* We're assuming the state code never drops its reference * without first removing the lease. Since we're in this lease * callback (and since the lease code is serialized by the kernel @@ -2741,6 +2846,8 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp) /* Only place dl_time is set; protected by i_lock: */ dp->dl_time = get_seconds(); + block_delegations(&dp->dl_fh); + nfsd4_cb_recall(dp); } @@ -2765,11 +2872,11 @@ static void nfsd_break_deleg_cb(struct file_lock *fl) */ fl->fl_break_time = 0; - spin_lock(&recall_lock); + spin_lock(&state_lock); fp->fi_had_conflict = true; list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) nfsd_break_one_deleg(dp); - spin_unlock(&recall_lock); + spin_unlock(&state_lock); } static @@ -3046,11 +3153,12 @@ static int nfs4_setlease(struct nfs4_delegation *dp) status = vfs_setlease(fl->fl_file, fl->fl_type, &fl); if (status) goto out_free; - list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); fp->fi_lease = fl; fp->fi_deleg_file = get_file(fl->fl_file); atomic_set(&fp->fi_delegees, 1); - list_add(&dp->dl_perfile, &fp->fi_delegations); + spin_lock(&state_lock); + hash_delegation_locked(dp, fp); + spin_unlock(&state_lock); return 0; out_free: locks_free_lock(fl); @@ -3059,33 +3167,21 @@ out_free: static int nfs4_set_delegation(struct nfs4_delegation *dp, struct nfs4_file *fp) { - int status; - if (fp->fi_had_conflict) return -EAGAIN; get_nfs4_file(fp); dp->dl_file = fp; - if (!fp->fi_lease) { - status = nfs4_setlease(dp); - if (status) - goto out_free; - return 0; - } - spin_lock(&recall_lock); + if (!fp->fi_lease) + return nfs4_setlease(dp); + spin_lock(&state_lock); + atomic_inc(&fp->fi_delegees); if (fp->fi_had_conflict) { - spin_unlock(&recall_lock); - status = -EAGAIN; - goto out_free; + spin_unlock(&state_lock); + return -EAGAIN; } - atomic_inc(&fp->fi_delegees); - list_add(&dp->dl_perfile, &fp->fi_delegations); - spin_unlock(&recall_lock); - list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); + hash_delegation_locked(dp, fp); + spin_unlock(&state_lock); return 0; -out_free: - put_nfs4_file(fp); - dp->dl_file = fp; - return status; } static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status) @@ -3172,8 +3268,7 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh, open->op_delegate_type = NFS4_OPEN_DELEGATE_READ; return; out_free: - remove_stid(&dp->dl_stid); - nfs4_put_delegation(dp); + destroy_delegation(dp); out_no_deleg: open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE; if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS && @@ -3390,8 +3485,7 @@ nfs4_laundromat(struct nfsd_net *nn) struct nfs4_delegation *dp; struct list_head *pos, *next, reaplist; time_t cutoff = get_seconds() - nn->nfsd4_lease; - time_t t, clientid_val = nn->nfsd4_lease; - time_t u, test_val = nn->nfsd4_lease; + time_t t, new_timeo = nn->nfsd4_lease; nfs4_lock_state(); @@ -3403,8 +3497,7 @@ nfs4_laundromat(struct nfsd_net *nn) clp = list_entry(pos, struct nfs4_client, cl_lru); if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) { t = clp->cl_time - cutoff; - if (clientid_val > t) - clientid_val = t; + new_timeo = min(new_timeo, t); break; } if (mark_client_expired_locked(clp)) { @@ -3421,39 +3514,35 @@ nfs4_laundromat(struct nfsd_net *nn) clp->cl_clientid.cl_id); expire_client(clp); } - spin_lock(&recall_lock); + spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); if (net_generic(dp->dl_stid.sc_client->net, nfsd_net_id) != nn) continue; if (time_after((unsigned long)dp->dl_time, (unsigned long)cutoff)) { - u = dp->dl_time - cutoff; - if (test_val > u) - test_val = u; + t = dp->dl_time - cutoff; + new_timeo = min(new_timeo, t); break; } list_move(&dp->dl_recall_lru, &reaplist); } - spin_unlock(&recall_lock); + spin_unlock(&state_lock); list_for_each_safe(pos, next, &reaplist) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); revoke_delegation(dp); } - test_val = nn->nfsd4_lease; list_for_each_safe(pos, next, &nn->close_lru) { oo = container_of(pos, struct nfs4_openowner, oo_close_lru); if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) { - u = oo->oo_time - cutoff; - if (test_val > u) - test_val = u; + t = oo->oo_time - cutoff; + new_timeo = min(new_timeo, t); break; } release_openowner(oo); } - if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT) - clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT; + new_timeo = max_t(time_t, new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT); nfs4_unlock_state(); - return clientid_val; + return new_timeo; } static struct workqueue_struct *laundry_wq; @@ -3653,6 +3742,7 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, struct svc_fh *current_fh = &cstate->current_fh; struct inode *ino = current_fh->fh_dentry->d_inode; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct file *file = NULL; __be32 status; if (filpp) @@ -3664,10 +3754,12 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) return check_special_stateids(net, current_fh, stateid, flags); + nfs4_lock_state(); + status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, &s, cstate->minorversion, nn); if (status) - return status; + goto out; status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate)); if (status) goto out; @@ -3678,8 +3770,8 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, if (status) goto out; if (filpp) { - *filpp = dp->dl_file->fi_deleg_file; - if (!*filpp) { + file = dp->dl_file->fi_deleg_file; + if (!file) { WARN_ON_ONCE(1); status = nfserr_serverfault; goto out; @@ -3700,25 +3792,36 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, goto out; if (filpp) { if (flags & RD_STATE) - *filpp = find_readable_file(stp->st_file); + file = find_readable_file(stp->st_file); else - *filpp = find_writeable_file(stp->st_file); + file = find_writeable_file(stp->st_file); } break; default: - return nfserr_bad_stateid; + status = nfserr_bad_stateid; + goto out; } status = nfs_ok; + if (file) + *filpp = get_file(file); out: + nfs4_unlock_state(); return status; } static __be32 nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp) { - if (check_for_locks(stp->st_file, lockowner(stp->st_stateowner))) + struct nfs4_lockowner *lo = lockowner(stp->st_stateowner); + + if (check_for_locks(stp->st_file, lo)) return nfserr_locks_held; - release_lock_stateid(stp); + /* + * Currently there's a 1-1 lock stateid<->lockowner + * correspondance, and we have to delete the lockowner when we + * delete the lock stateid: + */ + release_lockowner(lo); return nfs_ok; } @@ -4158,6 +4261,10 @@ static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, c if (!same_owner_str(&lo->lo_owner, owner, clid)) return false; + if (list_empty(&lo->lo_owner.so_stateids)) { + WARN_ON_ONCE(1); + return false; + } lst = list_first_entry(&lo->lo_owner.so_stateids, struct nfs4_ol_stateid, st_perstateowner); return lst->st_file->fi_inode == inode; @@ -4884,6 +4991,7 @@ static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max, struct nfs4_delegation *dp, *next; u64 count = 0; + lockdep_assert_held(&state_lock); list_for_each_entry_safe(dp, next, &clp->cl_delegations, dl_perclnt) { if (victims) list_move(&dp->dl_recall_lru, victims); @@ -4899,9 +5007,9 @@ u64 nfsd_forget_client_delegations(struct nfs4_client *clp, u64 max) LIST_HEAD(victims); u64 count; - spin_lock(&recall_lock); + spin_lock(&state_lock); count = nfsd_find_all_delegations(clp, max, &victims); - spin_unlock(&recall_lock); + spin_unlock(&state_lock); list_for_each_entry_safe(dp, next, &victims, dl_recall_lru) revoke_delegation(dp); @@ -4915,11 +5023,11 @@ u64 nfsd_recall_client_delegations(struct nfs4_client *clp, u64 max) LIST_HEAD(victims); u64 count; - spin_lock(&recall_lock); + spin_lock(&state_lock); count = nfsd_find_all_delegations(clp, max, &victims); list_for_each_entry_safe(dp, next, &victims, dl_recall_lru) nfsd_break_one_deleg(dp); - spin_unlock(&recall_lock); + spin_unlock(&state_lock); return count; } @@ -4928,9 +5036,9 @@ u64 nfsd_print_client_delegations(struct nfs4_client *clp, u64 max) { u64 count = 0; - spin_lock(&recall_lock); + spin_lock(&state_lock); count = nfsd_find_all_delegations(clp, max, NULL); - spin_unlock(&recall_lock); + spin_unlock(&state_lock); nfsd_print_count(clp, count, "delegations"); return count; @@ -4971,13 +5079,6 @@ struct nfs4_client *nfsd_find_client(struct sockaddr_storage *addr, size_t addr_ #endif /* CONFIG_NFSD_FAULT_INJECTION */ -/* initialization to perform at module load time: */ - -void -nfs4_state_init(void) -{ -} - /* * Since the lifetime of a delegation isn't limited to that of an open, a * client may quite reasonably hang on to a delegation as long as it has @@ -5148,12 +5249,12 @@ nfs4_state_shutdown_net(struct net *net) nfs4_lock_state(); INIT_LIST_HEAD(&reaplist); - spin_lock(&recall_lock); + spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); list_move(&dp->dl_recall_lru, &reaplist); } - spin_unlock(&recall_lock); + spin_unlock(&state_lock); list_for_each_safe(pos, next, &reaplist) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); destroy_delegation(dp); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 2723c1badd01..944275c8f56d 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -98,11 +98,6 @@ xdr_error: \ status = nfserr_bad_xdr; \ goto out -#define READ32(x) (x) = ntohl(*p++) -#define READ64(x) do { \ - (x) = (u64)ntohl(*p++) << 32; \ - (x) |= ntohl(*p++); \ -} while (0) #define READMEM(x,nbytes) do { \ x = (char *)p; \ p += XDR_QUADLEN(nbytes); \ @@ -248,17 +243,17 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval) bmval[2] = 0; READ_BUF(4); - READ32(bmlen); + bmlen = be32_to_cpup(p++); if (bmlen > 1000) goto xdr_error; READ_BUF(bmlen << 2); if (bmlen > 0) - READ32(bmval[0]); + bmval[0] = be32_to_cpup(p++); if (bmlen > 1) - READ32(bmval[1]); + bmval[1] = be32_to_cpup(p++); if (bmlen > 2) - READ32(bmval[2]); + bmval[2] = be32_to_cpup(p++); DECODE_TAIL; } @@ -270,6 +265,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, { int expected_len, len = 0; u32 dummy32; + u64 sec; char *buf; DECODE_HEAD; @@ -278,12 +274,12 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, return status; READ_BUF(4); - READ32(expected_len); + expected_len = be32_to_cpup(p++); if (bmval[0] & FATTR4_WORD0_SIZE) { READ_BUF(8); len += 8; - READ64(iattr->ia_size); + p = xdr_decode_hyper(p, &iattr->ia_size); iattr->ia_valid |= ATTR_SIZE; } if (bmval[0] & FATTR4_WORD0_ACL) { @@ -291,7 +287,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct nfs4_ace *ace; READ_BUF(4); len += 4; - READ32(nace); + nace = be32_to_cpup(p++); if (nace > NFS4_ACL_MAX) return nfserr_fbig; @@ -305,10 +301,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, (*acl)->naces = nace; for (ace = (*acl)->aces; ace < (*acl)->aces + nace; ace++) { READ_BUF(16); len += 16; - READ32(ace->type); - READ32(ace->flag); - READ32(ace->access_mask); - READ32(dummy32); + ace->type = be32_to_cpup(p++); + ace->flag = be32_to_cpup(p++); + ace->access_mask = be32_to_cpup(p++); + dummy32 = be32_to_cpup(p++); READ_BUF(dummy32); len += XDR_QUADLEN(dummy32) << 2; READMEM(buf, dummy32); @@ -330,14 +326,14 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, if (bmval[1] & FATTR4_WORD1_MODE) { READ_BUF(4); len += 4; - READ32(iattr->ia_mode); + iattr->ia_mode = be32_to_cpup(p++); iattr->ia_mode &= (S_IFMT | S_IALLUGO); iattr->ia_valid |= ATTR_MODE; } if (bmval[1] & FATTR4_WORD1_OWNER) { READ_BUF(4); len += 4; - READ32(dummy32); + dummy32 = be32_to_cpup(p++); READ_BUF(dummy32); len += (XDR_QUADLEN(dummy32) << 2); READMEM(buf, dummy32); @@ -348,7 +344,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, if (bmval[1] & FATTR4_WORD1_OWNER_GROUP) { READ_BUF(4); len += 4; - READ32(dummy32); + dummy32 = be32_to_cpup(p++); READ_BUF(dummy32); len += (XDR_QUADLEN(dummy32) << 2); READMEM(buf, dummy32); @@ -359,15 +355,16 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) { READ_BUF(4); len += 4; - READ32(dummy32); + dummy32 = be32_to_cpup(p++); switch (dummy32) { case NFS4_SET_TO_CLIENT_TIME: /* We require the high 32 bits of 'seconds' to be 0, and we ignore all 32 bits of 'nseconds'. */ READ_BUF(12); len += 12; - READ64(iattr->ia_atime.tv_sec); - READ32(iattr->ia_atime.tv_nsec); + p = xdr_decode_hyper(p, &sec); + iattr->ia_atime.tv_sec = (time_t)sec; + iattr->ia_atime.tv_nsec = be32_to_cpup(p++); if (iattr->ia_atime.tv_nsec >= (u32)1000000000) return nfserr_inval; iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET); @@ -382,15 +379,16 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) { READ_BUF(4); len += 4; - READ32(dummy32); + dummy32 = be32_to_cpup(p++); switch (dummy32) { case NFS4_SET_TO_CLIENT_TIME: /* We require the high 32 bits of 'seconds' to be 0, and we ignore all 32 bits of 'nseconds'. */ READ_BUF(12); len += 12; - READ64(iattr->ia_mtime.tv_sec); - READ32(iattr->ia_mtime.tv_nsec); + p = xdr_decode_hyper(p, &sec); + iattr->ia_mtime.tv_sec = sec; + iattr->ia_mtime.tv_nsec = be32_to_cpup(p++); if (iattr->ia_mtime.tv_nsec >= (u32)1000000000) return nfserr_inval; iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET); @@ -408,13 +406,13 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) { READ_BUF(4); len += 4; - READ32(dummy32); /* lfs: we don't use it */ + dummy32 = be32_to_cpup(p++); /* lfs: we don't use it */ READ_BUF(4); len += 4; - READ32(dummy32); /* pi: we don't use it either */ + dummy32 = be32_to_cpup(p++); /* pi: we don't use it either */ READ_BUF(4); len += 4; - READ32(dummy32); + dummy32 = be32_to_cpup(p++); READ_BUF(dummy32); if (dummy32 > NFSD4_MAX_SEC_LABEL_LEN) return nfserr_badlabel; @@ -445,7 +443,7 @@ nfsd4_decode_stateid(struct nfsd4_compoundargs *argp, stateid_t *sid) DECODE_HEAD; READ_BUF(sizeof(stateid_t)); - READ32(sid->si_generation); + sid->si_generation = be32_to_cpup(p++); COPYMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); DECODE_TAIL; @@ -457,7 +455,7 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access DECODE_HEAD; READ_BUF(4); - READ32(access->ac_req_access); + access->ac_req_access = be32_to_cpup(p++); DECODE_TAIL; } @@ -472,7 +470,7 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_ /* callback_sec_params4 */ READ_BUF(4); - READ32(nr_secflavs); + nr_secflavs = be32_to_cpup(p++); if (nr_secflavs) cbs->flavor = (u32)(-1); else @@ -480,7 +478,7 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_ cbs->flavor = 0; for (i = 0; i < nr_secflavs; ++i) { READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); switch (dummy) { case RPC_AUTH_NULL: /* Nothing to read */ @@ -490,21 +488,21 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_ case RPC_AUTH_UNIX: READ_BUF(8); /* stamp */ - READ32(dummy); + dummy = be32_to_cpup(p++); /* machine name */ - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy); SAVEMEM(machine_name, dummy); /* uid, gid */ READ_BUF(8); - READ32(uid); - READ32(gid); + uid = be32_to_cpup(p++); + gid = be32_to_cpup(p++); /* more gids */ READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy * 4); if (cbs->flavor == (u32)(-1)) { kuid_t kuid = make_kuid(&init_user_ns, uid); @@ -524,14 +522,14 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_ "not supported!\n"); READ_BUF(8); /* gcbp_service */ - READ32(dummy); + dummy = be32_to_cpup(p++); /* gcbp_handle_from_server */ - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy); p += XDR_QUADLEN(dummy); /* gcbp_handle_from_client */ READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy); break; default: @@ -547,7 +545,7 @@ static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, stru DECODE_HEAD; READ_BUF(4); - READ32(bc->bc_cb_program); + bc->bc_cb_program = be32_to_cpup(p++); nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec); DECODE_TAIL; @@ -559,7 +557,7 @@ static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, READ_BUF(NFS4_MAX_SESSIONID_LEN + 8); COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN); - READ32(bcts->dir); + bcts->dir = be32_to_cpup(p++); /* XXX: skipping ctsa_use_conn_in_rdma_mode. Perhaps Tom Tucker * could help us figure out we should be using it. */ DECODE_TAIL; @@ -571,7 +569,7 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) DECODE_HEAD; READ_BUF(4); - READ32(close->cl_seqid); + close->cl_seqid = be32_to_cpup(p++); return nfsd4_decode_stateid(argp, &close->cl_stateid); DECODE_TAIL; @@ -584,8 +582,8 @@ nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit DECODE_HEAD; READ_BUF(12); - READ64(commit->co_offset); - READ32(commit->co_count); + p = xdr_decode_hyper(p, &commit->co_offset); + commit->co_count = be32_to_cpup(p++); DECODE_TAIL; } @@ -596,19 +594,30 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create DECODE_HEAD; READ_BUF(4); - READ32(create->cr_type); + create->cr_type = be32_to_cpup(p++); switch (create->cr_type) { case NF4LNK: READ_BUF(4); - READ32(create->cr_linklen); + create->cr_linklen = be32_to_cpup(p++); READ_BUF(create->cr_linklen); - SAVEMEM(create->cr_linkname, create->cr_linklen); + /* + * The VFS will want a null-terminated string, and + * null-terminating in place isn't safe since this might + * end on a page boundary: + */ + create->cr_linkname = + kmalloc(create->cr_linklen + 1, GFP_KERNEL); + if (!create->cr_linkname) + return nfserr_jukebox; + memcpy(create->cr_linkname, p, create->cr_linklen); + create->cr_linkname[create->cr_linklen] = '\0'; + defer_free(argp, kfree, create->cr_linkname); break; case NF4BLK: case NF4CHR: READ_BUF(8); - READ32(create->cr_specdata1); - READ32(create->cr_specdata2); + create->cr_specdata1 = be32_to_cpup(p++); + create->cr_specdata2 = be32_to_cpup(p++); break; case NF4SOCK: case NF4FIFO: @@ -618,7 +627,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create } READ_BUF(4); - READ32(create->cr_namelen); + create->cr_namelen = be32_to_cpup(p++); READ_BUF(create->cr_namelen); SAVEMEM(create->cr_name, create->cr_namelen); if ((status = check_filename(create->cr_name, create->cr_namelen))) @@ -650,7 +659,7 @@ nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link) DECODE_HEAD; READ_BUF(4); - READ32(link->li_namelen); + link->li_namelen = be32_to_cpup(p++); READ_BUF(link->li_namelen); SAVEMEM(link->li_name, link->li_namelen); if ((status = check_filename(link->li_name, link->li_namelen))) @@ -668,24 +677,24 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) * type, reclaim(boolean), offset, length, new_lock_owner(boolean) */ READ_BUF(28); - READ32(lock->lk_type); + lock->lk_type = be32_to_cpup(p++); if ((lock->lk_type < NFS4_READ_LT) || (lock->lk_type > NFS4_WRITEW_LT)) goto xdr_error; - READ32(lock->lk_reclaim); - READ64(lock->lk_offset); - READ64(lock->lk_length); - READ32(lock->lk_is_new); + lock->lk_reclaim = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &lock->lk_offset); + p = xdr_decode_hyper(p, &lock->lk_length); + lock->lk_is_new = be32_to_cpup(p++); if (lock->lk_is_new) { READ_BUF(4); - READ32(lock->lk_new_open_seqid); + lock->lk_new_open_seqid = be32_to_cpup(p++); status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid); if (status) return status; READ_BUF(8 + sizeof(clientid_t)); - READ32(lock->lk_new_lock_seqid); + lock->lk_new_lock_seqid = be32_to_cpup(p++); COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t)); - READ32(lock->lk_new_owner.len); + lock->lk_new_owner.len = be32_to_cpup(p++); READ_BUF(lock->lk_new_owner.len); READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len); } else { @@ -693,7 +702,7 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) if (status) return status; READ_BUF(4); - READ32(lock->lk_old_lock_seqid); + lock->lk_old_lock_seqid = be32_to_cpup(p++); } DECODE_TAIL; @@ -705,13 +714,13 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt) DECODE_HEAD; READ_BUF(32); - READ32(lockt->lt_type); + lockt->lt_type = be32_to_cpup(p++); if((lockt->lt_type < NFS4_READ_LT) || (lockt->lt_type > NFS4_WRITEW_LT)) goto xdr_error; - READ64(lockt->lt_offset); - READ64(lockt->lt_length); + p = xdr_decode_hyper(p, &lockt->lt_offset); + p = xdr_decode_hyper(p, &lockt->lt_length); COPYMEM(&lockt->lt_clientid, 8); - READ32(lockt->lt_owner.len); + lockt->lt_owner.len = be32_to_cpup(p++); READ_BUF(lockt->lt_owner.len); READMEM(lockt->lt_owner.data, lockt->lt_owner.len); @@ -724,16 +733,16 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku) DECODE_HEAD; READ_BUF(8); - READ32(locku->lu_type); + locku->lu_type = be32_to_cpup(p++); if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT)) goto xdr_error; - READ32(locku->lu_seqid); + locku->lu_seqid = be32_to_cpup(p++); status = nfsd4_decode_stateid(argp, &locku->lu_stateid); if (status) return status; READ_BUF(16); - READ64(locku->lu_offset); - READ64(locku->lu_length); + p = xdr_decode_hyper(p, &locku->lu_offset); + p = xdr_decode_hyper(p, &locku->lu_length); DECODE_TAIL; } @@ -744,7 +753,7 @@ nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup DECODE_HEAD; READ_BUF(4); - READ32(lookup->lo_len); + lookup->lo_len = be32_to_cpup(p++); READ_BUF(lookup->lo_len); SAVEMEM(lookup->lo_name, lookup->lo_len); if ((status = check_filename(lookup->lo_name, lookup->lo_len))) @@ -759,7 +768,7 @@ static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *sh u32 w; READ_BUF(4); - READ32(w); + w = be32_to_cpup(p++); *share_access = w & NFS4_SHARE_ACCESS_MASK; *deleg_want = w & NFS4_SHARE_WANT_MASK; if (deleg_when) @@ -811,7 +820,7 @@ static __be32 nfsd4_decode_share_deny(struct nfsd4_compoundargs *argp, u32 *x) __be32 *p; READ_BUF(4); - READ32(*x); + *x = be32_to_cpup(p++); /* Note: unlinke access bits, deny bits may be zero. */ if (*x & ~NFS4_SHARE_DENY_BOTH) return nfserr_bad_xdr; @@ -825,7 +834,7 @@ static __be32 nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_ne __be32 *p; READ_BUF(4); - READ32(o->len); + o->len = be32_to_cpup(p++); if (o->len == 0 || o->len > NFS4_OPAQUE_LIMIT) return nfserr_bad_xdr; @@ -850,7 +859,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) open->op_xdr_error = 0; /* seqid, share_access, share_deny, clientid, ownerlen */ READ_BUF(4); - READ32(open->op_seqid); + open->op_seqid = be32_to_cpup(p++); /* decode, yet ignore deleg_when until supported */ status = nfsd4_decode_share_access(argp, &open->op_share_access, &open->op_deleg_want, &dummy); @@ -865,13 +874,13 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) if (status) goto xdr_error; READ_BUF(4); - READ32(open->op_create); + open->op_create = be32_to_cpup(p++); switch (open->op_create) { case NFS4_OPEN_NOCREATE: break; case NFS4_OPEN_CREATE: READ_BUF(4); - READ32(open->op_createmode); + open->op_createmode = be32_to_cpup(p++); switch (open->op_createmode) { case NFS4_CREATE_UNCHECKED: case NFS4_CREATE_GUARDED: @@ -904,12 +913,12 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) /* open_claim */ READ_BUF(4); - READ32(open->op_claim_type); + open->op_claim_type = be32_to_cpup(p++); switch (open->op_claim_type) { case NFS4_OPEN_CLAIM_NULL: case NFS4_OPEN_CLAIM_DELEGATE_PREV: READ_BUF(4); - READ32(open->op_fname.len); + open->op_fname.len = be32_to_cpup(p++); READ_BUF(open->op_fname.len); SAVEMEM(open->op_fname.data, open->op_fname.len); if ((status = check_filename(open->op_fname.data, open->op_fname.len))) @@ -917,14 +926,14 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) break; case NFS4_OPEN_CLAIM_PREVIOUS: READ_BUF(4); - READ32(open->op_delegate_type); + open->op_delegate_type = be32_to_cpup(p++); break; case NFS4_OPEN_CLAIM_DELEGATE_CUR: status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid); if (status) return status; READ_BUF(4); - READ32(open->op_fname.len); + open->op_fname.len = be32_to_cpup(p++); READ_BUF(open->op_fname.len); SAVEMEM(open->op_fname.data, open->op_fname.len); if ((status = check_filename(open->op_fname.data, open->op_fname.len))) @@ -962,7 +971,7 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con if (status) return status; READ_BUF(4); - READ32(open_conf->oc_seqid); + open_conf->oc_seqid = be32_to_cpup(p++); DECODE_TAIL; } @@ -976,7 +985,7 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d if (status) return status; READ_BUF(4); - READ32(open_down->od_seqid); + open_down->od_seqid = be32_to_cpup(p++); status = nfsd4_decode_share_access(argp, &open_down->od_share_access, &open_down->od_deleg_want, NULL); if (status) @@ -993,7 +1002,7 @@ nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh) DECODE_HEAD; READ_BUF(4); - READ32(putfh->pf_fhlen); + putfh->pf_fhlen = be32_to_cpup(p++); if (putfh->pf_fhlen > NFS4_FHSIZE) goto xdr_error; READ_BUF(putfh->pf_fhlen); @@ -1019,8 +1028,8 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read) if (status) return status; READ_BUF(12); - READ64(read->rd_offset); - READ32(read->rd_length); + p = xdr_decode_hyper(p, &read->rd_offset); + read->rd_length = be32_to_cpup(p++); DECODE_TAIL; } @@ -1031,10 +1040,10 @@ nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *read DECODE_HEAD; READ_BUF(24); - READ64(readdir->rd_cookie); + p = xdr_decode_hyper(p, &readdir->rd_cookie); COPYMEM(readdir->rd_verf.data, sizeof(readdir->rd_verf.data)); - READ32(readdir->rd_dircount); /* just in case you needed a useless field... */ - READ32(readdir->rd_maxcount); + readdir->rd_dircount = be32_to_cpup(p++); + readdir->rd_maxcount = be32_to_cpup(p++); if ((status = nfsd4_decode_bitmap(argp, readdir->rd_bmval))) goto out; @@ -1047,7 +1056,7 @@ nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove DECODE_HEAD; READ_BUF(4); - READ32(remove->rm_namelen); + remove->rm_namelen = be32_to_cpup(p++); READ_BUF(remove->rm_namelen); SAVEMEM(remove->rm_name, remove->rm_namelen); if ((status = check_filename(remove->rm_name, remove->rm_namelen))) @@ -1062,10 +1071,10 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename DECODE_HEAD; READ_BUF(4); - READ32(rename->rn_snamelen); + rename->rn_snamelen = be32_to_cpup(p++); READ_BUF(rename->rn_snamelen + 4); SAVEMEM(rename->rn_sname, rename->rn_snamelen); - READ32(rename->rn_tnamelen); + rename->rn_tnamelen = be32_to_cpup(p++); READ_BUF(rename->rn_tnamelen); SAVEMEM(rename->rn_tname, rename->rn_tnamelen); if ((status = check_filename(rename->rn_sname, rename->rn_snamelen))) @@ -1097,7 +1106,7 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp, DECODE_HEAD; READ_BUF(4); - READ32(secinfo->si_namelen); + secinfo->si_namelen = be32_to_cpup(p++); READ_BUF(secinfo->si_namelen); SAVEMEM(secinfo->si_name, secinfo->si_namelen); status = check_filename(secinfo->si_name, secinfo->si_namelen); @@ -1113,7 +1122,7 @@ nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp, DECODE_HEAD; READ_BUF(4); - READ32(sin->sin_style); + sin->sin_style = be32_to_cpup(p++); DECODE_TAIL; } @@ -1144,16 +1153,16 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient if (status) return nfserr_bad_xdr; READ_BUF(8); - READ32(setclientid->se_callback_prog); - READ32(setclientid->se_callback_netid_len); + setclientid->se_callback_prog = be32_to_cpup(p++); + setclientid->se_callback_netid_len = be32_to_cpup(p++); READ_BUF(setclientid->se_callback_netid_len + 4); SAVEMEM(setclientid->se_callback_netid_val, setclientid->se_callback_netid_len); - READ32(setclientid->se_callback_addr_len); + setclientid->se_callback_addr_len = be32_to_cpup(p++); READ_BUF(setclientid->se_callback_addr_len + 4); SAVEMEM(setclientid->se_callback_addr_val, setclientid->se_callback_addr_len); - READ32(setclientid->se_callback_ident); + setclientid->se_callback_ident = be32_to_cpup(p++); DECODE_TAIL; } @@ -1186,7 +1195,7 @@ nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify * nfsd4_proc_verify */ READ_BUF(4); - READ32(verify->ve_attrlen); + verify->ve_attrlen = be32_to_cpup(p++); READ_BUF(verify->ve_attrlen); SAVEMEM(verify->ve_attrval, verify->ve_attrlen); @@ -1204,11 +1213,11 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) if (status) return status; READ_BUF(16); - READ64(write->wr_offset); - READ32(write->wr_stable_how); + p = xdr_decode_hyper(p, &write->wr_offset); + write->wr_stable_how = be32_to_cpup(p++); if (write->wr_stable_how > 2) goto xdr_error; - READ32(write->wr_buflen); + write->wr_buflen = be32_to_cpup(p++); /* Sorry .. no magic macros for this.. * * READ_BUF(write->wr_buflen); @@ -1254,7 +1263,7 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel READ_BUF(12); COPYMEM(&rlockowner->rl_clientid, sizeof(clientid_t)); - READ32(rlockowner->rl_owner.len); + rlockowner->rl_owner.len = be32_to_cpup(p++); READ_BUF(rlockowner->rl_owner.len); READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len); @@ -1278,63 +1287,63 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, return nfserr_bad_xdr; READ_BUF(4); - READ32(exid->flags); + exid->flags = be32_to_cpup(p++); /* Ignore state_protect4_a */ READ_BUF(4); - READ32(exid->spa_how); + exid->spa_how = be32_to_cpup(p++); switch (exid->spa_how) { case SP4_NONE: break; case SP4_MACH_CRED: /* spo_must_enforce */ READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy * 4); p += dummy; /* spo_must_allow */ READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy * 4); p += dummy; break; case SP4_SSV: /* ssp_ops */ READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy * 4); p += dummy; READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy * 4); p += dummy; /* ssp_hash_algs<> */ READ_BUF(4); - READ32(tmp); + tmp = be32_to_cpup(p++); while (tmp--) { READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy); p += XDR_QUADLEN(dummy); } /* ssp_encr_algs<> */ READ_BUF(4); - READ32(tmp); + tmp = be32_to_cpup(p++); while (tmp--) { READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy); p += XDR_QUADLEN(dummy); } /* ssp_window and ssp_num_gss_handles */ READ_BUF(8); - READ32(dummy); - READ32(dummy); + dummy = be32_to_cpup(p++); + dummy = be32_to_cpup(p++); break; default: goto xdr_error; @@ -1342,7 +1351,7 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, /* Ignore Implementation ID */ READ_BUF(4); /* nfs_impl_id4 array length */ - READ32(dummy); + dummy = be32_to_cpup(p++); if (dummy > 1) goto xdr_error; @@ -1350,13 +1359,13 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, if (dummy == 1) { /* nii_domain */ READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy); p += XDR_QUADLEN(dummy); /* nii_name */ READ_BUF(4); - READ32(dummy); + dummy = be32_to_cpup(p++); READ_BUF(dummy); p += XDR_QUADLEN(dummy); @@ -1376,21 +1385,21 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, READ_BUF(16); COPYMEM(&sess->clientid, 8); - READ32(sess->seqid); - READ32(sess->flags); + sess->seqid = be32_to_cpup(p++); + sess->flags = be32_to_cpup(p++); /* Fore channel attrs */ READ_BUF(28); - READ32(dummy); /* headerpadsz is always 0 */ - READ32(sess->fore_channel.maxreq_sz); - READ32(sess->fore_channel.maxresp_sz); - READ32(sess->fore_channel.maxresp_cached); - READ32(sess->fore_channel.maxops); - READ32(sess->fore_channel.maxreqs); - READ32(sess->fore_channel.nr_rdma_attrs); + dummy = be32_to_cpup(p++); /* headerpadsz is always 0 */ + sess->fore_channel.maxreq_sz = be32_to_cpup(p++); + sess->fore_channel.maxresp_sz = be32_to_cpup(p++); + sess->fore_channel.maxresp_cached = be32_to_cpup(p++); + sess->fore_channel.maxops = be32_to_cpup(p++); + sess->fore_channel.maxreqs = be32_to_cpup(p++); + sess->fore_channel.nr_rdma_attrs = be32_to_cpup(p++); if (sess->fore_channel.nr_rdma_attrs == 1) { READ_BUF(4); - READ32(sess->fore_channel.rdma_attrs); + sess->fore_channel.rdma_attrs = be32_to_cpup(p++); } else if (sess->fore_channel.nr_rdma_attrs > 1) { dprintk("Too many fore channel attr bitmaps!\n"); goto xdr_error; @@ -1398,23 +1407,23 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, /* Back channel attrs */ READ_BUF(28); - READ32(dummy); /* headerpadsz is always 0 */ - READ32(sess->back_channel.maxreq_sz); - READ32(sess->back_channel.maxresp_sz); - READ32(sess->back_channel.maxresp_cached); - READ32(sess->back_channel.maxops); - READ32(sess->back_channel.maxreqs); - READ32(sess->back_channel.nr_rdma_attrs); + dummy = be32_to_cpup(p++); /* headerpadsz is always 0 */ + sess->back_channel.maxreq_sz = be32_to_cpup(p++); + sess->back_channel.maxresp_sz = be32_to_cpup(p++); + sess->back_channel.maxresp_cached = be32_to_cpup(p++); + sess->back_channel.maxops = be32_to_cpup(p++); + sess->back_channel.maxreqs = be32_to_cpup(p++); + sess->back_channel.nr_rdma_attrs = be32_to_cpup(p++); if (sess->back_channel.nr_rdma_attrs == 1) { READ_BUF(4); - READ32(sess->back_channel.rdma_attrs); + sess->back_channel.rdma_attrs = be32_to_cpup(p++); } else if (sess->back_channel.nr_rdma_attrs > 1) { dprintk("Too many back channel attr bitmaps!\n"); goto xdr_error; } READ_BUF(4); - READ32(sess->callback_prog); + sess->callback_prog = be32_to_cpup(p++); nfsd4_decode_cb_sec(argp, &sess->cb_sec); DECODE_TAIL; } @@ -1437,7 +1446,7 @@ nfsd4_decode_free_stateid(struct nfsd4_compoundargs *argp, DECODE_HEAD; READ_BUF(sizeof(stateid_t)); - READ32(free_stateid->fr_stateid.si_generation); + free_stateid->fr_stateid.si_generation = be32_to_cpup(p++); COPYMEM(&free_stateid->fr_stateid.si_opaque, sizeof(stateid_opaque_t)); DECODE_TAIL; @@ -1451,10 +1460,10 @@ nfsd4_decode_sequence(struct nfsd4_compoundargs *argp, READ_BUF(NFS4_MAX_SESSIONID_LEN + 16); COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN); - READ32(seq->seqid); - READ32(seq->slotid); - READ32(seq->maxslots); - READ32(seq->cachethis); + seq->seqid = be32_to_cpup(p++); + seq->slotid = be32_to_cpup(p++); + seq->maxslots = be32_to_cpup(p++); + seq->cachethis = be32_to_cpup(p++); DECODE_TAIL; } @@ -1511,7 +1520,7 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str DECODE_HEAD; READ_BUF(4); - READ32(rc->rca_one_fs); + rc->rca_one_fs = be32_to_cpup(p++); DECODE_TAIL; } @@ -1605,47 +1614,25 @@ nfsd4_opnum_in_range(struct nfsd4_compoundargs *argp, struct nfsd4_op *op) return true; } -/* - * Return a rough estimate of the maximum possible reply size. Note the - * estimate includes rpc headers so is meant to be passed to - * svc_reserve, not svc_reserve_auth. - * - * Also note the current compound encoding permits only one operation to - * use pages beyond the first one, so the maximum possible length is the - * maximum over these values, not the sum. - */ -static int nfsd4_max_reply(u32 opnum) -{ - switch (opnum) { - case OP_READLINK: - case OP_READDIR: - /* - * Both of these ops take a single page for data and put - * the head and tail in another page: - */ - return 2 * PAGE_SIZE; - case OP_READ: - return INT_MAX; - default: - return PAGE_SIZE; - } -} - static __be32 nfsd4_decode_compound(struct nfsd4_compoundargs *argp) { DECODE_HEAD; struct nfsd4_op *op; bool cachethis = false; - int max_reply = PAGE_SIZE; + int auth_slack= argp->rqstp->rq_auth_slack; + int max_reply = auth_slack + 8; /* opcnt, status */ + int readcount = 0; + int readbytes = 0; int i; READ_BUF(4); - READ32(argp->taglen); + argp->taglen = be32_to_cpup(p++); READ_BUF(argp->taglen + 8); SAVEMEM(argp->tag, argp->taglen); - READ32(argp->minorversion); - READ32(argp->opcnt); + argp->minorversion = be32_to_cpup(p++); + argp->opcnt = be32_to_cpup(p++); + max_reply += 4 + (XDR_QUADLEN(argp->taglen) << 2); if (argp->taglen > NFSD4_MAX_TAGLEN) goto xdr_error; @@ -1669,7 +1656,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) op->replay = NULL; READ_BUF(4); - READ32(op->opnum); + op->opnum = be32_to_cpup(p++); if (nfsd4_opnum_in_range(argp, op)) op->status = nfsd4_dec_ops[op->opnum](argp, &op->u); @@ -1677,97 +1664,82 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) op->opnum = OP_ILLEGAL; op->status = nfserr_op_illegal; } - - if (op->status) { - argp->opcnt = i+1; - break; - } /* * We'll try to cache the result in the DRC if any one * op in the compound wants to be cached: */ cachethis |= nfsd4_cache_this_op(op); - max_reply = max(max_reply, nfsd4_max_reply(op->opnum)); + if (op->opnum == OP_READ) { + readcount++; + readbytes += nfsd4_max_reply(argp->rqstp, op); + } else + max_reply += nfsd4_max_reply(argp->rqstp, op); + + if (op->status) { + argp->opcnt = i+1; + break; + } } /* Sessions make the DRC unnecessary: */ if (argp->minorversion) cachethis = false; - if (max_reply != INT_MAX) - svc_reserve(argp->rqstp, max_reply); + svc_reserve(argp->rqstp, max_reply + readbytes); argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE; - DECODE_TAIL; -} - -#define WRITE32(n) *p++ = htonl(n) -#define WRITE64(n) do { \ - *p++ = htonl((u32)((n) >> 32)); \ - *p++ = htonl((u32)(n)); \ -} while (0) -#define WRITEMEM(ptr,nbytes) do { if (nbytes > 0) { \ - *(p + XDR_QUADLEN(nbytes) -1) = 0; \ - memcpy(p, ptr, nbytes); \ - p += XDR_QUADLEN(nbytes); \ -}} while (0) + if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack) + argp->rqstp->rq_splice_ok = false; -static void write32(__be32 **p, u32 n) -{ - *(*p)++ = htonl(n); -} - -static void write64(__be32 **p, u64 n) -{ - write32(p, (n >> 32)); - write32(p, (u32)n); + DECODE_TAIL; } -static void write_change(__be32 **p, struct kstat *stat, struct inode *inode) +static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode) { if (IS_I_VERSION(inode)) { - write64(p, inode->i_version); + p = xdr_encode_hyper(p, inode->i_version); } else { - write32(p, stat->ctime.tv_sec); - write32(p, stat->ctime.tv_nsec); + *p++ = cpu_to_be32(stat->ctime.tv_sec); + *p++ = cpu_to_be32(stat->ctime.tv_nsec); } + return p; } -static void write_cinfo(__be32 **p, struct nfsd4_change_info *c) +static __be32 *encode_cinfo(__be32 *p, struct nfsd4_change_info *c) { - write32(p, c->atomic); + *p++ = cpu_to_be32(c->atomic); if (c->change_supported) { - write64(p, c->before_change); - write64(p, c->after_change); + p = xdr_encode_hyper(p, c->before_change); + p = xdr_encode_hyper(p, c->after_change); } else { - write32(p, c->before_ctime_sec); - write32(p, c->before_ctime_nsec); - write32(p, c->after_ctime_sec); - write32(p, c->after_ctime_nsec); + *p++ = cpu_to_be32(c->before_ctime_sec); + *p++ = cpu_to_be32(c->before_ctime_nsec); + *p++ = cpu_to_be32(c->after_ctime_sec); + *p++ = cpu_to_be32(c->after_ctime_nsec); } + return p; } -#define RESERVE_SPACE(nbytes) do { \ - p = resp->p; \ - BUG_ON(p + XDR_QUADLEN(nbytes) > resp->end); \ -} while (0) -#define ADJUST_ARGS() resp->p = p - /* Encode as an array of strings the string given with components * separated @sep, escaped with esc_enter and esc_exit. */ -static __be32 nfsd4_encode_components_esc(char sep, char *components, - __be32 **pp, int *buflen, - char esc_enter, char esc_exit) +static __be32 nfsd4_encode_components_esc(struct xdr_stream *xdr, char sep, + char *components, char esc_enter, + char esc_exit) { - __be32 *p = *pp; - __be32 *countp = p; + __be32 *p; + __be32 pathlen; + int pathlen_offset; int strlen, count=0; char *str, *end, *next; dprintk("nfsd4_encode_components(%s)\n", components); - if ((*buflen -= 4) < 0) + + pathlen_offset = xdr->buf->len; + p = xdr_reserve_space(xdr, 4); + if (!p) return nfserr_resource; - WRITE32(0); /* We will fill this in with @count later */ + p++; /* We will fill this in with @count later */ + end = str = components; while (*end) { bool found_esc = false; @@ -1789,59 +1761,57 @@ static __be32 nfsd4_encode_components_esc(char sep, char *components, strlen = end - str; if (strlen) { - if ((*buflen -= ((XDR_QUADLEN(strlen) << 2) + 4)) < 0) + p = xdr_reserve_space(xdr, strlen + 4); + if (!p) return nfserr_resource; - WRITE32(strlen); - WRITEMEM(str, strlen); + p = xdr_encode_opaque(p, str, strlen); count++; } else end++; str = end; } - *pp = p; - p = countp; - WRITE32(count); + pathlen = htonl(xdr->buf->len - pathlen_offset); + write_bytes_to_xdr_buf(xdr->buf, pathlen_offset, &pathlen, 4); return 0; } /* Encode as an array of strings the string given with components * separated @sep. */ -static __be32 nfsd4_encode_components(char sep, char *components, - __be32 **pp, int *buflen) +static __be32 nfsd4_encode_components(struct xdr_stream *xdr, char sep, + char *components) { - return nfsd4_encode_components_esc(sep, components, pp, buflen, 0, 0); + return nfsd4_encode_components_esc(xdr, sep, components, 0, 0); } /* * encode a location element of a fs_locations structure */ -static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location, - __be32 **pp, int *buflen) +static __be32 nfsd4_encode_fs_location4(struct xdr_stream *xdr, + struct nfsd4_fs_location *location) { __be32 status; - __be32 *p = *pp; - status = nfsd4_encode_components_esc(':', location->hosts, &p, buflen, + status = nfsd4_encode_components_esc(xdr, ':', location->hosts, '[', ']'); if (status) return status; - status = nfsd4_encode_components('/', location->path, &p, buflen); + status = nfsd4_encode_components(xdr, '/', location->path); if (status) return status; - *pp = p; return 0; } /* * Encode a path in RFC3530 'pathname4' format */ -static __be32 nfsd4_encode_path(const struct path *root, - const struct path *path, __be32 **pp, int *buflen) +static __be32 nfsd4_encode_path(struct xdr_stream *xdr, + const struct path *root, + const struct path *path) { struct path cur = *path; - __be32 *p = *pp; + __be32 *p; struct dentry **components = NULL; unsigned int ncomponents = 0; __be32 err = nfserr_jukebox; @@ -1872,11 +1842,11 @@ static __be32 nfsd4_encode_path(const struct path *root, components[ncomponents++] = cur.dentry; cur.dentry = dget_parent(cur.dentry); } - - *buflen -= 4; - if (*buflen < 0) + err = nfserr_resource; + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_free; - WRITE32(ncomponents); + *p++ = cpu_to_be32(ncomponents); while (ncomponents) { struct dentry *dentry = components[ncomponents - 1]; @@ -1884,20 +1854,18 @@ static __be32 nfsd4_encode_path(const struct path *root, spin_lock(&dentry->d_lock); len = dentry->d_name.len; - *buflen -= 4 + (XDR_QUADLEN(len) << 2); - if (*buflen < 0) { + p = xdr_reserve_space(xdr, len + 4); + if (!p) { spin_unlock(&dentry->d_lock); goto out_free; } - WRITE32(len); - WRITEMEM(dentry->d_name.name, len); + p = xdr_encode_opaque(p, dentry->d_name.name, len); dprintk("/%s", dentry->d_name.name); spin_unlock(&dentry->d_lock); dput(dentry); ncomponents--; } - *pp = p; err = 0; out_free: dprintk(")\n"); @@ -1908,8 +1876,8 @@ out_free: return err; } -static __be32 nfsd4_encode_fsloc_fsroot(struct svc_rqst *rqstp, - const struct path *path, __be32 **pp, int *buflen) +static __be32 nfsd4_encode_fsloc_fsroot(struct xdr_stream *xdr, + struct svc_rqst *rqstp, const struct path *path) { struct svc_export *exp_ps; __be32 res; @@ -1917,7 +1885,7 @@ static __be32 nfsd4_encode_fsloc_fsroot(struct svc_rqst *rqstp, exp_ps = rqst_find_fsidzero_export(rqstp); if (IS_ERR(exp_ps)) return nfserrno(PTR_ERR(exp_ps)); - res = nfsd4_encode_path(&exp_ps->ex_path, path, pp, buflen); + res = nfsd4_encode_path(xdr, &exp_ps->ex_path, path); exp_put(exp_ps); return res; } @@ -1925,28 +1893,26 @@ static __be32 nfsd4_encode_fsloc_fsroot(struct svc_rqst *rqstp, /* * encode a fs_locations structure */ -static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp, - struct svc_export *exp, - __be32 **pp, int *buflen) +static __be32 nfsd4_encode_fs_locations(struct xdr_stream *xdr, + struct svc_rqst *rqstp, struct svc_export *exp) { __be32 status; int i; - __be32 *p = *pp; + __be32 *p; struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs; - status = nfsd4_encode_fsloc_fsroot(rqstp, &exp->ex_path, &p, buflen); + status = nfsd4_encode_fsloc_fsroot(xdr, rqstp, &exp->ex_path); if (status) return status; - if ((*buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) return nfserr_resource; - WRITE32(fslocs->locations_count); + *p++ = cpu_to_be32(fslocs->locations_count); for (i=0; i<fslocs->locations_count; i++) { - status = nfsd4_encode_fs_location4(&fslocs->locations[i], - &p, buflen); + status = nfsd4_encode_fs_location4(xdr, &fslocs->locations[i]); if (status) return status; } - *pp = p; return 0; } @@ -1965,15 +1931,15 @@ static u32 nfs4_file_type(umode_t mode) } static inline __be32 -nfsd4_encode_aclname(struct svc_rqst *rqstp, struct nfs4_ace *ace, - __be32 **p, int *buflen) +nfsd4_encode_aclname(struct xdr_stream *xdr, struct svc_rqst *rqstp, + struct nfs4_ace *ace) { if (ace->whotype != NFS4_ACL_WHO_NAMED) - return nfs4_acl_write_who(ace->whotype, p, buflen); + return nfs4_acl_write_who(xdr, ace->whotype); else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) - return nfsd4_encode_group(rqstp, ace->who_gid, p, buflen); + return nfsd4_encode_group(xdr, rqstp, ace->who_gid); else - return nfsd4_encode_user(rqstp, ace->who_uid, p, buflen); + return nfsd4_encode_user(xdr, rqstp, ace->who_uid); } #define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \ @@ -1982,31 +1948,28 @@ nfsd4_encode_aclname(struct svc_rqst *rqstp, struct nfs4_ace *ace, #ifdef CONFIG_NFSD_V4_SECURITY_LABEL static inline __be32 -nfsd4_encode_security_label(struct svc_rqst *rqstp, void *context, int len, __be32 **pp, int *buflen) +nfsd4_encode_security_label(struct xdr_stream *xdr, struct svc_rqst *rqstp, + void *context, int len) { - __be32 *p = *pp; + __be32 *p; - if (*buflen < ((XDR_QUADLEN(len) << 2) + 4 + 4 + 4)) + p = xdr_reserve_space(xdr, len + 4 + 4 + 4); + if (!p) return nfserr_resource; /* * For now we use a 0 here to indicate the null translation; in * the future we may place a call to translation code here. */ - if ((*buflen -= 8) < 0) - return nfserr_resource; - - WRITE32(0); /* lfs */ - WRITE32(0); /* pi */ + *p++ = cpu_to_be32(0); /* lfs */ + *p++ = cpu_to_be32(0); /* pi */ p = xdr_encode_opaque(p, context, len); - *buflen -= (XDR_QUADLEN(len) << 2) + 4; - - *pp = p; return 0; } #else static inline __be32 -nfsd4_encode_security_label(struct svc_rqst *rqstp, void *context, int len, __be32 **pp, int *buflen) +nfsd4_encode_security_label(struct xdr_stream *xdr, struct svc_rqst *rqstp, + void *context, int len) { return 0; } #endif @@ -2045,12 +2008,11 @@ static int get_parent_attributes(struct svc_export *exp, struct kstat *stat) /* * Note: @fhp can be NULL; in this case, we might have to compose the filehandle * ourselves. - * - * countp is the buffer size in _words_ */ -__be32 -nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, - struct dentry *dentry, __be32 **buffer, int count, u32 *bmval, +static __be32 +nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, + struct svc_export *exp, + struct dentry *dentry, u32 *bmval, struct svc_rqst *rqstp, int ignore_crossmnt) { u32 bmval0 = bmval[0]; @@ -2059,12 +2021,13 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, struct kstat stat; struct svc_fh *tempfh = NULL; struct kstatfs statfs; - int buflen = count << 2; - __be32 *attrlenp; + __be32 *p; + int starting_len = xdr->buf->len; + int attrlen_offset; + __be32 attrlen; u32 dummy; u64 dummy64; u32 rdattr_err = 0; - __be32 *p = *buffer; __be32 status; int err; int aclsupport = 0; @@ -2095,8 +2058,8 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, err = vfs_getattr(&path, &stat); if (err) goto out_nfserr; - if ((bmval0 & (FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL | - FATTR4_WORD0_MAXNAME)) || + if ((bmval0 & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE | + FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_MAXNAME)) || (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL))) { err = vfs_statfs(&path, &statfs); @@ -2145,25 +2108,33 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, #endif /* CONFIG_NFSD_V4_SECURITY_LABEL */ if (bmval2) { - if ((buflen -= 16) < 0) + p = xdr_reserve_space(xdr, 16); + if (!p) goto out_resource; - WRITE32(3); - WRITE32(bmval0); - WRITE32(bmval1); - WRITE32(bmval2); + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(bmval0); + *p++ = cpu_to_be32(bmval1); + *p++ = cpu_to_be32(bmval2); } else if (bmval1) { - if ((buflen -= 12) < 0) + p = xdr_reserve_space(xdr, 12); + if (!p) goto out_resource; - WRITE32(2); - WRITE32(bmval0); - WRITE32(bmval1); + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(bmval0); + *p++ = cpu_to_be32(bmval1); } else { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - WRITE32(1); - WRITE32(bmval0); + *p++ = cpu_to_be32(1); + *p++ = cpu_to_be32(bmval0); } - attrlenp = p++; /* to be backfilled later */ + + attrlen_offset = xdr->buf->len; + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + p++; /* to be backfilled later */ if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { u32 word0 = nfsd_suppattrs0(minorversion); @@ -2175,296 +2146,343 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, if (!contextsupport) word2 &= ~FATTR4_WORD2_SECURITY_LABEL; if (!word2) { - if ((buflen -= 12) < 0) + p = xdr_reserve_space(xdr, 12); + if (!p) goto out_resource; - WRITE32(2); - WRITE32(word0); - WRITE32(word1); + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(word0); + *p++ = cpu_to_be32(word1); } else { - if ((buflen -= 16) < 0) + p = xdr_reserve_space(xdr, 16); + if (!p) goto out_resource; - WRITE32(3); - WRITE32(word0); - WRITE32(word1); - WRITE32(word2); + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(word0); + *p++ = cpu_to_be32(word1); + *p++ = cpu_to_be32(word2); } } if (bmval0 & FATTR4_WORD0_TYPE) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; dummy = nfs4_file_type(stat.mode); if (dummy == NF4BAD) { status = nfserr_serverfault; goto out; } - WRITE32(dummy); + *p++ = cpu_to_be32(dummy); } if (bmval0 & FATTR4_WORD0_FH_EXPIRE_TYPE) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) - WRITE32(NFS4_FH_PERSISTENT); + *p++ = cpu_to_be32(NFS4_FH_PERSISTENT); else - WRITE32(NFS4_FH_PERSISTENT|NFS4_FH_VOL_RENAME); + *p++ = cpu_to_be32(NFS4_FH_PERSISTENT| + NFS4_FH_VOL_RENAME); } if (bmval0 & FATTR4_WORD0_CHANGE) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - write_change(&p, &stat, dentry->d_inode); + p = encode_change(p, &stat, dentry->d_inode); } if (bmval0 & FATTR4_WORD0_SIZE) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - WRITE64(stat.size); + p = xdr_encode_hyper(p, stat.size); } if (bmval0 & FATTR4_WORD0_LINK_SUPPORT) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(1); + *p++ = cpu_to_be32(1); } if (bmval0 & FATTR4_WORD0_SYMLINK_SUPPORT) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(1); + *p++ = cpu_to_be32(1); } if (bmval0 & FATTR4_WORD0_NAMED_ATTR) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(0); + *p++ = cpu_to_be32(0); } if (bmval0 & FATTR4_WORD0_FSID) { - if ((buflen -= 16) < 0) + p = xdr_reserve_space(xdr, 16); + if (!p) goto out_resource; if (exp->ex_fslocs.migrated) { - WRITE64(NFS4_REFERRAL_FSID_MAJOR); - WRITE64(NFS4_REFERRAL_FSID_MINOR); + p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MAJOR); + p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MINOR); } else switch(fsid_source(fhp)) { case FSIDSOURCE_FSID: - WRITE64((u64)exp->ex_fsid); - WRITE64((u64)0); + p = xdr_encode_hyper(p, (u64)exp->ex_fsid); + p = xdr_encode_hyper(p, (u64)0); break; case FSIDSOURCE_DEV: - WRITE32(0); - WRITE32(MAJOR(stat.dev)); - WRITE32(0); - WRITE32(MINOR(stat.dev)); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(MAJOR(stat.dev)); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(MINOR(stat.dev)); break; case FSIDSOURCE_UUID: - WRITEMEM(exp->ex_uuid, 16); + p = xdr_encode_opaque_fixed(p, exp->ex_uuid, + EX_UUID_LEN); break; } } if (bmval0 & FATTR4_WORD0_UNIQUE_HANDLES) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(0); + *p++ = cpu_to_be32(0); } if (bmval0 & FATTR4_WORD0_LEASE_TIME) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(nn->nfsd4_lease); + *p++ = cpu_to_be32(nn->nfsd4_lease); } if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(rdattr_err); + *p++ = cpu_to_be32(rdattr_err); } if (bmval0 & FATTR4_WORD0_ACL) { struct nfs4_ace *ace; if (acl == NULL) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(0); + *p++ = cpu_to_be32(0); goto out_acl; } - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(acl->naces); + *p++ = cpu_to_be32(acl->naces); for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) { - if ((buflen -= 4*3) < 0) + p = xdr_reserve_space(xdr, 4*3); + if (!p) goto out_resource; - WRITE32(ace->type); - WRITE32(ace->flag); - WRITE32(ace->access_mask & NFS4_ACE_MASK_ALL); - status = nfsd4_encode_aclname(rqstp, ace, &p, &buflen); + *p++ = cpu_to_be32(ace->type); + *p++ = cpu_to_be32(ace->flag); + *p++ = cpu_to_be32(ace->access_mask & + NFS4_ACE_MASK_ALL); + status = nfsd4_encode_aclname(xdr, rqstp, ace); if (status) goto out; } } out_acl: if (bmval0 & FATTR4_WORD0_ACLSUPPORT) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(aclsupport ? + *p++ = cpu_to_be32(aclsupport ? ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL : 0); } if (bmval0 & FATTR4_WORD0_CANSETTIME) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(1); + *p++ = cpu_to_be32(1); } if (bmval0 & FATTR4_WORD0_CASE_INSENSITIVE) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(0); + *p++ = cpu_to_be32(0); } if (bmval0 & FATTR4_WORD0_CASE_PRESERVING) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(1); + *p++ = cpu_to_be32(1); } if (bmval0 & FATTR4_WORD0_CHOWN_RESTRICTED) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(1); + *p++ = cpu_to_be32(1); } if (bmval0 & FATTR4_WORD0_FILEHANDLE) { - buflen -= (XDR_QUADLEN(fhp->fh_handle.fh_size) << 2) + 4; - if (buflen < 0) + p = xdr_reserve_space(xdr, fhp->fh_handle.fh_size + 4); + if (!p) goto out_resource; - WRITE32(fhp->fh_handle.fh_size); - WRITEMEM(&fhp->fh_handle.fh_base, fhp->fh_handle.fh_size); + p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base, + fhp->fh_handle.fh_size); } if (bmval0 & FATTR4_WORD0_FILEID) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - WRITE64(stat.ino); + p = xdr_encode_hyper(p, stat.ino); } if (bmval0 & FATTR4_WORD0_FILES_AVAIL) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - WRITE64((u64) statfs.f_ffree); + p = xdr_encode_hyper(p, (u64) statfs.f_ffree); } if (bmval0 & FATTR4_WORD0_FILES_FREE) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - WRITE64((u64) statfs.f_ffree); + p = xdr_encode_hyper(p, (u64) statfs.f_ffree); } if (bmval0 & FATTR4_WORD0_FILES_TOTAL) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - WRITE64((u64) statfs.f_files); + p = xdr_encode_hyper(p, (u64) statfs.f_files); } if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) { - status = nfsd4_encode_fs_locations(rqstp, exp, &p, &buflen); + status = nfsd4_encode_fs_locations(xdr, rqstp, exp); if (status) goto out; } if (bmval0 & FATTR4_WORD0_HOMOGENEOUS) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(1); + *p++ = cpu_to_be32(1); } if (bmval0 & FATTR4_WORD0_MAXFILESIZE) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - WRITE64(exp->ex_path.mnt->mnt_sb->s_maxbytes); + p = xdr_encode_hyper(p, exp->ex_path.mnt->mnt_sb->s_maxbytes); } if (bmval0 & FATTR4_WORD0_MAXLINK) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(255); + *p++ = cpu_to_be32(255); } if (bmval0 & FATTR4_WORD0_MAXNAME) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(statfs.f_namelen); + *p++ = cpu_to_be32(statfs.f_namelen); } if (bmval0 & FATTR4_WORD0_MAXREAD) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - WRITE64((u64) svc_max_payload(rqstp)); + p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp)); } if (bmval0 & FATTR4_WORD0_MAXWRITE) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - WRITE64((u64) svc_max_payload(rqstp)); + p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp)); } if (bmval1 & FATTR4_WORD1_MODE) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(stat.mode & S_IALLUGO); + *p++ = cpu_to_be32(stat.mode & S_IALLUGO); } if (bmval1 & FATTR4_WORD1_NO_TRUNC) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(1); + *p++ = cpu_to_be32(1); } if (bmval1 & FATTR4_WORD1_NUMLINKS) { - if ((buflen -= 4) < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto out_resource; - WRITE32(stat.nlink); + *p++ = cpu_to_be32(stat.nlink); } if (bmval1 & FATTR4_WORD1_OWNER) { - status = nfsd4_encode_user(rqstp, stat.uid, &p, &buflen); + status = nfsd4_encode_user(xdr, rqstp, stat.uid); if (status) goto out; } if (bmval1 & FATTR4_WORD1_OWNER_GROUP) { - status = nfsd4_encode_group(rqstp, stat.gid, &p, &buflen); + status = nfsd4_encode_group(xdr, rqstp, stat.gid); if (status) goto out; } if (bmval1 & FATTR4_WORD1_RAWDEV) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; - WRITE32((u32) MAJOR(stat.rdev)); - WRITE32((u32) MINOR(stat.rdev)); + *p++ = cpu_to_be32((u32) MAJOR(stat.rdev)); + *p++ = cpu_to_be32((u32) MINOR(stat.rdev)); } if (bmval1 & FATTR4_WORD1_SPACE_AVAIL) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; dummy64 = (u64)statfs.f_bavail * (u64)statfs.f_bsize; - WRITE64(dummy64); + p = xdr_encode_hyper(p, dummy64); } if (bmval1 & FATTR4_WORD1_SPACE_FREE) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; dummy64 = (u64)statfs.f_bfree * (u64)statfs.f_bsize; - WRITE64(dummy64); + p = xdr_encode_hyper(p, dummy64); } if (bmval1 & FATTR4_WORD1_SPACE_TOTAL) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; dummy64 = (u64)statfs.f_blocks * (u64)statfs.f_bsize; - WRITE64(dummy64); + p = xdr_encode_hyper(p, dummy64); } if (bmval1 & FATTR4_WORD1_SPACE_USED) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; dummy64 = (u64)stat.blocks << 9; - WRITE64(dummy64); + p = xdr_encode_hyper(p, dummy64); } if (bmval1 & FATTR4_WORD1_TIME_ACCESS) { - if ((buflen -= 12) < 0) + p = xdr_reserve_space(xdr, 12); + if (!p) goto out_resource; - WRITE64((s64)stat.atime.tv_sec); - WRITE32(stat.atime.tv_nsec); + p = xdr_encode_hyper(p, (s64)stat.atime.tv_sec); + *p++ = cpu_to_be32(stat.atime.tv_nsec); } if (bmval1 & FATTR4_WORD1_TIME_DELTA) { - if ((buflen -= 12) < 0) + p = xdr_reserve_space(xdr, 12); + if (!p) goto out_resource; - WRITE32(0); - WRITE32(1); - WRITE32(0); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(1); + *p++ = cpu_to_be32(0); } if (bmval1 & FATTR4_WORD1_TIME_METADATA) { - if ((buflen -= 12) < 0) + p = xdr_reserve_space(xdr, 12); + if (!p) goto out_resource; - WRITE64((s64)stat.ctime.tv_sec); - WRITE32(stat.ctime.tv_nsec); + p = xdr_encode_hyper(p, (s64)stat.ctime.tv_sec); + *p++ = cpu_to_be32(stat.ctime.tv_nsec); } if (bmval1 & FATTR4_WORD1_TIME_MODIFY) { - if ((buflen -= 12) < 0) + p = xdr_reserve_space(xdr, 12); + if (!p) goto out_resource; - WRITE64((s64)stat.mtime.tv_sec); - WRITE32(stat.mtime.tv_nsec); + p = xdr_encode_hyper(p, (s64)stat.mtime.tv_sec); + *p++ = cpu_to_be32(stat.mtime.tv_nsec); } if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) { - if ((buflen -= 8) < 0) + p = xdr_reserve_space(xdr, 8); + if (!p) goto out_resource; /* * Get parent's attributes if not ignoring crossmount @@ -2473,25 +2491,26 @@ out_acl: if (ignore_crossmnt == 0 && dentry == exp->ex_path.mnt->mnt_root) get_parent_attributes(exp, &stat); - WRITE64(stat.ino); + p = xdr_encode_hyper(p, stat.ino); } if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { - status = nfsd4_encode_security_label(rqstp, context, - contextlen, &p, &buflen); + status = nfsd4_encode_security_label(xdr, rqstp, context, + contextlen); if (status) goto out; } if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { - if ((buflen -= 16) < 0) + p = xdr_reserve_space(xdr, 16); + if (!p) goto out_resource; - WRITE32(3); - WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0); - WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD1); - WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD2); + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD0); + *p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD1); + *p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD2); } - *attrlenp = htonl((char *)p - (char *)attrlenp - 4); - *buffer = p; + attrlen = htonl(xdr->buf->len - attrlen_offset - 4); + write_bytes_to_xdr_buf(xdr->buf, attrlen_offset, &attrlen, 4); status = nfs_ok; out: @@ -2504,6 +2523,8 @@ out: fh_put(tempfh); kfree(tempfh); } + if (status) + xdr_truncate_encode(xdr, starting_len); return status; out_nfserr: status = nfserrno(err); @@ -2513,6 +2534,37 @@ out_resource: goto out; } +static void svcxdr_init_encode_from_buffer(struct xdr_stream *xdr, + struct xdr_buf *buf, __be32 *p, int bytes) +{ + xdr->scratch.iov_len = 0; + memset(buf, 0, sizeof(struct xdr_buf)); + buf->head[0].iov_base = p; + buf->head[0].iov_len = 0; + buf->len = 0; + xdr->buf = buf; + xdr->iov = buf->head; + xdr->p = p; + xdr->end = (void *)p + bytes; + buf->buflen = bytes; +} + +__be32 nfsd4_encode_fattr_to_buf(__be32 **p, int words, + struct svc_fh *fhp, struct svc_export *exp, + struct dentry *dentry, u32 *bmval, + struct svc_rqst *rqstp, int ignore_crossmnt) +{ + struct xdr_buf dummy; + struct xdr_stream xdr; + __be32 ret; + + svcxdr_init_encode_from_buffer(&xdr, &dummy, *p, words << 2); + ret = nfsd4_encode_fattr(&xdr, fhp, exp, dentry, bmval, rqstp, + ignore_crossmnt); + *p = xdr.p; + return ret; +} + static inline int attributes_need_mount(u32 *bmval) { if (bmval[0] & ~(FATTR4_WORD0_RDATTR_ERROR | FATTR4_WORD0_LEASE_TIME)) @@ -2523,8 +2575,8 @@ static inline int attributes_need_mount(u32 *bmval) } static __be32 -nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd, - const char *name, int namlen, __be32 **p, int buflen) +nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd, + const char *name, int namlen) { struct svc_export *exp = cd->rd_fhp->fh_export; struct dentry *dentry; @@ -2576,7 +2628,7 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd, } out_encode: - nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval, + nfserr = nfsd4_encode_fattr(xdr, NULL, exp, dentry, cd->rd_bmval, cd->rd_rqstp, ignore_crossmnt); out_put: dput(dentry); @@ -2585,9 +2637,12 @@ out_put: } static __be32 * -nfsd4_encode_rdattr_error(__be32 *p, int buflen, __be32 nfserr) +nfsd4_encode_rdattr_error(struct xdr_stream *xdr, __be32 nfserr) { - if (buflen < 6) + __be32 *p; + + p = xdr_reserve_space(xdr, 20); + if (!p) return NULL; *p++ = htonl(2); *p++ = htonl(FATTR4_WORD0_RDATTR_ERROR); /* bmval0 */ @@ -2604,10 +2659,13 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, { struct readdir_cd *ccd = ccdv; struct nfsd4_readdir *cd = container_of(ccd, struct nfsd4_readdir, common); - int buflen; - __be32 *p = cd->buffer; - __be32 *cookiep; + struct xdr_stream *xdr = cd->xdr; + int start_offset = xdr->buf->len; + int cookie_offset; + int entry_bytes; __be32 nfserr = nfserr_toosmall; + __be64 wire_offset; + __be32 *p; /* In nfsv4, "." and ".." never make it onto the wire.. */ if (name && isdotent(name, namlen)) { @@ -2615,19 +2673,24 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, return 0; } - if (cd->offset) - xdr_encode_hyper(cd->offset, (u64) offset); + if (cd->cookie_offset) { + wire_offset = cpu_to_be64(offset); + write_bytes_to_xdr_buf(xdr->buf, cd->cookie_offset, + &wire_offset, 8); + } - buflen = cd->buflen - 4 - XDR_QUADLEN(namlen); - if (buflen < 0) + p = xdr_reserve_space(xdr, 4); + if (!p) goto fail; - *p++ = xdr_one; /* mark entry present */ - cookiep = p; + cookie_offset = xdr->buf->len; + p = xdr_reserve_space(xdr, 3*4 + namlen); + if (!p) + goto fail; p = xdr_encode_hyper(p, NFS_OFFSET_MAX); /* offset of next entry */ p = xdr_encode_array(p, name, namlen); /* name length & name */ - nfserr = nfsd4_encode_dirent_fattr(cd, name, namlen, &p, buflen); + nfserr = nfsd4_encode_dirent_fattr(xdr, cd, name, namlen); switch (nfserr) { case nfs_ok: break; @@ -2635,6 +2698,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, nfserr = nfserr_toosmall; goto fail; case nfserr_noent: + xdr_truncate_encode(xdr, start_offset); goto skip_entry; default: /* @@ -2646,59 +2710,74 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, */ if (!(cd->rd_bmval[0] & FATTR4_WORD0_RDATTR_ERROR)) goto fail; - p = nfsd4_encode_rdattr_error(p, buflen, nfserr); + p = nfsd4_encode_rdattr_error(xdr, nfserr); if (p == NULL) { nfserr = nfserr_toosmall; goto fail; } } - cd->buflen -= (p - cd->buffer); - cd->buffer = p; - cd->offset = cookiep; + nfserr = nfserr_toosmall; + entry_bytes = xdr->buf->len - start_offset; + if (entry_bytes > cd->rd_maxcount) + goto fail; + cd->rd_maxcount -= entry_bytes; + if (!cd->rd_dircount) + goto fail; + cd->rd_dircount--; + cd->cookie_offset = cookie_offset; skip_entry: cd->common.err = nfs_ok; return 0; fail: + xdr_truncate_encode(xdr, start_offset); cd->common.err = nfserr; return -EINVAL; } -static void -nfsd4_encode_stateid(struct nfsd4_compoundres *resp, stateid_t *sid) +static __be32 +nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid) { __be32 *p; - RESERVE_SPACE(sizeof(stateid_t)); - WRITE32(sid->si_generation); - WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, sizeof(stateid_t)); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(sid->si_generation); + p = xdr_encode_opaque_fixed(p, &sid->si_opaque, + sizeof(stateid_opaque_t)); + return 0; } static __be32 nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (!nfserr) { - RESERVE_SPACE(8); - WRITE32(access->ac_supported); - WRITE32(access->ac_resp_access); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 8); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(access->ac_supported); + *p++ = cpu_to_be32(access->ac_resp_access); } return nfserr; } static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (!nfserr) { - RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 8); - WRITEMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN); - WRITE32(bcts->dir); + p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 8); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque_fixed(p, bcts->sessionid.data, + NFS4_MAX_SESSIONID_LEN); + *p++ = cpu_to_be32(bcts->dir); /* Sorry, we do not yet support RDMA over 4.1: */ - WRITE32(0); - ADJUST_ARGS(); + *p++ = cpu_to_be32(0); } return nfserr; } @@ -2706,8 +2785,10 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, static __be32 nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close) { + struct xdr_stream *xdr = &resp->xdr; + if (!nfserr) - nfsd4_encode_stateid(resp, &close->cl_stateid); + nfserr = nfsd4_encode_stateid(xdr, &close->cl_stateid); return nfserr; } @@ -2716,12 +2797,15 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c static __be32 nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (!nfserr) { - RESERVE_SPACE(NFS4_VERIFIER_SIZE); - WRITEMEM(commit->co_verf.data, NFS4_VERIFIER_SIZE); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque_fixed(p, commit->co_verf.data, + NFS4_VERIFIER_SIZE); } return nfserr; } @@ -2729,15 +2813,17 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ static __be32 nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (!nfserr) { - RESERVE_SPACE(32); - write_cinfo(&p, &create->cr_cinfo); - WRITE32(2); - WRITE32(create->cr_bmval[0]); - WRITE32(create->cr_bmval[1]); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 32); + if (!p) + return nfserr_resource; + p = encode_cinfo(p, &create->cr_cinfo); + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(create->cr_bmval[0]); + *p++ = cpu_to_be32(create->cr_bmval[1]); } return nfserr; } @@ -2746,14 +2832,13 @@ static __be32 nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getattr *getattr) { struct svc_fh *fhp = getattr->ga_fhp; - int buflen; + struct xdr_stream *xdr = &resp->xdr; if (nfserr) return nfserr; - buflen = resp->end - resp->p - (COMPOUND_ERR_SLACK_SPACE >> 2); - nfserr = nfsd4_encode_fattr(fhp, fhp->fh_export, fhp->fh_dentry, - &resp->p, buflen, getattr->ga_bmval, + nfserr = nfsd4_encode_fattr(xdr, fhp, fhp->fh_export, fhp->fh_dentry, + getattr->ga_bmval, resp->rqstp, 0); return nfserr; } @@ -2761,16 +2846,17 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 static __be32 nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh **fhpp) { + struct xdr_stream *xdr = &resp->xdr; struct svc_fh *fhp = *fhpp; unsigned int len; __be32 *p; if (!nfserr) { len = fhp->fh_handle.fh_size; - RESERVE_SPACE(len + 4); - WRITE32(len); - WRITEMEM(&fhp->fh_handle.fh_base, len); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, len + 4); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base, len); } return nfserr; } @@ -2779,35 +2865,50 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh * Including all fields other than the name, a LOCK4denied structure requires * 8(clientid) + 4(namelen) + 8(offset) + 8(length) + 4(type) = 32 bytes. */ -static void -nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denied *ld) +static __be32 +nfsd4_encode_lock_denied(struct xdr_stream *xdr, struct nfsd4_lock_denied *ld) { struct xdr_netobj *conf = &ld->ld_owner; __be32 *p; - RESERVE_SPACE(32 + XDR_LEN(conf->len)); - WRITE64(ld->ld_start); - WRITE64(ld->ld_length); - WRITE32(ld->ld_type); +again: + p = xdr_reserve_space(xdr, 32 + XDR_LEN(conf->len)); + if (!p) { + /* + * Don't fail to return the result just because we can't + * return the conflicting open: + */ + if (conf->len) { + kfree(conf->data); + conf->len = 0; + conf->data = NULL; + goto again; + } + return nfserr_resource; + } + p = xdr_encode_hyper(p, ld->ld_start); + p = xdr_encode_hyper(p, ld->ld_length); + *p++ = cpu_to_be32(ld->ld_type); if (conf->len) { - WRITEMEM(&ld->ld_clientid, 8); - WRITE32(conf->len); - WRITEMEM(conf->data, conf->len); + p = xdr_encode_opaque_fixed(p, &ld->ld_clientid, 8); + p = xdr_encode_opaque(p, conf->data, conf->len); kfree(conf->data); } else { /* non - nfsv4 lock in conflict, no clientid nor owner */ - WRITE64((u64)0); /* clientid */ - WRITE32(0); /* length of owner name */ + p = xdr_encode_hyper(p, (u64)0); /* clientid */ + *p++ = cpu_to_be32(0); /* length of owner name */ } - ADJUST_ARGS(); + return nfserr_denied; } static __be32 nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock) { + struct xdr_stream *xdr = &resp->xdr; + if (!nfserr) - nfsd4_encode_stateid(resp, &lock->lk_resp_stateid); + nfserr = nfsd4_encode_stateid(xdr, &lock->lk_resp_stateid); else if (nfserr == nfserr_denied) - nfsd4_encode_lock_denied(resp, &lock->lk_denied); + nfserr = nfsd4_encode_lock_denied(xdr, &lock->lk_denied); return nfserr; } @@ -2815,16 +2916,20 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo static __be32 nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt) { + struct xdr_stream *xdr = &resp->xdr; + if (nfserr == nfserr_denied) - nfsd4_encode_lock_denied(resp, &lockt->lt_denied); + nfsd4_encode_lock_denied(xdr, &lockt->lt_denied); return nfserr; } static __be32 nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku) { + struct xdr_stream *xdr = &resp->xdr; + if (!nfserr) - nfsd4_encode_stateid(resp, &locku->lu_stateid); + nfserr = nfsd4_encode_stateid(xdr, &locku->lu_stateid); return nfserr; } @@ -2833,12 +2938,14 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l static __be32 nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (!nfserr) { - RESERVE_SPACE(20); - write_cinfo(&p, &link->li_cinfo); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 20); + if (!p) + return nfserr_resource; + p = encode_cinfo(p, &link->li_cinfo); } return nfserr; } @@ -2847,72 +2954,86 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li static __be32 nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (nfserr) goto out; - nfsd4_encode_stateid(resp, &open->op_stateid); - RESERVE_SPACE(40); - write_cinfo(&p, &open->op_cinfo); - WRITE32(open->op_rflags); - WRITE32(2); - WRITE32(open->op_bmval[0]); - WRITE32(open->op_bmval[1]); - WRITE32(open->op_delegate_type); - ADJUST_ARGS(); + nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid); + if (nfserr) + goto out; + p = xdr_reserve_space(xdr, 40); + if (!p) + return nfserr_resource; + p = encode_cinfo(p, &open->op_cinfo); + *p++ = cpu_to_be32(open->op_rflags); + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(open->op_bmval[0]); + *p++ = cpu_to_be32(open->op_bmval[1]); + *p++ = cpu_to_be32(open->op_delegate_type); switch (open->op_delegate_type) { case NFS4_OPEN_DELEGATE_NONE: break; case NFS4_OPEN_DELEGATE_READ: - nfsd4_encode_stateid(resp, &open->op_delegate_stateid); - RESERVE_SPACE(20); - WRITE32(open->op_recall); + nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid); + if (nfserr) + return nfserr; + p = xdr_reserve_space(xdr, 20); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(open->op_recall); /* * TODO: ACE's in delegations */ - WRITE32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE); - WRITE32(0); - WRITE32(0); - WRITE32(0); /* XXX: is NULL principal ok? */ - ADJUST_ARGS(); + *p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); /* XXX: is NULL principal ok? */ break; case NFS4_OPEN_DELEGATE_WRITE: - nfsd4_encode_stateid(resp, &open->op_delegate_stateid); - RESERVE_SPACE(32); - WRITE32(0); + nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid); + if (nfserr) + return nfserr; + p = xdr_reserve_space(xdr, 32); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(0); /* * TODO: space_limit's in delegations */ - WRITE32(NFS4_LIMIT_SIZE); - WRITE32(~(u32)0); - WRITE32(~(u32)0); + *p++ = cpu_to_be32(NFS4_LIMIT_SIZE); + *p++ = cpu_to_be32(~(u32)0); + *p++ = cpu_to_be32(~(u32)0); /* * TODO: ACE's in delegations */ - WRITE32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE); - WRITE32(0); - WRITE32(0); - WRITE32(0); /* XXX: is NULL principal ok? */ - ADJUST_ARGS(); + *p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); /* XXX: is NULL principal ok? */ break; case NFS4_OPEN_DELEGATE_NONE_EXT: /* 4.1 */ switch (open->op_why_no_deleg) { case WND4_CONTENTION: case WND4_RESOURCE: - RESERVE_SPACE(8); - WRITE32(open->op_why_no_deleg); - WRITE32(0); /* deleg signaling not supported yet */ + p = xdr_reserve_space(xdr, 8); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(open->op_why_no_deleg); + /* deleg signaling not supported yet: */ + *p++ = cpu_to_be32(0); break; default: - RESERVE_SPACE(4); - WRITE32(open->op_why_no_deleg); + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(open->op_why_no_deleg); } - ADJUST_ARGS(); break; default: BUG(); @@ -2925,8 +3046,10 @@ out: static __be32 nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc) { + struct xdr_stream *xdr = &resp->xdr; + if (!nfserr) - nfsd4_encode_stateid(resp, &oc->oc_resp_stateid); + nfserr = nfsd4_encode_stateid(xdr, &oc->oc_resp_stateid); return nfserr; } @@ -2934,127 +3057,233 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct static __be32 nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od) { + struct xdr_stream *xdr = &resp->xdr; + if (!nfserr) - nfsd4_encode_stateid(resp, &od->od_stateid); + nfserr = nfsd4_encode_stateid(xdr, &od->od_stateid); return nfserr; } -static __be32 -nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, - struct nfsd4_read *read) +static __be32 nfsd4_encode_splice_read( + struct nfsd4_compoundres *resp, + struct nfsd4_read *read, + struct file *file, unsigned long maxcount) { + struct xdr_stream *xdr = &resp->xdr; + struct xdr_buf *buf = xdr->buf; u32 eof; - int v; - struct page *page; - unsigned long maxcount; - long len; - __be32 *p; + int space_left; + __be32 nfserr; + __be32 *p = xdr->p - 2; - if (nfserr) - return nfserr; - if (resp->xbuf->page_len) + /* + * Don't inline pages unless we know there's room for eof, + * count, and possible padding: + */ + if (xdr->end - xdr->p < 3) return nfserr_resource; - RESERVE_SPACE(8); /* eof flag and byte count */ + nfserr = nfsd_splice_read(read->rd_rqstp, file, + read->rd_offset, &maxcount); + if (nfserr) { + /* + * nfsd_splice_actor may have already messed with the + * page length; reset it so as not to confuse + * xdr_truncate_encode: + */ + buf->page_len = 0; + return nfserr; + } - maxcount = svc_max_payload(resp->rqstp); - if (maxcount > read->rd_length) - maxcount = read->rd_length; + eof = (read->rd_offset + maxcount >= + read->rd_fhp->fh_dentry->d_inode->i_size); + + *(p++) = htonl(eof); + *(p++) = htonl(maxcount); + + buf->page_len = maxcount; + buf->len += maxcount; + xdr->page_ptr += (maxcount + PAGE_SIZE - 1) / PAGE_SIZE; + + /* Use rest of head for padding and remaining ops: */ + buf->tail[0].iov_base = xdr->p; + buf->tail[0].iov_len = 0; + xdr->iov = buf->tail; + if (maxcount&3) { + int pad = 4 - (maxcount&3); + + *(xdr->p++) = 0; + + buf->tail[0].iov_base += maxcount&3; + buf->tail[0].iov_len = pad; + buf->len += pad; + } + + space_left = min_t(int, (void *)xdr->end - (void *)xdr->p, + buf->buflen - buf->len); + buf->buflen = buf->len + space_left; + xdr->end = (__be32 *)((void *)xdr->end + space_left); + + return 0; +} + +static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, + struct nfsd4_read *read, + struct file *file, unsigned long maxcount) +{ + struct xdr_stream *xdr = &resp->xdr; + u32 eof; + int v; + int starting_len = xdr->buf->len - 8; + long len; + int thislen; + __be32 nfserr; + __be32 tmp; + __be32 *p; + u32 zzz = 0; + int pad; len = maxcount; v = 0; - while (len > 0) { - page = *(resp->rqstp->rq_next_page); - if (!page) { /* ran out of pages */ - maxcount -= len; - break; - } - resp->rqstp->rq_vec[v].iov_base = page_address(page); - resp->rqstp->rq_vec[v].iov_len = - len < PAGE_SIZE ? len : PAGE_SIZE; - resp->rqstp->rq_next_page++; + + thislen = (void *)xdr->end - (void *)xdr->p; + if (len < thislen) + thislen = len; + p = xdr_reserve_space(xdr, (thislen+3)&~3); + WARN_ON_ONCE(!p); + resp->rqstp->rq_vec[v].iov_base = p; + resp->rqstp->rq_vec[v].iov_len = thislen; + v++; + len -= thislen; + + while (len) { + thislen = min_t(long, len, PAGE_SIZE); + p = xdr_reserve_space(xdr, (thislen+3)&~3); + WARN_ON_ONCE(!p); + resp->rqstp->rq_vec[v].iov_base = p; + resp->rqstp->rq_vec[v].iov_len = thislen; v++; - len -= PAGE_SIZE; + len -= thislen; } read->rd_vlen = v; - nfserr = nfsd_read_file(read->rd_rqstp, read->rd_fhp, read->rd_filp, - read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, - &maxcount); - + nfserr = nfsd_readv(file, read->rd_offset, resp->rqstp->rq_vec, + read->rd_vlen, &maxcount); if (nfserr) return nfserr; + xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3)); + eof = (read->rd_offset + maxcount >= read->rd_fhp->fh_dentry->d_inode->i_size); - WRITE32(eof); - WRITE32(maxcount); - ADJUST_ARGS(); - resp->xbuf->head[0].iov_len = (char*)p - - (char*)resp->xbuf->head[0].iov_base; - resp->xbuf->page_len = maxcount; + tmp = htonl(eof); + write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4); + tmp = htonl(maxcount); + write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4); - /* Use rest of head for padding and remaining ops: */ - resp->xbuf->tail[0].iov_base = p; - resp->xbuf->tail[0].iov_len = 0; - if (maxcount&3) { - RESERVE_SPACE(4); - WRITE32(0); - resp->xbuf->tail[0].iov_base += maxcount&3; - resp->xbuf->tail[0].iov_len = 4 - (maxcount&3); - ADJUST_ARGS(); - } + pad = (maxcount&3) ? 4 - (maxcount&3) : 0; + write_bytes_to_xdr_buf(xdr->buf, starting_len + 8 + maxcount, + &zzz, pad); return 0; + } static __be32 -nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readlink *readlink) +nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_read *read) { - int maxcount; - char *page; + unsigned long maxcount; + struct xdr_stream *xdr = &resp->xdr; + struct file *file = read->rd_filp; + int starting_len = xdr->buf->len; + struct raparms *ra; __be32 *p; + __be32 err; if (nfserr) return nfserr; - if (resp->xbuf->page_len) + + p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */ + if (!p) { + WARN_ON_ONCE(resp->rqstp->rq_splice_ok); return nfserr_resource; - if (!*resp->rqstp->rq_next_page) + } + if (resp->xdr.buf->page_len && resp->rqstp->rq_splice_ok) { + WARN_ON_ONCE(1); return nfserr_resource; + } + xdr_commit_encode(xdr); + + maxcount = svc_max_payload(resp->rqstp); + if (maxcount > xdr->buf->buflen - xdr->buf->len) + maxcount = xdr->buf->buflen - xdr->buf->len; + if (maxcount > read->rd_length) + maxcount = read->rd_length; + + if (!read->rd_filp) { + err = nfsd_get_tmp_read_open(resp->rqstp, read->rd_fhp, + &file, &ra); + if (err) + goto err_truncate; + } + + if (file->f_op->splice_read && resp->rqstp->rq_splice_ok) + err = nfsd4_encode_splice_read(resp, read, file, maxcount); + else + err = nfsd4_encode_readv(resp, read, file, maxcount); + + if (!read->rd_filp) + nfsd_put_tmp_read_open(file, ra); + +err_truncate: + if (err) + xdr_truncate_encode(xdr, starting_len); + return err; +} + +static __be32 +nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readlink *readlink) +{ + int maxcount; + __be32 wire_count; + int zero = 0; + struct xdr_stream *xdr = &resp->xdr; + int length_offset = xdr->buf->len; + __be32 *p; - page = page_address(*(resp->rqstp->rq_next_page++)); + if (nfserr) + return nfserr; + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; maxcount = PAGE_SIZE; - RESERVE_SPACE(4); + p = xdr_reserve_space(xdr, maxcount); + if (!p) + return nfserr_resource; /* * XXX: By default, the ->readlink() VFS op will truncate symlinks * if they would overflow the buffer. Is this kosher in NFSv4? If * not, one easy fix is: if ->readlink() precisely fills the buffer, * assume that truncation occurred, and return NFS4ERR_RESOURCE. */ - nfserr = nfsd_readlink(readlink->rl_rqstp, readlink->rl_fhp, page, &maxcount); + nfserr = nfsd_readlink(readlink->rl_rqstp, readlink->rl_fhp, + (char *)p, &maxcount); if (nfserr == nfserr_isdir) - return nfserr_inval; - if (nfserr) + nfserr = nfserr_inval; + if (nfserr) { + xdr_truncate_encode(xdr, length_offset); return nfserr; - - WRITE32(maxcount); - ADJUST_ARGS(); - resp->xbuf->head[0].iov_len = (char*)p - - (char*)resp->xbuf->head[0].iov_base; - resp->xbuf->page_len = maxcount; - - /* Use rest of head for padding and remaining ops: */ - resp->xbuf->tail[0].iov_base = p; - resp->xbuf->tail[0].iov_len = 0; - if (maxcount&3) { - RESERVE_SPACE(4); - WRITE32(0); - resp->xbuf->tail[0].iov_base += maxcount&3; - resp->xbuf->tail[0].iov_len = 4 - (maxcount&3); - ADJUST_ARGS(); } + + wire_count = htonl(maxcount); + write_bytes_to_xdr_buf(xdr->buf, length_offset, &wire_count, 4); + xdr_truncate_encode(xdr, length_offset + 4 + ALIGN(maxcount, 4)); + if (maxcount & 3) + write_bytes_to_xdr_buf(xdr->buf, length_offset + 4 + maxcount, + &zero, 4 - (maxcount&3)); return 0; } @@ -3062,47 +3291,52 @@ static __be32 nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readdir *readdir) { int maxcount; + int bytes_left; loff_t offset; - __be32 *page, *savep, *tailbase; + __be64 wire_offset; + struct xdr_stream *xdr = &resp->xdr; + int starting_len = xdr->buf->len; __be32 *p; if (nfserr) return nfserr; - if (resp->xbuf->page_len) - return nfserr_resource; - if (!*resp->rqstp->rq_next_page) - return nfserr_resource; - RESERVE_SPACE(NFS4_VERIFIER_SIZE); - savep = p; + p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); + if (!p) + return nfserr_resource; /* XXX: Following NFSv3, we ignore the READDIR verifier for now. */ - WRITE32(0); - WRITE32(0); - ADJUST_ARGS(); - resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base; - tailbase = p; - - maxcount = PAGE_SIZE; - if (maxcount > readdir->rd_maxcount) - maxcount = readdir->rd_maxcount; + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); + resp->xdr.buf->head[0].iov_len = ((char *)resp->xdr.p) + - (char *)resp->xdr.buf->head[0].iov_base; /* - * Convert from bytes to words, account for the two words already - * written, make sure to leave two words at the end for the next - * pointer and eof field. + * Number of bytes left for directory entries allowing for the + * final 8 bytes of the readdir and a following failed op: */ - maxcount = (maxcount >> 2) - 4; - if (maxcount < 0) { - nfserr = nfserr_toosmall; + bytes_left = xdr->buf->buflen - xdr->buf->len + - COMPOUND_ERR_SLACK_SPACE - 8; + if (bytes_left < 0) { + nfserr = nfserr_resource; goto err_no_verf; } + maxcount = min_t(u32, readdir->rd_maxcount, INT_MAX); + /* + * Note the rfc defines rd_maxcount as the size of the + * READDIR4resok structure, which includes the verifier above + * and the 8 bytes encoded at the end of this function: + */ + if (maxcount < 16) { + nfserr = nfserr_toosmall; + goto err_no_verf; + } + maxcount = min_t(int, maxcount-16, bytes_left); - page = page_address(*(resp->rqstp->rq_next_page++)); + readdir->xdr = xdr; + readdir->rd_maxcount = maxcount; readdir->common.err = 0; - readdir->buflen = maxcount; - readdir->buffer = page; - readdir->offset = NULL; + readdir->cookie_offset = 0; offset = readdir->rd_cookie; nfserr = nfsd_readdir(readdir->rd_rqstp, readdir->rd_fhp, @@ -3110,42 +3344,49 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 &readdir->common, nfsd4_encode_dirent); if (nfserr == nfs_ok && readdir->common.err == nfserr_toosmall && - readdir->buffer == page) - nfserr = nfserr_toosmall; + xdr->buf->len == starting_len + 8) { + /* nothing encoded; which limit did we hit?: */ + if (maxcount - 16 < bytes_left) + /* It was the fault of rd_maxcount: */ + nfserr = nfserr_toosmall; + else + /* We ran out of buffer space: */ + nfserr = nfserr_resource; + } if (nfserr) goto err_no_verf; - if (readdir->offset) - xdr_encode_hyper(readdir->offset, offset); + if (readdir->cookie_offset) { + wire_offset = cpu_to_be64(offset); + write_bytes_to_xdr_buf(xdr->buf, readdir->cookie_offset, + &wire_offset, 8); + } - p = readdir->buffer; + p = xdr_reserve_space(xdr, 8); + if (!p) { + WARN_ON_ONCE(1); + goto err_no_verf; + } *p++ = 0; /* no more entries */ *p++ = htonl(readdir->common.err == nfserr_eof); - resp->xbuf->page_len = ((char*)p) - - (char*)page_address(*(resp->rqstp->rq_next_page-1)); - - /* Use rest of head for padding and remaining ops: */ - resp->xbuf->tail[0].iov_base = tailbase; - resp->xbuf->tail[0].iov_len = 0; - resp->p = resp->xbuf->tail[0].iov_base; - resp->end = resp->p + (PAGE_SIZE - resp->xbuf->head[0].iov_len)/4; return 0; err_no_verf: - p = savep; - ADJUST_ARGS(); + xdr_truncate_encode(xdr, starting_len); return nfserr; } static __be32 nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (!nfserr) { - RESERVE_SPACE(20); - write_cinfo(&p, &remove->rm_cinfo); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 20); + if (!p) + return nfserr_resource; + p = encode_cinfo(p, &remove->rm_cinfo); } return nfserr; } @@ -3153,19 +3394,21 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ static __be32 nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (!nfserr) { - RESERVE_SPACE(40); - write_cinfo(&p, &rename->rn_sinfo); - write_cinfo(&p, &rename->rn_tinfo); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 40); + if (!p) + return nfserr_resource; + p = encode_cinfo(p, &rename->rn_sinfo); + p = encode_cinfo(p, &rename->rn_tinfo); } return nfserr; } static __be32 -nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp, +nfsd4_do_encode_secinfo(struct xdr_stream *xdr, __be32 nfserr, struct svc_export *exp) { u32 i, nflavs, supported; @@ -3176,6 +3419,7 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp, if (nfserr) goto out; + nfserr = nfserr_resource; if (exp->ex_nflavors) { flavs = exp->ex_flavors; nflavs = exp->ex_nflavors; @@ -3197,9 +3441,10 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp, } supported = 0; - RESERVE_SPACE(4); + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out; flavorsp = p++; /* to be backfilled later */ - ADJUST_ARGS(); for (i = 0; i < nflavs; i++) { rpc_authflavor_t pf = flavs[i].pseudoflavor; @@ -3207,18 +3452,20 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp, if (rpcauth_get_gssinfo(pf, &info) == 0) { supported++; - RESERVE_SPACE(4 + 4 + XDR_LEN(info.oid.len) + 4 + 4); - WRITE32(RPC_AUTH_GSS); - WRITE32(info.oid.len); - WRITEMEM(info.oid.data, info.oid.len); - WRITE32(info.qop); - WRITE32(info.service); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 4 + 4 + + XDR_LEN(info.oid.len) + 4 + 4); + if (!p) + goto out; + *p++ = cpu_to_be32(RPC_AUTH_GSS); + p = xdr_encode_opaque(p, info.oid.data, info.oid.len); + *p++ = cpu_to_be32(info.qop); + *p++ = cpu_to_be32(info.service); } else if (pf < RPC_AUTH_MAXFLAVOR) { supported++; - RESERVE_SPACE(4); - WRITE32(pf); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out; + *p++ = cpu_to_be32(pf); } else { if (report) pr_warn("NFS: SECINFO: security flavor %u " @@ -3229,7 +3476,7 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp, if (nflavs != supported) report = false; *flavorsp = htonl(supported); - + nfserr = 0; out: if (exp) exp_put(exp); @@ -3240,14 +3487,18 @@ static __be32 nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_secinfo *secinfo) { - return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->si_exp); + struct xdr_stream *xdr = &resp->xdr; + + return nfsd4_do_encode_secinfo(xdr, nfserr, secinfo->si_exp); } static __be32 nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_secinfo_no_name *secinfo) { - return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->sin_exp); + struct xdr_stream *xdr = &resp->xdr; + + return nfsd4_do_encode_secinfo(xdr, nfserr, secinfo->sin_exp); } /* @@ -3257,41 +3508,47 @@ nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; - RESERVE_SPACE(16); + p = xdr_reserve_space(xdr, 16); + if (!p) + return nfserr_resource; if (nfserr) { - WRITE32(3); - WRITE32(0); - WRITE32(0); - WRITE32(0); + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); } else { - WRITE32(3); - WRITE32(setattr->sa_bmval[0]); - WRITE32(setattr->sa_bmval[1]); - WRITE32(setattr->sa_bmval[2]); + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(setattr->sa_bmval[0]); + *p++ = cpu_to_be32(setattr->sa_bmval[1]); + *p++ = cpu_to_be32(setattr->sa_bmval[2]); } - ADJUST_ARGS(); return nfserr; } static __be32 nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (!nfserr) { - RESERVE_SPACE(8 + NFS4_VERIFIER_SIZE); - WRITEMEM(&scd->se_clientid, 8); - WRITEMEM(&scd->se_confirm, NFS4_VERIFIER_SIZE); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 8 + NFS4_VERIFIER_SIZE); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque_fixed(p, &scd->se_clientid, 8); + p = xdr_encode_opaque_fixed(p, &scd->se_confirm, + NFS4_VERIFIER_SIZE); } else if (nfserr == nfserr_clid_inuse) { - RESERVE_SPACE(8); - WRITE32(0); - WRITE32(0); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 8); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(0); } return nfserr; } @@ -3299,14 +3556,17 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct n static __be32 nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (!nfserr) { - RESERVE_SPACE(16); - WRITE32(write->wr_bytes_written); - WRITE32(write->wr_how_written); - WRITEMEM(write->wr_verifier.data, NFS4_VERIFIER_SIZE); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 16); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(write->wr_bytes_written); + *p++ = cpu_to_be32(write->wr_how_written); + p = xdr_encode_opaque_fixed(p, write->wr_verifier.data, + NFS4_VERIFIER_SIZE); } return nfserr; } @@ -3323,6 +3583,7 @@ static __be32 nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_exchange_id *exid) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; char *major_id; char *server_scope; @@ -3338,60 +3599,61 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, server_scope = utsname()->nodename; server_scope_sz = strlen(server_scope); - RESERVE_SPACE( + p = xdr_reserve_space(xdr, 8 /* eir_clientid */ + 4 /* eir_sequenceid */ + 4 /* eir_flags */ + 4 /* spr_how */); + if (!p) + return nfserr_resource; - WRITEMEM(&exid->clientid, 8); - WRITE32(exid->seqid); - WRITE32(exid->flags); + p = xdr_encode_opaque_fixed(p, &exid->clientid, 8); + *p++ = cpu_to_be32(exid->seqid); + *p++ = cpu_to_be32(exid->flags); - WRITE32(exid->spa_how); - ADJUST_ARGS(); + *p++ = cpu_to_be32(exid->spa_how); switch (exid->spa_how) { case SP4_NONE: break; case SP4_MACH_CRED: /* spo_must_enforce, spo_must_allow */ - RESERVE_SPACE(16); + p = xdr_reserve_space(xdr, 16); + if (!p) + return nfserr_resource; /* spo_must_enforce bitmap: */ - WRITE32(2); - WRITE32(nfs4_minimal_spo_must_enforce[0]); - WRITE32(nfs4_minimal_spo_must_enforce[1]); + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(nfs4_minimal_spo_must_enforce[0]); + *p++ = cpu_to_be32(nfs4_minimal_spo_must_enforce[1]); /* empty spo_must_allow bitmap: */ - WRITE32(0); + *p++ = cpu_to_be32(0); - ADJUST_ARGS(); break; default: WARN_ON_ONCE(1); } - RESERVE_SPACE( + p = xdr_reserve_space(xdr, 8 /* so_minor_id */ + 4 /* so_major_id.len */ + (XDR_QUADLEN(major_id_sz) * 4) + 4 /* eir_server_scope.len */ + (XDR_QUADLEN(server_scope_sz) * 4) + 4 /* eir_server_impl_id.count (0) */); + if (!p) + return nfserr_resource; /* The server_owner struct */ - WRITE64(minor_id); /* Minor id */ + p = xdr_encode_hyper(p, minor_id); /* Minor id */ /* major id */ - WRITE32(major_id_sz); - WRITEMEM(major_id, major_id_sz); + p = xdr_encode_opaque(p, major_id, major_id_sz); /* Server scope */ - WRITE32(server_scope_sz); - WRITEMEM(server_scope, server_scope_sz); + p = xdr_encode_opaque(p, server_scope, server_scope_sz); /* Implementation id */ - WRITE32(0); /* zero length nfs_impl_id4 array */ - ADJUST_ARGS(); + *p++ = cpu_to_be32(0); /* zero length nfs_impl_id4 array */ return 0; } @@ -3399,47 +3661,54 @@ static __be32 nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create_session *sess) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (nfserr) return nfserr; - RESERVE_SPACE(24); - WRITEMEM(sess->sessionid.data, NFS4_MAX_SESSIONID_LEN); - WRITE32(sess->seqid); - WRITE32(sess->flags); - ADJUST_ARGS(); - - RESERVE_SPACE(28); - WRITE32(0); /* headerpadsz */ - WRITE32(sess->fore_channel.maxreq_sz); - WRITE32(sess->fore_channel.maxresp_sz); - WRITE32(sess->fore_channel.maxresp_cached); - WRITE32(sess->fore_channel.maxops); - WRITE32(sess->fore_channel.maxreqs); - WRITE32(sess->fore_channel.nr_rdma_attrs); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 24); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque_fixed(p, sess->sessionid.data, + NFS4_MAX_SESSIONID_LEN); + *p++ = cpu_to_be32(sess->seqid); + *p++ = cpu_to_be32(sess->flags); + + p = xdr_reserve_space(xdr, 28); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(0); /* headerpadsz */ + *p++ = cpu_to_be32(sess->fore_channel.maxreq_sz); + *p++ = cpu_to_be32(sess->fore_channel.maxresp_sz); + *p++ = cpu_to_be32(sess->fore_channel.maxresp_cached); + *p++ = cpu_to_be32(sess->fore_channel.maxops); + *p++ = cpu_to_be32(sess->fore_channel.maxreqs); + *p++ = cpu_to_be32(sess->fore_channel.nr_rdma_attrs); if (sess->fore_channel.nr_rdma_attrs) { - RESERVE_SPACE(4); - WRITE32(sess->fore_channel.rdma_attrs); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(sess->fore_channel.rdma_attrs); } - RESERVE_SPACE(28); - WRITE32(0); /* headerpadsz */ - WRITE32(sess->back_channel.maxreq_sz); - WRITE32(sess->back_channel.maxresp_sz); - WRITE32(sess->back_channel.maxresp_cached); - WRITE32(sess->back_channel.maxops); - WRITE32(sess->back_channel.maxreqs); - WRITE32(sess->back_channel.nr_rdma_attrs); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 28); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(0); /* headerpadsz */ + *p++ = cpu_to_be32(sess->back_channel.maxreq_sz); + *p++ = cpu_to_be32(sess->back_channel.maxresp_sz); + *p++ = cpu_to_be32(sess->back_channel.maxresp_cached); + *p++ = cpu_to_be32(sess->back_channel.maxops); + *p++ = cpu_to_be32(sess->back_channel.maxreqs); + *p++ = cpu_to_be32(sess->back_channel.nr_rdma_attrs); if (sess->back_channel.nr_rdma_attrs) { - RESERVE_SPACE(4); - WRITE32(sess->back_channel.rdma_attrs); - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(sess->back_channel.rdma_attrs); } return 0; } @@ -3448,22 +3717,25 @@ static __be32 nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_sequence *seq) { + struct xdr_stream *xdr = &resp->xdr; __be32 *p; if (nfserr) return nfserr; - RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 20); - WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN); - WRITE32(seq->seqid); - WRITE32(seq->slotid); + p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 20); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque_fixed(p, seq->sessionid.data, + NFS4_MAX_SESSIONID_LEN); + *p++ = cpu_to_be32(seq->seqid); + *p++ = cpu_to_be32(seq->slotid); /* Note slotid's are numbered from zero: */ - WRITE32(seq->maxslots - 1); /* sr_highest_slotid */ - WRITE32(seq->maxslots - 1); /* sr_target_highest_slotid */ - WRITE32(seq->status_flags); + *p++ = cpu_to_be32(seq->maxslots - 1); /* sr_highest_slotid */ + *p++ = cpu_to_be32(seq->maxslots - 1); /* sr_target_highest_slotid */ + *p++ = cpu_to_be32(seq->status_flags); - ADJUST_ARGS(); - resp->cstate.datap = p; /* DRC cache data pointer */ + resp->cstate.data_offset = xdr->buf->len; /* DRC cache data pointer */ return 0; } @@ -3471,20 +3743,22 @@ static __be32 nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_test_stateid *test_stateid) { + struct xdr_stream *xdr = &resp->xdr; struct nfsd4_test_stateid_id *stateid, *next; __be32 *p; if (nfserr) return nfserr; - RESERVE_SPACE(4 + (4 * test_stateid->ts_num_ids)); + p = xdr_reserve_space(xdr, 4 + (4 * test_stateid->ts_num_ids)); + if (!p) + return nfserr_resource; *p++ = htonl(test_stateid->ts_num_ids); list_for_each_entry_safe(stateid, next, &test_stateid->ts_stateid_list, ts_id_list) { *p++ = stateid->ts_id_status; } - ADJUST_ARGS(); return nfserr; } @@ -3563,70 +3837,67 @@ static nfsd4_enc nfsd4_enc_ops[] = { }; /* - * Calculate the total amount of memory that the compound response has taken - * after encoding the current operation with pad. - * - * pad: if operation is non-idempotent, pad was calculate by op_rsize_bop() - * which was specified at nfsd4_operation, else pad is zero. - * - * Compare this length to the session se_fmaxresp_sz and se_fmaxresp_cached. + * Calculate whether we still have space to encode repsize bytes. + * There are two considerations: + * - For NFS versions >=4.1, the size of the reply must stay within + * session limits + * - For all NFS versions, we must stay within limited preallocated + * buffer space. * - * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so - * will be at least a page and will therefore hold the xdr_buf head. + * This is called before the operation is processed, so can only provide + * an upper estimate. For some nonidempotent operations (such as + * getattr), it's not necessarily a problem if that estimate is wrong, + * as we can fail it after processing without significant side effects. */ -__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad) +__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 respsize) { - struct xdr_buf *xb = &resp->rqstp->rq_res; - struct nfsd4_session *session = NULL; + struct xdr_buf *buf = &resp->rqstp->rq_res; struct nfsd4_slot *slot = resp->cstate.slot; - u32 length, tlen = 0; + if (buf->len + respsize <= buf->buflen) + return nfs_ok; if (!nfsd4_has_session(&resp->cstate)) - return 0; - - session = resp->cstate.session; - - if (xb->page_len == 0) { - length = (char *)resp->p - (char *)xb->head[0].iov_base + pad; - } else { - if (xb->tail[0].iov_base && xb->tail[0].iov_len > 0) - tlen = (char *)resp->p - (char *)xb->tail[0].iov_base; - - length = xb->head[0].iov_len + xb->page_len + tlen + pad; - } - dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__, - length, xb->page_len, tlen, pad); - - if (length > session->se_fchannel.maxresp_sz) - return nfserr_rep_too_big; - - if ((slot->sl_flags & NFSD4_SLOT_CACHETHIS) && - length > session->se_fchannel.maxresp_cached) + return nfserr_resource; + if (slot->sl_flags & NFSD4_SLOT_CACHETHIS) { + WARN_ON_ONCE(1); return nfserr_rep_too_big_to_cache; - - return 0; + } + return nfserr_rep_too_big; } void nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) { + struct xdr_stream *xdr = &resp->xdr; struct nfs4_stateowner *so = resp->cstate.replay_owner; - __be32 *statp; + struct svc_rqst *rqstp = resp->rqstp; + int post_err_offset; + nfsd4_enc encoder; __be32 *p; - RESERVE_SPACE(8); - WRITE32(op->opnum); - statp = p++; /* to be backfilled at the end */ - ADJUST_ARGS(); + p = xdr_reserve_space(xdr, 8); + if (!p) { + WARN_ON_ONCE(1); + return; + } + *p++ = cpu_to_be32(op->opnum); + post_err_offset = xdr->buf->len; if (op->opnum == OP_ILLEGAL) goto status; BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) || !nfsd4_enc_ops[op->opnum]); - op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u); + encoder = nfsd4_enc_ops[op->opnum]; + op->status = encoder(resp, op->status, &op->u); + xdr_commit_encode(xdr); + /* nfsd4_check_resp_size guarantees enough room for error status */ - if (!op->status) - op->status = nfsd4_check_resp_size(resp, 0); + if (!op->status) { + int space_needed = 0; + if (!nfsd4_last_compound_op(rqstp)) + space_needed = COMPOUND_ERR_SLACK_SPACE; + op->status = nfsd4_check_resp_size(resp, space_needed); + } if (op->status == nfserr_resource && nfsd4_has_session(&resp->cstate)) { struct nfsd4_slot *slot = resp->cstate.slot; @@ -3635,17 +3906,30 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) else op->status = nfserr_rep_too_big; } + if (op->status == nfserr_resource || + op->status == nfserr_rep_too_big || + op->status == nfserr_rep_too_big_to_cache) { + /* + * The operation may have already been encoded or + * partially encoded. No op returns anything additional + * in the case of one of these three errors, so we can + * just truncate back to after the status. But it's a + * bug if we had to do this on a non-idempotent op: + */ + warn_on_nonidempotent_op(op); + xdr_truncate_encode(xdr, post_err_offset); + } if (so) { + int len = xdr->buf->len - post_err_offset; + so->so_replay.rp_status = op->status; - so->so_replay.rp_buflen = (char *)resp->p - (char *)(statp+1); - memcpy(so->so_replay.rp_buf, statp+1, so->so_replay.rp_buflen); + so->so_replay.rp_buflen = len; + read_bytes_from_xdr_buf(xdr->buf, post_err_offset, + so->so_replay.rp_buf, len); } status: - /* - * Note: We write the status directly, instead of using WRITE32(), - * since it is already in network byte order. - */ - *statp = op->status; + /* Note that op->status is already in network byte order: */ + write_bytes_to_xdr_buf(xdr->buf, post_err_offset - 4, &op->status, 4); } /* @@ -3657,21 +3941,22 @@ status: * called with nfs4_lock_state() held */ void -nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op) +nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op) { __be32 *p; struct nfs4_replay *rp = op->replay; BUG_ON(!rp); - RESERVE_SPACE(8); - WRITE32(op->opnum); + p = xdr_reserve_space(xdr, 8 + rp->rp_buflen); + if (!p) { + WARN_ON_ONCE(1); + return; + } + *p++ = cpu_to_be32(op->opnum); *p++ = rp->rp_status; /* already xdr'ed */ - ADJUST_ARGS(); - RESERVE_SPACE(rp->rp_buflen); - WRITEMEM(rp->rp_buf, rp->rp_buflen); - ADJUST_ARGS(); + p = xdr_encode_opaque_fixed(p, rp->rp_buf, rp->rp_buflen); } int @@ -3728,19 +4013,19 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo * All that remains is to write the tag and operation count... */ struct nfsd4_compound_state *cs = &resp->cstate; - struct kvec *iov; + struct xdr_buf *buf = resp->xdr.buf; + + WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len + + buf->tail[0].iov_len); + + rqstp->rq_next_page = resp->xdr.page_ptr + 1; + p = resp->tagp; *p++ = htonl(resp->taglen); memcpy(p, resp->tag, resp->taglen); p += XDR_QUADLEN(resp->taglen); *p++ = htonl(resp->opcnt); - if (rqstp->rq_res.page_len) - iov = &rqstp->rq_res.tail[0]; - else - iov = &rqstp->rq_res.head[0]; - iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base; - BUG_ON(iov->iov_len > PAGE_SIZE); if (nfsd4_has_session(cs)) { struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct nfs4_client *clp = cs->session->se_client; diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index f8f060ffbf4f..6040da8830ff 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -224,13 +224,6 @@ hash_refile(struct svc_cacherep *rp) hlist_add_head(&rp->c_hash, cache_hash + hash_32(rp->c_xid, maskbits)); } -static inline bool -nfsd_cache_entry_expired(struct svc_cacherep *rp) -{ - return rp->c_state != RC_INPROG && - time_after(jiffies, rp->c_timestamp + RC_EXPIRE); -} - /* * Walk the LRU list and prune off entries that are older than RC_EXPIRE. * Also prune the oldest ones when the total exceeds the max number of entries. @@ -242,8 +235,14 @@ prune_cache_entries(void) long freed = 0; list_for_each_entry_safe(rp, tmp, &lru_head, c_lru) { - if (!nfsd_cache_entry_expired(rp) && - num_drc_entries <= max_drc_entries) + /* + * Don't free entries attached to calls that are still + * in-progress, but do keep scanning the list. + */ + if (rp->c_state == RC_INPROG) + continue; + if (num_drc_entries <= max_drc_entries && + time_before(jiffies, rp->c_timestamp + RC_EXPIRE)) break; nfsd_reply_cache_free_locked(rp); freed++; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index f34d9de802ab..51844048937f 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1179,7 +1179,6 @@ static int __init init_nfsd(void) retval = nfsd4_init_slabs(); if (retval) goto out_unregister_pernet; - nfs4_state_init(); retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */ if (retval) goto out_free_slabs; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 479eb681c27c..847daf37e566 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -15,11 +15,20 @@ #include <linux/nfs2.h> #include <linux/nfs3.h> #include <linux/nfs4.h> +#include <linux/sunrpc/svc.h> #include <linux/sunrpc/msg_prot.h> -#include <linux/nfsd/debug.h> -#include <linux/nfsd/export.h> -#include <linux/nfsd/stats.h> +#include <uapi/linux/nfsd/debug.h> + +#include "stats.h" +#include "export.h" + +#undef ifdebug +#ifdef NFSD_DEBUG +# define ifdebug(flag) if (nfsd_debug & NFSDDBG_##flag) +#else +# define ifdebug(flag) if (0) +#endif /* * nfsd version @@ -106,7 +115,6 @@ static inline int nfsd_v4client(struct svc_rqst *rq) */ #ifdef CONFIG_NFSD_V4 extern unsigned long max_delegations; -void nfs4_state_init(void); int nfsd4_init_slabs(void); void nfsd4_free_slabs(void); int nfs4_state_start(void); @@ -117,7 +125,6 @@ void nfs4_reset_lease(time_t leasetime); int nfs4_reset_recoverydir(char *recdir); char * nfs4_recoverydir(void); #else -static inline void nfs4_state_init(void) { } static inline int nfsd4_init_slabs(void) { return 0; } static inline void nfsd4_free_slabs(void) { } static inline int nfs4_state_start(void) { return 0; } diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 3c37b160dcad..ec8393418154 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -88,9 +88,8 @@ static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, /* Check if the request originated from a secure port. */ if (!rqstp->rq_secure && !(flags & NFSEXP_INSECURE_PORT)) { RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); - dprintk(KERN_WARNING - "nfsd: request from insecure port %s!\n", - svc_print_addr(rqstp, buf, sizeof(buf))); + dprintk("nfsd: request from insecure port %s!\n", + svc_print_addr(rqstp, buf, sizeof(buf))); return nfserr_perm; } @@ -169,8 +168,8 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) data_left -= len; if (data_left < 0) return error; - exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_auth); - fid = (struct fid *)(fh->fh_auth + len); + exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_fsid); + fid = (struct fid *)(fh->fh_fsid + len); } else { __u32 tfh[2]; dev_t xdev; @@ -385,7 +384,7 @@ static void _fh_update(struct svc_fh *fhp, struct svc_export *exp, { if (dentry != exp->ex_path.dentry) { struct fid *fid = (struct fid *) - (fhp->fh_handle.fh_auth + fhp->fh_handle.fh_size/4 - 1); + (fhp->fh_handle.fh_fsid + fhp->fh_handle.fh_size/4 - 1); int maxsize = (fhp->fh_maxsize - fhp->fh_handle.fh_size)/4; int subtreecheck = !(exp->ex_flags & NFSEXP_NOSUBTREECHECK); @@ -513,7 +512,6 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, */ struct inode * inode = dentry->d_inode; - __u32 *datap; dev_t ex_dev = exp_sb(exp)->s_dev; dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %pd2, ino=%ld)\n", @@ -557,17 +555,16 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, if (inode) _fh_update_old(dentry, exp, &fhp->fh_handle); } else { - int len; + fhp->fh_handle.fh_size = + key_len(fhp->fh_handle.fh_fsid_type) + 4; fhp->fh_handle.fh_auth_type = 0; - datap = fhp->fh_handle.fh_auth+0; - mk_fsid(fhp->fh_handle.fh_fsid_type, datap, ex_dev, + + mk_fsid(fhp->fh_handle.fh_fsid_type, + fhp->fh_handle.fh_fsid, + ex_dev, exp->ex_path.dentry->d_inode->i_ino, exp->ex_fsid, exp->ex_uuid); - len = key_len(fhp->fh_handle.fh_fsid_type); - datap += len/4; - fhp->fh_handle.fh_size = 4 + len; - if (inode) _fh_update(fhp, exp, dentry); if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) { diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index ad67964d0bb1..2e89e70ac15c 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -1,9 +1,58 @@ -/* Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */ +/* + * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> + * + * This file describes the layout of the file handles as passed + * over the wire. + */ +#ifndef _LINUX_NFSD_NFSFH_H +#define _LINUX_NFSD_NFSFH_H + +#include <linux/sunrpc/svc.h> +#include <uapi/linux/nfsd/nfsfh.h> + +static inline __u32 ino_t_to_u32(ino_t ino) +{ + return (__u32) ino; +} + +static inline ino_t u32_to_ino_t(__u32 uino) +{ + return (ino_t) uino; +} -#ifndef _LINUX_NFSD_FH_INT_H -#define _LINUX_NFSD_FH_INT_H +/* + * This is the internal representation of an NFS handle used in knfsd. + * pre_mtime/post_version will be used to support wcc_attr's in NFSv3. + */ +typedef struct svc_fh { + struct knfsd_fh fh_handle; /* FH data */ + struct dentry * fh_dentry; /* validated dentry */ + struct svc_export * fh_export; /* export pointer */ + int fh_maxsize; /* max size for fh_handle */ + + unsigned char fh_locked; /* inode locked by us */ + unsigned char fh_want_write; /* remount protection taken */ + +#ifdef CONFIG_NFSD_V3 + unsigned char fh_post_saved; /* post-op attrs saved */ + unsigned char fh_pre_saved; /* pre-op attrs saved */ + + /* Pre-op attributes saved during fh_lock */ + __u64 fh_pre_size; /* size before operation */ + struct timespec fh_pre_mtime; /* mtime before oper */ + struct timespec fh_pre_ctime; /* ctime before oper */ + /* + * pre-op nfsv4 change attr: note must check IS_I_VERSION(inode) + * to find out if it is valid. + */ + u64 fh_pre_change; + + /* Post-op attributes saved in fh_unlock */ + struct kstat fh_post_attr; /* full attrs after operation */ + u64 fh_post_change; /* nfsv4 change; see above */ +#endif /* CONFIG_NFSD_V3 */ -#include <linux/nfsd/nfsfh.h> +} svc_fh; enum nfsd_fsid { FSID_DEV = 0, @@ -215,4 +264,4 @@ fh_unlock(struct svc_fh *fhp) } } -#endif /* _LINUX_NFSD_FH_INT_H */ +#endif /* _LINUX_NFSD_NFSFH_H */ diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 9a4a5f9e7468..1879e43f2868 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -591,12 +591,6 @@ nfsd(void *vrqstp) nfsdstats.th_cnt++; mutex_unlock(&nfsd_mutex); - /* - * We want less throttling in balance_dirty_pages() so that nfs to - * localhost doesn't cause nfsd to lock up due to all the client's - * dirty pages. - */ - current->flags |= PF_LESS_THROTTLE; set_freezable(); /* diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 9c769a47ac5a..1ac306b769df 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -214,7 +214,8 @@ nfssvc_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy) int nfssvc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p, struct nfsd_fhandle *args) { - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; return xdr_argsize_check(rqstp, p); } @@ -248,7 +249,8 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p, { unsigned int len; int v; - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; args->offset = ntohl(*p++); @@ -281,7 +283,8 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, unsigned int len, hdr, dlen; int v; - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; p++; /* beginoffset */ @@ -355,7 +358,8 @@ nfssvc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p, int nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readlinkargs *args) { - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; args->buffer = page_address(*(rqstp->rq_next_page++)); @@ -391,7 +395,8 @@ int nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readdirargs *args) { - if (!(p = decode_fh(p, &args->fh))) + p = decode_fh(p, &args->fh); + if (!p) return 0; args->cookie = ntohl(*p++); args->count = ntohl(*p++); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 424d8f5f2317..374c66283ac5 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -37,7 +37,6 @@ #include <linux/idr.h> #include <linux/sunrpc/svc_xprt.h> -#include <linux/nfsd/nfsfh.h> #include "nfsfh.h" typedef struct { @@ -123,7 +122,7 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s) /* Maximum number of operations per session compound */ #define NFSD_MAX_OPS_PER_COMPOUND 16 /* Maximum session per slot cache size */ -#define NFSD_SLOT_CACHE_SIZE 1024 +#define NFSD_SLOT_CACHE_SIZE 2048 /* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */ #define NFSD_CACHE_SIZE_SLOTS_PER_SESSION 32 #define NFSD_MAX_MEM_PER_SESSION \ @@ -464,8 +463,6 @@ extern void nfs4_release_reclaim(struct nfsd_net *); extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir, struct nfsd_net *nn); extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn); -extern void nfs4_free_openowner(struct nfs4_openowner *); -extern void nfs4_free_lockowner(struct nfs4_lockowner *); extern int set_callback_cred(void); extern void nfsd4_init_callback(struct nfsd4_callback *); extern void nfsd4_probe_callback(struct nfs4_client *clp); diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 6d4521feb6e3..cd90878a76aa 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -24,7 +24,6 @@ #include <linux/seq_file.h> #include <linux/module.h> #include <linux/sunrpc/stats.h> -#include <linux/nfsd/stats.h> #include <net/net_namespace.h> #include "nfsd.h" diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h new file mode 100644 index 000000000000..a5c944b771c6 --- /dev/null +++ b/fs/nfsd/stats.h @@ -0,0 +1,43 @@ +/* + * Statistics for NFS server. + * + * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> + */ +#ifndef _NFSD_STATS_H +#define _NFSD_STATS_H + +#include <uapi/linux/nfsd/stats.h> + + +struct nfsd_stats { + unsigned int rchits; /* repcache hits */ + unsigned int rcmisses; /* repcache hits */ + unsigned int rcnocache; /* uncached reqs */ + unsigned int fh_stale; /* FH stale error */ + unsigned int fh_lookup; /* dentry cached */ + unsigned int fh_anon; /* anon file dentry returned */ + unsigned int fh_nocache_dir; /* filehandle not found in dcache */ + unsigned int fh_nocache_nondir; /* filehandle not found in dcache */ + unsigned int io_read; /* bytes returned to read requests */ + unsigned int io_write; /* bytes passed in write requests */ + unsigned int th_cnt; /* number of available threads */ + unsigned int th_usage[10]; /* number of ticks during which n perdeciles + * of available threads were in use */ + unsigned int th_fullcnt; /* number of times last free thread was used */ + unsigned int ra_size; /* size of ra cache */ + unsigned int ra_depth[11]; /* number of times ra entry was found that deep + * in the cache (10percentiles). [10] = not found */ +#ifdef CONFIG_NFSD_V4 + unsigned int nfs4_opcount[LAST_NFS4_OP + 1]; /* count of individual nfsv4 operations */ +#endif + +}; + + +extern struct nfsd_stats nfsdstats; +extern struct svc_stat nfsd_svcstats; + +void nfsd_stat_init(void); +void nfsd_stat_shutdown(void); + +#endif /* _NFSD_STATS_H */ diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 16f0673a423c..140c496f612c 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -820,55 +820,54 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe, return __splice_from_pipe(pipe, sd, nfsd_splice_actor); } -static __be32 -nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, - loff_t offset, struct kvec *vec, int vlen, unsigned long *count) +__be32 nfsd_finish_read(struct file *file, unsigned long *count, int host_err) { - mm_segment_t oldfs; - __be32 err; - int host_err; - - err = nfserr_perm; - - if (file->f_op->splice_read && rqstp->rq_splice_ok) { - struct splice_desc sd = { - .len = 0, - .total_len = *count, - .pos = offset, - .u.data = rqstp, - }; - - rqstp->rq_next_page = rqstp->rq_respages + 1; - host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); - } else { - oldfs = get_fs(); - set_fs(KERNEL_DS); - host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset); - set_fs(oldfs); - } - if (host_err >= 0) { nfsdstats.io_read += host_err; *count = host_err; - err = 0; fsnotify_access(file); + return 0; } else - err = nfserrno(host_err); - return err; + return nfserrno(host_err); +} + +int nfsd_splice_read(struct svc_rqst *rqstp, + struct file *file, loff_t offset, unsigned long *count) +{ + struct splice_desc sd = { + .len = 0, + .total_len = *count, + .pos = offset, + .u.data = rqstp, + }; + int host_err; + + rqstp->rq_next_page = rqstp->rq_respages + 1; + host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); + return nfsd_finish_read(file, count, host_err); } -static void kill_suid(struct dentry *dentry) +int nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen, + unsigned long *count) { - struct iattr ia; - ia.ia_valid = ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; + mm_segment_t oldfs; + int host_err; - mutex_lock(&dentry->d_inode->i_mutex); - /* - * Note we call this on write, so notify_change will not - * encounter any conflicting delegations: - */ - notify_change(dentry, &ia, NULL); - mutex_unlock(&dentry->d_inode->i_mutex); + oldfs = get_fs(); + set_fs(KERNEL_DS); + host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset); + set_fs(oldfs); + return nfsd_finish_read(file, count, host_err); +} + +static __be32 +nfsd_vfs_read(struct svc_rqst *rqstp, struct file *file, + loff_t offset, struct kvec *vec, int vlen, unsigned long *count) +{ + if (file->f_op->splice_read && rqstp->rq_splice_ok) + return nfsd_splice_read(rqstp, file, offset, count); + else + return nfsd_readv(file, offset, vec, vlen, count); } /* @@ -922,6 +921,16 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, int stable = *stablep; int use_wgather; loff_t pos = offset; + unsigned int pflags = current->flags; + + if (rqstp->rq_local) + /* + * We want less throttling in balance_dirty_pages() + * and shrink_inactive_list() so that nfs to + * localhost doesn't cause nfsd to lock up due to all + * the client's dirty pages or its congested queue. + */ + current->flags |= PF_LESS_THROTTLE; dentry = file->f_path.dentry; inode = dentry->d_inode; @@ -942,10 +951,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, nfsdstats.io_write += host_err; fsnotify_modify(file); - /* clear setuid/setgid flag after write */ - if (inode->i_mode & (S_ISUID | S_ISGID)) - kill_suid(dentry); - if (stable) { if (use_wgather) host_err = wait_for_concurrent_writes(file); @@ -959,36 +964,33 @@ out_nfserr: err = 0; else err = nfserrno(host_err); + if (rqstp->rq_local) + tsk_restore_flags(current, pflags, PF_LESS_THROTTLE); return err; } -/* - * Read data from a file. count must contain the requested read count - * on entry. On return, *count contains the number of bytes actually read. - * N.B. After this call fhp needs an fh_put - */ -__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, - loff_t offset, struct kvec *vec, int vlen, unsigned long *count) +__be32 nfsd_get_tmp_read_open(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct file **file, struct raparms **ra) { - struct file *file; struct inode *inode; - struct raparms *ra; __be32 err; - err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); + err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, file); if (err) return err; - inode = file_inode(file); + inode = file_inode(*file); /* Get readahead parameters */ - ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino); + *ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino); - if (ra && ra->p_set) - file->f_ra = ra->p_ra; - - err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); + if (*ra && (*ra)->p_set) + (*file)->f_ra = (*ra)->p_ra; + return nfs_ok; +} +void nfsd_put_tmp_read_open(struct file *file, struct raparms *ra) +{ /* Write back readahead params */ if (ra) { struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex]; @@ -998,28 +1000,29 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, ra->p_count--; spin_unlock(&rab->pb_lock); } - nfsd_close(file); - return err; } -/* As above, but use the provided file descriptor. */ -__be32 -nfsd_read_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, - loff_t offset, struct kvec *vec, int vlen, - unsigned long *count) +/* + * Read data from a file. count must contain the requested read count + * on entry. On return, *count contains the number of bytes actually read. + * N.B. After this call fhp needs an fh_put + */ +__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, + loff_t offset, struct kvec *vec, int vlen, unsigned long *count) { - __be32 err; + struct file *file; + struct raparms *ra; + __be32 err; + + err = nfsd_get_tmp_read_open(rqstp, fhp, &file, &ra); + if (err) + return err; + + err = nfsd_vfs_read(rqstp, file, offset, vec, vlen, count); + + nfsd_put_tmp_read_open(file, ra); - if (file) { - err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, - NFSD_MAY_READ|NFSD_MAY_OWNER_OVERRIDE); - if (err) - goto out; - err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); - } else /* Note file may still be NULL in NFSv4 special stateid case: */ - err = nfsd_read(rqstp, fhp, offset, vec, vlen, count); -out: return err; } diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index fbe90bdb2214..91b6ae3f658b 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -70,10 +70,16 @@ __be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, __be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); void nfsd_close(struct file *); +struct raparms; +__be32 nfsd_get_tmp_read_open(struct svc_rqst *, struct svc_fh *, + struct file **, struct raparms **); +void nfsd_put_tmp_read_open(struct file *, struct raparms *); +int nfsd_splice_read(struct svc_rqst *, + struct file *, loff_t, unsigned long *); +int nfsd_readv(struct file *, loff_t, struct kvec *, int, + unsigned long *); __be32 nfsd_read(struct svc_rqst *, struct svc_fh *, loff_t, struct kvec *, int, unsigned long *); -__be32 nfsd_read_file(struct svc_rqst *, struct svc_fh *, struct file *, - loff_t, struct kvec *, int, unsigned long *); __be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *, loff_t, struct kvec *,int, unsigned long *, int *); __be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *, diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 5ea7df305083..18cbb6d9c8a9 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -58,7 +58,7 @@ struct nfsd4_compound_state { /* For sessions DRC */ struct nfsd4_session *session; struct nfsd4_slot *slot; - __be32 *datap; + int data_offset; size_t iovlen; u32 minorversion; __be32 status; @@ -287,9 +287,8 @@ struct nfsd4_readdir { struct svc_fh * rd_fhp; /* response */ struct readdir_cd common; - __be32 * buffer; - int buflen; - __be32 * offset; + struct xdr_stream *xdr; + int cookie_offset; }; struct nfsd4_release_lockowner { @@ -506,9 +505,7 @@ struct nfsd4_compoundargs { struct nfsd4_compoundres { /* scratch variables for XDR encode */ - __be32 * p; - __be32 * end; - struct xdr_buf * xbuf; + struct xdr_stream xdr; struct svc_rqst * rqstp; u32 taglen; @@ -538,6 +535,9 @@ static inline bool nfsd4_last_compound_op(struct svc_rqst *rqstp) return argp->opcnt == resp->opcnt; } +int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op); +void warn_on_nonidempotent_op(struct nfsd4_op *op); + #define NFS4_SVC_XDRSIZE sizeof(struct nfsd4_compoundargs) static inline void @@ -563,10 +563,11 @@ int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *, struct nfsd4_compoundres *); __be32 nfsd4_check_resp_size(struct nfsd4_compoundres *, u32); void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *); -void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op); -__be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, - struct dentry *dentry, __be32 **buffer, int countp, - u32 *bmval, struct svc_rqst *, int ignore_crossmnt); +void nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op); +__be32 nfsd4_encode_fattr_to_buf(__be32 **p, int words, + struct svc_fh *fhp, struct svc_export *exp, + struct dentry *dentry, + u32 *bmval, struct svc_rqst *, int ignore_crossmnt); extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_setclientid *setclid); diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index f3a82fbcae02..24978153c0c4 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -152,10 +152,10 @@ static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma) */ const struct file_operations nilfs_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .unlocked_ioctl = nilfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = nilfs_compat_ioctl, diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index b9c5726120e3..6252b173a465 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -298,19 +298,20 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping, } static ssize_t -nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t offset, unsigned long nr_segs) +nilfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = file->f_mapping->host; + size_t count = iov_iter_count(iter); ssize_t size; if (rw == WRITE) return 0; /* Needs synchronization with the cleaner */ - size = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, + size = blockdev_direct_IO(rw, iocb, inode, iter, offset, nilfs_get_block); /* @@ -319,7 +320,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, */ if (unlikely((rw & WRITE) && size < 0)) { loff_t isize = i_size_read(inode); - loff_t end = offset + iov_length(iov, nr_segs); + loff_t end = offset + count; if (end > isize) nilfs_write_failed(mapping, end); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 4e565c814309..3fdc8a3e1134 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -25,6 +25,19 @@ #define FANOTIFY_DEFAULT_MAX_MARKS 8192 #define FANOTIFY_DEFAULT_MAX_LISTENERS 128 +/* + * All flags that may be specified in parameter event_f_flags of fanotify_init. + * + * Internal and external open flags are stored together in field f_flags of + * struct file. Only external open flags shall be allowed in event_f_flags. + * Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be + * excluded. + */ +#define FANOTIFY_INIT_ALL_EVENT_F_BITS ( \ + O_ACCMODE | O_APPEND | O_NONBLOCK | \ + __O_SYNC | O_DSYNC | O_CLOEXEC | \ + O_LARGEFILE | O_NOATIME ) + extern const struct fsnotify_ops fanotify_fsnotify_ops; static struct kmem_cache *fanotify_mark_cache __read_mostly; @@ -669,6 +682,18 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) if (flags & ~FAN_ALL_INIT_FLAGS) return -EINVAL; + if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS) + return -EINVAL; + + switch (event_f_flags & O_ACCMODE) { + case O_RDONLY: + case O_RDWR: + case O_WRONLY: + break; + default: + return -EINVAL; + } + user = get_current_user(); if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) { free_uid(user); @@ -698,6 +723,8 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) } group->overflow_event = &oevent->fse; + if (force_o_largefile()) + event_f_flags |= O_LARGEFILE; group->fanotify_data.f_flags = event_f_flags; #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS spin_lock_init(&group->fanotify_data.access_lock); @@ -774,7 +801,10 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, case FAN_MARK_REMOVE: if (!mask) return -EINVAL; + break; case FAN_MARK_FLUSH: + if (flags & ~(FAN_MARK_MOUNT | FAN_MARK_FLUSH)) + return -EINVAL; break; default: return -EINVAL; @@ -811,6 +841,15 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, group->priority == FS_PRIO_0) goto fput_and_out; + if (flags & FAN_MARK_FLUSH) { + ret = 0; + if (flags & FAN_MARK_MOUNT) + fsnotify_clear_vfsmount_marks_by_group(group); + else + fsnotify_clear_inode_marks_by_group(group); + goto fput_and_out; + } + ret = fanotify_find_path(dfd, pathname, &path, flags); if (ret) goto fput_and_out; @@ -822,7 +861,7 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, mnt = path.mnt; /* create/update an inode mark */ - switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { + switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) { case FAN_MARK_ADD: if (flags & FAN_MARK_MOUNT) ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags); @@ -835,12 +874,6 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, else ret = fanotify_remove_inode_mark(group, inode, mask, flags); break; - case FAN_MARK_FLUSH: - if (flags & FAN_MARK_MOUNT) - fsnotify_clear_vfsmount_marks_by_group(group); - else - fsnotify_clear_inode_marks_by_group(group); - break; default: ret = -EINVAL; } diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 78a2ca3966c3..cc423a30a0c8 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -57,7 +57,7 @@ static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; static int zero; -ctl_table inotify_table[] = { +struct ctl_table inotify_table[] = { { .procname = "max_user_instances", .data = &inotify_max_user_instances, diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 923fe4a5f503..d90deaa08e78 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -340,7 +340,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark, static int fsnotify_mark_destroy(void *ignored) { struct fsnotify_mark *mark, *next; - LIST_HEAD(private_destroy_list); + struct list_head private_destroy_list; for (;;) { spin_lock(&destroy_lock); diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index a27e3fecefaf..250ed5b20c8f 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -1748,7 +1748,6 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size) if (page) { set_page_dirty(page); unlock_page(page); - mark_page_accessed(page); page_cache_release(page); } ntfs_debug("Done."); diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c index ee4144ce5d7c..f82498c35e78 100644 --- a/fs/ntfs/compress.c +++ b/fs/ntfs/compress.c @@ -58,7 +58,7 @@ typedef enum { /** * ntfs_compression_buffer - one buffer for the decompression engine */ -static u8 *ntfs_compression_buffer = NULL; +static u8 *ntfs_compression_buffer; /** * ntfs_cb_lock - spinlock which protects ntfs_compression_buffer diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index db9bd8a31725..5c9e2c81cb11 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -2060,7 +2060,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, } do { unlock_page(pages[--do_pages]); - mark_page_accessed(pages[do_pages]); page_cache_release(pages[do_pages]); } while (do_pages); if (unlikely(status)) @@ -2091,10 +2090,7 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb, size_t count; /* after file limit checks */ ssize_t written, err; - count = 0; - err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); - if (err) - return err; + count = iov_length(iov, nr_segs); pos = *ppos; /* We can write back this queue in page reclaim. */ current->backing_dev_info = mapping->backing_dev_info; @@ -2203,8 +2199,8 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, const struct file_operations ntfs_file_ops = { .llseek = generic_file_llseek, /* Seek inside file. */ - .read = do_sync_read, /* Read from file. */ - .aio_read = generic_file_aio_read, /* Async read from file. */ + .read = new_sync_read, /* Read from file. */ + .read_iter = generic_file_read_iter, /* Async read from file. */ #ifdef NTFS_RW .write = do_sync_write, /* Write to file. */ .aio_write = ntfs_file_aio_write, /* Async write to file. */ diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 9de2491f2926..6c3296e546c3 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -50,8 +50,8 @@ static unsigned long ntfs_nr_compression_users; /* A global default upcase table and a corresponding reference count. */ -static ntfschar *default_upcase = NULL; -static unsigned long ntfs_nr_upcase_users = 0; +static ntfschar *default_upcase; +static unsigned long ntfs_nr_upcase_users; /* Error constants/strings used in inode.c::ntfs_show_options(). */ typedef enum { diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c index 79a89184cb5e..a503156ec15f 100644 --- a/fs/ntfs/sysctl.c +++ b/fs/ntfs/sysctl.c @@ -34,7 +34,7 @@ #include "debug.h" /* Definition of the ntfs sysctl. */ -static ctl_table ntfs_sysctls[] = { +static struct ctl_table ntfs_sysctls[] = { { .procname = "ntfs-debug", .data = &debug_msgs, /* Data pointer and size. */ @@ -46,7 +46,7 @@ static ctl_table ntfs_sysctls[] = { }; /* Define the parent directory /proc/sys/fs. */ -static ctl_table sysctls_root[] = { +static struct ctl_table sysctls_root[] = { { .procname = "fs", .mode = 0555, @@ -56,7 +56,7 @@ static ctl_table sysctls_root[] = { }; /* Storage for the sysctls header. */ -static struct ctl_table_header *sysctls_root_table = NULL; +static struct ctl_table_header *sysctls_root_table; /** * ntfs_sysctl - add or remove the debug sysctl diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index b4deb5f750d9..9d8fcf2f3b94 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6046,7 +6046,8 @@ static void ocfs2_truncate_log_worker(struct work_struct *work) void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, int cancel) { - if (osb->osb_tl_inode) { + if (osb->osb_tl_inode && + atomic_read(&osb->osb_tl_disable) == 0) { /* We want to push off log flushes while truncates are * still running. */ if (cancel) @@ -6223,6 +6224,8 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb) int status; struct inode *tl_inode = osb->osb_tl_inode; + atomic_set(&osb->osb_tl_disable, 1); + if (tl_inode) { cancel_delayed_work(&osb->osb_truncate_log_wq); flush_workqueue(ocfs2_wq); @@ -6254,6 +6257,7 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb) * until we're sure all is well. */ INIT_DELAYED_WORK(&osb->osb_truncate_log_wq, ocfs2_truncate_log_worker); + atomic_set(&osb->osb_tl_disable, 0); osb->osb_tl_bh = tl_bh; osb->osb_tl_inode = tl_inode; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index d310d12a9adc..4a231a166cf8 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -599,9 +599,8 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait) static ssize_t ocfs2_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, - loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, + loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file)->i_mapping->host; @@ -618,7 +617,7 @@ static ssize_t ocfs2_direct_IO(int rw, return 0; return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, - iov, offset, nr_segs, + iter, offset, ocfs2_direct_IO_get_blocks, ocfs2_dio_end_io, NULL, 0); } diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index bf482dfed14f..73039295d0d1 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1107,7 +1107,7 @@ static int o2hb_thread(void *data) mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n"); - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); /* Pin node */ o2nm_depend_this_node(); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index c6b90e670389..681691bc233a 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -108,7 +108,7 @@ static struct rb_root o2net_handler_tree = RB_ROOT; static struct o2net_node o2net_nodes[O2NM_MAX_NODES]; /* XXX someday we'll need better accounting */ -static struct socket *o2net_listen_sock = NULL; +static struct socket *o2net_listen_sock; /* * listen work is only queued by the listening socket callbacks on the @@ -1799,7 +1799,7 @@ int o2net_register_hb_callbacks(void) /* ------------------------------------------------------------ */ -static int o2net_accept_one(struct socket *sock) +static int o2net_accept_one(struct socket *sock, int *more) { int ret, slen; struct sockaddr_in sin; @@ -1810,6 +1810,7 @@ static int o2net_accept_one(struct socket *sock) struct o2net_node *nn; BUG_ON(sock == NULL); + *more = 0; ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, sock->sk->sk_protocol, &new_sock); if (ret) @@ -1821,6 +1822,7 @@ static int o2net_accept_one(struct socket *sock) if (ret < 0) goto out; + *more = 1; new_sock->sk->sk_allocation = GFP_ATOMIC; ret = o2net_set_nodelay(new_sock); @@ -1919,11 +1921,36 @@ out: return ret; } +/* + * This function is invoked in response to one or more + * pending accepts at softIRQ level. We must drain the + * entire que before returning. + */ + static void o2net_accept_many(struct work_struct *work) { struct socket *sock = o2net_listen_sock; - while (o2net_accept_one(sock) == 0) + int more; + int err; + + /* + * It is critical to note that due to interrupt moderation + * at the network driver level, we can't assume to get a + * softIRQ for every single conn since tcp SYN packets + * can arrive back-to-back, and therefore many pending + * accepts may result in just 1 softIRQ. If we terminate + * the o2net_accept_one() loop upon seeing an err, what happens + * to the rest of the conns in the queue? If no new SYN + * arrives for hours, no softIRQ will be delivered, + * and the connections will just sit in the queue. + */ + + for (;;) { + err = o2net_accept_one(sock, &more); + if (!more) + break; cond_resched(); + } } static void o2net_listen_data_ready(struct sock *sk) diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index e0517762fcc0..fae17c640df3 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -108,7 +108,6 @@ static inline int dlm_is_recovery_lock(const char *lock_name, int name_len) struct dlm_recovery_ctxt { struct list_head resources; - struct list_head received; struct list_head node_data; u8 new_master; u8 dead_node; @@ -332,6 +331,7 @@ struct dlm_lock_resource u16 state; char lvb[DLM_LVB_LEN]; unsigned int inflight_locks; + unsigned int inflight_assert_workers; unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; }; @@ -911,6 +911,9 @@ void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); +void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); + void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index e33cd7a3c582..18f13c2e4a10 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -338,7 +338,7 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle) #ifdef CONFIG_DEBUG_FS -static struct dentry *dlm_debugfs_root = NULL; +static struct dentry *dlm_debugfs_root; #define DLM_DEBUGFS_DIR "o2dlm" #define DLM_DEBUGFS_DLM_STATE "dlm_state" diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index c973690dc0bc..39efc5057a36 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -959,6 +959,14 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, * domain. Set him in the map and clean up our * leftover join state. */ BUG_ON(dlm->joining_node != assert->node_idx); + + if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) { + mlog(0, "dlm recovery is ongoing, disallow join\n"); + spin_unlock(&dlm->spinlock); + spin_unlock(&dlm_domain_lock); + return -EAGAIN; + } + set_bit(assert->node_idx, dlm->domain_map); clear_bit(assert->node_idx, dlm->exit_domain_map); __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); @@ -1517,6 +1525,7 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm, unsigned int node) { int status; + int ret; struct dlm_assert_joined assert_msg; mlog(0, "Sending join assert to node %u\n", node); @@ -1528,11 +1537,13 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm, status = o2net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, &assert_msg, sizeof(assert_msg), node, - NULL); + &ret); if (status < 0) mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " "node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, node); + else + status = ret; return status; } @@ -2023,7 +2034,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, INIT_LIST_HEAD(&dlm->list); INIT_LIST_HEAD(&dlm->dirty_list); INIT_LIST_HEAD(&dlm->reco.resources); - INIT_LIST_HEAD(&dlm->reco.received); INIT_LIST_HEAD(&dlm->reco.node_data); INIT_LIST_HEAD(&dlm->purge_list); INIT_LIST_HEAD(&dlm->dlm_domain_handlers); diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 5d32f7511f74..66c2a491f68d 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -52,7 +52,7 @@ #define MLOG_MASK_PREFIX ML_DLM #include "cluster/masklog.h" -static struct kmem_cache *dlm_lock_cache = NULL; +static struct kmem_cache *dlm_lock_cache; static DEFINE_SPINLOCK(dlm_cookie_lock); static u64 dlm_next_cookie = 1; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index af3f7aa73e13..82abf0cc9a12 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -82,9 +82,9 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm, return 1; } -static struct kmem_cache *dlm_lockres_cache = NULL; -static struct kmem_cache *dlm_lockname_cache = NULL; -static struct kmem_cache *dlm_mle_cache = NULL; +static struct kmem_cache *dlm_lockres_cache; +static struct kmem_cache *dlm_lockname_cache; +static struct kmem_cache *dlm_mle_cache; static void dlm_mle_release(struct kref *kref); static void dlm_init_mle(struct dlm_master_list_entry *mle, @@ -472,11 +472,15 @@ bail: void dlm_destroy_master_caches(void) { - if (dlm_lockname_cache) + if (dlm_lockname_cache) { kmem_cache_destroy(dlm_lockname_cache); + dlm_lockname_cache = NULL; + } - if (dlm_lockres_cache) + if (dlm_lockres_cache) { kmem_cache_destroy(dlm_lockres_cache); + dlm_lockres_cache = NULL; + } } static void dlm_lockres_release(struct kref *kref) @@ -577,6 +581,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, atomic_set(&res->asts_reserved, 0); res->migration_pending = 0; res->inflight_locks = 0; + res->inflight_assert_workers = 0; res->dlm = dlm; @@ -679,6 +684,43 @@ void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, wake_up(&res->wq); } +void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + assert_spin_locked(&res->spinlock); + res->inflight_assert_workers++; + mlog(0, "%s:%.*s: inflight assert worker++: now %u\n", + dlm->name, res->lockname.len, res->lockname.name, + res->inflight_assert_workers); +} + +static void dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + spin_lock(&res->spinlock); + __dlm_lockres_grab_inflight_worker(dlm, res); + spin_unlock(&res->spinlock); +} + +static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + assert_spin_locked(&res->spinlock); + BUG_ON(res->inflight_assert_workers == 0); + res->inflight_assert_workers--; + mlog(0, "%s:%.*s: inflight assert worker--: now %u\n", + dlm->name, res->lockname.len, res->lockname.name, + res->inflight_assert_workers); +} + +static void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + spin_lock(&res->spinlock); + __dlm_lockres_drop_inflight_worker(dlm, res); + spin_unlock(&res->spinlock); +} + /* * lookup a lock resource by name. * may already exist in the hashtable. @@ -1599,7 +1641,8 @@ send_response: mlog(ML_ERROR, "failed to dispatch assert master work\n"); response = DLM_MASTER_RESP_ERROR; dlm_lockres_put(res); - } + } else + dlm_lockres_grab_inflight_worker(dlm, res); } else { if (res) dlm_lockres_put(res); @@ -2114,6 +2157,8 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) dlm_lockres_release_ast(dlm, res); put: + dlm_lockres_drop_inflight_worker(dlm, res); + dlm_lockres_put(res); mlog(0, "finished with dlm_assert_master_worker\n"); @@ -3084,11 +3129,15 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, /* remove it so that only one mle will be found */ __dlm_unlink_mle(dlm, tmp); __dlm_mle_detach_hb_events(dlm, tmp); - ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; - mlog(0, "%s:%.*s: master=%u, newmaster=%u, " - "telling master to get ref for cleared out mle " - "during migration\n", dlm->name, namelen, name, - master, new_master); + if (tmp->type == DLM_MLE_MASTER) { + ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; + mlog(0, "%s:%.*s: master=%u, newmaster=%u, " + "telling master to get ref " + "for cleared out mle during " + "migration\n", dlm->name, + namelen, name, master, + new_master); + } } spin_unlock(&tmp->spinlock); } diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index fe29f7978f81..45067faf5695 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1708,7 +1708,8 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, mlog_errno(-ENOMEM); /* retry!? */ BUG(); - } + } else + __dlm_lockres_grab_inflight_worker(dlm, res); } else /* put.. incase we are not the master */ dlm_lockres_put(res); spin_unlock(&res->spinlock); @@ -1986,7 +1987,15 @@ skip_lvb: } if (!bad) { dlm_lock_get(newlock); - list_add_tail(&newlock->list, queue); + if (mres->flags & DLM_MRES_RECOVERY && + ml->list == DLM_CONVERTING_LIST && + newlock->ml.type > + newlock->ml.convert_type) { + /* newlock is doing downconvert, add it to the + * head of converting list */ + list_add(&newlock->list, queue); + } else + list_add_tail(&newlock->list, queue); mlog(0, "%s:%.*s: added lock for node %u, " "setting refmap bit\n", dlm->name, res->lockname.len, res->lockname.name, ml->node); diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 9db869de829d..69aac6f088ad 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -259,12 +259,15 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm, * refs on it. */ unused = __dlm_lockres_unused(lockres); if (!unused || - (lockres->state & DLM_LOCK_RES_MIGRATING)) { + (lockres->state & DLM_LOCK_RES_MIGRATING) || + (lockres->inflight_assert_workers != 0)) { mlog(0, "%s: res %.*s is in use or being remastered, " - "used %d, state %d\n", dlm->name, - lockres->lockname.len, lockres->lockname.name, - !unused, lockres->state); - list_move_tail(&dlm->purge_list, &lockres->purge); + "used %d, state %d, assert master workers %u\n", + dlm->name, lockres->lockname.len, + lockres->lockname.name, + !unused, lockres->state, + lockres->inflight_assert_workers); + list_move_tail(&lockres->purge, &dlm->purge_list); spin_unlock(&lockres->spinlock); continue; } diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 5698b52cf5c9..2e3c9dbab68c 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -191,7 +191,9 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, DLM_UNLOCK_CLEAR_CONVERT_TYPE); } else if (status == DLM_RECOVERING || status == DLM_MIGRATING || - status == DLM_FORWARD) { + status == DLM_FORWARD || + status == DLM_NOLOCKMGR + ) { /* must clear the actions because this unlock * is about to be retried. cannot free or do * any list manipulation. */ @@ -200,7 +202,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, res->lockname.name, status==DLM_RECOVERING?"recovering": (status==DLM_MIGRATING?"migrating": - "forward")); + (status == DLM_FORWARD ? "forward" : + "nolockmanager"))); actions = 0; } if (flags & LKM_CANCEL) @@ -364,7 +367,10 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm, * updated state to the recovery master. this thread * just needs to finish out the operation and call * the unlockast. */ - ret = DLM_NORMAL; + if (dlm_is_node_dead(dlm, owner)) + ret = DLM_NORMAL; + else + ret = DLM_NOLOCKMGR; } else { /* something bad. this will BUG in ocfs2 */ ret = dlm_err_to_dlm_status(tmpret); @@ -638,7 +644,9 @@ retry: if (status == DLM_RECOVERING || status == DLM_MIGRATING || - status == DLM_FORWARD) { + status == DLM_FORWARD || + status == DLM_NOLOCKMGR) { + /* We want to go away for a tiny bit to allow recovery * / migration to complete on this resource. I don't * know of any wait queue we could sleep on as this @@ -650,7 +658,7 @@ retry: msleep(50); mlog(0, "retrying unlock due to pending recovery/" - "migration/in-progress\n"); + "migration/in-progress/reconnect\n"); goto retry; } diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 6bd690b5a061..52cfe99ae056 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2544,11 +2544,6 @@ int ocfs2_super_lock(struct ocfs2_super *osb, * refreshed, so we do it here. Of course, making sense of * everything is up to the caller :) */ status = ocfs2_should_refresh_lock_res(lockres); - if (status < 0) { - ocfs2_cluster_unlock(osb, lockres, level); - mlog_errno(status); - goto bail; - } if (status) { status = ocfs2_refresh_slot_info(osb); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 8970dcf74de5..2930e231f3f9 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -828,7 +828,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, /* * fs-writeback will release the dirty pages without page lock * whose offset are over inode size, the release happens at - * block_write_full_page_endio(). + * block_write_full_page(). */ i_size_write(inode, abs_to); inode->i_blocks = ocfs2_inode_sector_count(inode); @@ -2233,16 +2233,13 @@ out: return ret; } -static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, - loff_t pos) +static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, + struct iov_iter *from) { int ret, direct_io, appending, rw_level, have_alloc_sem = 0; int can_do_direct, has_refcount = 0; ssize_t written = 0; - size_t ocount; /* original count */ - size_t count; /* after file limit checks */ + size_t count = iov_iter_count(from); loff_t old_size, *ppos = &iocb->ki_pos; u32 old_clusters; struct file *file = iocb->ki_filp; @@ -2256,7 +2253,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, (unsigned long long)OCFS2_I(inode)->ip_blkno, file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name, - (unsigned int)nr_segs); + (unsigned int)from->nr_segs); /* GRRRRR */ if (iocb->ki_nbytes == 0) return 0; @@ -2354,29 +2351,21 @@ relock: /* communicate with ocfs2_dio_end_io */ ocfs2_iocb_set_rw_locked(iocb, rw_level); - ret = generic_segment_checks(iov, &nr_segs, &ocount, - VERIFY_READ); - if (ret) - goto out_dio; - - count = ocount; ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode)); if (ret) goto out_dio; + iov_iter_truncate(from, count); if (direct_io) { - written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, - count, ocount); + written = generic_file_direct_write(iocb, from, *ppos); if (written < 0) { ret = written; goto out_dio; } } else { - struct iov_iter from; - iov_iter_init(&from, iov, nr_segs, count, 0); current->backing_dev_info = file->f_mapping->backing_dev_info; - written = generic_perform_write(file, &from, *ppos); + written = generic_perform_write(file, from, *ppos); if (likely(written >= 0)) iocb->ki_pos = *ppos + written; current->backing_dev_info = NULL; @@ -2441,84 +2430,6 @@ out_sems: return ret; } -static int ocfs2_splice_to_file(struct pipe_inode_info *pipe, - struct file *out, - struct splice_desc *sd) -{ - int ret; - - ret = ocfs2_prepare_inode_for_write(out, &sd->pos, - sd->total_len, 0, NULL, NULL); - if (ret < 0) { - mlog_errno(ret); - return ret; - } - - return splice_from_pipe_feed(pipe, sd, pipe_to_file); -} - -static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, - struct file *out, - loff_t *ppos, - size_t len, - unsigned int flags) -{ - int ret; - struct address_space *mapping = out->f_mapping; - struct inode *inode = mapping->host; - struct splice_desc sd = { - .total_len = len, - .flags = flags, - .pos = *ppos, - .u.file = out, - }; - - - trace_ocfs2_file_splice_write(inode, out, out->f_path.dentry, - (unsigned long long)OCFS2_I(inode)->ip_blkno, - out->f_path.dentry->d_name.len, - out->f_path.dentry->d_name.name, len); - - pipe_lock(pipe); - - splice_from_pipe_begin(&sd); - do { - ret = splice_from_pipe_next(pipe, &sd); - if (ret <= 0) - break; - - mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); - ret = ocfs2_rw_lock(inode, 1); - if (ret < 0) - mlog_errno(ret); - else { - ret = ocfs2_splice_to_file(pipe, out, &sd); - ocfs2_rw_unlock(inode, 1); - } - mutex_unlock(&inode->i_mutex); - } while (ret > 0); - splice_from_pipe_end(pipe, &sd); - - pipe_unlock(pipe); - - if (sd.num_spliced) - ret = sd.num_spliced; - - if (ret > 0) { - int err; - - err = generic_write_sync(out, *ppos, ret); - if (err) - ret = err; - else - *ppos += ret; - - balance_dirty_pages_ratelimited(mapping); - } - - return ret; -} - static ssize_t ocfs2_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, @@ -2534,7 +2445,7 @@ static ssize_t ocfs2_file_splice_read(struct file *in, in->f_path.dentry->d_name.name, len); /* - * See the comment in ocfs2_file_aio_read() + * See the comment in ocfs2_file_read_iter() */ ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level); if (ret < 0) { @@ -2549,10 +2460,8 @@ bail: return ret; } -static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, - loff_t pos) +static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, + struct iov_iter *to) { int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0; struct file *filp = iocb->ki_filp; @@ -2561,7 +2470,8 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, filp->f_path.dentry->d_name.len, - filp->f_path.dentry->d_name.name, nr_segs); + filp->f_path.dentry->d_name.name, + to->nr_segs); /* GRRRRR */ if (!inode) { @@ -2606,13 +2516,13 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, } ocfs2_inode_unlock(inode, lock_level); - ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); + ret = generic_file_read_iter(iocb, to); trace_generic_file_aio_read_ret(ret); /* buffered aio wouldn't have proper lock coverage today */ BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); - /* see ocfs2_file_aio_write */ + /* see ocfs2_file_write_iter */ if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { rw_level = -1; have_alloc_sem = 0; @@ -2705,14 +2615,14 @@ const struct inode_operations ocfs2_special_file_iops = { */ const struct file_operations ocfs2_fops = { .llseek = ocfs2_file_llseek, - .read = do_sync_read, - .write = do_sync_write, + .read = new_sync_read, + .write = new_sync_write, .mmap = ocfs2_mmap, .fsync = ocfs2_sync_file, .release = ocfs2_file_release, .open = ocfs2_file_open, - .aio_read = ocfs2_file_aio_read, - .aio_write = ocfs2_file_aio_write, + .read_iter = ocfs2_file_read_iter, + .write_iter = ocfs2_file_write_iter, .unlocked_ioctl = ocfs2_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ocfs2_compat_ioctl, @@ -2720,7 +2630,7 @@ const struct file_operations ocfs2_fops = { .lock = ocfs2_lock, .flock = ocfs2_flock, .splice_read = ocfs2_file_splice_read, - .splice_write = ocfs2_file_splice_write, + .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, }; @@ -2753,21 +2663,21 @@ const struct file_operations ocfs2_dops = { */ const struct file_operations ocfs2_fops_no_plocks = { .llseek = ocfs2_file_llseek, - .read = do_sync_read, - .write = do_sync_write, + .read = new_sync_read, + .write = new_sync_write, .mmap = ocfs2_mmap, .fsync = ocfs2_sync_file, .release = ocfs2_file_release, .open = ocfs2_file_open, - .aio_read = ocfs2_file_aio_read, - .aio_write = ocfs2_file_aio_write, + .read_iter = ocfs2_file_read_iter, + .write_iter = ocfs2_file_write_iter, .unlocked_ioctl = ocfs2_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ocfs2_compat_ioctl, #endif .flock = ocfs2_flock, .splice_read = ocfs2_file_splice_read, - .splice_write = ocfs2_file_splice_write, + .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, }; diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 490229f43731..6f66b3751ace 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -143,8 +143,8 @@ bail: return status; } -int ocfs2_info_handle_blocksize(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_blocksize(struct inode *inode, + struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_blocksize oib; @@ -167,8 +167,8 @@ bail: return status; } -int ocfs2_info_handle_clustersize(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_clustersize(struct inode *inode, + struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_clustersize oic; @@ -192,8 +192,8 @@ bail: return status; } -int ocfs2_info_handle_maxslots(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_maxslots(struct inode *inode, + struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_maxslots oim; @@ -217,8 +217,8 @@ bail: return status; } -int ocfs2_info_handle_label(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_label(struct inode *inode, + struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_label oil; @@ -242,8 +242,8 @@ bail: return status; } -int ocfs2_info_handle_uuid(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_uuid(struct inode *inode, + struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_uuid oiu; @@ -267,8 +267,8 @@ bail: return status; } -int ocfs2_info_handle_fs_features(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_fs_features(struct inode *inode, + struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_fs_features oif; @@ -294,8 +294,8 @@ bail: return status; } -int ocfs2_info_handle_journal_size(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_journal_size(struct inode *inode, + struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_journal_size oij; @@ -319,9 +319,10 @@ bail: return status; } -int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, - struct inode *inode_alloc, u64 blkno, - struct ocfs2_info_freeinode *fi, u32 slot) +static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, + struct inode *inode_alloc, u64 blkno, + struct ocfs2_info_freeinode *fi, + u32 slot) { int status = 0, unlock = 0; @@ -366,8 +367,8 @@ bail: return status; } -int ocfs2_info_handle_freeinode(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_freeinode(struct inode *inode, + struct ocfs2_info_request __user *req) { u32 i; u64 blkno = -1; @@ -462,19 +463,19 @@ static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats, stats->ffs_free_chunks_real++; } -void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg, - unsigned int chunksize) +static void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg, + unsigned int chunksize) { o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize); o2ffg_update_stats(&(ffg->iff_ffs), chunksize); } -int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb, - struct inode *gb_inode, - struct ocfs2_dinode *gb_dinode, - struct ocfs2_chain_rec *rec, - struct ocfs2_info_freefrag *ffg, - u32 chunks_in_group) +static int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb, + struct inode *gb_inode, + struct ocfs2_dinode *gb_dinode, + struct ocfs2_chain_rec *rec, + struct ocfs2_info_freefrag *ffg, + u32 chunks_in_group) { int status = 0, used; u64 blkno; @@ -572,9 +573,9 @@ bail: return status; } -int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb, - struct inode *gb_inode, u64 blkno, - struct ocfs2_info_freefrag *ffg) +static int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb, + struct inode *gb_inode, u64 blkno, + struct ocfs2_info_freefrag *ffg) { u32 chunks_in_group; int status = 0, unlock = 0, i; @@ -652,8 +653,8 @@ bail: return status; } -int ocfs2_info_handle_freefrag(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_freefrag(struct inode *inode, + struct ocfs2_info_request __user *req) { u64 blkno = -1; char namebuf[40]; @@ -723,8 +724,8 @@ out_err: return status; } -int ocfs2_info_handle_unknown(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_unknown(struct inode *inode, + struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_request oir; @@ -752,8 +753,8 @@ bail: * - distinguish different requests. * - validate size of different requests. */ -int ocfs2_info_handle_request(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_request(struct inode *inode, + struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_request oir; @@ -811,8 +812,8 @@ bail: return status; } -int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx, - u64 *req_addr, int compat_flag) +static int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx, + u64 *req_addr, int compat_flag) { int status = -EFAULT; u64 __user *bp = NULL; @@ -849,8 +850,8 @@ bail: * a better backward&forward compatibility, since a small piece of * request will be less likely to be broken if disk layout get changed. */ -int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, - int compat_flag) +static int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, + int compat_flag) { int i, status = 0; u64 req_addr; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 03ea9314fecd..4b0c68849b36 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -30,6 +30,7 @@ #include <linux/kthread.h> #include <linux/time.h> #include <linux/random.h> +#include <linux/delay.h> #include <cluster/masklog.h> @@ -2185,8 +2186,20 @@ static int ocfs2_commit_thread(void *arg) || kthread_should_stop()); status = ocfs2_commit_cache(osb); - if (status < 0) - mlog_errno(status); + if (status < 0) { + static unsigned long abort_warn_time; + + /* Warn about this once per minute */ + if (printk_timed_ratelimit(&abort_warn_time, 60*HZ)) + mlog(ML_ERROR, "status = %d, journal is " + "already aborted.\n", status); + /* + * After ocfs2_commit_cache() fails, j_num_trans has a + * non-zero value. Sleep here to avoid a busy-wait + * loop. + */ + msleep_interruptible(1000); + } if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){ mlog(ML_KTHREAD, diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 2060fc398445..8add6f1030d7 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -205,6 +205,21 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode) return inode; } +static void ocfs2_cleanup_add_entry_failure(struct ocfs2_super *osb, + struct dentry *dentry, struct inode *inode) +{ + struct ocfs2_dentry_lock *dl = dentry->d_fsdata; + + ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); + ocfs2_lock_res_free(&dl->dl_lockres); + BUG_ON(dl->dl_count != 1); + spin_lock(&dentry_attach_lock); + dentry->d_fsdata = NULL; + spin_unlock(&dentry_attach_lock); + kfree(dl); + iput(inode); +} + static int ocfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, @@ -231,6 +246,7 @@ static int ocfs2_mknod(struct inode *dir, sigset_t oldset; int did_block_signals = 0; struct posix_acl *default_acl = NULL, *acl = NULL; + struct ocfs2_dentry_lock *dl = NULL; trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno, @@ -423,6 +439,8 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } + dl = dentry->d_fsdata; + status = ocfs2_add_entry(handle, dentry, inode, OCFS2_I(inode)->ip_blkno, parent_fe_bh, &lookup); @@ -469,6 +487,9 @@ leave: * ocfs2_delete_inode will mutex_lock again. */ if ((status < 0) && inode) { + if (dl) + ocfs2_cleanup_add_entry_failure(osb, dentry, inode); + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR; clear_nlink(inode); iput(inode); @@ -991,6 +1012,65 @@ leave: return status; } +static int ocfs2_check_if_ancestor(struct ocfs2_super *osb, + u64 src_inode_no, u64 dest_inode_no) +{ + int ret = 0, i = 0; + u64 parent_inode_no = 0; + u64 child_inode_no = src_inode_no; + struct inode *child_inode; + +#define MAX_LOOKUP_TIMES 32 + while (1) { + child_inode = ocfs2_iget(osb, child_inode_no, 0, 0); + if (IS_ERR(child_inode)) { + ret = PTR_ERR(child_inode); + break; + } + + ret = ocfs2_inode_lock(child_inode, NULL, 0); + if (ret < 0) { + iput(child_inode); + if (ret != -ENOENT) + mlog_errno(ret); + break; + } + + ret = ocfs2_lookup_ino_from_name(child_inode, "..", 2, + &parent_inode_no); + ocfs2_inode_unlock(child_inode, 0); + iput(child_inode); + if (ret < 0) { + ret = -ENOENT; + break; + } + + if (parent_inode_no == dest_inode_no) { + ret = 1; + break; + } + + if (parent_inode_no == osb->root_inode->i_ino) { + ret = 0; + break; + } + + child_inode_no = parent_inode_no; + + if (++i >= MAX_LOOKUP_TIMES) { + mlog(ML_NOTICE, "max lookup times reached, filesystem " + "may have nested directories, " + "src inode: %llu, dest inode: %llu.\n", + (unsigned long long)src_inode_no, + (unsigned long long)dest_inode_no); + ret = 0; + break; + } + } + + return ret; +} + /* * The only place this should be used is rename! * if they have the same id, then the 1st one is the only one locked. @@ -1002,6 +1082,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, struct inode *inode2) { int status; + int inode1_is_ancestor, inode2_is_ancestor; struct ocfs2_inode_info *oi1 = OCFS2_I(inode1); struct ocfs2_inode_info *oi2 = OCFS2_I(inode2); struct buffer_head **tmpbh; @@ -1015,9 +1096,26 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, if (*bh2) *bh2 = NULL; - /* we always want to lock the one with the lower lockid first. */ + /* we always want to lock the one with the lower lockid first. + * and if they are nested, we lock ancestor first */ if (oi1->ip_blkno != oi2->ip_blkno) { - if (oi1->ip_blkno < oi2->ip_blkno) { + inode1_is_ancestor = ocfs2_check_if_ancestor(osb, oi2->ip_blkno, + oi1->ip_blkno); + if (inode1_is_ancestor < 0) { + status = inode1_is_ancestor; + goto bail; + } + + inode2_is_ancestor = ocfs2_check_if_ancestor(osb, oi1->ip_blkno, + oi2->ip_blkno); + if (inode2_is_ancestor < 0) { + status = inode2_is_ancestor; + goto bail; + } + + if ((inode1_is_ancestor == 1) || + (oi1->ip_blkno < oi2->ip_blkno && + inode2_is_ancestor == 0)) { /* switch id1 and id2 around */ tmpbh = bh2; bh2 = bh1; @@ -1098,6 +1196,7 @@ static int ocfs2_rename(struct inode *old_dir, struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, }; struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; struct ocfs2_dir_lookup_result target_insert = { NULL, }; + bool should_add_orphan = false; /* At some point it might be nice to break this function up a * bit. */ @@ -1134,6 +1233,21 @@ static int ocfs2_rename(struct inode *old_dir, goto bail; } rename_lock = 1; + + /* here we cannot guarantee the inodes haven't just been + * changed, so check if they are nested again */ + status = ocfs2_check_if_ancestor(osb, new_dir->i_ino, + old_inode->i_ino); + if (status < 0) { + mlog_errno(status); + goto bail; + } else if (status == 1) { + status = -EPERM; + trace_ocfs2_rename_not_permitted( + (unsigned long long)old_inode->i_ino, + (unsigned long long)new_dir->i_ino); + goto bail; + } } /* if old and new are the same, this'll just do one lock. */ @@ -1304,6 +1418,7 @@ static int ocfs2_rename(struct inode *old_dir, mlog_errno(status); goto bail; } + should_add_orphan = true; } } else { BUG_ON(new_dentry->d_parent->d_inode != new_dir); @@ -1348,17 +1463,6 @@ static int ocfs2_rename(struct inode *old_dir, goto bail; } - if (S_ISDIR(new_inode->i_mode) || - (ocfs2_read_links_count(newfe) == 1)) { - status = ocfs2_orphan_add(osb, handle, new_inode, - newfe_bh, orphan_name, - &orphan_insert, orphan_dir); - if (status < 0) { - mlog_errno(status); - goto bail; - } - } - /* change the dirent to point to the correct inode */ status = ocfs2_update_entry(new_dir, handle, &target_lookup_res, old_inode); @@ -1373,6 +1477,15 @@ static int ocfs2_rename(struct inode *old_dir, else ocfs2_add_links_count(newfe, -1); ocfs2_journal_dirty(handle, newfe_bh); + if (should_add_orphan) { + status = ocfs2_orphan_add(osb, handle, new_inode, + newfe_bh, orphan_name, + &orphan_insert, orphan_dir); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } } else { /* if the name was not found in new_dir, add it now */ status = ocfs2_add_entry(handle, new_dentry, old_inode, @@ -1642,6 +1755,7 @@ static int ocfs2_symlink(struct inode *dir, struct ocfs2_dir_lookup_result lookup = { NULL, }; sigset_t oldset; int did_block_signals = 0; + struct ocfs2_dentry_lock *dl = NULL; trace_ocfs2_symlink_begin(dir, dentry, symname, dentry->d_name.len, dentry->d_name.name); @@ -1830,6 +1944,8 @@ static int ocfs2_symlink(struct inode *dir, goto bail; } + dl = dentry->d_fsdata; + status = ocfs2_add_entry(handle, dentry, inode, le64_to_cpu(fe->i_blkno), parent_fe_bh, &lookup); @@ -1864,6 +1980,9 @@ bail: if (xattr_ac) ocfs2_free_alloc_context(xattr_ac); if ((status < 0) && inode) { + if (dl) + ocfs2_cleanup_add_entry_failure(osb, dentry, inode); + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR; clear_nlink(inode); iput(inode); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 8d64a97a9d5e..bbec539230fd 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -422,6 +422,7 @@ struct ocfs2_super struct inode *osb_tl_inode; struct buffer_head *osb_tl_bh; struct delayed_work osb_truncate_log_wq; + atomic_t osb_tl_disable; /* * How many clusters in our truncate log. * It must be protected by osb_tl_inode->i_mutex. diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index 1b60c62aa9d6..6cb019b7c6a8 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -2292,6 +2292,8 @@ TRACE_EVENT(ocfs2_rename, __entry->new_len, __get_str(new_name)) ); +DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_rename_not_permitted); + TRACE_EVENT(ocfs2_rename_target_exists, TP_PROTO(int new_len, const char *new_name), TP_ARGS(new_len, new_name), diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 6ba4bcbc4796..636aab69ead5 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -1408,10 +1408,9 @@ static void swap_refcount_rec(void *a, void *b, int size) { struct ocfs2_refcount_rec *l = a, *r = b, tmp; - tmp = *(struct ocfs2_refcount_rec *)l; - *(struct ocfs2_refcount_rec *)l = - *(struct ocfs2_refcount_rec *)r; - *(struct ocfs2_refcount_rec *)r = tmp; + tmp = *l; + *l = *r; + *r = tmp; } /* @@ -4289,9 +4288,16 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, goto out; } + error = ocfs2_rw_lock(inode, 1); + if (error) { + mlog_errno(error); + goto out; + } + error = ocfs2_inode_lock(inode, &old_bh, 1); if (error) { mlog_errno(error); + ocfs2_rw_unlock(inode, 1); goto out; } @@ -4303,6 +4309,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, up_write(&OCFS2_I(inode)->ip_xattr_sem); ocfs2_inode_unlock(inode, 1); + ocfs2_rw_unlock(inode, 1); brelse(old_bh); if (error) { diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index 822ebc10f281..d5da6f624142 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -53,8 +53,6 @@ */ static u16 ocfs2_calc_new_backup_super(struct inode *inode, struct ocfs2_group_desc *gd, - int new_clusters, - u32 first_new_cluster, u16 cl_cpg, int set) { @@ -127,8 +125,6 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, OCFS2_FEATURE_COMPAT_BACKUP_SB)) { backups = ocfs2_calc_new_backup_super(bm_inode, group, - new_clusters, - first_new_cluster, cl_cpg, 1); le16_add_cpu(&group->bg_free_bits_count, -1 * backups); } @@ -157,7 +153,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, spin_lock(&OCFS2_I(bm_inode)->ip_lock); OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); - le64_add_cpu(&fe->i_size, new_clusters << osb->s_clustersize_bits); + le64_add_cpu(&fe->i_size, (u64)new_clusters << osb->s_clustersize_bits); spin_unlock(&OCFS2_I(bm_inode)->ip_lock); i_size_write(bm_inode, le64_to_cpu(fe->i_size)); @@ -167,8 +163,6 @@ out_rollback: if (ret < 0) { ocfs2_calc_new_backup_super(bm_inode, group, - new_clusters, - first_new_cluster, cl_cpg, 0); le16_add_cpu(&group->bg_free_bits_count, backups); le16_add_cpu(&group->bg_bits, -1 * num_bits); @@ -569,7 +563,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) spin_lock(&OCFS2_I(main_bm_inode)->ip_lock); OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); - le64_add_cpu(&fe->i_size, input->clusters << osb->s_clustersize_bits); + le64_add_cpu(&fe->i_size, (u64)input->clusters << osb->s_clustersize_bits); spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock); i_size_write(main_bm_inode, le64_to_cpu(fe->i_size)); diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 83f1a665ae97..5d965e83bd43 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -709,7 +709,7 @@ static struct ctl_table ocfs2_root_table[] = { { } }; -static struct ctl_table_header *ocfs2_table_header = NULL; +static struct ctl_table_header *ocfs2_table_header; /* diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index a7cdd56f4c79..ddb662b32447 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -75,7 +75,7 @@ #include "buffer_head_io.h" -static struct kmem_cache *ocfs2_inode_cachep = NULL; +static struct kmem_cache *ocfs2_inode_cachep; struct kmem_cache *ocfs2_dquot_cachep; struct kmem_cache *ocfs2_qf_chunk_cachep; @@ -85,7 +85,7 @@ struct kmem_cache *ocfs2_qf_chunk_cachep; * workqueue and schedule on our own. */ struct workqueue_struct *ocfs2_wq = NULL; -static struct dentry *ocfs2_debugfs_root = NULL; +static struct dentry *ocfs2_debugfs_root; MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); @@ -1925,15 +1925,11 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) ocfs2_shutdown_local_alloc(osb); + ocfs2_truncate_log_shutdown(osb); + /* This will disable recovery and flush any recovery work. */ ocfs2_recovery_exit(osb); - /* - * During dismount, when it recovers another node it will call - * ocfs2_recover_orphans and queue delayed work osb_truncate_log_wq. - */ - ocfs2_truncate_log_shutdown(osb); - ocfs2_journal_shutdown(osb); ocfs2_sync_blockdev(sb); @@ -2292,8 +2288,8 @@ static int ocfs2_initialize_super(struct super_block *sb, goto bail; } - strncpy(osb->vol_label, di->id2.i_super.s_label, 63); - osb->vol_label[63] = '\0'; + strlcpy(osb->vol_label, di->id2.i_super.s_label, + OCFS2_MAX_VOL_LABEL_LEN); osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno); osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno); osb->first_cluster_group_blkno = diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c index 52eaf33d346f..82e17b076ce7 100644 --- a/fs/ocfs2/uptodate.c +++ b/fs/ocfs2/uptodate.c @@ -67,7 +67,7 @@ struct ocfs2_meta_cache_item { sector_t c_block; }; -static struct kmem_cache *ocfs2_uptodate_cachep = NULL; +static struct kmem_cache *ocfs2_uptodate_cachep; u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci) { diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 54d57d6ba68d..902e88527fce 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -337,10 +337,10 @@ static sector_t omfs_bmap(struct address_space *mapping, sector_t block) const struct file_operations omfs_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .fsync = generic_file_fsync, .splice_read = generic_file_splice_read, diff --git a/fs/open.c b/fs/open.c index 3d30eb1fc95e..d6fd3acde134 100644 --- a/fs/open.c +++ b/fs/open.c @@ -254,17 +254,21 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) return -EBADF; /* - * It's not possible to punch hole or perform collapse range - * on append only file + * We can only allow pure fallocate on append only files */ - if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE) - && IS_APPEND(inode)) + if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode)) return -EPERM; if (IS_IMMUTABLE(inode)) return -EPERM; /* + * We cannot allow any fallocate operation on an active swapfile + */ + if (IS_SWAPFILE(inode)) + return -ETXTBSY; + + /* * Revalidate the write permissions, in case security policy has * changed since the files were opened. */ @@ -286,14 +290,6 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) return -EFBIG; - /* - * There is no need to overlap collapse range with EOF, in which case - * it is effectively a truncate operation - */ - if ((mode & FALLOC_FL_COLLAPSE_RANGE) && - (offset + len >= i_size_read(inode))) - return -EINVAL; - if (!file->f_op->fallocate) return -EOPNOTSUPP; @@ -728,6 +724,12 @@ static int do_dentry_open(struct file *f, } if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_inc(inode); + if ((f->f_mode & FMODE_READ) && + likely(f->f_op->read || f->f_op->aio_read || f->f_op->read_iter)) + f->f_mode |= FMODE_CAN_READ; + if ((f->f_mode & FMODE_WRITE) && + likely(f->f_op->write || f->f_op->aio_write || f->f_op->write_iter)) + f->f_mode |= FMODE_CAN_WRITE; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); diff --git a/fs/pipe.c b/fs/pipe.c index 034bffac3f97..21981e58e2a6 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -116,50 +116,6 @@ void pipe_wait(struct pipe_inode_info *pipe) pipe_lock(pipe); } -static int -pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len, - int atomic) -{ - unsigned long copy; - - while (len > 0) { - while (!iov->iov_len) - iov++; - copy = min_t(unsigned long, len, iov->iov_len); - - if (atomic) { - if (__copy_from_user_inatomic(to, iov->iov_base, copy)) - return -EFAULT; - } else { - if (copy_from_user(to, iov->iov_base, copy)) - return -EFAULT; - } - to += copy; - len -= copy; - iov->iov_base += copy; - iov->iov_len -= copy; - } - return 0; -} - -/* - * Pre-fault in the user memory, so we can use atomic copies. - */ -static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len) -{ - while (!iov->iov_len) - iov++; - - while (len > 0) { - unsigned long this_len; - - this_len = min_t(unsigned long, len, iov->iov_len); - fault_in_pages_readable(iov->iov_base, this_len); - len -= this_len; - iov++; - } -} - static void anon_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { @@ -271,24 +227,18 @@ static const struct pipe_buf_operations packet_pipe_buf_ops = { }; static ssize_t -pipe_read(struct kiocb *iocb, const struct iovec *_iov, - unsigned long nr_segs, loff_t pos) +pipe_read(struct kiocb *iocb, struct iov_iter *to) { + size_t total_len = iov_iter_count(to); struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; int do_wakeup; ssize_t ret; - struct iovec *iov = (struct iovec *)_iov; - size_t total_len; - struct iov_iter iter; - total_len = iov_length(iov, nr_segs); /* Null read succeeds. */ if (unlikely(total_len == 0)) return 0; - iov_iter_init(&iter, iov, nr_segs, total_len, 0); - do_wakeup = 0; ret = 0; __pipe_lock(pipe); @@ -312,7 +262,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, break; } - written = copy_page_to_iter(buf->page, buf->offset, chars, &iter); + written = copy_page_to_iter(buf->page, buf->offset, chars, to); if (unlikely(written < chars)) { if (!ret) ret = -EFAULT; @@ -386,24 +336,19 @@ static inline int is_packetized(struct file *file) } static ssize_t -pipe_write(struct kiocb *iocb, const struct iovec *_iov, - unsigned long nr_segs, loff_t ppos) +pipe_write(struct kiocb *iocb, struct iov_iter *from) { struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; - ssize_t ret; - int do_wakeup; - struct iovec *iov = (struct iovec *)_iov; - size_t total_len; + ssize_t ret = 0; + int do_wakeup = 0; + size_t total_len = iov_iter_count(from); ssize_t chars; - total_len = iov_length(iov, nr_segs); /* Null write succeeds. */ if (unlikely(total_len == 0)) return 0; - do_wakeup = 0; - ret = 0; __pipe_lock(pipe); if (!pipe->readers) { @@ -422,38 +367,19 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov, int offset = buf->offset + buf->len; if (ops->can_merge && offset + chars <= PAGE_SIZE) { - int error, atomic = 1; - void *addr; - - error = ops->confirm(pipe, buf); + int error = ops->confirm(pipe, buf); if (error) goto out; - iov_fault_in_pages_read(iov, chars); -redo1: - if (atomic) - addr = kmap_atomic(buf->page); - else - addr = kmap(buf->page); - error = pipe_iov_copy_from_user(offset + addr, iov, - chars, atomic); - if (atomic) - kunmap_atomic(addr); - else - kunmap(buf->page); - ret = error; - do_wakeup = 1; - if (error) { - if (atomic) { - atomic = 0; - goto redo1; - } + ret = copy_page_from_iter(buf->page, offset, chars, from); + if (unlikely(ret < chars)) { + error = -EFAULT; goto out; } + do_wakeup = 1; buf->len += chars; - total_len -= chars; ret = chars; - if (!total_len) + if (!iov_iter_count(from)) goto out; } } @@ -472,8 +398,7 @@ redo1: int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1); struct pipe_buffer *buf = pipe->bufs + newbuf; struct page *page = pipe->tmp_page; - char *src; - int error, atomic = 1; + int copied; if (!page) { page = alloc_page(GFP_HIGHUSER); @@ -489,40 +414,19 @@ redo1: * FIXME! Is this really true? */ do_wakeup = 1; - chars = PAGE_SIZE; - if (chars > total_len) - chars = total_len; - - iov_fault_in_pages_read(iov, chars); -redo2: - if (atomic) - src = kmap_atomic(page); - else - src = kmap(page); - - error = pipe_iov_copy_from_user(src, iov, chars, - atomic); - if (atomic) - kunmap_atomic(src); - else - kunmap(page); - - if (unlikely(error)) { - if (atomic) { - atomic = 0; - goto redo2; - } + copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); + if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { if (!ret) - ret = error; + ret = -EFAULT; break; } - ret += chars; + ret += copied; /* Insert it into the buffer array */ buf->page = page; buf->ops = &anon_pipe_buf_ops; buf->offset = 0; - buf->len = chars; + buf->len = copied; buf->flags = 0; if (is_packetized(filp)) { buf->ops = &packet_pipe_buf_ops; @@ -531,8 +435,7 @@ redo2: pipe->nrbufs = ++bufs; pipe->tmp_page = NULL; - total_len -= chars; - if (!total_len) + if (!iov_iter_count(from)) break; } if (bufs < pipe->buffers) @@ -1044,10 +947,10 @@ err: const struct file_operations pipefifo_fops = { .open = fifo_open, .llseek = no_llseek, - .read = do_sync_read, - .aio_read = pipe_read, - .write = do_sync_write, - .aio_write = pipe_write, + .read = new_sync_read, + .read_iter = pipe_read, + .write = new_sync_write, + .write_iter = pipe_write, .poll = pipe_poll, .unlocked_ioctl = pipe_ioctl, .release = pipe_release, diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 9e363e41dacc..0855f772cd41 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -246,6 +246,12 @@ posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p) umode_t mode = 0; int not_equiv = 0; + /* + * A null ACL can always be presented as mode bits. + */ + if (!acl) + return 0; + FOREACH_ACL_ENTRY(pa, acl, pe) { switch (pa->e_tag) { case ACL_USER_OBJ: diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 9d231e9e5f0e..bf2d03f8fd3e 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -184,29 +184,11 @@ static int show_stat(struct seq_file *p, void *v) static int stat_open(struct inode *inode, struct file *file) { - size_t size = 1024 + 128 * num_possible_cpus(); - char *buf; - struct seq_file *m; - int res; + size_t size = 1024 + 128 * num_online_cpus(); /* minimum size to display an interrupt count : 2 bytes */ size += 2 * nr_irqs; - - /* don't ask for more than the kmalloc() max size */ - if (size > KMALLOC_MAX_SIZE) - size = KMALLOC_MAX_SIZE; - buf = kmalloc(size, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - res = single_open(file, show_stat, NULL); - if (!res) { - m = file->private_data; - m->buf = buf; - m->size = ksize(buf); - } else - kfree(buf); - return res; + return single_open_size(file, show_stat, NULL, size); } static const struct file_operations proc_stat_operations = { diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 442177b1119a..cfa63ee92c96 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -300,6 +300,12 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) goto done; } + if (vma->vm_ops && vma->vm_ops->name) { + name = vma->vm_ops->name(vma); + if (name) + goto done; + } + name = arch_vma_name(vma); if (!name) { pid_t tid; @@ -737,9 +743,6 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, ptent = pte_file_clear_soft_dirty(ptent); } - if (vma->vm_flags & VM_SOFTDIRTY) - vma->vm_flags &= ~VM_SOFTDIRTY; - set_pte_at(vma->vm_mm, addr, pte, ptent); #endif } @@ -807,8 +810,9 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, if (type == CLEAR_REFS_SOFT_DIRTY) { soft_dirty_cleared = true; - pr_warn_once("The pagemap bits 55-60 has changed their meaning! " - "See the linux/Documentation/vm/pagemap.txt for details.\n"); + pr_warn_once("The pagemap bits 55-60 has changed their meaning!" + " See the linux/Documentation/vm/pagemap.txt for " + "details.\n"); } task = get_proc_task(file_inode(file)); @@ -839,11 +843,17 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, * * Writing 3 to /proc/pid/clear_refs only affects file * mapped pages. + * + * Writing 4 to /proc/pid/clear_refs affects all pages. */ if (type == CLEAR_REFS_ANON && vma->vm_file) continue; if (type == CLEAR_REFS_MAPPED && !vma->vm_file) continue; + if (type == CLEAR_REFS_SOFT_DIRTY) { + if (vma->vm_flags & VM_SOFTDIRTY) + vma->vm_flags &= ~VM_SOFTDIRTY; + } walk_page_range(vma->vm_start, vma->vm_end, &clear_refs_walk); } @@ -1351,7 +1361,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, struct numa_maps *md; struct page *page; - if (pte_none(*pte)) + if (!pte_present(*pte)) return 0; page = pte_page(*pte); @@ -1408,10 +1418,10 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) seq_printf(m, "%08lx %s", vma->vm_start, buffer); if (file) { - seq_printf(m, " file="); + seq_puts(m, " file="); seq_path(m, &file->f_path, "\n\t= "); } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { - seq_printf(m, " heap"); + seq_puts(m, " heap"); } else { pid_t tid = vm_is_stack(task, vma, is_pid); if (tid != 0) { @@ -1421,14 +1431,14 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) */ if (!is_pid || (vma->vm_start <= mm->start_stack && vma->vm_end >= mm->start_stack)) - seq_printf(m, " stack"); + seq_puts(m, " stack"); else seq_printf(m, " stack:%d", tid); } } if (is_vm_hugetlb_page(vma)) - seq_printf(m, " huge"); + seq_puts(m, " huge"); walk_page_range(vma->vm_start, vma->vm_end, &walk); diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 6a8e785b29da..382aa890e228 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -42,7 +42,7 @@ static size_t elfnotes_sz; /* Total size of vmcore file. */ static u64 vmcore_size; -static struct proc_dir_entry *proc_vmcore = NULL; +static struct proc_dir_entry *proc_vmcore; /* * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 46d269e38706..0a9b72cdfeca 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -18,6 +18,8 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#define pr_fmt(fmt) "pstore: " fmt + #include <linux/atomic.h> #include <linux/types.h> #include <linux/errno.h> @@ -224,14 +226,12 @@ static void allocate_buf_for_compression(void) zlib_inflate_workspacesize()); stream.workspace = kmalloc(size, GFP_KERNEL); if (!stream.workspace) { - pr_err("pstore: No memory for compression workspace; " - "skipping compression\n"); + pr_err("No memory for compression workspace; skipping compression\n"); kfree(big_oops_buf); big_oops_buf = NULL; } } else { - pr_err("No memory for uncompressed data; " - "skipping compression\n"); + pr_err("No memory for uncompressed data; skipping compression\n"); stream.workspace = NULL; } @@ -455,8 +455,7 @@ int pstore_register(struct pstore_info *psi) add_timer(&pstore_timer); } - pr_info("pstore: Registered %s as persistent store backend\n", - psi->name); + pr_info("Registered %s as persistent store backend\n", psi->name); return 0; } @@ -502,8 +501,8 @@ void pstore_get_records(int quiet) size = unzipped_len; compressed = false; } else { - pr_err("pstore: decompression failed;" - "returned %d\n", unzipped_len); + pr_err("decompression failed;returned %d\n", + unzipped_len); compressed = true; } } @@ -524,8 +523,8 @@ out: mutex_unlock(&psi->read_mutex); if (failed) - printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n", - failed, psi->name); + pr_warn("failed to load %d record(s) from '%s'\n", + failed, psi->name); } static void pstore_dowork(struct work_struct *work) diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index ff7e3d4df5a1..34a1e5aa848c 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -12,6 +12,8 @@ * */ +#define pr_fmt(fmt) "persistent_ram: " fmt + #include <linux/device.h> #include <linux/err.h> #include <linux/errno.h> @@ -205,12 +207,10 @@ static void persistent_ram_ecc_old(struct persistent_ram_zone *prz) size = buffer->data + prz->buffer_size - block; numerr = persistent_ram_decode_rs8(prz, block, size, par); if (numerr > 0) { - pr_devel("persistent_ram: error in block %p, %d\n", - block, numerr); + pr_devel("error in block %p, %d\n", block, numerr); prz->corrected_bytes += numerr; } else if (numerr < 0) { - pr_devel("persistent_ram: uncorrectable error in block %p\n", - block); + pr_devel("uncorrectable error in block %p\n", block); prz->bad_blocks++; } block += prz->ecc_info.block_size; @@ -257,7 +257,7 @@ static int persistent_ram_init_ecc(struct persistent_ram_zone *prz, prz->rs_decoder = init_rs(prz->ecc_info.symsize, prz->ecc_info.poly, 0, 1, prz->ecc_info.ecc_size); if (prz->rs_decoder == NULL) { - pr_info("persistent_ram: init_rs failed\n"); + pr_info("init_rs failed\n"); return -EINVAL; } @@ -267,10 +267,10 @@ static int persistent_ram_init_ecc(struct persistent_ram_zone *prz, numerr = persistent_ram_decode_rs8(prz, buffer, sizeof(*buffer), prz->par_header); if (numerr > 0) { - pr_info("persistent_ram: error in header, %d\n", numerr); + pr_info("error in header, %d\n", numerr); prz->corrected_bytes += numerr; } else if (numerr < 0) { - pr_info("persistent_ram: uncorrectable error in header\n"); + pr_info("uncorrectable error in header\n"); prz->bad_blocks++; } @@ -317,7 +317,7 @@ void persistent_ram_save_old(struct persistent_ram_zone *prz) prz->old_log = kmalloc(size, GFP_KERNEL); } if (!prz->old_log) { - pr_err("persistent_ram: failed to allocate buffer\n"); + pr_err("failed to allocate buffer\n"); return; } @@ -396,8 +396,8 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size) pages = kmalloc(sizeof(struct page *) * page_count, GFP_KERNEL); if (!pages) { - pr_err("%s: Failed to allocate array for %u pages\n", __func__, - page_count); + pr_err("%s: Failed to allocate array for %u pages\n", + __func__, page_count); return NULL; } @@ -462,19 +462,17 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, if (prz->buffer->sig == sig) { if (buffer_size(prz) > prz->buffer_size || buffer_start(prz) > buffer_size(prz)) - pr_info("persistent_ram: found existing invalid buffer," - " size %zu, start %zu\n", - buffer_size(prz), buffer_start(prz)); + pr_info("found existing invalid buffer, size %zu, start %zu\n", + buffer_size(prz), buffer_start(prz)); else { - pr_debug("persistent_ram: found existing buffer," - " size %zu, start %zu\n", - buffer_size(prz), buffer_start(prz)); + pr_debug("found existing buffer, size %zu, start %zu\n", + buffer_size(prz), buffer_start(prz)); persistent_ram_save_old(prz); return 0; } } else { - pr_debug("persistent_ram: no valid data in buffer" - " (sig = 0x%08x)\n", prz->buffer->sig); + pr_debug("no valid data in buffer (sig = 0x%08x)\n", + prz->buffer->sig); } prz->buffer->sig = sig; @@ -509,7 +507,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, prz = kzalloc(sizeof(struct persistent_ram_zone), GFP_KERNEL); if (!prz) { - pr_err("persistent_ram: failed to allocate persistent ram zone\n"); + pr_err("failed to allocate persistent ram zone\n"); goto err; } diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 9cd5f63715c0..7f30bdc57d13 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -702,6 +702,7 @@ dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) struct dquot *dquot; unsigned long freed = 0; + spin_lock(&dq_list_lock); head = free_dquots.prev; while (head != &free_dquots && sc->nr_to_scan) { dquot = list_entry(head, struct dquot, dq_free); @@ -713,6 +714,7 @@ dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) freed++; head = free_dquots.prev; } + spin_unlock(&dq_list_lock); return freed; } diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 2b363e23f36e..ff3f0b3cfdb3 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -278,6 +278,17 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id, return ret; } +static int quota_rmxquota(struct super_block *sb, void __user *addr) +{ + __u32 flags; + + if (copy_from_user(&flags, addr, sizeof(flags))) + return -EFAULT; + if (!sb->s_qcop->rm_xquota) + return -ENOSYS; + return sb->s_qcop->rm_xquota(sb, flags); +} + /* Copy parameters and call proper function */ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void __user *addr, struct path *path) @@ -316,8 +327,9 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, return sb->s_qcop->quota_sync(sb, type); case Q_XQUOTAON: case Q_XQUOTAOFF: - case Q_XQUOTARM: return quota_setxstate(sb, cmd, addr); + case Q_XQUOTARM: + return quota_rmxquota(sb, addr); case Q_XGETQSTAT: return quota_getxstate(sb, addr); case Q_XGETQSTATV: diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c index 1e56a4e8cf7c..4f56de822d2f 100644 --- a/fs/ramfs/file-mmu.c +++ b/fs/ramfs/file-mmu.c @@ -31,14 +31,14 @@ #include "internal.h" const struct file_operations ramfs_file_operations = { - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write = new_sync_write, + .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .fsync = noop_fsync, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, .llseek = generic_file_llseek, }; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 0b3d8e4cb2fa..dda012ad4208 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -37,13 +37,13 @@ static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma); const struct file_operations ramfs_file_operations = { .mmap = ramfs_nommu_mmap, .get_unmapped_area = ramfs_nommu_get_unmapped_area, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write = new_sync_write, + .write_iter = generic_file_write_iter, .fsync = noop_fsync, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, .llseek = generic_file_llseek, }; diff --git a/fs/read_write.c b/fs/read_write.c index 31c6efa43183..009d8542a889 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -25,11 +25,12 @@ typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *); typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *, unsigned long, loff_t); +typedef ssize_t (*iter_fn_t)(struct kiocb *, struct iov_iter *); const struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, + .read = new_sync_read, + .read_iter = generic_file_read_iter, .mmap = generic_file_readonly_mmap, .splice_read = generic_file_splice_read, }; @@ -390,13 +391,34 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp EXPORT_SYMBOL(do_sync_read); +ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) +{ + struct iovec iov = { .iov_base = buf, .iov_len = len }; + struct kiocb kiocb; + struct iov_iter iter; + ssize_t ret; + + init_sync_kiocb(&kiocb, filp); + kiocb.ki_pos = *ppos; + kiocb.ki_nbytes = len; + iov_iter_init(&iter, READ, &iov, 1, len); + + ret = filp->f_op->read_iter(&kiocb, &iter); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&kiocb); + *ppos = kiocb.ki_pos; + return ret; +} + +EXPORT_SYMBOL(new_sync_read); + ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { ssize_t ret; if (!(file->f_mode & FMODE_READ)) return -EBADF; - if (!file->f_op->read && !file->f_op->aio_read) + if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; if (unlikely(!access_ok(VERIFY_WRITE, buf, count))) return -EFAULT; @@ -406,8 +428,10 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) count = ret; if (file->f_op->read) ret = file->f_op->read(file, buf, count, pos); - else + else if (file->f_op->aio_read) ret = do_sync_read(file, buf, count, pos); + else + ret = new_sync_read(file, buf, count, pos); if (ret > 0) { fsnotify_access(file); add_rchar(current, ret); @@ -439,13 +463,34 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof EXPORT_SYMBOL(do_sync_write); +ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) +{ + struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len }; + struct kiocb kiocb; + struct iov_iter iter; + ssize_t ret; + + init_sync_kiocb(&kiocb, filp); + kiocb.ki_pos = *ppos; + kiocb.ki_nbytes = len; + iov_iter_init(&iter, WRITE, &iov, 1, len); + + ret = filp->f_op->write_iter(&kiocb, &iter); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&kiocb); + *ppos = kiocb.ki_pos; + return ret; +} + +EXPORT_SYMBOL(new_sync_write); + ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos) { mm_segment_t old_fs; const char __user *p; ssize_t ret; - if (!file->f_op->write && !file->f_op->aio_write) + if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; old_fs = get_fs(); @@ -455,8 +500,10 @@ ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t count = MAX_RW_COUNT; if (file->f_op->write) ret = file->f_op->write(file, p, count, pos); - else + else if (file->f_op->aio_write) ret = do_sync_write(file, p, count, pos); + else + ret = new_sync_write(file, p, count, pos); set_fs(old_fs); if (ret > 0) { fsnotify_modify(file); @@ -472,7 +519,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_ if (!(file->f_mode & FMODE_WRITE)) return -EBADF; - if (!file->f_op->write && !file->f_op->aio_write) + if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (unlikely(!access_ok(VERIFY_READ, buf, count))) return -EFAULT; @@ -483,8 +530,10 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_ file_start_write(file); if (file->f_op->write) ret = file->f_op->write(file, buf, count, pos); - else + else if (file->f_op->aio_write) ret = do_sync_write(file, buf, count, pos); + else + ret = new_sync_write(file, buf, count, pos); if (ret > 0) { fsnotify_modify(file); add_wchar(current, ret); @@ -601,6 +650,25 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to) } EXPORT_SYMBOL(iov_shorten); +static ssize_t do_iter_readv_writev(struct file *filp, int rw, const struct iovec *iov, + unsigned long nr_segs, size_t len, loff_t *ppos, iter_fn_t fn) +{ + struct kiocb kiocb; + struct iov_iter iter; + ssize_t ret; + + init_sync_kiocb(&kiocb, filp); + kiocb.ki_pos = *ppos; + kiocb.ki_nbytes = len; + + iov_iter_init(&iter, rw, iov, nr_segs, len); + ret = fn(&kiocb, &iter); + if (ret == -EIOCBQUEUED) + ret = wait_on_sync_kiocb(&kiocb); + *ppos = kiocb.ki_pos; + return ret; +} + static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn) { @@ -738,6 +806,7 @@ static ssize_t do_readv_writev(int type, struct file *file, ssize_t ret; io_fn_t fn; iov_fn_t fnv; + iter_fn_t iter_fn; ret = rw_copy_check_uvector(type, uvector, nr_segs, ARRAY_SIZE(iovstack), iovstack, &iov); @@ -753,13 +822,18 @@ static ssize_t do_readv_writev(int type, struct file *file, if (type == READ) { fn = file->f_op->read; fnv = file->f_op->aio_read; + iter_fn = file->f_op->read_iter; } else { fn = (io_fn_t)file->f_op->write; fnv = file->f_op->aio_write; + iter_fn = file->f_op->write_iter; file_start_write(file); } - if (fnv) + if (iter_fn) + ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len, + pos, iter_fn); + else if (fnv) ret = do_sync_readv_writev(file, iov, nr_segs, tot_len, pos, fnv); else @@ -785,7 +859,7 @@ ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, { if (!(file->f_mode & FMODE_READ)) return -EBADF; - if (!file->f_op->aio_read && !file->f_op->read) + if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; return do_readv_writev(READ, file, vec, vlen, pos); @@ -798,7 +872,7 @@ ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, { if (!(file->f_mode & FMODE_WRITE)) return -EBADF; - if (!file->f_op->aio_write && !file->f_op->write) + if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; return do_readv_writev(WRITE, file, vec, vlen, pos); @@ -912,6 +986,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, ssize_t ret; io_fn_t fn; iov_fn_t fnv; + iter_fn_t iter_fn; ret = compat_rw_copy_check_uvector(type, uvector, nr_segs, UIO_FASTIOV, iovstack, &iov); @@ -927,13 +1002,18 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, if (type == READ) { fn = file->f_op->read; fnv = file->f_op->aio_read; + iter_fn = file->f_op->read_iter; } else { fn = (io_fn_t)file->f_op->write; fnv = file->f_op->aio_write; + iter_fn = file->f_op->write_iter; file_start_write(file); } - if (fnv) + if (iter_fn) + ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len, + pos, iter_fn); + else if (fnv) ret = do_sync_readv_writev(file, iov, nr_segs, tot_len, pos, fnv); else @@ -964,7 +1044,7 @@ static size_t compat_readv(struct file *file, goto out; ret = -EINVAL; - if (!file->f_op->aio_read && !file->f_op->read) + if (!(file->f_mode & FMODE_CAN_READ)) goto out; ret = compat_do_readv_writev(READ, file, vec, vlen, pos); @@ -1041,7 +1121,7 @@ static size_t compat_writev(struct file *file, goto out; ret = -EINVAL; - if (!file->f_op->aio_write && !file->f_op->write) + if (!(file->f_mode & FMODE_CAN_WRITE)) goto out; ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos); diff --git a/fs/readdir.c b/fs/readdir.c index 5b53d995cae6..33fd92208cb7 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -13,6 +13,7 @@ #include <linux/stat.h> #include <linux/file.h> #include <linux/fs.h> +#include <linux/fsnotify.h> #include <linux/dirent.h> #include <linux/security.h> #include <linux/syscalls.h> @@ -40,6 +41,7 @@ int iterate_dir(struct file *file, struct dir_context *ctx) ctx->pos = file->f_pos; res = file->f_op->iterate(file, ctx); file->f_pos = ctx->pos; + fsnotify_access(file); file_accessed(file); } mutex_unlock(&inode->i_mutex); diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index dc9a6829f7c6..dc198bc64c61 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -50,8 +50,10 @@ static inline void get_bit_address(struct super_block *s, unsigned int *bmap_nr, unsigned int *offset) { - /* It is in the bitmap block number equal to the block - * number divided by the number of bits in a block. */ + /* + * It is in the bitmap block number equal to the block + * number divided by the number of bits in a block. + */ *bmap_nr = block >> (s->s_blocksize_bits + 3); /* Within that bitmap block it is located at bit offset *offset. */ *offset = block & ((s->s_blocksize << 3) - 1); @@ -71,10 +73,12 @@ int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value) get_bit_address(s, block, &bmap, &offset); - /* Old format filesystem? Unlikely, but the bitmaps are all up front so - * we need to account for it. */ + /* + * Old format filesystem? Unlikely, but the bitmaps are all + * up front so we need to account for it. + */ if (unlikely(test_bit(REISERFS_OLD_FORMAT, - &(REISERFS_SB(s)->s_properties)))) { + &REISERFS_SB(s)->s_properties))) { b_blocknr_t bmap1 = REISERFS_SB(s)->s_sbh->b_blocknr + 1; if (block >= bmap1 && block <= bmap1 + bmap_count) { @@ -108,8 +112,11 @@ int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value) return 1; } -/* searches in journal structures for a given block number (bmap, off). If block - is found in reiserfs journal it suggests next free block candidate to test. */ +/* + * Searches in journal structures for a given block number (bmap, off). + * If block is found in reiserfs journal it suggests next free block + * candidate to test. + */ static inline int is_block_in_journal(struct super_block *s, unsigned int bmap, int off, int *next) { @@ -120,7 +127,7 @@ static inline int is_block_in_journal(struct super_block *s, unsigned int bmap, *next = tmp; PROC_INFO_INC(s, scan_bitmap.in_journal_hint); } else { - (*next) = off + 1; /* inc offset to avoid looping. */ + (*next) = off + 1; /* inc offset to avoid looping. */ PROC_INFO_INC(s, scan_bitmap.in_journal_nohint); } PROC_INFO_INC(s, scan_bitmap.retry); @@ -129,8 +136,10 @@ static inline int is_block_in_journal(struct super_block *s, unsigned int bmap, return 0; } -/* it searches for a window of zero bits with given minimum and maximum lengths in one bitmap - * block; */ +/* + * Searches for a window of zero bits with given minimum and maximum + * lengths in one bitmap block + */ static int scan_bitmap_block(struct reiserfs_transaction_handle *th, unsigned int bmap_n, int *beg, int boundary, int min, int max, int unfm) @@ -142,14 +151,9 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th, int org = *beg; BUG_ON(!th->t_trans_id); - RFALSE(bmap_n >= reiserfs_bmap_count(s), "Bitmap %u is out of " "range (0..%u)", bmap_n, reiserfs_bmap_count(s) - 1); PROC_INFO_INC(s, scan_bitmap.bmap); -/* this is unclear and lacks comments, explain how journal bitmaps - work here for the reader. Convey a sense of the design here. What - is a window? */ -/* - I mean `a window of zero bits' as in description of this function - Zam. */ if (!bi) { reiserfs_error(s, "jdm-4055", "NULL bitmap info pointer " @@ -162,18 +166,21 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th, return 0; while (1) { - cont: +cont: if (bi->free_count < min) { brelse(bh); - return 0; // No free blocks in this bitmap + return 0; /* No free blocks in this bitmap */ } /* search for a first zero bit -- beginning of a window */ *beg = reiserfs_find_next_zero_le_bit ((unsigned long *)(bh->b_data), boundary, *beg); - if (*beg + min > boundary) { /* search for a zero bit fails or the rest of bitmap block - * cannot contain a zero window of minimum size */ + /* + * search for a zero bit fails or the rest of bitmap block + * cannot contain a zero window of minimum size + */ + if (*beg + min > boundary) { brelse(bh); return 0; } @@ -187,49 +194,75 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th, next = end; break; } - /* finding the other end of zero bit window requires looking into journal structures (in - * case of searching for free blocks for unformatted nodes) */ + + /* + * finding the other end of zero bit window requires + * looking into journal structures (in case of + * searching for free blocks for unformatted nodes) + */ if (unfm && is_block_in_journal(s, bmap_n, end, &next)) break; } - /* now (*beg) points to beginning of zero bits window, - * (end) points to one bit after the window end */ - if (end - *beg >= min) { /* it seems we have found window of proper size */ + /* + * now (*beg) points to beginning of zero bits window, + * (end) points to one bit after the window end + */ + + /* found window of proper size */ + if (end - *beg >= min) { int i; reiserfs_prepare_for_journal(s, bh, 1); - /* try to set all blocks used checking are they still free */ + /* + * try to set all blocks used checking are + * they still free + */ for (i = *beg; i < end; i++) { - /* It seems that we should not check in journal again. */ + /* Don't check in journal again. */ if (reiserfs_test_and_set_le_bit (i, bh->b_data)) { - /* bit was set by another process - * while we slept in prepare_for_journal() */ + /* + * bit was set by another process while + * we slept in prepare_for_journal() + */ PROC_INFO_INC(s, scan_bitmap.stolen); - if (i >= *beg + min) { /* we can continue with smaller set of allocated blocks, - * if length of this set is more or equal to `min' */ + + /* + * we can continue with smaller set + * of allocated blocks, if length of + * this set is more or equal to `min' + */ + if (i >= *beg + min) { end = i; break; } - /* otherwise we clear all bit were set ... */ + + /* + * otherwise we clear all bit + * were set ... + */ while (--i >= *beg) reiserfs_clear_le_bit (i, bh->b_data); reiserfs_restore_prepared_buffer(s, bh); *beg = org; - /* ... and search again in current block from beginning */ + + /* + * Search again in current block + * from beginning + */ goto cont; } } bi->free_count -= (end - *beg); - journal_mark_dirty(th, s, bh); + journal_mark_dirty(th, bh); brelse(bh); /* free block count calculation */ reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); PUT_SB_FREE_BLOCKS(s, SB_FREE_BLOCKS(s) - (end - *beg)); - journal_mark_dirty(th, s, SB_BUFFER_WITH_SB(s)); + journal_mark_dirty(th, SB_BUFFER_WITH_SB(s)); return end - (*beg); } else { @@ -268,11 +301,13 @@ static inline int block_group_used(struct super_block *s, u32 id) int bm = bmap_hash_id(s, id); struct reiserfs_bitmap_info *info = &SB_AP_BITMAP(s)[bm]; - /* If we don't have cached information on this bitmap block, we're + /* + * If we don't have cached information on this bitmap block, we're * going to have to load it later anyway. Loading it here allows us * to make a better decision. This favors long-term performance gain * with a better on-disk layout vs. a short term gain of skipping the - * read and potentially having a bad placement. */ + * read and potentially having a bad placement. + */ if (info->free_count == UINT_MAX) { struct buffer_head *bh = reiserfs_read_bitmap_block(s, bm); brelse(bh); @@ -305,26 +340,26 @@ __le32 reiserfs_choose_packing(struct inode * dir) return packing; } -/* Tries to find contiguous zero bit window (given size) in given region of - * bitmap and place new blocks there. Returns number of allocated blocks. */ +/* + * Tries to find contiguous zero bit window (given size) in given region of + * bitmap and place new blocks there. Returns number of allocated blocks. + */ static int scan_bitmap(struct reiserfs_transaction_handle *th, b_blocknr_t * start, b_blocknr_t finish, int min, int max, int unfm, sector_t file_block) { int nr_allocated = 0; struct super_block *s = th->t_super; - /* find every bm and bmap and bmap_nr in this file, and change them all to bitmap_blocknr - * - Hans, it is not a block number - Zam. */ - unsigned int bm, off; unsigned int end_bm, end_off; unsigned int off_max = s->s_blocksize << 3; BUG_ON(!th->t_trans_id); - PROC_INFO_INC(s, scan_bitmap.call); + + /* No point in looking for more free blocks */ if (SB_FREE_BLOCKS(s) <= 0) - return 0; // No point in looking for more free blocks + return 0; get_bit_address(s, *start, &bm, &off); get_bit_address(s, finish, &end_bm, &end_off); @@ -333,7 +368,8 @@ static int scan_bitmap(struct reiserfs_transaction_handle *th, if (end_bm > reiserfs_bmap_count(s)) end_bm = reiserfs_bmap_count(s); - /* When the bitmap is more than 10% free, anyone can allocate. + /* + * When the bitmap is more than 10% free, anyone can allocate. * When it's less than 10% free, only files that already use the * bitmap are allowed. Once we pass 80% full, this restriction * is lifted. @@ -371,7 +407,7 @@ static int scan_bitmap(struct reiserfs_transaction_handle *th, nr_allocated = scan_bitmap_block(th, bm, &off, end_off + 1, min, max, unfm); - ret: +ret: *start = bm * off_max + off; return nr_allocated; @@ -388,9 +424,7 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th, unsigned int nr, offset; BUG_ON(!th->t_trans_id); - PROC_INFO_INC(s, free_block); - rs = SB_DISK_SUPER_BLOCK(s); sbh = SB_BUFFER_WITH_SB(s); apbi = SB_AP_BITMAP(s); @@ -415,14 +449,14 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th, "block %lu: bit already cleared", block); } apbi[nr].free_count++; - journal_mark_dirty(th, s, bmbh); + journal_mark_dirty(th, bmbh); brelse(bmbh); reiserfs_prepare_for_journal(s, sbh, 1); /* update super block */ set_sb_free_blocks(rs, sb_free_blocks(rs) + 1); - journal_mark_dirty(th, s, sbh); + journal_mark_dirty(th, sbh); if (for_unformatted) { int depth = reiserfs_write_unlock_nested(s); dquot_free_block_nodirty(inode, 1); @@ -435,8 +469,8 @@ void reiserfs_free_block(struct reiserfs_transaction_handle *th, int for_unformatted) { struct super_block *s = th->t_super; - BUG_ON(!th->t_trans_id); + BUG_ON(!th->t_trans_id); RFALSE(!s, "vs-4061: trying to free block on nonexistent device"); if (!is_reusable(s, block, 1)) return; @@ -471,6 +505,7 @@ static void __discard_prealloc(struct reiserfs_transaction_handle *th, unsigned long save = ei->i_prealloc_block; int dirty = 0; struct inode *inode = &ei->vfs_inode; + BUG_ON(!th->t_trans_id); #ifdef CONFIG_REISERFS_CHECK if (ei->i_prealloc_count < 0) @@ -486,7 +521,7 @@ static void __discard_prealloc(struct reiserfs_transaction_handle *th, if (dirty) reiserfs_update_sd(th, inode); ei->i_prealloc_block = save; - list_del_init(&(ei->i_prealloc_list)); + list_del_init(&ei->i_prealloc_list); } /* FIXME: It should be inline function */ @@ -494,6 +529,7 @@ void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th, struct inode *inode) { struct reiserfs_inode_info *ei = REISERFS_I(inode); + BUG_ON(!th->t_trans_id); if (ei->i_prealloc_count) __discard_prealloc(th, ei); @@ -504,7 +540,6 @@ void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th) struct list_head *plist = &SB_JOURNAL(th->t_super)->j_prealloc_list; BUG_ON(!th->t_trans_id); - while (!list_empty(plist)) { struct reiserfs_inode_info *ei; ei = list_entry(plist->next, struct reiserfs_inode_info, @@ -532,7 +567,8 @@ int reiserfs_parse_alloc_options(struct super_block *s, char *options) { char *this_char, *value; - REISERFS_SB(s)->s_alloc_options.bits = 0; /* clear default settings */ + /* clear default settings */ + REISERFS_SB(s)->s_alloc_options.bits = 0; while ((this_char = strsep(&options, ":")) != NULL) { if ((value = strchr(this_char, '=')) != NULL) @@ -562,7 +598,7 @@ int reiserfs_parse_alloc_options(struct super_block *s, char *options) if (!strcmp(this_char, "displacing_new_packing_localities")) { SET_OPTION(displacing_new_packing_localities); continue; - }; + } if (!strcmp(this_char, "old_hashed_relocation")) { SET_OPTION(old_hashed_relocation); @@ -729,11 +765,12 @@ void show_alloc_options(struct seq_file *seq, struct super_block *s) static inline void new_hashed_relocation(reiserfs_blocknr_hint_t * hint) { char *hash_in; + if (hint->formatted_node) { hash_in = (char *)&hint->key.k_dir_id; } else { if (!hint->inode) { - //hint->search_start = hint->beg; + /*hint->search_start = hint->beg;*/ hash_in = (char *)&hint->key.k_dir_id; } else if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super)) @@ -757,6 +794,7 @@ static void dirid_groups(reiserfs_blocknr_hint_t * hint) __u32 dirid = 0; int bm = 0; struct super_block *sb = hint->th->t_super; + if (hint->inode) dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id); else if (hint->formatted_node) @@ -786,7 +824,8 @@ static void oid_groups(reiserfs_blocknr_hint_t * hint) dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id); - /* keep the root dir and it's first set of subdirs close to + /* + * keep the root dir and it's first set of subdirs close to * the start of the disk */ if (dirid <= 2) @@ -800,7 +839,8 @@ static void oid_groups(reiserfs_blocknr_hint_t * hint) } } -/* returns 1 if it finds an indirect item and gets valid hint info +/* + * returns 1 if it finds an indirect item and gets valid hint info * from it, otherwise 0 */ static int get_left_neighbor(reiserfs_blocknr_hint_t * hint) @@ -812,25 +852,29 @@ static int get_left_neighbor(reiserfs_blocknr_hint_t * hint) __le32 *item; int ret = 0; - if (!hint->path) /* reiserfs code can call this function w/o pointer to path - * structure supplied; then we rely on supplied search_start */ + /* + * reiserfs code can call this function w/o pointer to path + * structure supplied; then we rely on supplied search_start + */ + if (!hint->path) return 0; path = hint->path; bh = get_last_bh(path); RFALSE(!bh, "green-4002: Illegal path specified to get_left_neighbor"); - ih = get_ih(path); + ih = tp_item_head(path); pos_in_item = path->pos_in_item; - item = get_item(path); + item = tp_item_body(path); hint->search_start = bh->b_blocknr; + /* + * for indirect item: go to left and look for the first non-hole entry + * in the indirect item + */ if (!hint->formatted_node && is_indirect_le_ih(ih)) { - /* for indirect item: go to left and look for the first non-hole entry - in the indirect item */ if (pos_in_item == I_UNFM_NUM(ih)) pos_in_item--; -// pos_in_item = I_UNFM_NUM (ih) - 1; while (pos_in_item >= 0) { int t = get_block_num(item, pos_in_item); if (t) { @@ -846,10 +890,12 @@ static int get_left_neighbor(reiserfs_blocknr_hint_t * hint) return ret; } -/* should be, if formatted node, then try to put on first part of the device - specified as number of percent with mount option device, else try to put - on last of device. This is not to say it is good code to do so, - but the effect should be measured. */ +/* + * should be, if formatted node, then try to put on first part of the device + * specified as number of percent with mount option device, else try to put + * on last of device. This is not to say it is good code to do so, + * but the effect should be measured. + */ static inline void set_border_in_hint(struct super_block *s, reiserfs_blocknr_hint_t * hint) { @@ -975,21 +1021,27 @@ static void determine_search_start(reiserfs_blocknr_hint_t * hint, set_border_in_hint(s, hint); #ifdef DISPLACE_NEW_PACKING_LOCALITIES - /* whenever we create a new directory, we displace it. At first we will - hash for location, later we might look for a moderately empty place for - it */ + /* + * whenever we create a new directory, we displace it. At first + * we will hash for location, later we might look for a moderately + * empty place for it + */ if (displacing_new_packing_localities(s) && hint->th->displace_new_blocks) { displace_new_packing_locality(hint); - /* we do not continue determine_search_start, - * if new packing locality is being displaced */ + /* + * we do not continue determine_search_start, + * if new packing locality is being displaced + */ return; } #endif - /* all persons should feel encouraged to add more special cases here and - * test them */ + /* + * all persons should feel encouraged to add more special cases + * here and test them + */ if (displacing_large_files(s) && !hint->formatted_node && this_blocknr_allocation_would_make_it_a_large_file(hint)) { @@ -997,8 +1049,10 @@ static void determine_search_start(reiserfs_blocknr_hint_t * hint, return; } - /* if none of our special cases is relevant, use the left neighbor in the - tree order of the new node we are allocating for */ + /* + * if none of our special cases is relevant, use the left + * neighbor in the tree order of the new node we are allocating for + */ if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes, s)) { hash_formatted_node(hint); return; @@ -1006,10 +1060,13 @@ static void determine_search_start(reiserfs_blocknr_hint_t * hint, unfm_hint = get_left_neighbor(hint); - /* Mimic old block allocator behaviour, that is if VFS allowed for preallocation, - new blocks are displaced based on directory ID. Also, if suggested search_start - is less than last preallocated block, we start searching from it, assuming that - HDD dataflow is faster in forward direction */ + /* + * Mimic old block allocator behaviour, that is if VFS allowed for + * preallocation, new blocks are displaced based on directory ID. + * Also, if suggested search_start is less than last preallocated + * block, we start searching from it, assuming that HDD dataflow + * is faster in forward direction + */ if (TEST_OPTION(old_way, s)) { if (!hint->formatted_node) { if (!reiserfs_hashed_relocation(s)) @@ -1038,11 +1095,13 @@ static void determine_search_start(reiserfs_blocknr_hint_t * hint, TEST_OPTION(old_hashed_relocation, s)) { old_hashed_relocation(hint); } + /* new_hashed_relocation works with both formatted/unformatted nodes */ if ((!unfm_hint || hint->formatted_node) && TEST_OPTION(new_hashed_relocation, s)) { new_hashed_relocation(hint); } + /* dirid grouping works only on unformatted nodes */ if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups, s)) { dirid_groups(hint); @@ -1080,8 +1139,6 @@ static int determine_prealloc_size(reiserfs_blocknr_hint_t * hint) return CARRY_ON; } -/* XXX I know it could be merged with upper-level function; - but may be result function would be too complex. */ static inline int allocate_without_wrapping_disk(reiserfs_blocknr_hint_t * hint, b_blocknr_t * new_blocknrs, b_blocknr_t start, @@ -1109,7 +1166,10 @@ static inline int allocate_without_wrapping_disk(reiserfs_blocknr_hint_t * hint, /* do we have something to fill prealloc. array also ? */ if (nr_allocated > 0) { - /* it means prealloc_size was greater that 0 and we do preallocation */ + /* + * it means prealloc_size was greater that 0 and + * we do preallocation + */ list_add(&REISERFS_I(hint->inode)->i_prealloc_list, &SB_JOURNAL(hint->th->t_super)-> j_prealloc_list); @@ -1177,7 +1237,8 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start start = 0; finish = hint->beg; break; - default: /* We've tried searching everywhere, not enough space */ + default: + /* We've tried searching everywhere, not enough space */ /* Free the blocks */ if (!hint->formatted_node) { #ifdef REISERQUOTA_DEBUG @@ -1262,8 +1323,11 @@ static int use_preallocated_list_if_available(reiserfs_blocknr_hint_t * hint, return amount_needed; } -int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t * hint, b_blocknr_t * new_blocknrs, int amount_needed, int reserved_by_us /* Amount of blocks we have - already reserved */ ) +int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *hint, + b_blocknr_t *new_blocknrs, + int amount_needed, + /* Amount of blocks we have already reserved */ + int reserved_by_us) { int initial_amount_needed = amount_needed; int ret; @@ -1275,15 +1339,21 @@ int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t * hint, b_blocknr_t * new return NO_DISK_SPACE; /* should this be if !hint->inode && hint->preallocate? */ /* do you mean hint->formatted_node can be removed ? - Zam */ - /* hint->formatted_node cannot be removed because we try to access - inode information here, and there is often no inode assotiated with - metadata allocations - green */ + /* + * hint->formatted_node cannot be removed because we try to access + * inode information here, and there is often no inode associated with + * metadata allocations - green + */ if (!hint->formatted_node && hint->preallocate) { amount_needed = use_preallocated_list_if_available (hint, new_blocknrs, amount_needed); - if (amount_needed == 0) /* all blocknrs we need we got from - prealloc. list */ + + /* + * We have all the block numbers we need from the + * prealloc list + */ + if (amount_needed == 0) return CARRY_ON; new_blocknrs += (initial_amount_needed - amount_needed); } @@ -1297,10 +1367,12 @@ int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t * hint, b_blocknr_t * new ret = blocknrs_and_prealloc_arrays_from_search_start (hint, new_blocknrs, amount_needed); - /* we used prealloc. list to fill (partially) new_blocknrs array. If final allocation fails we - * need to return blocks back to prealloc. list or just free them. -- Zam (I chose second - * variant) */ - + /* + * We used prealloc. list to fill (partially) new_blocknrs array. + * If final allocation fails we need to return blocks back to + * prealloc. list or just free them. -- Zam (I chose second + * variant) + */ if (ret != CARRY_ON) { while (amount_needed++ < initial_amount_needed) { reiserfs_free_block(hint->th, hint->inode, @@ -1339,10 +1411,12 @@ struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb, struct reiserfs_bitmap_info *info = SB_AP_BITMAP(sb) + bitmap; struct buffer_head *bh; - /* Way old format filesystems had the bitmaps packed up front. - * I doubt there are any of these left, but just in case... */ + /* + * Way old format filesystems had the bitmaps packed up front. + * I doubt there are any of these left, but just in case... + */ if (unlikely(test_bit(REISERFS_OLD_FORMAT, - &(REISERFS_SB(sb)->s_properties)))) + &REISERFS_SB(sb)->s_properties))) block = REISERFS_SB(sb)->s_sbh->b_blocknr + 1 + bitmap; else if (bitmap == 0) block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1; diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index af677353a3f5..d9f5a60dd59b 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -59,7 +59,10 @@ static inline bool is_privroot_deh(struct inode *dir, struct reiserfs_de_head *d int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx) { - struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */ + + /* key of current position in the directory (key of directory entry) */ + struct cpu_key pos_key; + INITIALIZE_PATH(path_to_entry); struct buffer_head *bh; int item_num, entry_num; @@ -77,21 +80,28 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx) reiserfs_check_lock_depth(inode->i_sb, "readdir"); - /* form key for search the next directory entry using f_pos field of - file structure */ + /* + * form key for search the next directory entry using + * f_pos field of file structure + */ make_cpu_key(&pos_key, inode, ctx->pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3); next_pos = cpu_key_k_offset(&pos_key); path_to_entry.reada = PATH_READA; while (1) { - research: - /* search the directory item, containing entry with specified key */ +research: + /* + * search the directory item, containing entry with + * specified key + */ search_res = search_by_entry_key(inode->i_sb, &pos_key, &path_to_entry, &de); if (search_res == IO_ERROR) { - // FIXME: we could just skip part of directory which could - // not be read + /* + * FIXME: we could just skip part of directory + * which could not be read + */ ret = -EIO; goto out; } @@ -102,41 +112,49 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx) store_ih(&tmp_ih, ih); /* we must have found item, that is item of this directory, */ - RFALSE(COMP_SHORT_KEYS(&(ih->ih_key), &pos_key), + RFALSE(COMP_SHORT_KEYS(&ih->ih_key, &pos_key), "vs-9000: found item %h does not match to dir we readdir %K", ih, &pos_key); RFALSE(item_num > B_NR_ITEMS(bh) - 1, "vs-9005 item_num == %d, item amount == %d", item_num, B_NR_ITEMS(bh)); - /* and entry must be not more than number of entries in the item */ - RFALSE(I_ENTRY_COUNT(ih) < entry_num, + /* + * and entry must be not more than number of entries + * in the item + */ + RFALSE(ih_entry_count(ih) < entry_num, "vs-9010: entry number is too big %d (%d)", - entry_num, I_ENTRY_COUNT(ih)); + entry_num, ih_entry_count(ih)); + /* + * go through all entries in the directory item beginning + * from the entry, that has been found + */ if (search_res == POSITION_FOUND - || entry_num < I_ENTRY_COUNT(ih)) { - /* go through all entries in the directory item beginning from the entry, that has been found */ + || entry_num < ih_entry_count(ih)) { struct reiserfs_de_head *deh = B_I_DEH(bh, ih) + entry_num; - for (; entry_num < I_ENTRY_COUNT(ih); + for (; entry_num < ih_entry_count(ih); entry_num++, deh++) { int d_reclen; char *d_name; ino_t d_ino; loff_t cur_pos = deh_offset(deh); + /* it is hidden entry */ if (!de_visible(deh)) - /* it is hidden entry */ continue; d_reclen = entry_length(bh, ih, entry_num); d_name = B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh); if (d_reclen <= 0 || d_name + d_reclen > bh->b_data + bh->b_size) { - /* There is corrupted data in entry, - * We'd better stop here */ + /* + * There is corrupted data in entry, + * We'd better stop here + */ pathrelse(&path_to_entry); ret = -EIO; goto out; @@ -145,10 +163,10 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx) if (!d_name[d_reclen - 1]) d_reclen = strlen(d_name); + /* too big to send back to VFS */ if (d_reclen > REISERFS_MAX_NAME(inode->i_sb-> s_blocksize)) { - /* too big to send back to VFS */ continue; } @@ -173,10 +191,14 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx) goto research; } } - // Note, that we copy name to user space via temporary - // buffer (local_buf) because filldir will block if - // user space buffer is swapped out. At that time - // entry can move to somewhere else + + /* + * Note, that we copy name to user space via + * temporary buffer (local_buf) because + * filldir will block if user space buffer is + * swapped out. At that time entry can move to + * somewhere else + */ memcpy(local_buf, d_name, d_reclen); /* @@ -209,22 +231,26 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx) } /* for */ } + /* end of directory has been reached */ if (item_num != B_NR_ITEMS(bh) - 1) - // end of directory has been reached goto end; - /* item we went through is last item of node. Using right - delimiting key check is it directory end */ + /* + * item we went through is last item of node. Using right + * delimiting key check is it directory end + */ rkey = get_rkey(&path_to_entry, inode->i_sb); if (!comp_le_keys(rkey, &MIN_KEY)) { - /* set pos_key to key, that is the smallest and greater - that key of the last entry in the item */ + /* + * set pos_key to key, that is the smallest and greater + * that key of the last entry in the item + */ set_cpu_key_k_offset(&pos_key, next_pos); continue; } + /* end of directory has been reached */ if (COMP_SHORT_KEYS(rkey, &pos_key)) { - // end of directory has been reached goto end; } @@ -248,71 +274,73 @@ static int reiserfs_readdir(struct file *file, struct dir_context *ctx) return reiserfs_readdir_inode(file_inode(file), ctx); } -/* compose directory item containing "." and ".." entries (entries are - not aligned to 4 byte boundary) */ -/* the last four params are LE */ +/* + * compose directory item containing "." and ".." entries (entries are + * not aligned to 4 byte boundary) + */ void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid, __le32 par_dirid, __le32 par_objid) { - struct reiserfs_de_head *deh; + struct reiserfs_de_head *dot, *dotdot; memset(body, 0, EMPTY_DIR_SIZE_V1); - deh = (struct reiserfs_de_head *)body; + dot = (struct reiserfs_de_head *)body; + dotdot = dot + 1; /* direntry header of "." */ - put_deh_offset(&(deh[0]), DOT_OFFSET); + put_deh_offset(dot, DOT_OFFSET); /* these two are from make_le_item_head, and are are LE */ - deh[0].deh_dir_id = dirid; - deh[0].deh_objectid = objid; - deh[0].deh_state = 0; /* Endian safe if 0 */ - put_deh_location(&(deh[0]), EMPTY_DIR_SIZE_V1 - strlen(".")); - mark_de_visible(&(deh[0])); + dot->deh_dir_id = dirid; + dot->deh_objectid = objid; + dot->deh_state = 0; /* Endian safe if 0 */ + put_deh_location(dot, EMPTY_DIR_SIZE_V1 - strlen(".")); + mark_de_visible(dot); /* direntry header of ".." */ - put_deh_offset(&(deh[1]), DOT_DOT_OFFSET); + put_deh_offset(dotdot, DOT_DOT_OFFSET); /* key of ".." for the root directory */ /* these two are from the inode, and are are LE */ - deh[1].deh_dir_id = par_dirid; - deh[1].deh_objectid = par_objid; - deh[1].deh_state = 0; /* Endian safe if 0 */ - put_deh_location(&(deh[1]), deh_location(&(deh[0])) - strlen("..")); - mark_de_visible(&(deh[1])); + dotdot->deh_dir_id = par_dirid; + dotdot->deh_objectid = par_objid; + dotdot->deh_state = 0; /* Endian safe if 0 */ + put_deh_location(dotdot, deh_location(dot) - strlen("..")); + mark_de_visible(dotdot); /* copy ".." and "." */ - memcpy(body + deh_location(&(deh[0])), ".", 1); - memcpy(body + deh_location(&(deh[1])), "..", 2); + memcpy(body + deh_location(dot), ".", 1); + memcpy(body + deh_location(dotdot), "..", 2); } /* compose directory item containing "." and ".." entries */ void make_empty_dir_item(char *body, __le32 dirid, __le32 objid, __le32 par_dirid, __le32 par_objid) { - struct reiserfs_de_head *deh; + struct reiserfs_de_head *dot, *dotdot; memset(body, 0, EMPTY_DIR_SIZE); - deh = (struct reiserfs_de_head *)body; + dot = (struct reiserfs_de_head *)body; + dotdot = dot + 1; /* direntry header of "." */ - put_deh_offset(&(deh[0]), DOT_OFFSET); + put_deh_offset(dot, DOT_OFFSET); /* these two are from make_le_item_head, and are are LE */ - deh[0].deh_dir_id = dirid; - deh[0].deh_objectid = objid; - deh[0].deh_state = 0; /* Endian safe if 0 */ - put_deh_location(&(deh[0]), EMPTY_DIR_SIZE - ROUND_UP(strlen("."))); - mark_de_visible(&(deh[0])); + dot->deh_dir_id = dirid; + dot->deh_objectid = objid; + dot->deh_state = 0; /* Endian safe if 0 */ + put_deh_location(dot, EMPTY_DIR_SIZE - ROUND_UP(strlen("."))); + mark_de_visible(dot); /* direntry header of ".." */ - put_deh_offset(&(deh[1]), DOT_DOT_OFFSET); + put_deh_offset(dotdot, DOT_DOT_OFFSET); /* key of ".." for the root directory */ /* these two are from the inode, and are are LE */ - deh[1].deh_dir_id = par_dirid; - deh[1].deh_objectid = par_objid; - deh[1].deh_state = 0; /* Endian safe if 0 */ - put_deh_location(&(deh[1]), - deh_location(&(deh[0])) - ROUND_UP(strlen(".."))); - mark_de_visible(&(deh[1])); + dotdot->deh_dir_id = par_dirid; + dotdot->deh_objectid = par_objid; + dotdot->deh_state = 0; /* Endian safe if 0 */ + put_deh_location(dotdot, deh_location(dot) - ROUND_UP(strlen(".."))); + mark_de_visible(dotdot); /* copy ".." and "." */ - memcpy(body + deh_location(&(deh[0])), ".", 1); - memcpy(body + deh_location(&(deh[1])), "..", 2); + memcpy(body + deh_location(dot), ".", 1); + memcpy(body + deh_location(dotdot), "..", 2); } diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c index 9a3c68cf6026..54fdf196bfb2 100644 --- a/fs/reiserfs/do_balan.c +++ b/fs/reiserfs/do_balan.c @@ -2,18 +2,13 @@ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README */ -/* Now we have all buffers that must be used in balancing of the tree */ -/* Further calculations can not cause schedule(), and thus the buffer */ -/* tree will be stable until the balancing will be finished */ -/* balance the tree according to the analysis made before, */ -/* and using buffers obtained after all above. */ - -/** - ** balance_leaf_when_delete - ** balance_leaf - ** do_balance - ** - **/ +/* + * Now we have all buffers that must be used in balancing of the tree + * Further calculations can not cause schedule(), and thus the buffer + * tree will be stable until the balancing will be finished + * balance the tree according to the analysis made before, + * and using buffers obtained after all above. + */ #include <asm/uaccess.h> #include <linux/time.h> @@ -61,48 +56,190 @@ static inline void buffer_info_init_bh(struct tree_balance *tb, inline void do_balance_mark_leaf_dirty(struct tree_balance *tb, struct buffer_head *bh, int flag) { - journal_mark_dirty(tb->transaction_handle, - tb->transaction_handle->t_super, bh); + journal_mark_dirty(tb->transaction_handle, bh); } #define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty #define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty -/* summary: - if deleting something ( tb->insert_size[0] < 0 ) - return(balance_leaf_when_delete()); (flag d handled here) - else - if lnum is larger than 0 we put items into the left node - if rnum is larger than 0 we put items into the right node - if snum1 is larger than 0 we put items into the new node s1 - if snum2 is larger than 0 we put items into the new node s2 -Note that all *num* count new items being created. - -It would be easier to read balance_leaf() if each of these summary -lines was a separate procedure rather than being inlined. I think -that there are many passages here and in balance_leaf_when_delete() in -which two calls to one procedure can replace two passages, and it -might save cache space and improve software maintenance costs to do so. - -Vladimir made the perceptive comment that we should offload most of -the decision making in this function into fix_nodes/check_balance, and -then create some sort of structure in tb that says what actions should -be performed by do_balance. - --Hans */ - -/* Balance leaf node in case of delete or cut: insert_size[0] < 0 +/* + * summary: + * if deleting something ( tb->insert_size[0] < 0 ) + * return(balance_leaf_when_delete()); (flag d handled here) + * else + * if lnum is larger than 0 we put items into the left node + * if rnum is larger than 0 we put items into the right node + * if snum1 is larger than 0 we put items into the new node s1 + * if snum2 is larger than 0 we put items into the new node s2 + * Note that all *num* count new items being created. + */ + +static void balance_leaf_when_delete_del(struct tree_balance *tb) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int item_pos = PATH_LAST_POSITION(tb->tb_path); + struct buffer_info bi; +#ifdef CONFIG_REISERFS_CHECK + struct item_head *ih = item_head(tbS0, item_pos); +#endif + + RFALSE(ih_item_len(ih) + IH_SIZE != -tb->insert_size[0], + "vs-12013: mode Delete, insert size %d, ih to be deleted %h", + -tb->insert_size[0], ih); + + buffer_info_init_tbS0(tb, &bi); + leaf_delete_items(&bi, 0, item_pos, 1, -1); + + if (!item_pos && tb->CFL[0]) { + if (B_NR_ITEMS(tbS0)) { + replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0); + } else { + if (!PATH_H_POSITION(tb->tb_path, 1)) + replace_key(tb, tb->CFL[0], tb->lkey[0], + PATH_H_PPARENT(tb->tb_path, 0), 0); + } + } + + RFALSE(!item_pos && !tb->CFL[0], + "PAP-12020: tb->CFL[0]==%p, tb->L[0]==%p", tb->CFL[0], + tb->L[0]); +} + +/* cut item in S[0] */ +static void balance_leaf_when_delete_cut(struct tree_balance *tb) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int item_pos = PATH_LAST_POSITION(tb->tb_path); + struct item_head *ih = item_head(tbS0, item_pos); + int pos_in_item = tb->tb_path->pos_in_item; + struct buffer_info bi; + buffer_info_init_tbS0(tb, &bi); + + if (is_direntry_le_ih(ih)) { + /* + * UFS unlink semantics are such that you can only + * delete one directory entry at a time. + * + * when we cut a directory tb->insert_size[0] means + * number of entries to be cut (always 1) + */ + tb->insert_size[0] = -1; + leaf_cut_from_buffer(&bi, item_pos, pos_in_item, + -tb->insert_size[0]); + + RFALSE(!item_pos && !pos_in_item && !tb->CFL[0], + "PAP-12030: can not change delimiting key. CFL[0]=%p", + tb->CFL[0]); + + if (!item_pos && !pos_in_item && tb->CFL[0]) + replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0); + } else { + leaf_cut_from_buffer(&bi, item_pos, pos_in_item, + -tb->insert_size[0]); + + RFALSE(!ih_item_len(ih), + "PAP-12035: cut must leave non-zero dynamic " + "length of item"); + } +} + +static int balance_leaf_when_delete_left(struct tree_balance *tb) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tbS0); + + /* L[0] must be joined with S[0] */ + if (tb->lnum[0] == -1) { + /* R[0] must be also joined with S[0] */ + if (tb->rnum[0] == -1) { + if (tb->FR[0] == PATH_H_PPARENT(tb->tb_path, 0)) { + /* + * all contents of all the + * 3 buffers will be in L[0] + */ + if (PATH_H_POSITION(tb->tb_path, 1) == 0 && + 1 < B_NR_ITEMS(tb->FR[0])) + replace_key(tb, tb->CFL[0], + tb->lkey[0], tb->FR[0], 1); + + leaf_move_items(LEAF_FROM_S_TO_L, tb, n, -1, + NULL); + leaf_move_items(LEAF_FROM_R_TO_L, tb, + B_NR_ITEMS(tb->R[0]), -1, + NULL); + + reiserfs_invalidate_buffer(tb, tbS0); + reiserfs_invalidate_buffer(tb, tb->R[0]); + + return 0; + } + + /* all contents of all the 3 buffers will be in R[0] */ + leaf_move_items(LEAF_FROM_S_TO_R, tb, n, -1, NULL); + leaf_move_items(LEAF_FROM_L_TO_R, tb, + B_NR_ITEMS(tb->L[0]), -1, NULL); + + /* right_delimiting_key is correct in R[0] */ + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); + + reiserfs_invalidate_buffer(tb, tbS0); + reiserfs_invalidate_buffer(tb, tb->L[0]); + + return -1; + } + + RFALSE(tb->rnum[0] != 0, + "PAP-12045: rnum must be 0 (%d)", tb->rnum[0]); + /* all contents of L[0] and S[0] will be in L[0] */ + leaf_shift_left(tb, n, -1); + + reiserfs_invalidate_buffer(tb, tbS0); + + return 0; + } + + /* + * a part of contents of S[0] will be in L[0] and + * the rest part of S[0] will be in R[0] + */ + + RFALSE((tb->lnum[0] + tb->rnum[0] < n) || + (tb->lnum[0] + tb->rnum[0] > n + 1), + "PAP-12050: rnum(%d) and lnum(%d) and item " + "number(%d) in S[0] are not consistent", + tb->rnum[0], tb->lnum[0], n); + RFALSE((tb->lnum[0] + tb->rnum[0] == n) && + (tb->lbytes != -1 || tb->rbytes != -1), + "PAP-12055: bad rbytes (%d)/lbytes (%d) " + "parameters when items are not split", + tb->rbytes, tb->lbytes); + RFALSE((tb->lnum[0] + tb->rnum[0] == n + 1) && + (tb->lbytes < 1 || tb->rbytes != -1), + "PAP-12060: bad rbytes (%d)/lbytes (%d) " + "parameters when items are split", + tb->rbytes, tb->lbytes); + + leaf_shift_left(tb, tb->lnum[0], tb->lbytes); + leaf_shift_right(tb, tb->rnum[0], tb->rbytes); + + reiserfs_invalidate_buffer(tb, tbS0); + + return 0; +} + +/* + * Balance leaf node in case of delete or cut: insert_size[0] < 0 * * lnum, rnum can have values >= -1 * -1 means that the neighbor must be joined with S * 0 means that nothing should be done with the neighbor - * >0 means to shift entirely or partly the specified number of items to the neighbor + * >0 means to shift entirely or partly the specified number of items + * to the neighbor */ static int balance_leaf_when_delete(struct tree_balance *tb, int flag) { struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); int item_pos = PATH_LAST_POSITION(tb->tb_path); - int pos_in_item = tb->tb_path->pos_in_item; struct buffer_info bi; int n; struct item_head *ih; @@ -114,1022 +251,1202 @@ static int balance_leaf_when_delete(struct tree_balance *tb, int flag) RFALSE(!tb->blknum[0] && !PATH_H_PPARENT(tb->tb_path, 0), "PAP-12010: tree can not be empty"); - ih = B_N_PITEM_HEAD(tbS0, item_pos); + ih = item_head(tbS0, item_pos); buffer_info_init_tbS0(tb, &bi); /* Delete or truncate the item */ - switch (flag) { - case M_DELETE: /* delete item in S[0] */ + BUG_ON(flag != M_DELETE && flag != M_CUT); + if (flag == M_DELETE) + balance_leaf_when_delete_del(tb); + else /* M_CUT */ + balance_leaf_when_delete_cut(tb); - RFALSE(ih_item_len(ih) + IH_SIZE != -tb->insert_size[0], - "vs-12013: mode Delete, insert size %d, ih to be deleted %h", - -tb->insert_size[0], ih); - leaf_delete_items(&bi, 0, item_pos, 1, -1); + /* + * the rule is that no shifting occurs unless by shifting + * a node can be freed + */ + n = B_NR_ITEMS(tbS0); - if (!item_pos && tb->CFL[0]) { - if (B_NR_ITEMS(tbS0)) { - replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, - 0); - } else { - if (!PATH_H_POSITION(tb->tb_path, 1)) - replace_key(tb, tb->CFL[0], tb->lkey[0], - PATH_H_PPARENT(tb->tb_path, - 0), 0); - } - } - RFALSE(!item_pos && !tb->CFL[0], - "PAP-12020: tb->CFL[0]==%p, tb->L[0]==%p", tb->CFL[0], - tb->L[0]); + /* L[0] takes part in balancing */ + if (tb->lnum[0]) + return balance_leaf_when_delete_left(tb); - break; + if (tb->rnum[0] == -1) { + /* all contents of R[0] and S[0] will be in R[0] */ + leaf_shift_right(tb, n, -1); + reiserfs_invalidate_buffer(tb, tbS0); + return 0; + } - case M_CUT:{ /* cut item in S[0] */ - if (is_direntry_le_ih(ih)) { + RFALSE(tb->rnum[0], + "PAP-12065: bad rnum parameter must be 0 (%d)", tb->rnum[0]); + return 0; +} - /* UFS unlink semantics are such that you can only delete one directory entry at a time. */ - /* when we cut a directory tb->insert_size[0] means number of entries to be cut (always 1) */ - tb->insert_size[0] = -1; - leaf_cut_from_buffer(&bi, item_pos, pos_in_item, - -tb->insert_size[0]); +static void balance_leaf_insert_left(struct tree_balance *tb, + struct item_head *ih, const char *body) +{ + int ret; + struct buffer_info bi; + int n = B_NR_ITEMS(tb->L[0]); + + if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) { + /* part of new item falls into L[0] */ + int new_item_len, shift; + int version; + + ret = leaf_shift_left(tb, tb->lnum[0] - 1, -1); + + /* Calculate item length to insert to S[0] */ + new_item_len = ih_item_len(ih) - tb->lbytes; + + /* Calculate and check item length to insert to L[0] */ + put_ih_item_len(ih, ih_item_len(ih) - new_item_len); + + RFALSE(ih_item_len(ih) <= 0, + "PAP-12080: there is nothing to insert into L[0]: " + "ih_item_len=%d", ih_item_len(ih)); + + /* Insert new item into L[0] */ + buffer_info_init_left(tb, &bi); + leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body, + min_t(int, tb->zeroes_num, ih_item_len(ih))); + + version = ih_version(ih); + + /* + * Calculate key component, item length and body to + * insert into S[0] + */ + shift = 0; + if (is_indirect_le_ih(ih)) + shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT; + + add_le_ih_k_offset(ih, tb->lbytes << shift); + + put_ih_item_len(ih, new_item_len); + if (tb->lbytes > tb->zeroes_num) { + body += (tb->lbytes - tb->zeroes_num); + tb->zeroes_num = 0; + } else + tb->zeroes_num -= tb->lbytes; + + RFALSE(ih_item_len(ih) <= 0, + "PAP-12085: there is nothing to insert into S[0]: " + "ih_item_len=%d", ih_item_len(ih)); + } else { + /* new item in whole falls into L[0] */ + /* Shift lnum[0]-1 items to L[0] */ + ret = leaf_shift_left(tb, tb->lnum[0] - 1, tb->lbytes); + + /* Insert new item into L[0] */ + buffer_info_init_left(tb, &bi); + leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body, + tb->zeroes_num); + tb->insert_size[0] = 0; + tb->zeroes_num = 0; + } +} - RFALSE(!item_pos && !pos_in_item && !tb->CFL[0], - "PAP-12030: can not change delimiting key. CFL[0]=%p", - tb->CFL[0]); +static void balance_leaf_paste_left_shift_dirent(struct tree_balance *tb, + struct item_head *ih, + const char *body) +{ + int n = B_NR_ITEMS(tb->L[0]); + struct buffer_info bi; - if (!item_pos && !pos_in_item && tb->CFL[0]) { - replace_key(tb, tb->CFL[0], tb->lkey[0], - tbS0, 0); - } - } else { - leaf_cut_from_buffer(&bi, item_pos, pos_in_item, - -tb->insert_size[0]); + RFALSE(tb->zeroes_num, + "PAP-12090: invalid parameter in case of a directory"); + + /* directory item */ + if (tb->lbytes > tb->pos_in_item) { + /* new directory entry falls into L[0] */ + struct item_head *pasted; + int ret, l_pos_in_item = tb->pos_in_item; + + /* + * Shift lnum[0] - 1 items in whole. + * Shift lbytes - 1 entries from given directory item + */ + ret = leaf_shift_left(tb, tb->lnum[0], tb->lbytes - 1); + if (ret && !tb->item_pos) { + pasted = item_head(tb->L[0], B_NR_ITEMS(tb->L[0]) - 1); + l_pos_in_item += ih_entry_count(pasted) - + (tb->lbytes - 1); + } - RFALSE(!ih_item_len(ih), - "PAP-12035: cut must leave non-zero dynamic length of item"); - } - break; + /* Append given directory entry to directory item */ + buffer_info_init_left(tb, &bi); + leaf_paste_in_buffer(&bi, n + tb->item_pos - ret, + l_pos_in_item, tb->insert_size[0], + body, tb->zeroes_num); + + /* + * previous string prepared space for pasting new entry, + * following string pastes this entry + */ + + /* + * when we have merge directory item, pos_in_item + * has been changed too + */ + + /* paste new directory entry. 1 is entry number */ + leaf_paste_entries(&bi, n + tb->item_pos - ret, + l_pos_in_item, 1, + (struct reiserfs_de_head *) body, + body + DEH_SIZE, tb->insert_size[0]); + tb->insert_size[0] = 0; + } else { + /* new directory item doesn't fall into L[0] */ + /* + * Shift lnum[0]-1 items in whole. Shift lbytes + * directory entries from directory item number lnum[0] + */ + leaf_shift_left(tb, tb->lnum[0], tb->lbytes); + } + + /* Calculate new position to append in item body */ + tb->pos_in_item -= tb->lbytes; +} + +static void balance_leaf_paste_left_shift(struct tree_balance *tb, + struct item_head *ih, + const char *body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tb->L[0]); + struct buffer_info bi; + + if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) { + balance_leaf_paste_left_shift_dirent(tb, ih, body); + return; + } + + RFALSE(tb->lbytes <= 0, + "PAP-12095: there is nothing to shift to L[0]. " + "lbytes=%d", tb->lbytes); + RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)), + "PAP-12100: incorrect position to paste: " + "item_len=%d, pos_in_item=%d", + ih_item_len(item_head(tbS0, tb->item_pos)), tb->pos_in_item); + + /* appended item will be in L[0] in whole */ + if (tb->lbytes >= tb->pos_in_item) { + struct item_head *tbS0_pos_ih, *tbL0_ih; + struct item_head *tbS0_0_ih; + struct reiserfs_key *left_delim_key; + int ret, l_n, version, temp_l; + + tbS0_pos_ih = item_head(tbS0, tb->item_pos); + tbS0_0_ih = item_head(tbS0, 0); + + /* + * this bytes number must be appended + * to the last item of L[h] + */ + l_n = tb->lbytes - tb->pos_in_item; + + /* Calculate new insert_size[0] */ + tb->insert_size[0] -= l_n; + + RFALSE(tb->insert_size[0] <= 0, + "PAP-12105: there is nothing to paste into " + "L[0]. insert_size=%d", tb->insert_size[0]); + + ret = leaf_shift_left(tb, tb->lnum[0], + ih_item_len(tbS0_pos_ih)); + + tbL0_ih = item_head(tb->L[0], n + tb->item_pos - ret); + + /* Append to body of item in L[0] */ + buffer_info_init_left(tb, &bi); + leaf_paste_in_buffer(&bi, n + tb->item_pos - ret, + ih_item_len(tbL0_ih), l_n, body, + min_t(int, l_n, tb->zeroes_num)); + + /* + * 0-th item in S0 can be only of DIRECT type + * when l_n != 0 + */ + temp_l = l_n; + + RFALSE(ih_item_len(tbS0_0_ih), + "PAP-12106: item length must be 0"); + RFALSE(comp_short_le_keys(&tbS0_0_ih->ih_key, + leaf_key(tb->L[0], n + tb->item_pos - ret)), + "PAP-12107: items must be of the same file"); + + if (is_indirect_le_ih(tbL0_ih)) { + int shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT; + temp_l = l_n << shift; } + /* update key of first item in S0 */ + version = ih_version(tbS0_0_ih); + add_le_key_k_offset(version, &tbS0_0_ih->ih_key, temp_l); + + /* update left delimiting key */ + left_delim_key = internal_key(tb->CFL[0], tb->lkey[0]); + add_le_key_k_offset(version, left_delim_key, temp_l); + + /* + * Calculate new body, position in item and + * insert_size[0] + */ + if (l_n > tb->zeroes_num) { + body += (l_n - tb->zeroes_num); + tb->zeroes_num = 0; + } else + tb->zeroes_num -= l_n; + tb->pos_in_item = 0; + + RFALSE(comp_short_le_keys(&tbS0_0_ih->ih_key, + leaf_key(tb->L[0], + B_NR_ITEMS(tb->L[0]) - 1)) || + !op_is_left_mergeable(leaf_key(tbS0, 0), tbS0->b_size) || + !op_is_left_mergeable(left_delim_key, tbS0->b_size), + "PAP-12120: item must be merge-able with left " + "neighboring item"); + } else { + /* only part of the appended item will be in L[0] */ + + /* Calculate position in item for append in S[0] */ + tb->pos_in_item -= tb->lbytes; + + RFALSE(tb->pos_in_item <= 0, + "PAP-12125: no place for paste. pos_in_item=%d", + tb->pos_in_item); + + /* + * Shift lnum[0] - 1 items in whole. + * Shift lbytes - 1 byte from item number lnum[0] + */ + leaf_shift_left(tb, tb->lnum[0], tb->lbytes); + } +} - default: - print_cur_tb("12040"); - reiserfs_panic(tb->tb_sb, "PAP-12040", - "unexpected mode: %s(%d)", - (flag == - M_PASTE) ? "PASTE" : ((flag == - M_INSERT) ? "INSERT" : - "UNKNOWN"), flag); + +/* appended item will be in L[0] in whole */ +static void balance_leaf_paste_left_whole(struct tree_balance *tb, + struct item_head *ih, + const char *body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tb->L[0]); + struct buffer_info bi; + struct item_head *pasted; + int ret; + + /* if we paste into first item of S[0] and it is left mergable */ + if (!tb->item_pos && + op_is_left_mergeable(leaf_key(tbS0, 0), tbS0->b_size)) { + /* + * then increment pos_in_item by the size of the + * last item in L[0] + */ + pasted = item_head(tb->L[0], n - 1); + if (is_direntry_le_ih(pasted)) + tb->pos_in_item += ih_entry_count(pasted); + else + tb->pos_in_item += ih_item_len(pasted); } - /* the rule is that no shifting occurs unless by shifting a node can be freed */ - n = B_NR_ITEMS(tbS0); - if (tb->lnum[0]) { /* L[0] takes part in balancing */ - if (tb->lnum[0] == -1) { /* L[0] must be joined with S[0] */ - if (tb->rnum[0] == -1) { /* R[0] must be also joined with S[0] */ - if (tb->FR[0] == PATH_H_PPARENT(tb->tb_path, 0)) { - /* all contents of all the 3 buffers will be in L[0] */ - if (PATH_H_POSITION(tb->tb_path, 1) == 0 - && 1 < B_NR_ITEMS(tb->FR[0])) - replace_key(tb, tb->CFL[0], - tb->lkey[0], - tb->FR[0], 1); - - leaf_move_items(LEAF_FROM_S_TO_L, tb, n, - -1, NULL); - leaf_move_items(LEAF_FROM_R_TO_L, tb, - B_NR_ITEMS(tb->R[0]), - -1, NULL); - - reiserfs_invalidate_buffer(tb, tbS0); - reiserfs_invalidate_buffer(tb, - tb->R[0]); - - return 0; - } - /* all contents of all the 3 buffers will be in R[0] */ - leaf_move_items(LEAF_FROM_S_TO_R, tb, n, -1, - NULL); - leaf_move_items(LEAF_FROM_L_TO_R, tb, - B_NR_ITEMS(tb->L[0]), -1, NULL); + /* + * Shift lnum[0] - 1 items in whole. + * Shift lbytes - 1 byte from item number lnum[0] + */ + ret = leaf_shift_left(tb, tb->lnum[0], tb->lbytes); + + /* Append to body of item in L[0] */ + buffer_info_init_left(tb, &bi); + leaf_paste_in_buffer(&bi, n + tb->item_pos - ret, tb->pos_in_item, + tb->insert_size[0], body, tb->zeroes_num); + + /* if appended item is directory, paste entry */ + pasted = item_head(tb->L[0], n + tb->item_pos - ret); + if (is_direntry_le_ih(pasted)) + leaf_paste_entries(&bi, n + tb->item_pos - ret, + tb->pos_in_item, 1, + (struct reiserfs_de_head *)body, + body + DEH_SIZE, tb->insert_size[0]); + + /* + * if appended item is indirect item, put unformatted node + * into un list + */ + if (is_indirect_le_ih(pasted)) + set_ih_free_space(pasted, 0); - /* right_delimiting_key is correct in R[0] */ - replace_key(tb, tb->CFR[0], tb->rkey[0], - tb->R[0], 0); + tb->insert_size[0] = 0; + tb->zeroes_num = 0; +} - reiserfs_invalidate_buffer(tb, tbS0); - reiserfs_invalidate_buffer(tb, tb->L[0]); +static void balance_leaf_paste_left(struct tree_balance *tb, + struct item_head *ih, const char *body) +{ + /* we must shift the part of the appended item */ + if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) + balance_leaf_paste_left_shift(tb, ih, body); + else + balance_leaf_paste_left_whole(tb, ih, body); +} - return -1; - } +/* Shift lnum[0] items from S[0] to the left neighbor L[0] */ +static void balance_leaf_left(struct tree_balance *tb, struct item_head *ih, + const char *body, int flag) +{ + if (tb->lnum[0] <= 0) + return; - RFALSE(tb->rnum[0] != 0, - "PAP-12045: rnum must be 0 (%d)", tb->rnum[0]); - /* all contents of L[0] and S[0] will be in L[0] */ - leaf_shift_left(tb, n, -1); + /* new item or it part falls to L[0], shift it too */ + if (tb->item_pos < tb->lnum[0]) { + BUG_ON(flag != M_INSERT && flag != M_PASTE); + + if (flag == M_INSERT) + balance_leaf_insert_left(tb, ih, body); + else /* M_PASTE */ + balance_leaf_paste_left(tb, ih, body); + } else + /* new item doesn't fall into L[0] */ + leaf_shift_left(tb, tb->lnum[0], tb->lbytes); +} - reiserfs_invalidate_buffer(tb, tbS0); - return 0; +static void balance_leaf_insert_right(struct tree_balance *tb, + struct item_head *ih, const char *body) +{ + + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tbS0); + struct buffer_info bi; + int ret; + + /* new item or part of it doesn't fall into R[0] */ + if (n - tb->rnum[0] >= tb->item_pos) { + leaf_shift_right(tb, tb->rnum[0], tb->rbytes); + return; + } + + /* new item or its part falls to R[0] */ + + /* part of new item falls into R[0] */ + if (tb->item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) { + loff_t old_key_comp, old_len, r_zeroes_number; + const char *r_body; + int version, shift; + loff_t offset; + + leaf_shift_right(tb, tb->rnum[0] - 1, -1); + + version = ih_version(ih); + + /* Remember key component and item length */ + old_key_comp = le_ih_k_offset(ih); + old_len = ih_item_len(ih); + + /* + * Calculate key component and item length to insert + * into R[0] + */ + shift = 0; + if (is_indirect_le_ih(ih)) + shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT; + offset = le_ih_k_offset(ih) + ((old_len - tb->rbytes) << shift); + set_le_ih_k_offset(ih, offset); + put_ih_item_len(ih, tb->rbytes); + + /* Insert part of the item into R[0] */ + buffer_info_init_right(tb, &bi); + if ((old_len - tb->rbytes) > tb->zeroes_num) { + r_zeroes_number = 0; + r_body = body + (old_len - tb->rbytes) - tb->zeroes_num; + } else { + r_body = body; + r_zeroes_number = tb->zeroes_num - + (old_len - tb->rbytes); + tb->zeroes_num -= r_zeroes_number; } - /* a part of contents of S[0] will be in L[0] and the rest part of S[0] will be in R[0] */ - - RFALSE((tb->lnum[0] + tb->rnum[0] < n) || - (tb->lnum[0] + tb->rnum[0] > n + 1), - "PAP-12050: rnum(%d) and lnum(%d) and item number(%d) in S[0] are not consistent", - tb->rnum[0], tb->lnum[0], n); - RFALSE((tb->lnum[0] + tb->rnum[0] == n) && - (tb->lbytes != -1 || tb->rbytes != -1), - "PAP-12055: bad rbytes (%d)/lbytes (%d) parameters when items are not split", - tb->rbytes, tb->lbytes); - RFALSE((tb->lnum[0] + tb->rnum[0] == n + 1) && - (tb->lbytes < 1 || tb->rbytes != -1), - "PAP-12060: bad rbytes (%d)/lbytes (%d) parameters when items are split", - tb->rbytes, tb->lbytes); - leaf_shift_left(tb, tb->lnum[0], tb->lbytes); + leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeroes_number); + + /* Replace right delimiting key by first key in R[0] */ + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); + + /* + * Calculate key component and item length to + * insert into S[0] + */ + set_le_ih_k_offset(ih, old_key_comp); + put_ih_item_len(ih, old_len - tb->rbytes); + + tb->insert_size[0] -= tb->rbytes; + + } else { + /* whole new item falls into R[0] */ + + /* Shift rnum[0]-1 items to R[0] */ + ret = leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes); + + /* Insert new item into R[0] */ + buffer_info_init_right(tb, &bi); + leaf_insert_into_buf(&bi, tb->item_pos - n + tb->rnum[0] - 1, + ih, body, tb->zeroes_num); + + if (tb->item_pos - n + tb->rnum[0] - 1 == 0) + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); + + tb->zeroes_num = tb->insert_size[0] = 0; + } +} + + +static void balance_leaf_paste_right_shift_dirent(struct tree_balance *tb, + struct item_head *ih, const char *body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + struct buffer_info bi; + int entry_count; + + RFALSE(tb->zeroes_num, + "PAP-12145: invalid parameter in case of a directory"); + entry_count = ih_entry_count(item_head(tbS0, tb->item_pos)); + + /* new directory entry falls into R[0] */ + if (entry_count - tb->rbytes < tb->pos_in_item) { + int paste_entry_position; + + RFALSE(tb->rbytes - 1 >= entry_count || !tb->insert_size[0], + "PAP-12150: no enough of entries to shift to R[0]: " + "rbytes=%d, entry_count=%d", tb->rbytes, entry_count); + + /* + * Shift rnum[0]-1 items in whole. + * Shift rbytes-1 directory entries from directory + * item number rnum[0] + */ + leaf_shift_right(tb, tb->rnum[0], tb->rbytes - 1); + + /* Paste given directory entry to directory item */ + paste_entry_position = tb->pos_in_item - entry_count + + tb->rbytes - 1; + buffer_info_init_right(tb, &bi); + leaf_paste_in_buffer(&bi, 0, paste_entry_position, + tb->insert_size[0], body, tb->zeroes_num); + + /* paste entry */ + leaf_paste_entries(&bi, 0, paste_entry_position, 1, + (struct reiserfs_de_head *) body, + body + DEH_SIZE, tb->insert_size[0]); + + /* change delimiting keys */ + if (paste_entry_position == 0) + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); + + tb->insert_size[0] = 0; + tb->pos_in_item++; + } else { + /* new directory entry doesn't fall into R[0] */ leaf_shift_right(tb, tb->rnum[0], tb->rbytes); + } +} - reiserfs_invalidate_buffer(tb, tbS0); +static void balance_leaf_paste_right_shift(struct tree_balance *tb, + struct item_head *ih, const char *body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n_shift, n_rem, r_zeroes_number, version; + unsigned long temp_rem; + const char *r_body; + struct buffer_info bi; - return 0; + /* we append to directory item */ + if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) { + balance_leaf_paste_right_shift_dirent(tb, ih, body); + return; } - if (tb->rnum[0] == -1) { - /* all contents of R[0] and S[0] will be in R[0] */ - leaf_shift_right(tb, n, -1); - reiserfs_invalidate_buffer(tb, tbS0); - return 0; + /* regular object */ + + /* + * Calculate number of bytes which must be shifted + * from appended item + */ + n_shift = tb->rbytes - tb->insert_size[0]; + if (n_shift < 0) + n_shift = 0; + + RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)), + "PAP-12155: invalid position to paste. ih_item_len=%d, " + "pos_in_item=%d", tb->pos_in_item, + ih_item_len(item_head(tbS0, tb->item_pos))); + + leaf_shift_right(tb, tb->rnum[0], n_shift); + + /* + * Calculate number of bytes which must remain in body + * after appending to R[0] + */ + n_rem = tb->insert_size[0] - tb->rbytes; + if (n_rem < 0) + n_rem = 0; + + temp_rem = n_rem; + + version = ih_version(item_head(tb->R[0], 0)); + + if (is_indirect_le_key(version, leaf_key(tb->R[0], 0))) { + int shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT; + temp_rem = n_rem << shift; } - RFALSE(tb->rnum[0], - "PAP-12065: bad rnum parameter must be 0 (%d)", tb->rnum[0]); - return 0; + add_le_key_k_offset(version, leaf_key(tb->R[0], 0), temp_rem); + add_le_key_k_offset(version, internal_key(tb->CFR[0], tb->rkey[0]), + temp_rem); + + do_balance_mark_internal_dirty(tb, tb->CFR[0], 0); + + /* Append part of body into R[0] */ + buffer_info_init_right(tb, &bi); + if (n_rem > tb->zeroes_num) { + r_zeroes_number = 0; + r_body = body + n_rem - tb->zeroes_num; + } else { + r_body = body; + r_zeroes_number = tb->zeroes_num - n_rem; + tb->zeroes_num -= r_zeroes_number; + } + + leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem, + r_body, r_zeroes_number); + + if (is_indirect_le_ih(item_head(tb->R[0], 0))) + set_ih_free_space(item_head(tb->R[0], 0), 0); + + tb->insert_size[0] = n_rem; + if (!n_rem) + tb->pos_in_item++; } -static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item header of inserted item (this is on little endian) */ - const char *body, /* body of inserted item or bytes to paste */ - int flag, /* i - insert, d - delete, c - cut, p - paste - (see comment to do_balance) */ - struct item_head *insert_key, /* in our processing of one level we sometimes determine what - must be inserted into the next higher level. This insertion - consists of a key or two keys and their corresponding - pointers */ - struct buffer_head **insert_ptr /* inserted node-ptrs for the next level */ - ) +static void balance_leaf_paste_right_whole(struct tree_balance *tb, + struct item_head *ih, const char *body) { struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); - int item_pos = PATH_LAST_POSITION(tb->tb_path); /* index into the array of item headers in S[0] - of the affected item */ + int n = B_NR_ITEMS(tbS0); + struct item_head *pasted; struct buffer_info bi; - struct buffer_head *S_new[2]; /* new nodes allocated to hold what could not fit into S */ - int snum[2]; /* number of items that will be placed - into S_new (includes partially shifted - items) */ - int sbytes[2]; /* if an item is partially shifted into S_new then - if it is a directory item - it is the number of entries from the item that are shifted into S_new - else - it is the number of bytes from the item that are shifted into S_new - */ - int n, i; - int ret_val; - int pos_in_item; - int zeros_num; - PROC_INFO_INC(tb->tb_sb, balance_at[0]); + buffer_info_init_right(tb, &bi); + leaf_shift_right(tb, tb->rnum[0], tb->rbytes); + + /* append item in R[0] */ + if (tb->pos_in_item >= 0) { + buffer_info_init_right(tb, &bi); + leaf_paste_in_buffer(&bi, tb->item_pos - n + tb->rnum[0], + tb->pos_in_item, tb->insert_size[0], body, + tb->zeroes_num); + } - /* Make balance in case insert_size[0] < 0 */ - if (tb->insert_size[0] < 0) - return balance_leaf_when_delete(tb, flag); + /* paste new entry, if item is directory item */ + pasted = item_head(tb->R[0], tb->item_pos - n + tb->rnum[0]); + if (is_direntry_le_ih(pasted) && tb->pos_in_item >= 0) { + leaf_paste_entries(&bi, tb->item_pos - n + tb->rnum[0], + tb->pos_in_item, 1, + (struct reiserfs_de_head *)body, + body + DEH_SIZE, tb->insert_size[0]); - zeros_num = 0; - if (flag == M_INSERT && !body) - zeros_num = ih_item_len(ih); + if (!tb->pos_in_item) { - pos_in_item = tb->tb_path->pos_in_item; - /* for indirect item pos_in_item is measured in unformatted node - pointers. Recalculate to bytes */ - if (flag != M_INSERT - && is_indirect_le_ih(B_N_PITEM_HEAD(tbS0, item_pos))) - pos_in_item *= UNFM_P_SIZE; - - if (tb->lnum[0] > 0) { - /* Shift lnum[0] items from S[0] to the left neighbor L[0] */ - if (item_pos < tb->lnum[0]) { - /* new item or it part falls to L[0], shift it too */ - n = B_NR_ITEMS(tb->L[0]); - - switch (flag) { - case M_INSERT: /* insert item into L[0] */ - - if (item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) { - /* part of new item falls into L[0] */ - int new_item_len; - int version; - - ret_val = leaf_shift_left(tb, tb->lnum[0] - 1, -1); - - /* Calculate item length to insert to S[0] */ - new_item_len = ih_item_len(ih) - tb->lbytes; - /* Calculate and check item length to insert to L[0] */ - put_ih_item_len(ih, ih_item_len(ih) - new_item_len); - - RFALSE(ih_item_len(ih) <= 0, - "PAP-12080: there is nothing to insert into L[0]: ih_item_len=%d", - ih_item_len(ih)); - - /* Insert new item into L[0] */ - buffer_info_init_left(tb, &bi); - leaf_insert_into_buf(&bi, - n + item_pos - ret_val, ih, body, - zeros_num > ih_item_len(ih) ? ih_item_len(ih) : zeros_num); - - version = ih_version(ih); - - /* Calculate key component, item length and body to insert into S[0] */ - set_le_ih_k_offset(ih, le_ih_k_offset(ih) + - (tb-> lbytes << (is_indirect_le_ih(ih) ? tb->tb_sb-> s_blocksize_bits - UNFM_P_SHIFT : 0))); - - put_ih_item_len(ih, new_item_len); - if (tb->lbytes > zeros_num) { - body += (tb->lbytes - zeros_num); - zeros_num = 0; - } else - zeros_num -= tb->lbytes; - - RFALSE(ih_item_len(ih) <= 0, - "PAP-12085: there is nothing to insert into S[0]: ih_item_len=%d", - ih_item_len(ih)); - } else { - /* new item in whole falls into L[0] */ - /* Shift lnum[0]-1 items to L[0] */ - ret_val = leaf_shift_left(tb, tb->lnum[0] - 1, tb->lbytes); - /* Insert new item into L[0] */ - buffer_info_init_left(tb, &bi); - leaf_insert_into_buf(&bi, n + item_pos - ret_val, ih, body, zeros_num); - tb->insert_size[0] = 0; - zeros_num = 0; - } - break; - - case M_PASTE: /* append item in L[0] */ - - if (item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) { - /* we must shift the part of the appended item */ - if (is_direntry_le_ih(B_N_PITEM_HEAD(tbS0, item_pos))) { - - RFALSE(zeros_num, - "PAP-12090: invalid parameter in case of a directory"); - /* directory item */ - if (tb->lbytes > pos_in_item) { - /* new directory entry falls into L[0] */ - struct item_head *pasted; - int l_pos_in_item = pos_in_item; - - /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 entries from given directory item */ - ret_val = leaf_shift_left(tb, tb->lnum[0], tb->lbytes-1); - if (ret_val && !item_pos) { - pasted = B_N_PITEM_HEAD(tb->L[0], B_NR_ITEMS(tb->L[0]) - 1); - l_pos_in_item += I_ENTRY_COUNT(pasted) - (tb->lbytes -1); - } - - /* Append given directory entry to directory item */ - buffer_info_init_left(tb, &bi); - leaf_paste_in_buffer(&bi, n + item_pos - ret_val, l_pos_in_item, tb->insert_size[0], body, zeros_num); - - /* previous string prepared space for pasting new entry, following string pastes this entry */ - - /* when we have merge directory item, pos_in_item has been changed too */ - - /* paste new directory entry. 1 is entry number */ - leaf_paste_entries(&bi, n + item_pos - ret_val, l_pos_in_item, - 1, (struct reiserfs_de_head *) body, - body + DEH_SIZE, tb->insert_size[0]); - tb->insert_size[0] = 0; - } else { - /* new directory item doesn't fall into L[0] */ - /* Shift lnum[0]-1 items in whole. Shift lbytes directory entries from directory item number lnum[0] */ - leaf_shift_left(tb, tb->lnum[0], tb->lbytes); - } - /* Calculate new position to append in item body */ - pos_in_item -= tb->lbytes; - } else { - /* regular object */ - RFALSE(tb->lbytes <= 0, "PAP-12095: there is nothing to shift to L[0]. lbytes=%d", tb->lbytes); - RFALSE(pos_in_item != ih_item_len(B_N_PITEM_HEAD(tbS0, item_pos)), - "PAP-12100: incorrect position to paste: item_len=%d, pos_in_item=%d", - ih_item_len(B_N_PITEM_HEAD(tbS0, item_pos)),pos_in_item); - - if (tb->lbytes >= pos_in_item) { - /* appended item will be in L[0] in whole */ - int l_n; - - /* this bytes number must be appended to the last item of L[h] */ - l_n = tb->lbytes - pos_in_item; - - /* Calculate new insert_size[0] */ - tb->insert_size[0] -= l_n; - - RFALSE(tb->insert_size[0] <= 0, - "PAP-12105: there is nothing to paste into L[0]. insert_size=%d", - tb->insert_size[0]); - ret_val = leaf_shift_left(tb, tb->lnum[0], ih_item_len - (B_N_PITEM_HEAD(tbS0, item_pos))); - /* Append to body of item in L[0] */ - buffer_info_init_left(tb, &bi); - leaf_paste_in_buffer - (&bi, n + item_pos - ret_val, ih_item_len - (B_N_PITEM_HEAD(tb->L[0], n + item_pos - ret_val)), - l_n, body, - zeros_num > l_n ? l_n : zeros_num); - /* 0-th item in S0 can be only of DIRECT type when l_n != 0 */ - { - int version; - int temp_l = l_n; - - RFALSE(ih_item_len(B_N_PITEM_HEAD(tbS0, 0)), - "PAP-12106: item length must be 0"); - RFALSE(comp_short_le_keys(B_N_PKEY(tbS0, 0), B_N_PKEY - (tb->L[0], n + item_pos - ret_val)), - "PAP-12107: items must be of the same file"); - if (is_indirect_le_ih(B_N_PITEM_HEAD(tb->L[0], n + item_pos - ret_val))) { - temp_l = l_n << (tb->tb_sb-> s_blocksize_bits - UNFM_P_SHIFT); - } - /* update key of first item in S0 */ - version = ih_version(B_N_PITEM_HEAD(tbS0, 0)); - set_le_key_k_offset(version, B_N_PKEY(tbS0, 0), - le_key_k_offset(version,B_N_PKEY(tbS0, 0)) + temp_l); - /* update left delimiting key */ - set_le_key_k_offset(version, B_N_PDELIM_KEY(tb->CFL[0], tb->lkey[0]), - le_key_k_offset(version, B_N_PDELIM_KEY(tb->CFL[0], tb->lkey[0])) + temp_l); - } - - /* Calculate new body, position in item and insert_size[0] */ - if (l_n > zeros_num) { - body += (l_n - zeros_num); - zeros_num = 0; - } else - zeros_num -= l_n; - pos_in_item = 0; - - RFALSE(comp_short_le_keys(B_N_PKEY(tbS0, 0), B_N_PKEY(tb->L[0], B_NR_ITEMS(tb->L[0]) - 1)) - || !op_is_left_mergeable(B_N_PKEY(tbS0, 0), tbS0->b_size) - || !op_is_left_mergeable(B_N_PDELIM_KEY(tb->CFL[0], tb->lkey[0]), tbS0->b_size), - "PAP-12120: item must be merge-able with left neighboring item"); - } else { /* only part of the appended item will be in L[0] */ - - /* Calculate position in item for append in S[0] */ - pos_in_item -= tb->lbytes; - - RFALSE(pos_in_item <= 0, "PAP-12125: no place for paste. pos_in_item=%d", pos_in_item); - - /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */ - leaf_shift_left(tb, tb->lnum[0], tb->lbytes); - } - } - } else { /* appended item will be in L[0] in whole */ - - struct item_head *pasted; - - if (!item_pos && op_is_left_mergeable(B_N_PKEY(tbS0, 0), tbS0->b_size)) { /* if we paste into first item of S[0] and it is left mergable */ - /* then increment pos_in_item by the size of the last item in L[0] */ - pasted = B_N_PITEM_HEAD(tb->L[0], n - 1); - if (is_direntry_le_ih(pasted)) - pos_in_item += ih_entry_count(pasted); - else - pos_in_item += ih_item_len(pasted); - } - - /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */ - ret_val = leaf_shift_left(tb, tb->lnum[0], tb->lbytes); - /* Append to body of item in L[0] */ - buffer_info_init_left(tb, &bi); - leaf_paste_in_buffer(&bi, n + item_pos - ret_val, - pos_in_item, - tb->insert_size[0], - body, zeros_num); - - /* if appended item is directory, paste entry */ - pasted = B_N_PITEM_HEAD(tb->L[0], n + item_pos - ret_val); - if (is_direntry_le_ih(pasted)) - leaf_paste_entries(&bi, n + item_pos - ret_val, - pos_in_item, 1, - (struct reiserfs_de_head *) body, - body + DEH_SIZE, - tb->insert_size[0]); - /* if appended item is indirect item, put unformatted node into un list */ - if (is_indirect_le_ih(pasted)) - set_ih_free_space(pasted, 0); - tb->insert_size[0] = 0; - zeros_num = 0; - } - break; - default: /* cases d and t */ - reiserfs_panic(tb->tb_sb, "PAP-12130", - "lnum > 0: unexpected mode: " - " %s(%d)", - (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag); - } - } else { - /* new item doesn't fall into L[0] */ - leaf_shift_left(tb, tb->lnum[0], tb->lbytes); + RFALSE(tb->item_pos - n + tb->rnum[0], + "PAP-12165: directory item must be first " + "item of node when pasting is in 0th position"); + + /* update delimiting keys */ + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); } } - /* tb->lnum[0] > 0 */ - /* Calculate new item position */ - item_pos -= (tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0)); - - if (tb->rnum[0] > 0) { - /* shift rnum[0] items from S[0] to the right neighbor R[0] */ - n = B_NR_ITEMS(tbS0); - switch (flag) { - - case M_INSERT: /* insert item */ - if (n - tb->rnum[0] < item_pos) { /* new item or its part falls to R[0] */ - if (item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) { /* part of new item falls into R[0] */ - loff_t old_key_comp, old_len, r_zeros_number; - const char *r_body; - int version; - loff_t offset; - - leaf_shift_right(tb, tb->rnum[0] - 1, -1); - - version = ih_version(ih); - /* Remember key component and item length */ - old_key_comp = le_ih_k_offset(ih); - old_len = ih_item_len(ih); - - /* Calculate key component and item length to insert into R[0] */ - offset = le_ih_k_offset(ih) + ((old_len - tb->rbytes) << (is_indirect_le_ih(ih) ? tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT : 0)); - set_le_ih_k_offset(ih, offset); - put_ih_item_len(ih, tb->rbytes); - /* Insert part of the item into R[0] */ - buffer_info_init_right(tb, &bi); - if ((old_len - tb->rbytes) > zeros_num) { - r_zeros_number = 0; - r_body = body + (old_len - tb->rbytes) - zeros_num; - } else { - r_body = body; - r_zeros_number = zeros_num - (old_len - tb->rbytes); - zeros_num -= r_zeros_number; - } - - leaf_insert_into_buf(&bi, 0, ih, r_body, - r_zeros_number); - - /* Replace right delimiting key by first key in R[0] */ - replace_key(tb, tb->CFR[0], tb->rkey[0], - tb->R[0], 0); - - /* Calculate key component and item length to insert into S[0] */ - set_le_ih_k_offset(ih, old_key_comp); - put_ih_item_len(ih, old_len - tb->rbytes); - - tb->insert_size[0] -= tb->rbytes; - - } else { /* whole new item falls into R[0] */ - - /* Shift rnum[0]-1 items to R[0] */ - ret_val = leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes); - /* Insert new item into R[0] */ - buffer_info_init_right(tb, &bi); - leaf_insert_into_buf(&bi, item_pos - n + tb->rnum[0] - 1, - ih, body, zeros_num); - - if (item_pos - n + tb->rnum[0] - 1 == 0) { - replace_key(tb, tb->CFR[0], - tb->rkey[0], - tb->R[0], 0); - - } - zeros_num = tb->insert_size[0] = 0; - } - } else { /* new item or part of it doesn't fall into R[0] */ - - leaf_shift_right(tb, tb->rnum[0], tb->rbytes); - } - break; + if (is_indirect_le_ih(pasted)) + set_ih_free_space(pasted, 0); + tb->zeroes_num = tb->insert_size[0] = 0; +} - case M_PASTE: /* append item */ - - if (n - tb->rnum[0] <= item_pos) { /* pasted item or part of it falls to R[0] */ - if (item_pos == n - tb->rnum[0] && tb->rbytes != -1) { /* we must shift the part of the appended item */ - if (is_direntry_le_ih(B_N_PITEM_HEAD(tbS0, item_pos))) { /* we append to directory item */ - int entry_count; - - RFALSE(zeros_num, - "PAP-12145: invalid parameter in case of a directory"); - entry_count = I_ENTRY_COUNT(B_N_PITEM_HEAD - (tbS0, item_pos)); - if (entry_count - tb->rbytes < - pos_in_item) - /* new directory entry falls into R[0] */ - { - int paste_entry_position; - - RFALSE(tb->rbytes - 1 >= entry_count || !tb-> insert_size[0], - "PAP-12150: no enough of entries to shift to R[0]: rbytes=%d, entry_count=%d", - tb->rbytes, entry_count); - /* Shift rnum[0]-1 items in whole. Shift rbytes-1 directory entries from directory item number rnum[0] */ - leaf_shift_right(tb, tb->rnum[0], tb->rbytes - 1); - /* Paste given directory entry to directory item */ - paste_entry_position = pos_in_item - entry_count + tb->rbytes - 1; - buffer_info_init_right(tb, &bi); - leaf_paste_in_buffer(&bi, 0, paste_entry_position, tb->insert_size[0], body, zeros_num); - /* paste entry */ - leaf_paste_entries(&bi, 0, paste_entry_position, 1, - (struct reiserfs_de_head *) body, - body + DEH_SIZE, tb->insert_size[0]); - - if (paste_entry_position == 0) { - /* change delimiting keys */ - replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0],0); - } - - tb->insert_size[0] = 0; - pos_in_item++; - } else { /* new directory entry doesn't fall into R[0] */ - - leaf_shift_right(tb, tb->rnum[0], tb->rbytes); - } - } else { /* regular object */ - - int n_shift, n_rem, r_zeros_number; - const char *r_body; - - /* Calculate number of bytes which must be shifted from appended item */ - if ((n_shift = tb->rbytes - tb->insert_size[0]) < 0) - n_shift = 0; - - RFALSE(pos_in_item != ih_item_len - (B_N_PITEM_HEAD(tbS0, item_pos)), - "PAP-12155: invalid position to paste. ih_item_len=%d, pos_in_item=%d", - pos_in_item, ih_item_len - (B_N_PITEM_HEAD(tbS0, item_pos))); - - leaf_shift_right(tb, tb->rnum[0], n_shift); - /* Calculate number of bytes which must remain in body after appending to R[0] */ - if ((n_rem = tb->insert_size[0] - tb->rbytes) < 0) - n_rem = 0; - - { - int version; - unsigned long temp_rem = n_rem; - - version = ih_version(B_N_PITEM_HEAD(tb->R[0], 0)); - if (is_indirect_le_key(version, B_N_PKEY(tb->R[0], 0))) { - temp_rem = n_rem << (tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT); - } - set_le_key_k_offset(version, B_N_PKEY(tb->R[0], 0), - le_key_k_offset(version, B_N_PKEY(tb->R[0], 0)) + temp_rem); - set_le_key_k_offset(version, B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0]), - le_key_k_offset(version, B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0])) + temp_rem); - } -/* k_offset (B_N_PKEY(tb->R[0],0)) += n_rem; - k_offset (B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0])) += n_rem;*/ - do_balance_mark_internal_dirty(tb, tb->CFR[0], 0); - - /* Append part of body into R[0] */ - buffer_info_init_right(tb, &bi); - if (n_rem > zeros_num) { - r_zeros_number = 0; - r_body = body + n_rem - zeros_num; - } else { - r_body = body; - r_zeros_number = zeros_num - n_rem; - zeros_num -= r_zeros_number; - } - - leaf_paste_in_buffer(&bi, 0, n_shift, - tb->insert_size[0] - n_rem, - r_body, r_zeros_number); - - if (is_indirect_le_ih(B_N_PITEM_HEAD(tb->R[0], 0))) { -#if 0 - RFALSE(n_rem, - "PAP-12160: paste more than one unformatted node pointer"); -#endif - set_ih_free_space(B_N_PITEM_HEAD(tb->R[0], 0), 0); - } - tb->insert_size[0] = n_rem; - if (!n_rem) - pos_in_item++; - } - } else { /* pasted item in whole falls into R[0] */ - - struct item_head *pasted; - - ret_val = leaf_shift_right(tb, tb->rnum[0], tb->rbytes); - /* append item in R[0] */ - if (pos_in_item >= 0) { - buffer_info_init_right(tb, &bi); - leaf_paste_in_buffer(&bi, item_pos - n + tb->rnum[0], pos_in_item, - tb->insert_size[0], body, zeros_num); - } - - /* paste new entry, if item is directory item */ - pasted = B_N_PITEM_HEAD(tb->R[0], item_pos - n + tb->rnum[0]); - if (is_direntry_le_ih(pasted) && pos_in_item >= 0) { - leaf_paste_entries(&bi, item_pos - n + tb->rnum[0], - pos_in_item, 1, - (struct reiserfs_de_head *) body, - body + DEH_SIZE, tb->insert_size[0]); - if (!pos_in_item) { - - RFALSE(item_pos - n + tb->rnum[0], - "PAP-12165: directory item must be first item of node when pasting is in 0th position"); - - /* update delimiting keys */ - replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); - } - } - - if (is_indirect_le_ih(pasted)) - set_ih_free_space(pasted, 0); - zeros_num = tb->insert_size[0] = 0; - } - } else { /* new item doesn't fall into R[0] */ - - leaf_shift_right(tb, tb->rnum[0], tb->rbytes); - } - break; - default: /* cases d and t */ - reiserfs_panic(tb->tb_sb, "PAP-12175", - "rnum > 0: unexpected mode: %s(%d)", - (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag); - } +static void balance_leaf_paste_right(struct tree_balance *tb, + struct item_head *ih, const char *body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tbS0); + /* new item doesn't fall into R[0] */ + if (n - tb->rnum[0] > tb->item_pos) { + leaf_shift_right(tb, tb->rnum[0], tb->rbytes); + return; } - /* tb->rnum[0] > 0 */ - RFALSE(tb->blknum[0] > 3, - "PAP-12180: blknum can not be %d. It must be <= 3", tb->blknum[0]); - RFALSE(tb->blknum[0] < 0, - "PAP-12185: blknum can not be %d. It must be >= 0", tb->blknum[0]); + /* pasted item or part of it falls to R[0] */ - /* if while adding to a node we discover that it is possible to split - it in two, and merge the left part into the left neighbor and the - right part into the right neighbor, eliminating the node */ - if (tb->blknum[0] == 0) { /* node S[0] is empty now */ + if (tb->item_pos == n - tb->rnum[0] && tb->rbytes != -1) + /* we must shift the part of the appended item */ + balance_leaf_paste_right_shift(tb, ih, body); + else + /* pasted item in whole falls into R[0] */ + balance_leaf_paste_right_whole(tb, ih, body); +} - RFALSE(!tb->lnum[0] || !tb->rnum[0], - "PAP-12190: lnum and rnum must not be zero"); - /* if insertion was done before 0-th position in R[0], right - delimiting key of the tb->L[0]'s and left delimiting key are - not set correctly */ - if (tb->CFL[0]) { - if (!tb->CFR[0]) - reiserfs_panic(tb->tb_sb, "vs-12195", - "CFR not initialized"); - copy_key(B_N_PDELIM_KEY(tb->CFL[0], tb->lkey[0]), - B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0])); - do_balance_mark_internal_dirty(tb, tb->CFL[0], 0); +/* shift rnum[0] items from S[0] to the right neighbor R[0] */ +static void balance_leaf_right(struct tree_balance *tb, struct item_head *ih, + const char *body, int flag) +{ + if (tb->rnum[0] <= 0) + return; + + BUG_ON(flag != M_INSERT && flag != M_PASTE); + + if (flag == M_INSERT) + balance_leaf_insert_right(tb, ih, body); + else /* M_PASTE */ + balance_leaf_paste_right(tb, ih, body); +} + +static void balance_leaf_new_nodes_insert(struct tree_balance *tb, + struct item_head *ih, + const char *body, + struct item_head *insert_key, + struct buffer_head **insert_ptr, + int i) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tbS0); + struct buffer_info bi; + int shift; + + /* new item or it part don't falls into S_new[i] */ + if (n - tb->snum[i] >= tb->item_pos) { + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, + tb->snum[i], tb->sbytes[i], tb->S_new[i]); + return; + } + + /* new item or it's part falls to first new node S_new[i] */ + + /* part of new item falls into S_new[i] */ + if (tb->item_pos == n - tb->snum[i] + 1 && tb->sbytes[i] != -1) { + int old_key_comp, old_len, r_zeroes_number; + const char *r_body; + int version; + + /* Move snum[i]-1 items from S[0] to S_new[i] */ + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i] - 1, -1, + tb->S_new[i]); + + /* Remember key component and item length */ + version = ih_version(ih); + old_key_comp = le_ih_k_offset(ih); + old_len = ih_item_len(ih); + + /* + * Calculate key component and item length to insert + * into S_new[i] + */ + shift = 0; + if (is_indirect_le_ih(ih)) + shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT; + set_le_ih_k_offset(ih, + le_ih_k_offset(ih) + + ((old_len - tb->sbytes[i]) << shift)); + + put_ih_item_len(ih, tb->sbytes[i]); + + /* Insert part of the item into S_new[i] before 0-th item */ + buffer_info_init_bh(tb, &bi, tb->S_new[i]); + + if ((old_len - tb->sbytes[i]) > tb->zeroes_num) { + r_zeroes_number = 0; + r_body = body + (old_len - tb->sbytes[i]) - + tb->zeroes_num; + } else { + r_body = body; + r_zeroes_number = tb->zeroes_num - (old_len - + tb->sbytes[i]); + tb->zeroes_num -= r_zeroes_number; } - reiserfs_invalidate_buffer(tb, tbS0); - return 0; + leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeroes_number); + + /* + * Calculate key component and item length to + * insert into S[i] + */ + set_le_ih_k_offset(ih, old_key_comp); + put_ih_item_len(ih, old_len - tb->sbytes[i]); + tb->insert_size[0] -= tb->sbytes[i]; + } else { + /* whole new item falls into S_new[i] */ + + /* + * Shift snum[0] - 1 items to S_new[i] + * (sbytes[i] of split item) + */ + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, + tb->snum[i] - 1, tb->sbytes[i], tb->S_new[i]); + + /* Insert new item into S_new[i] */ + buffer_info_init_bh(tb, &bi, tb->S_new[i]); + leaf_insert_into_buf(&bi, tb->item_pos - n + tb->snum[i] - 1, + ih, body, tb->zeroes_num); + + tb->zeroes_num = tb->insert_size[0] = 0; } +} - /* Fill new nodes that appear in place of S[0] */ +/* we append to directory item */ +static void balance_leaf_new_nodes_paste_dirent(struct tree_balance *tb, + struct item_head *ih, + const char *body, + struct item_head *insert_key, + struct buffer_head **insert_ptr, + int i) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + struct item_head *aux_ih = item_head(tbS0, tb->item_pos); + int entry_count = ih_entry_count(aux_ih); + struct buffer_info bi; + + if (entry_count - tb->sbytes[i] < tb->pos_in_item && + tb->pos_in_item <= entry_count) { + /* new directory entry falls into S_new[i] */ + + RFALSE(!tb->insert_size[0], + "PAP-12215: insert_size is already 0"); + RFALSE(tb->sbytes[i] - 1 >= entry_count, + "PAP-12220: there are no so much entries (%d), only %d", + tb->sbytes[i] - 1, entry_count); + + /* + * Shift snum[i]-1 items in whole. + * Shift sbytes[i] directory entries + * from directory item number snum[i] + */ + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i], + tb->sbytes[i] - 1, tb->S_new[i]); + + /* + * Paste given directory entry to + * directory item + */ + buffer_info_init_bh(tb, &bi, tb->S_new[i]); + leaf_paste_in_buffer(&bi, 0, tb->pos_in_item - entry_count + + tb->sbytes[i] - 1, tb->insert_size[0], + body, tb->zeroes_num); + + /* paste new directory entry */ + leaf_paste_entries(&bi, 0, tb->pos_in_item - entry_count + + tb->sbytes[i] - 1, 1, + (struct reiserfs_de_head *) body, + body + DEH_SIZE, tb->insert_size[0]); + + tb->insert_size[0] = 0; + tb->pos_in_item++; + } else { + /* new directory entry doesn't fall into S_new[i] */ + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i], + tb->sbytes[i], tb->S_new[i]); + } + +} - /* I am told that this copying is because we need an array to enable - the looping code. -Hans */ - snum[0] = tb->s1num, snum[1] = tb->s2num; - sbytes[0] = tb->s1bytes; - sbytes[1] = tb->s2bytes; +static void balance_leaf_new_nodes_paste_shift(struct tree_balance *tb, + struct item_head *ih, + const char *body, + struct item_head *insert_key, + struct buffer_head **insert_ptr, + int i) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + struct item_head *aux_ih = item_head(tbS0, tb->item_pos); + int n_shift, n_rem, r_zeroes_number, shift; + const char *r_body; + struct item_head *tmp; + struct buffer_info bi; + + RFALSE(ih, "PAP-12210: ih must be 0"); + + if (is_direntry_le_ih(aux_ih)) { + balance_leaf_new_nodes_paste_dirent(tb, ih, body, insert_key, + insert_ptr, i); + return; + } + + /* regular object */ + + + RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)) || + tb->insert_size[0] <= 0, + "PAP-12225: item too short or insert_size <= 0"); + + /* + * Calculate number of bytes which must be shifted from appended item + */ + n_shift = tb->sbytes[i] - tb->insert_size[0]; + if (n_shift < 0) + n_shift = 0; + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i], n_shift, + tb->S_new[i]); + + /* + * Calculate number of bytes which must remain in body after + * append to S_new[i] + */ + n_rem = tb->insert_size[0] - tb->sbytes[i]; + if (n_rem < 0) + n_rem = 0; + + /* Append part of body into S_new[0] */ + buffer_info_init_bh(tb, &bi, tb->S_new[i]); + if (n_rem > tb->zeroes_num) { + r_zeroes_number = 0; + r_body = body + n_rem - tb->zeroes_num; + } else { + r_body = body; + r_zeroes_number = tb->zeroes_num - n_rem; + tb->zeroes_num -= r_zeroes_number; + } + + leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem, + r_body, r_zeroes_number); + + tmp = item_head(tb->S_new[i], 0); + shift = 0; + if (is_indirect_le_ih(tmp)) { + set_ih_free_space(tmp, 0); + shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT; + } + add_le_ih_k_offset(tmp, n_rem << shift); + + tb->insert_size[0] = n_rem; + if (!n_rem) + tb->pos_in_item++; +} + +static void balance_leaf_new_nodes_paste_whole(struct tree_balance *tb, + struct item_head *ih, + const char *body, + struct item_head *insert_key, + struct buffer_head **insert_ptr, + int i) + +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tbS0); + int leaf_mi; + struct item_head *pasted; + struct buffer_info bi; + +#ifdef CONFIG_REISERFS_CHECK + struct item_head *ih_check = item_head(tbS0, tb->item_pos); + + if (!is_direntry_le_ih(ih_check) && + (tb->pos_in_item != ih_item_len(ih_check) || + tb->insert_size[0] <= 0)) + reiserfs_panic(tb->tb_sb, + "PAP-12235", + "pos_in_item must be equal to ih_item_len"); +#endif + + leaf_mi = leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i], + tb->sbytes[i], tb->S_new[i]); + + RFALSE(leaf_mi, + "PAP-12240: unexpected value returned by leaf_move_items (%d)", + leaf_mi); + + /* paste into item */ + buffer_info_init_bh(tb, &bi, tb->S_new[i]); + leaf_paste_in_buffer(&bi, tb->item_pos - n + tb->snum[i], + tb->pos_in_item, tb->insert_size[0], + body, tb->zeroes_num); + + pasted = item_head(tb->S_new[i], tb->item_pos - n + + tb->snum[i]); + if (is_direntry_le_ih(pasted)) + leaf_paste_entries(&bi, tb->item_pos - n + tb->snum[i], + tb->pos_in_item, 1, + (struct reiserfs_de_head *)body, + body + DEH_SIZE, tb->insert_size[0]); + + /* if we paste to indirect item update ih_free_space */ + if (is_indirect_le_ih(pasted)) + set_ih_free_space(pasted, 0); + + tb->zeroes_num = tb->insert_size[0] = 0; + +} +static void balance_leaf_new_nodes_paste(struct tree_balance *tb, + struct item_head *ih, + const char *body, + struct item_head *insert_key, + struct buffer_head **insert_ptr, + int i) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int n = B_NR_ITEMS(tbS0); + + /* pasted item doesn't fall into S_new[i] */ + if (n - tb->snum[i] > tb->item_pos) { + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, + tb->snum[i], tb->sbytes[i], tb->S_new[i]); + return; + } + + /* pasted item or part if it falls to S_new[i] */ + + if (tb->item_pos == n - tb->snum[i] && tb->sbytes[i] != -1) + /* we must shift part of the appended item */ + balance_leaf_new_nodes_paste_shift(tb, ih, body, insert_key, + insert_ptr, i); + else + /* item falls wholly into S_new[i] */ + balance_leaf_new_nodes_paste_whole(tb, ih, body, insert_key, + insert_ptr, i); +} + +/* Fill new nodes that appear in place of S[0] */ +static void balance_leaf_new_nodes(struct tree_balance *tb, + struct item_head *ih, + const char *body, + struct item_head *insert_key, + struct buffer_head **insert_ptr, + int flag) +{ + int i; for (i = tb->blknum[0] - 2; i >= 0; i--) { + BUG_ON(flag != M_INSERT && flag != M_PASTE); - RFALSE(!snum[i], "PAP-12200: snum[%d] == %d. Must be > 0", i, - snum[i]); + RFALSE(!tb->snum[i], + "PAP-12200: snum[%d] == %d. Must be > 0", i, + tb->snum[i]); /* here we shift from S to S_new nodes */ - S_new[i] = get_FEB(tb); + tb->S_new[i] = get_FEB(tb); /* initialized block type and tree level */ - set_blkh_level(B_BLK_HEAD(S_new[i]), DISK_LEAF_NODE_LEVEL); - - n = B_NR_ITEMS(tbS0); - - switch (flag) { - case M_INSERT: /* insert item */ - - if (n - snum[i] < item_pos) { /* new item or it's part falls to first new node S_new[i] */ - if (item_pos == n - snum[i] + 1 && sbytes[i] != -1) { /* part of new item falls into S_new[i] */ - int old_key_comp, old_len, r_zeros_number; - const char *r_body; - int version; - - /* Move snum[i]-1 items from S[0] to S_new[i] */ - leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, - snum[i] - 1, -1, - S_new[i]); - /* Remember key component and item length */ - version = ih_version(ih); - old_key_comp = le_ih_k_offset(ih); - old_len = ih_item_len(ih); - - /* Calculate key component and item length to insert into S_new[i] */ - set_le_ih_k_offset(ih, le_ih_k_offset(ih) + - ((old_len - sbytes[i]) << (is_indirect_le_ih(ih) ? tb->tb_sb-> s_blocksize_bits - UNFM_P_SHIFT : 0))); - - put_ih_item_len(ih, sbytes[i]); - - /* Insert part of the item into S_new[i] before 0-th item */ - buffer_info_init_bh(tb, &bi, S_new[i]); - - if ((old_len - sbytes[i]) > zeros_num) { - r_zeros_number = 0; - r_body = body + (old_len - sbytes[i]) - zeros_num; - } else { - r_body = body; - r_zeros_number = zeros_num - (old_len - sbytes[i]); - zeros_num -= r_zeros_number; - } - - leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeros_number); - - /* Calculate key component and item length to insert into S[i] */ - set_le_ih_k_offset(ih, old_key_comp); - put_ih_item_len(ih, old_len - sbytes[i]); - tb->insert_size[0] -= sbytes[i]; - } else { /* whole new item falls into S_new[i] */ - - /* Shift snum[0] - 1 items to S_new[i] (sbytes[i] of split item) */ - leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, - snum[i] - 1, sbytes[i], S_new[i]); - - /* Insert new item into S_new[i] */ - buffer_info_init_bh(tb, &bi, S_new[i]); - leaf_insert_into_buf(&bi, item_pos - n + snum[i] - 1, - ih, body, zeros_num); - - zeros_num = tb->insert_size[0] = 0; - } - } - - else { /* new item or it part don't falls into S_new[i] */ + set_blkh_level(B_BLK_HEAD(tb->S_new[i]), DISK_LEAF_NODE_LEVEL); + + if (flag == M_INSERT) + balance_leaf_new_nodes_insert(tb, ih, body, insert_key, + insert_ptr, i); + else /* M_PASTE */ + balance_leaf_new_nodes_paste(tb, ih, body, insert_key, + insert_ptr, i); + + memcpy(insert_key + i, leaf_key(tb->S_new[i], 0), KEY_SIZE); + insert_ptr[i] = tb->S_new[i]; + + RFALSE(!buffer_journaled(tb->S_new[i]) + || buffer_journal_dirty(tb->S_new[i]) + || buffer_dirty(tb->S_new[i]), + "PAP-12247: S_new[%d] : (%b)", + i, tb->S_new[i]); + } +} - leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, - snum[i], sbytes[i], S_new[i]); - } - break; +static void balance_leaf_finish_node_insert(struct tree_balance *tb, + struct item_head *ih, + const char *body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + struct buffer_info bi; + buffer_info_init_tbS0(tb, &bi); + leaf_insert_into_buf(&bi, tb->item_pos, ih, body, tb->zeroes_num); - case M_PASTE: /* append item */ - - if (n - snum[i] <= item_pos) { /* pasted item or part if it falls to S_new[i] */ - if (item_pos == n - snum[i] && sbytes[i] != -1) { /* we must shift part of the appended item */ - struct item_head *aux_ih; - - RFALSE(ih, "PAP-12210: ih must be 0"); - - aux_ih = B_N_PITEM_HEAD(tbS0, item_pos); - if (is_direntry_le_ih(aux_ih)) { - /* we append to directory item */ - - int entry_count; - - entry_count = ih_entry_count(aux_ih); - - if (entry_count - sbytes[i] < pos_in_item && pos_in_item <= entry_count) { - /* new directory entry falls into S_new[i] */ - - RFALSE(!tb->insert_size[0], "PAP-12215: insert_size is already 0"); - RFALSE(sbytes[i] - 1 >= entry_count, - "PAP-12220: there are no so much entries (%d), only %d", - sbytes[i] - 1, entry_count); - - /* Shift snum[i]-1 items in whole. Shift sbytes[i] directory entries from directory item number snum[i] */ - leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i] - 1, S_new[i]); - /* Paste given directory entry to directory item */ - buffer_info_init_bh(tb, &bi, S_new[i]); - leaf_paste_in_buffer(&bi, 0, pos_in_item - entry_count + sbytes[i] - 1, - tb->insert_size[0], body, zeros_num); - /* paste new directory entry */ - leaf_paste_entries(&bi, 0, pos_in_item - entry_count + sbytes[i] - 1, 1, - (struct reiserfs_de_head *) body, - body + DEH_SIZE, tb->insert_size[0]); - tb->insert_size[0] = 0; - pos_in_item++; - } else { /* new directory entry doesn't fall into S_new[i] */ - leaf_move_items(LEAF_FROM_S_TO_SNEW,tb, snum[i], sbytes[i], S_new[i]); - } - } else { /* regular object */ - - int n_shift, n_rem, r_zeros_number; - const char *r_body; - - RFALSE(pos_in_item != ih_item_len(B_N_PITEM_HEAD(tbS0, item_pos)) || tb->insert_size[0] <= 0, - "PAP-12225: item too short or insert_size <= 0"); - - /* Calculate number of bytes which must be shifted from appended item */ - n_shift = sbytes[i] - tb->insert_size[0]; - if (n_shift < 0) - n_shift = 0; - leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, snum[i], n_shift, S_new[i]); - - /* Calculate number of bytes which must remain in body after append to S_new[i] */ - n_rem = tb->insert_size[0] - sbytes[i]; - if (n_rem < 0) - n_rem = 0; - /* Append part of body into S_new[0] */ - buffer_info_init_bh(tb, &bi, S_new[i]); - if (n_rem > zeros_num) { - r_zeros_number = 0; - r_body = body + n_rem - zeros_num; - } else { - r_body = body; - r_zeros_number = zeros_num - n_rem; - zeros_num -= r_zeros_number; - } - - leaf_paste_in_buffer(&bi, 0, n_shift, - tb->insert_size[0] - n_rem, - r_body, r_zeros_number); - { - struct item_head *tmp; - - tmp = B_N_PITEM_HEAD(S_new[i], 0); - if (is_indirect_le_ih - (tmp)) { - set_ih_free_space(tmp, 0); - set_le_ih_k_offset(tmp, le_ih_k_offset(tmp) + (n_rem << (tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT))); - } else { - set_le_ih_k_offset(tmp, le_ih_k_offset(tmp) + n_rem); - } - } - - tb->insert_size[0] = n_rem; - if (!n_rem) - pos_in_item++; - } - } else - /* item falls wholly into S_new[i] */ - { - int leaf_mi; - struct item_head *pasted; + /* If we insert the first key change the delimiting key */ + if (tb->item_pos == 0) { + if (tb->CFL[0]) /* can be 0 in reiserfsck */ + replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0); -#ifdef CONFIG_REISERFS_CHECK - struct item_head *ih_check = B_N_PITEM_HEAD(tbS0, item_pos); - - if (!is_direntry_le_ih(ih_check) - && (pos_in_item != ih_item_len(ih_check) - || tb->insert_size[0] <= 0)) - reiserfs_panic(tb->tb_sb, - "PAP-12235", - "pos_in_item " - "must be equal " - "to ih_item_len"); -#endif /* CONFIG_REISERFS_CHECK */ - - leaf_mi = leaf_move_items(LEAF_FROM_S_TO_SNEW, - tb, snum[i], - sbytes[i], - S_new[i]); - - RFALSE(leaf_mi, - "PAP-12240: unexpected value returned by leaf_move_items (%d)", - leaf_mi); - - /* paste into item */ - buffer_info_init_bh(tb, &bi, S_new[i]); - leaf_paste_in_buffer(&bi, - item_pos - n + snum[i], - pos_in_item, - tb->insert_size[0], - body, zeros_num); - - pasted = B_N_PITEM_HEAD(S_new[i], item_pos - n + snum[i]); - if (is_direntry_le_ih(pasted)) { - leaf_paste_entries(&bi, - item_pos - n + snum[i], - pos_in_item, 1, - (struct reiserfs_de_head *)body, - body + DEH_SIZE, - tb->insert_size[0] - ); - } - - /* if we paste to indirect item update ih_free_space */ - if (is_indirect_le_ih(pasted)) - set_ih_free_space(pasted, 0); - zeros_num = tb->insert_size[0] = 0; - } - } + } +} - else { /* pasted item doesn't fall into S_new[i] */ +static void balance_leaf_finish_node_paste_dirent(struct tree_balance *tb, + struct item_head *ih, + const char *body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + struct item_head *pasted = item_head(tbS0, tb->item_pos); + struct buffer_info bi; - leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, - snum[i], sbytes[i], S_new[i]); - } - break; - default: /* cases d and t */ - reiserfs_panic(tb->tb_sb, "PAP-12245", - "blknum > 2: unexpected mode: %s(%d)", - (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag); + if (tb->pos_in_item >= 0 && tb->pos_in_item <= ih_entry_count(pasted)) { + RFALSE(!tb->insert_size[0], + "PAP-12260: insert_size is 0 already"); + + /* prepare space */ + buffer_info_init_tbS0(tb, &bi); + leaf_paste_in_buffer(&bi, tb->item_pos, tb->pos_in_item, + tb->insert_size[0], body, tb->zeroes_num); + + /* paste entry */ + leaf_paste_entries(&bi, tb->item_pos, tb->pos_in_item, 1, + (struct reiserfs_de_head *)body, + body + DEH_SIZE, tb->insert_size[0]); + + if (!tb->item_pos && !tb->pos_in_item) { + RFALSE(!tb->CFL[0] || !tb->L[0], + "PAP-12270: CFL[0]/L[0] must be specified"); + if (tb->CFL[0]) + replace_key(tb, tb->CFL[0], tb->lkey[0], + tbS0, 0); } - memcpy(insert_key + i, B_N_PKEY(S_new[i], 0), KEY_SIZE); - insert_ptr[i] = S_new[i]; + tb->insert_size[0] = 0; + } +} + +static void balance_leaf_finish_node_paste(struct tree_balance *tb, + struct item_head *ih, + const char *body) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + struct buffer_info bi; + struct item_head *pasted = item_head(tbS0, tb->item_pos); - RFALSE(!buffer_journaled(S_new[i]) - || buffer_journal_dirty(S_new[i]) - || buffer_dirty(S_new[i]), "PAP-12247: S_new[%d] : (%b)", - i, S_new[i]); + /* when directory, may be new entry already pasted */ + if (is_direntry_le_ih(pasted)) { + balance_leaf_finish_node_paste_dirent(tb, ih, body); + return; } - /* if the affected item was not wholly shifted then we perform all necessary operations on that part or whole of the - affected item which remains in S */ - if (0 <= item_pos && item_pos < tb->s0num) { /* if we must insert or append into buffer S[0] */ + /* regular object */ - switch (flag) { - case M_INSERT: /* insert item into S[0] */ - buffer_info_init_tbS0(tb, &bi); - leaf_insert_into_buf(&bi, item_pos, ih, body, - zeros_num); + if (tb->pos_in_item == ih_item_len(pasted)) { + RFALSE(tb->insert_size[0] <= 0, + "PAP-12275: insert size must not be %d", + tb->insert_size[0]); + buffer_info_init_tbS0(tb, &bi); + leaf_paste_in_buffer(&bi, tb->item_pos, + tb->pos_in_item, tb->insert_size[0], body, + tb->zeroes_num); - /* If we insert the first key change the delimiting key */ - if (item_pos == 0) { - if (tb->CFL[0]) /* can be 0 in reiserfsck */ - replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0); - } - break; + if (is_indirect_le_ih(pasted)) + set_ih_free_space(pasted, 0); - case M_PASTE:{ /* append item in S[0] */ - struct item_head *pasted; - - pasted = B_N_PITEM_HEAD(tbS0, item_pos); - /* when directory, may be new entry already pasted */ - if (is_direntry_le_ih(pasted)) { - if (pos_in_item >= 0 && pos_in_item <= ih_entry_count(pasted)) { - - RFALSE(!tb->insert_size[0], - "PAP-12260: insert_size is 0 already"); - - /* prepare space */ - buffer_info_init_tbS0(tb, &bi); - leaf_paste_in_buffer(&bi, item_pos, pos_in_item, - tb->insert_size[0], body, - zeros_num); - - /* paste entry */ - leaf_paste_entries(&bi, item_pos, pos_in_item, 1, - (struct reiserfs_de_head *)body, - body + DEH_SIZE, - tb->insert_size[0]); - if (!item_pos && !pos_in_item) { - RFALSE(!tb->CFL[0] || !tb->L[0], - "PAP-12270: CFL[0]/L[0] must be specified"); - if (tb->CFL[0]) - replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0); - } - tb->insert_size[0] = 0; - } - } else { /* regular object */ - if (pos_in_item == ih_item_len(pasted)) { - - RFALSE(tb->insert_size[0] <= 0, - "PAP-12275: insert size must not be %d", - tb->insert_size[0]); - buffer_info_init_tbS0(tb, &bi); - leaf_paste_in_buffer(&bi, item_pos, pos_in_item, - tb->insert_size[0], body, zeros_num); - - if (is_indirect_le_ih(pasted)) { -#if 0 - RFALSE(tb-> - insert_size[0] != - UNFM_P_SIZE, - "PAP-12280: insert_size for indirect item must be %d, not %d", - UNFM_P_SIZE, - tb-> - insert_size[0]); -#endif - set_ih_free_space(pasted, 0); - } - tb->insert_size[0] = 0; - } + tb->insert_size[0] = 0; + } #ifdef CONFIG_REISERFS_CHECK - else { - if (tb->insert_size[0]) { - print_cur_tb("12285"); - reiserfs_panic(tb->tb_sb, - "PAP-12285", - "insert_size " - "must be 0 " - "(%d)", - tb->insert_size[0]); - } - } -#endif /* CONFIG_REISERFS_CHECK */ - - } - } /* case M_PASTE: */ + else if (tb->insert_size[0]) { + print_cur_tb("12285"); + reiserfs_panic(tb->tb_sb, "PAP-12285", + "insert_size must be 0 (%d)", tb->insert_size[0]); + } +#endif +} + +/* + * if the affected item was not wholly shifted then we + * perform all necessary operations on that part or whole + * of the affected item which remains in S + */ +static void balance_leaf_finish_node(struct tree_balance *tb, + struct item_head *ih, + const char *body, int flag) +{ + /* if we must insert or append into buffer S[0] */ + if (0 <= tb->item_pos && tb->item_pos < tb->s0num) { + if (flag == M_INSERT) + balance_leaf_finish_node_insert(tb, ih, body); + else /* M_PASTE */ + balance_leaf_finish_node_paste(tb, ih, body); + } +} + +/** + * balance_leaf - reiserfs tree balancing algorithm + * @tb: tree balance state + * @ih: item header of inserted item (little endian) + * @body: body of inserted item or bytes to paste + * @flag: i - insert, d - delete, c - cut, p - paste (see do_balance) + * passed back: + * @insert_key: key to insert new nodes + * @insert_ptr: array of nodes to insert at the next level + * + * In our processing of one level we sometimes determine what must be + * inserted into the next higher level. This insertion consists of a + * key or two keys and their corresponding pointers. + */ +static int balance_leaf(struct tree_balance *tb, struct item_head *ih, + const char *body, int flag, + struct item_head *insert_key, + struct buffer_head **insert_ptr) +{ + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + + PROC_INFO_INC(tb->tb_sb, balance_at[0]); + + /* Make balance in case insert_size[0] < 0 */ + if (tb->insert_size[0] < 0) + return balance_leaf_when_delete(tb, flag); + + tb->item_pos = PATH_LAST_POSITION(tb->tb_path), + tb->pos_in_item = tb->tb_path->pos_in_item, + tb->zeroes_num = 0; + if (flag == M_INSERT && !body) + tb->zeroes_num = ih_item_len(ih); + + /* + * for indirect item pos_in_item is measured in unformatted node + * pointers. Recalculate to bytes + */ + if (flag != M_INSERT + && is_indirect_le_ih(item_head(tbS0, tb->item_pos))) + tb->pos_in_item *= UNFM_P_SIZE; + + balance_leaf_left(tb, ih, body, flag); + + /* tb->lnum[0] > 0 */ + /* Calculate new item position */ + tb->item_pos -= (tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0)); + + balance_leaf_right(tb, ih, body, flag); + + /* tb->rnum[0] > 0 */ + RFALSE(tb->blknum[0] > 3, + "PAP-12180: blknum can not be %d. It must be <= 3", tb->blknum[0]); + RFALSE(tb->blknum[0] < 0, + "PAP-12185: blknum can not be %d. It must be >= 0", tb->blknum[0]); + + /* + * if while adding to a node we discover that it is possible to split + * it in two, and merge the left part into the left neighbor and the + * right part into the right neighbor, eliminating the node + */ + if (tb->blknum[0] == 0) { /* node S[0] is empty now */ + + RFALSE(!tb->lnum[0] || !tb->rnum[0], + "PAP-12190: lnum and rnum must not be zero"); + /* + * if insertion was done before 0-th position in R[0], right + * delimiting key of the tb->L[0]'s and left delimiting key are + * not set correctly + */ + if (tb->CFL[0]) { + if (!tb->CFR[0]) + reiserfs_panic(tb->tb_sb, "vs-12195", + "CFR not initialized"); + copy_key(internal_key(tb->CFL[0], tb->lkey[0]), + internal_key(tb->CFR[0], tb->rkey[0])); + do_balance_mark_internal_dirty(tb, tb->CFL[0], 0); } + + reiserfs_invalidate_buffer(tb, tbS0); + return 0; } + + balance_leaf_new_nodes(tb, ih, body, insert_key, insert_ptr, flag); + + balance_leaf_finish_node(tb, ih, body, flag); + #ifdef CONFIG_REISERFS_CHECK if (flag == M_PASTE && tb->insert_size[0]) { print_cur_tb("12290"); @@ -1137,9 +1454,11 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h "PAP-12290", "insert_size is still not 0 (%d)", tb->insert_size[0]); } -#endif /* CONFIG_REISERFS_CHECK */ +#endif + + /* Leaf level of the tree is balanced (end of balance_leaf) */ return 0; -} /* Leaf level of the tree is balanced (end of balance_leaf) */ +} /* Make empty node */ void make_empty_node(struct buffer_info *bi) @@ -1178,9 +1497,7 @@ struct buffer_head *get_FEB(struct tree_balance *tb) return tb->used[i]; } -/* This is now used because reiserfs_free_block has to be able to -** schedule. -*/ +/* This is now used because reiserfs_free_block has to be able to schedule. */ static void store_thrown(struct tree_balance *tb, struct buffer_head *bh) { int i; @@ -1246,10 +1563,10 @@ void replace_key(struct tree_balance *tb, struct buffer_head *dest, int n_dest, if (B_IS_ITEMS_LEVEL(src)) /* source buffer contains leaf node */ - memcpy(B_N_PDELIM_KEY(dest, n_dest), B_N_PITEM_HEAD(src, n_src), + memcpy(internal_key(dest, n_dest), item_head(src, n_src), KEY_SIZE); else - memcpy(B_N_PDELIM_KEY(dest, n_dest), B_N_PDELIM_KEY(src, n_src), + memcpy(internal_key(dest, n_dest), internal_key(src, n_src), KEY_SIZE); do_balance_mark_internal_dirty(tb, dest, 0); @@ -1335,8 +1652,10 @@ static int check_before_balancing(struct tree_balance *tb) "mount point."); } - /* double check that buffers that we will modify are unlocked. (fix_nodes should already have - prepped all of these for us). */ + /* + * double check that buffers that we will modify are unlocked. + * (fix_nodes should already have prepped all of these for us). + */ if (tb->lnum[0]) { retval |= locked_or_not_in_tree(tb, tb->L[0], "L[0]"); retval |= locked_or_not_in_tree(tb, tb->FL[0], "FL[0]"); @@ -1429,49 +1748,51 @@ static void check_internal_levels(struct tree_balance *tb) #endif -/* Now we have all of the buffers that must be used in balancing of - the tree. We rely on the assumption that schedule() will not occur - while do_balance works. ( Only interrupt handlers are acceptable.) - We balance the tree according to the analysis made before this, - using buffers already obtained. For SMP support it will someday be - necessary to add ordered locking of tb. */ - -/* Some interesting rules of balancing: - - we delete a maximum of two nodes per level per balancing: we never - delete R, when we delete two of three nodes L, S, R then we move - them into R. - - we only delete L if we are deleting two nodes, if we delete only - one node we delete S - - if we shift leaves then we shift as much as we can: this is a - deliberate policy of extremism in node packing which results in - higher average utilization after repeated random balance operations - at the cost of more memory copies and more balancing as a result of - small insertions to full nodes. - - if we shift internal nodes we try to evenly balance the node - utilization, with consequent less balancing at the cost of lower - utilization. - - one could argue that the policy for directories in leaves should be - that of internal nodes, but we will wait until another day to - evaluate this.... It would be nice to someday measure and prove - these assumptions as to what is optimal.... +/* + * Now we have all of the buffers that must be used in balancing of + * the tree. We rely on the assumption that schedule() will not occur + * while do_balance works. ( Only interrupt handlers are acceptable.) + * We balance the tree according to the analysis made before this, + * using buffers already obtained. For SMP support it will someday be + * necessary to add ordered locking of tb. + */ -*/ +/* + * Some interesting rules of balancing: + * we delete a maximum of two nodes per level per balancing: we never + * delete R, when we delete two of three nodes L, S, R then we move + * them into R. + * + * we only delete L if we are deleting two nodes, if we delete only + * one node we delete S + * + * if we shift leaves then we shift as much as we can: this is a + * deliberate policy of extremism in node packing which results in + * higher average utilization after repeated random balance operations + * at the cost of more memory copies and more balancing as a result of + * small insertions to full nodes. + * + * if we shift internal nodes we try to evenly balance the node + * utilization, with consequent less balancing at the cost of lower + * utilization. + * + * one could argue that the policy for directories in leaves should be + * that of internal nodes, but we will wait until another day to + * evaluate this.... It would be nice to someday measure and prove + * these assumptions as to what is optimal.... + */ static inline void do_balance_starts(struct tree_balance *tb) { - /* use print_cur_tb() to see initial state of struct - tree_balance */ + /* use print_cur_tb() to see initial state of struct tree_balance */ /* store_print_tb (tb); */ /* do not delete, just comment it out */ -/* print_tb(flag, PATH_LAST_POSITION(tb->tb_path), tb->tb_path->pos_in_item, tb, - "check");*/ + /* + print_tb(flag, PATH_LAST_POSITION(tb->tb_path), + tb->tb_path->pos_in_item, tb, "check"); + */ RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB"); #ifdef CONFIG_REISERFS_CHECK REISERFS_SB(tb->tb_sb)->cur_tb = tb; @@ -1487,9 +1808,10 @@ static inline void do_balance_completed(struct tree_balance *tb) REISERFS_SB(tb->tb_sb)->cur_tb = NULL; #endif - /* reiserfs_free_block is no longer schedule safe. So, we need to - ** put the buffers we want freed on the thrown list during do_balance, - ** and then free them now + /* + * reiserfs_free_block is no longer schedule safe. So, we need to + * put the buffers we want freed on the thrown list during do_balance, + * and then free them now */ REISERFS_SB(tb->tb_sb)->s_do_balance++; @@ -1500,36 +1822,40 @@ static inline void do_balance_completed(struct tree_balance *tb) free_thrown(tb); } -void do_balance(struct tree_balance *tb, /* tree_balance structure */ - struct item_head *ih, /* item header of inserted item */ - const char *body, /* body of inserted item or bytes to paste */ - int flag) -{ /* i - insert, d - delete - c - cut, p - paste - - Cut means delete part of an item - (includes removing an entry from a - directory). - - Delete means delete whole item. - - Insert means add a new item into the - tree. - - Paste means to append to the end of an - existing file or to insert a directory - entry. */ - int child_pos, /* position of a child node in its parent */ - h; /* level of the tree being processed */ - struct item_head insert_key[2]; /* in our processing of one level - we sometimes determine what - must be inserted into the next - higher level. This insertion - consists of a key or two keys - and their corresponding - pointers */ - struct buffer_head *insert_ptr[2]; /* inserted node-ptrs for the next - level */ +/* + * do_balance - balance the tree + * + * @tb: tree_balance structure + * @ih: item header of inserted item + * @body: body of inserted item or bytes to paste + * @flag: 'i' - insert, 'd' - delete, 'c' - cut, 'p' paste + * + * Cut means delete part of an item (includes removing an entry from a + * directory). + * + * Delete means delete whole item. + * + * Insert means add a new item into the tree. + * + * Paste means to append to the end of an existing file or to + * insert a directory entry. + */ +void do_balance(struct tree_balance *tb, struct item_head *ih, + const char *body, int flag) +{ + int child_pos; /* position of a child node in its parent */ + int h; /* level of the tree being processed */ + + /* + * in our processing of one level we sometimes determine what + * must be inserted into the next higher level. This insertion + * consists of a key or two keys and their corresponding + * pointers + */ + struct item_head insert_key[2]; + + /* inserted node-ptrs for the next level */ + struct buffer_head *insert_ptr[2]; tb->tb_mode = flag; tb->need_balance_dirty = 0; @@ -1546,12 +1872,14 @@ void do_balance(struct tree_balance *tb, /* tree_balance structure */ return; } - atomic_inc(&(fs_generation(tb->tb_sb))); + atomic_inc(&fs_generation(tb->tb_sb)); do_balance_starts(tb); - /* balance leaf returns 0 except if combining L R and S into - one node. see balance_internal() for explanation of this - line of code. */ + /* + * balance_leaf returns 0 except if combining L R and S into + * one node. see balance_internal() for explanation of this + * line of code. + */ child_pos = PATH_H_B_ITEM_ORDER(tb->tb_path, 0) + balance_leaf(tb, ih, body, flag, insert_key, insert_ptr); @@ -1561,9 +1889,8 @@ void do_balance(struct tree_balance *tb, /* tree_balance structure */ /* Balance internal level of the tree. */ for (h = 1; h < MAX_HEIGHT && tb->insert_size[h]; h++) - child_pos = - balance_internal(tb, h, child_pos, insert_key, insert_ptr); + child_pos = balance_internal(tb, h, child_pos, insert_key, + insert_ptr); do_balance_completed(tb); - } diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index ed58d843d578..db9e80ba53a0 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -15,20 +15,20 @@ #include <linux/quotaops.h> /* -** We pack the tails of files on file close, not at the time they are written. -** This implies an unnecessary copy of the tail and an unnecessary indirect item -** insertion/balancing, for files that are written in one write. -** It avoids unnecessary tail packings (balances) for files that are written in -** multiple writes and are small enough to have tails. -** -** file_release is called by the VFS layer when the file is closed. If -** this is the last open file descriptor, and the file -** small enough to have a tail, and the tail is currently in an -** unformatted node, the tail is converted back into a direct item. -** -** We use reiserfs_truncate_file to pack the tail, since it already has -** all the conditions coded. -*/ + * We pack the tails of files on file close, not at the time they are written. + * This implies an unnecessary copy of the tail and an unnecessary indirect item + * insertion/balancing, for files that are written in one write. + * It avoids unnecessary tail packings (balances) for files that are written in + * multiple writes and are small enough to have tails. + * + * file_release is called by the VFS layer when the file is closed. If + * this is the last open file descriptor, and the file + * small enough to have a tail, and the tail is currently in an + * unformatted node, the tail is converted back into a direct item. + * + * We use reiserfs_truncate_file to pack the tail, since it already has + * all the conditions coded. + */ static int reiserfs_file_release(struct inode *inode, struct file *filp) { @@ -41,10 +41,10 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp) if (atomic_add_unless(&REISERFS_I(inode)->openers, -1, 1)) return 0; - mutex_lock(&(REISERFS_I(inode)->tailpack)); + mutex_lock(&REISERFS_I(inode)->tailpack); if (!atomic_dec_and_test(&REISERFS_I(inode)->openers)) { - mutex_unlock(&(REISERFS_I(inode)->tailpack)); + mutex_unlock(&REISERFS_I(inode)->tailpack); return 0; } @@ -52,31 +52,35 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp) if ((!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) || !tail_has_to_be_packed(inode)) && REISERFS_I(inode)->i_prealloc_count <= 0) { - mutex_unlock(&(REISERFS_I(inode)->tailpack)); + mutex_unlock(&REISERFS_I(inode)->tailpack); return 0; } reiserfs_write_lock(inode->i_sb); - /* freeing preallocation only involves relogging blocks that + /* + * freeing preallocation only involves relogging blocks that * are already in the current transaction. preallocation gets * freed at the end of each transaction, so it is impossible for * us to log any additional blocks (including quota blocks) */ err = journal_begin(&th, inode->i_sb, 1); if (err) { - /* uh oh, we can't allow the inode to go away while there + /* + * uh oh, we can't allow the inode to go away while there * is still preallocation blocks pending. Try to join the * aborted transaction */ jbegin_failure = err; - err = journal_join_abort(&th, inode->i_sb, 1); + err = journal_join_abort(&th, inode->i_sb); if (err) { - /* hmpf, our choices here aren't good. We can pin the inode - * which will disallow unmount from every happening, we can - * do nothing, which will corrupt random memory on unmount, - * or we can forcibly remove the file from the preallocation - * list, which will leak blocks on disk. Lets pin the inode + /* + * hmpf, our choices here aren't good. We can pin + * the inode which will disallow unmount from ever + * happening, we can do nothing, which will corrupt + * random memory on unmount, or we can forcibly + * remove the file from the preallocation list, which + * will leak blocks on disk. Lets pin the inode * and let the admin know what is going on. */ igrab(inode); @@ -92,7 +96,7 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp) #ifdef REISERFS_PREALLOCATE reiserfs_discard_prealloc(&th, inode); #endif - err = journal_end(&th, inode->i_sb, 1); + err = journal_end(&th); /* copy back the error code from journal_begin */ if (!err) @@ -102,35 +106,38 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp) (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) && tail_has_to_be_packed(inode)) { - /* if regular file is released by last holder and it has been - appended (we append by unformatted node only) or its direct - item(s) had to be converted, then it may have to be - indirect2direct converted */ + /* + * if regular file is released by last holder and it has been + * appended (we append by unformatted node only) or its direct + * item(s) had to be converted, then it may have to be + * indirect2direct converted + */ err = reiserfs_truncate_file(inode, 0); } - out: +out: reiserfs_write_unlock(inode->i_sb); - mutex_unlock(&(REISERFS_I(inode)->tailpack)); + mutex_unlock(&REISERFS_I(inode)->tailpack); return err; } static int reiserfs_file_open(struct inode *inode, struct file *file) { int err = dquot_file_open(inode, file); + + /* somebody might be tailpacking on final close; wait for it */ if (!atomic_inc_not_zero(&REISERFS_I(inode)->openers)) { - /* somebody might be tailpacking on final close; wait for it */ - mutex_lock(&(REISERFS_I(inode)->tailpack)); + mutex_lock(&REISERFS_I(inode)->tailpack); atomic_inc(&REISERFS_I(inode)->openers); - mutex_unlock(&(REISERFS_I(inode)->tailpack)); + mutex_unlock(&REISERFS_I(inode)->tailpack); } return err; } void reiserfs_vfs_truncate_file(struct inode *inode) { - mutex_lock(&(REISERFS_I(inode)->tailpack)); + mutex_lock(&REISERFS_I(inode)->tailpack); reiserfs_truncate_file(inode, 1); - mutex_unlock(&(REISERFS_I(inode)->tailpack)); + mutex_unlock(&REISERFS_I(inode)->tailpack); } /* Sync a reiserfs file. */ @@ -205,10 +212,11 @@ int reiserfs_commit_page(struct inode *inode, struct page *page, set_buffer_uptodate(bh); if (logit) { reiserfs_prepare_for_journal(s, bh, 1); - journal_mark_dirty(&th, s, bh); + journal_mark_dirty(&th, bh); } else if (!buffer_dirty(bh)) { mark_buffer_dirty(bh); - /* do data=ordered on any page past the end + /* + * do data=ordered on any page past the end * of file and any buffer marked BH_New. */ if (reiserfs_data_ordered(inode->i_sb) && @@ -219,8 +227,8 @@ int reiserfs_commit_page(struct inode *inode, struct page *page, } } if (logit) { - ret = journal_end(&th, s, bh_per_page + 1); - drop_write_lock: + ret = journal_end(&th); +drop_write_lock: reiserfs_write_unlock(s); } /* @@ -235,8 +243,8 @@ int reiserfs_commit_page(struct inode *inode, struct page *page, } const struct file_operations reiserfs_file_operations = { - .read = do_sync_read, - .write = do_sync_write, + .read = new_sync_read, + .write = new_sync_write, .unlocked_ioctl = reiserfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = reiserfs_compat_ioctl, @@ -245,10 +253,10 @@ const struct file_operations reiserfs_file_operations = { .open = reiserfs_file_open, .release = reiserfs_file_release, .fsync = reiserfs_sync_file, - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, .llseek = generic_file_llseek, }; diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c index dc4d41530316..6b0ddb2a9091 100644 --- a/fs/reiserfs/fix_node.c +++ b/fs/reiserfs/fix_node.c @@ -2,59 +2,32 @@ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README */ -/** - ** old_item_num - ** old_entry_num - ** set_entry_sizes - ** create_virtual_node - ** check_left - ** check_right - ** directory_part_size - ** get_num_ver - ** set_parameters - ** is_leaf_removable - ** are_leaves_removable - ** get_empty_nodes - ** get_lfree - ** get_rfree - ** is_left_neighbor_in_cache - ** decrement_key - ** get_far_parent - ** get_parents - ** can_node_be_removed - ** ip_check_balance - ** dc_check_balance_internal - ** dc_check_balance_leaf - ** dc_check_balance - ** check_balance - ** get_direct_parent - ** get_neighbors - ** fix_nodes - ** - ** - **/ - #include <linux/time.h> #include <linux/slab.h> #include <linux/string.h> #include "reiserfs.h" #include <linux/buffer_head.h> -/* To make any changes in the tree we find a node, that contains item - to be changed/deleted or position in the node we insert a new item - to. We call this node S. To do balancing we need to decide what we - will shift to left/right neighbor, or to a new node, where new item - will be etc. To make this analysis simpler we build virtual - node. Virtual node is an array of items, that will replace items of - node S. (For instance if we are going to delete an item, virtual - node does not contain it). Virtual node keeps information about - item sizes and types, mergeability of first and last items, sizes - of all entries in directory item. We use this array of items when - calculating what we can shift to neighbors and how many nodes we - have to have if we do not any shiftings, if we shift to left/right - neighbor or to both. */ - -/* taking item number in virtual node, returns number of item, that it has in source buffer */ +/* + * To make any changes in the tree we find a node that contains item + * to be changed/deleted or position in the node we insert a new item + * to. We call this node S. To do balancing we need to decide what we + * will shift to left/right neighbor, or to a new node, where new item + * will be etc. To make this analysis simpler we build virtual + * node. Virtual node is an array of items, that will replace items of + * node S. (For instance if we are going to delete an item, virtual + * node does not contain it). Virtual node keeps information about + * item sizes and types, mergeability of first and last items, sizes + * of all entries in directory item. We use this array of items when + * calculating what we can shift to neighbors and how many nodes we + * have to have if we do not any shiftings, if we shift to left/right + * neighbor or to both. + */ + +/* + * Takes item number in virtual node, returns number of item + * that it has in source buffer + */ static inline int old_item_num(int new_num, int affected_item_num, int mode) { if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num) @@ -105,14 +78,17 @@ static void create_virtual_node(struct tree_balance *tb, int h) vn->vn_free_ptr += vn->vn_nr_item * sizeof(struct virtual_item); /* first item in the node */ - ih = B_N_PITEM_HEAD(Sh, 0); + ih = item_head(Sh, 0); /* define the mergeability for 0-th item (if it is not being deleted) */ - if (op_is_left_mergeable(&(ih->ih_key), Sh->b_size) + if (op_is_left_mergeable(&ih->ih_key, Sh->b_size) && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num)) vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE; - /* go through all items those remain in the virtual node (except for the new (inserted) one) */ + /* + * go through all items that remain in the virtual + * node (except for the new (inserted) one) + */ for (new_num = 0; new_num < vn->vn_nr_item; new_num++) { int j; struct virtual_item *vi = vn->vn_vi + new_num; @@ -128,11 +104,13 @@ static void create_virtual_node(struct tree_balance *tb, int h) vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE; vi->vi_ih = ih + j; - vi->vi_item = B_I_PITEM(Sh, ih + j); + vi->vi_item = ih_item_body(Sh, ih + j); vi->vi_uarea = vn->vn_free_ptr; - // FIXME: there is no check, that item operation did not - // consume too much memory + /* + * FIXME: there is no check that item operation did not + * consume too much memory + */ vn->vn_free_ptr += op_create_vi(vn, vi, is_affected, tb->insert_size[0]); if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr) @@ -145,7 +123,8 @@ static void create_virtual_node(struct tree_balance *tb, int h) if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) { vn->vn_vi[new_num].vi_item_len += tb->insert_size[0]; - vi->vi_new_data = vn->vn_data; // pointer to data which is going to be pasted + /* pointer to data which is going to be pasted */ + vi->vi_new_data = vn->vn_data; } } @@ -164,11 +143,14 @@ static void create_virtual_node(struct tree_balance *tb, int h) tb->insert_size[0]); } - /* set right merge flag we take right delimiting key and check whether it is a mergeable item */ + /* + * set right merge flag we take right delimiting key and + * check whether it is a mergeable item + */ if (tb->CFR[0]) { struct reiserfs_key *key; - key = B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0]); + key = internal_key(tb->CFR[0], tb->rkey[0]); if (op_is_left_mergeable(key, Sh->b_size) && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) @@ -179,12 +161,19 @@ static void create_virtual_node(struct tree_balance *tb, int h) if (op_is_left_mergeable(key, Sh->b_size) && !(vn->vn_mode != M_DELETE || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) { - /* we delete last item and it could be merged with right neighbor's first item */ + /* + * we delete last item and it could be merged + * with right neighbor's first item + */ if (! (B_NR_ITEMS(Sh) == 1 - && is_direntry_le_ih(B_N_PITEM_HEAD(Sh, 0)) - && I_ENTRY_COUNT(B_N_PITEM_HEAD(Sh, 0)) == 1)) { - /* node contains more than 1 item, or item is not directory item, or this item contains more than 1 entry */ + && is_direntry_le_ih(item_head(Sh, 0)) + && ih_entry_count(item_head(Sh, 0)) == 1)) { + /* + * node contains more than 1 item, or item + * is not directory item, or this item + * contains more than 1 entry + */ print_block(Sh, 0, -1, -1); reiserfs_panic(tb->tb_sb, "vs-8045", "rdkey %k, affected item==%d " @@ -198,8 +187,10 @@ static void create_virtual_node(struct tree_balance *tb, int h) } } -/* using virtual node check, how many items can be shifted to left - neighbor */ +/* + * Using virtual node check, how many items can be + * shifted to left neighbor + */ static void check_left(struct tree_balance *tb, int h, int cur_free) { int i; @@ -259,9 +250,13 @@ static void check_left(struct tree_balance *tb, int h, int cur_free) } /* the item cannot be shifted entirely, try to split it */ - /* check whether L[0] can hold ih and at least one byte of the item body */ + /* + * check whether L[0] can hold ih and at least one byte + * of the item body + */ + + /* cannot shift even a part of the current item */ if (cur_free <= ih_size) { - /* cannot shift even a part of the current item */ tb->lbytes = -1; return; } @@ -278,8 +273,10 @@ static void check_left(struct tree_balance *tb, int h, int cur_free) return; } -/* using virtual node check, how many items can be shifted to right - neighbor */ +/* + * Using virtual node check, how many items can be + * shifted to right neighbor + */ static void check_right(struct tree_balance *tb, int h, int cur_free) { int i; @@ -338,13 +335,21 @@ static void check_right(struct tree_balance *tb, int h, int cur_free) continue; } - /* check whether R[0] can hold ih and at least one byte of the item body */ - if (cur_free <= ih_size) { /* cannot shift even a part of the current item */ + /* + * check whether R[0] can hold ih and at least one + * byte of the item body + */ + + /* cannot shift even a part of the current item */ + if (cur_free <= ih_size) { tb->rbytes = -1; return; } - /* R[0] can hold the header of the item and at least one byte of its body */ + /* + * R[0] can hold the header of the item and at least + * one byte of its body + */ cur_free -= ih_size; /* cur_free is still > 0 */ tb->rbytes = op_check_right(vi, cur_free); @@ -361,45 +366,64 @@ static void check_right(struct tree_balance *tb, int h, int cur_free) /* * from - number of items, which are shifted to left neighbor entirely * to - number of item, which are shifted to right neighbor entirely - * from_bytes - number of bytes of boundary item (or directory entries) which are shifted to left neighbor - * to_bytes - number of bytes of boundary item (or directory entries) which are shifted to right neighbor */ + * from_bytes - number of bytes of boundary item (or directory entries) + * which are shifted to left neighbor + * to_bytes - number of bytes of boundary item (or directory entries) + * which are shifted to right neighbor + */ static int get_num_ver(int mode, struct tree_balance *tb, int h, int from, int from_bytes, int to, int to_bytes, short *snum012, int flow) { int i; int cur_free; - // int bytes; int units; struct virtual_node *vn = tb->tb_vn; - // struct virtual_item * vi; - int total_node_size, max_node_size, current_item_size; int needed_nodes; - int start_item, /* position of item we start filling node from */ - end_item, /* position of item we finish filling node by */ - start_bytes, /* number of first bytes (entries for directory) of start_item-th item - we do not include into node that is being filled */ - end_bytes; /* number of last bytes (entries for directory) of end_item-th item - we do node include into node that is being filled */ - int split_item_positions[2]; /* these are positions in virtual item of - items, that are split between S[0] and - S1new and S1new and S2new */ + + /* position of item we start filling node from */ + int start_item; + + /* position of item we finish filling node by */ + int end_item; + + /* + * number of first bytes (entries for directory) of start_item-th item + * we do not include into node that is being filled + */ + int start_bytes; + + /* + * number of last bytes (entries for directory) of end_item-th item + * we do node include into node that is being filled + */ + int end_bytes; + + /* + * these are positions in virtual item of items, that are split + * between S[0] and S1new and S1new and S2new + */ + int split_item_positions[2]; split_item_positions[0] = -1; split_item_positions[1] = -1; - /* We only create additional nodes if we are in insert or paste mode - or we are in replace mode at the internal level. If h is 0 and - the mode is M_REPLACE then in fix_nodes we change the mode to - paste or insert before we get here in the code. */ + /* + * We only create additional nodes if we are in insert or paste mode + * or we are in replace mode at the internal level. If h is 0 and + * the mode is M_REPLACE then in fix_nodes we change the mode to + * paste or insert before we get here in the code. + */ RFALSE(tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE), "vs-8100: insert_size < 0 in overflow"); max_node_size = MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, h)); - /* snum012 [0-2] - number of items, that lay - to S[0], first new node and second new node */ + /* + * snum012 [0-2] - number of items, that lay + * to S[0], first new node and second new node + */ snum012[3] = -1; /* s1bytes */ snum012[4] = -1; /* s2bytes */ @@ -416,20 +440,22 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, total_node_size = 0; cur_free = max_node_size; - // start from 'from'-th item + /* start from 'from'-th item */ start_item = from; - // skip its first 'start_bytes' units + /* skip its first 'start_bytes' units */ start_bytes = ((from_bytes != -1) ? from_bytes : 0); - // last included item is the 'end_item'-th one + /* last included item is the 'end_item'-th one */ end_item = vn->vn_nr_item - to - 1; - // do not count last 'end_bytes' units of 'end_item'-th item + /* do not count last 'end_bytes' units of 'end_item'-th item */ end_bytes = (to_bytes != -1) ? to_bytes : 0; - /* go through all item beginning from the start_item-th item and ending by - the end_item-th item. Do not count first 'start_bytes' units of - 'start_item'-th item and last 'end_bytes' of 'end_item'-th item */ - + /* + * go through all item beginning from the start_item-th item + * and ending by the end_item-th item. Do not count first + * 'start_bytes' units of 'start_item'-th item and last + * 'end_bytes' of 'end_item'-th item + */ for (i = start_item; i <= end_item; i++) { struct virtual_item *vi = vn->vn_vi + i; int skip_from_end = ((i == end_item) ? end_bytes : 0); @@ -439,7 +465,10 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, /* get size of current item */ current_item_size = vi->vi_item_len; - /* do not take in calculation head part (from_bytes) of from-th item */ + /* + * do not take in calculation head part (from_bytes) + * of from-th item + */ current_item_size -= op_part_size(vi, 0 /*from start */ , start_bytes); @@ -455,9 +484,11 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, continue; } + /* + * virtual item length is longer, than max size of item in + * a node. It is impossible for direct item + */ if (current_item_size > max_node_size) { - /* virtual item length is longer, than max size of item in - a node. It is impossible for direct item */ RFALSE(is_direct_le_ih(vi->vi_ih), "vs-8110: " "direct item length is %d. It can not be longer than %d", @@ -466,15 +497,18 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, flow = 1; } + /* as we do not split items, take new node and continue */ if (!flow) { - /* as we do not split items, take new node and continue */ needed_nodes++; i--; total_node_size = 0; continue; } - // calculate number of item units which fit into node being - // filled + + /* + * calculate number of item units which fit into node being + * filled + */ { int free_space; @@ -482,17 +516,17 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, units = op_check_left(vi, free_space, start_bytes, skip_from_end); + /* + * nothing fits into current node, take new + * node and continue + */ if (units == -1) { - /* nothing fits into current node, take new node and continue */ needed_nodes++, i--, total_node_size = 0; continue; } } /* something fits into the current node */ - //if (snum012[3] != -1 || needed_nodes != 1) - // reiserfs_panic (tb->tb_sb, "vs-8115: get_num_ver: too many nodes required"); - //snum012[needed_nodes - 1 + 3] = op_unit_num (vi) - start_bytes - units; start_bytes += units; snum012[needed_nodes - 1 + 3] = units; @@ -508,9 +542,11 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, total_node_size = 0; } - // sum012[4] (if it is not -1) contains number of units of which - // are to be in S1new, snum012[3] - to be in S0. They are supposed - // to be S1bytes and S2bytes correspondingly, so recalculate + /* + * sum012[4] (if it is not -1) contains number of units of which + * are to be in S1new, snum012[3] - to be in S0. They are supposed + * to be S1bytes and S2bytes correspondingly, so recalculate + */ if (snum012[4] > 0) { int split_item_num; int bytes_to_r, bytes_to_l; @@ -527,7 +563,7 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, ((split_item_positions[0] == split_item_positions[1]) ? snum012[3] : 0); - // s2bytes + /* s2bytes */ snum012[4] = op_unit_num(&vn->vn_vi[split_item_num]) - snum012[4] - bytes_to_r - bytes_to_l - bytes_to_S1new; @@ -555,7 +591,7 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, ((split_item_positions[0] == split_item_positions[1] && snum012[4] != -1) ? snum012[4] : 0); - // s1bytes + /* s1bytes */ snum012[3] = op_unit_num(&vn->vn_vi[split_item_num]) - snum012[3] - bytes_to_r - bytes_to_l - bytes_to_S2new; @@ -565,7 +601,8 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, } -/* Set parameters for balancing. +/* + * Set parameters for balancing. * Performs write of results of analysis of balancing into structure tb, * where it will later be used by the functions that actually do the balancing. * Parameters: @@ -575,11 +612,12 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, * rnum number of items from S[h] that must be shifted to R[h]; * blk_num number of blocks that S[h] will be splitted into; * s012 number of items that fall into splitted nodes. - * lbytes number of bytes which flow to the left neighbor from the item that is not - * not shifted entirely - * rbytes number of bytes which flow to the right neighbor from the item that is not - * not shifted entirely - * s1bytes number of bytes which flow to the first new node when S[0] splits (this number is contained in s012 array) + * lbytes number of bytes which flow to the left neighbor from the + * item that is not not shifted entirely + * rbytes number of bytes which flow to the right neighbor from the + * item that is not not shifted entirely + * s1bytes number of bytes which flow to the first new node when + * S[0] splits (this number is contained in s012 array) */ static void set_parameters(struct tree_balance *tb, int h, int lnum, @@ -590,12 +628,14 @@ static void set_parameters(struct tree_balance *tb, int h, int lnum, tb->rnum[h] = rnum; tb->blknum[h] = blk_num; - if (h == 0) { /* only for leaf level */ + /* only for leaf level */ + if (h == 0) { if (s012 != NULL) { - tb->s0num = *s012++, - tb->s1num = *s012++, tb->s2num = *s012++; - tb->s1bytes = *s012++; - tb->s2bytes = *s012; + tb->s0num = *s012++; + tb->snum[0] = *s012++; + tb->snum[1] = *s012++; + tb->sbytes[0] = *s012++; + tb->sbytes[1] = *s012; } tb->lbytes = lb; tb->rbytes = rb; @@ -607,8 +647,10 @@ static void set_parameters(struct tree_balance *tb, int h, int lnum, PROC_INFO_ADD(tb->tb_sb, rbytes[h], rb); } -/* check, does node disappear if we shift tb->lnum[0] items to left - neighbor and tb->rnum[0] to the right one. */ +/* + * check if node disappears if we shift tb->lnum[0] items to left + * neighbor and tb->rnum[0] to the right one. + */ static int is_leaf_removable(struct tree_balance *tb) { struct virtual_node *vn = tb->tb_vn; @@ -616,8 +658,10 @@ static int is_leaf_removable(struct tree_balance *tb) int size; int remain_items; - /* number of items, that will be shifted to left (right) neighbor - entirely */ + /* + * number of items that will be shifted to left (right) neighbor + * entirely + */ to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0); to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0); remain_items = vn->vn_nr_item; @@ -625,21 +669,21 @@ static int is_leaf_removable(struct tree_balance *tb) /* how many items remain in S[0] after shiftings to neighbors */ remain_items -= (to_left + to_right); + /* all content of node can be shifted to neighbors */ if (remain_items < 1) { - /* all content of node can be shifted to neighbors */ set_parameters(tb, 0, to_left, vn->vn_nr_item - to_left, 0, NULL, -1, -1); return 1; } + /* S[0] is not removable */ if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1) - /* S[0] is not removable */ return 0; - /* check, whether we can divide 1 remaining item between neighbors */ + /* check whether we can divide 1 remaining item between neighbors */ /* get size of remaining item (in item units) */ - size = op_unit_num(&(vn->vn_vi[to_left])); + size = op_unit_num(&vn->vn_vi[to_left]); if (tb->lbytes + tb->rbytes >= size) { set_parameters(tb, 0, to_left + 1, to_right + 1, 0, NULL, @@ -675,23 +719,28 @@ static int are_leaves_removable(struct tree_balance *tb, int lfree, int rfree) "vs-8125: item number must be 1: it is %d", B_NR_ITEMS(S0)); - ih = B_N_PITEM_HEAD(S0, 0); + ih = item_head(S0, 0); if (tb->CFR[0] - && !comp_short_le_keys(&(ih->ih_key), - B_N_PDELIM_KEY(tb->CFR[0], + && !comp_short_le_keys(&ih->ih_key, + internal_key(tb->CFR[0], tb->rkey[0]))) + /* + * Directory must be in correct state here: that is + * somewhere at the left side should exist first + * directory item. But the item being deleted can + * not be that first one because its right neighbor + * is item of the same directory. (But first item + * always gets deleted in last turn). So, neighbors + * of deleted item can be merged, so we can save + * ih_size + */ if (is_direntry_le_ih(ih)) { - /* Directory must be in correct state here: that is - somewhere at the left side should exist first directory - item. But the item being deleted can not be that first - one because its right neighbor is item of the same - directory. (But first item always gets deleted in last - turn). So, neighbors of deleted item can be merged, so - we can save ih_size */ ih_size = IH_SIZE; - /* we might check that left neighbor exists and is of the - same directory */ + /* + * we might check that left neighbor exists + * and is of the same directory + */ RFALSE(le_ih_k_offset(ih) == DOT_OFFSET, "vs-8130: first directory item can not be removed until directory is not empty"); } @@ -770,7 +819,8 @@ static void free_buffers_in_tb(struct tree_balance *tb) } } -/* Get new buffers for storing new nodes that are created while balancing. +/* + * Get new buffers for storing new nodes that are created while balancing. * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked; * CARRY_ON - schedule didn't occur while the function worked; * NO_DISK_SPACE - no disk space. @@ -778,28 +828,33 @@ static void free_buffers_in_tb(struct tree_balance *tb) /* The function is NOT SCHEDULE-SAFE! */ static int get_empty_nodes(struct tree_balance *tb, int h) { - struct buffer_head *new_bh, - *Sh = PATH_H_PBUFFER(tb->tb_path, h); + struct buffer_head *new_bh, *Sh = PATH_H_PBUFFER(tb->tb_path, h); b_blocknr_t *blocknr, blocknrs[MAX_AMOUNT_NEEDED] = { 0, }; - int counter, number_of_freeblk, amount_needed, /* number of needed empty blocks */ - retval = CARRY_ON; + int counter, number_of_freeblk; + int amount_needed; /* number of needed empty blocks */ + int retval = CARRY_ON; struct super_block *sb = tb->tb_sb; - /* number_of_freeblk is the number of empty blocks which have been - acquired for use by the balancing algorithm minus the number of - empty blocks used in the previous levels of the analysis, - number_of_freeblk = tb->cur_blknum can be non-zero if a schedule occurs - after empty blocks are acquired, and the balancing analysis is - then restarted, amount_needed is the number needed by this level - (h) of the balancing analysis. - - Note that for systems with many processes writing, it would be - more layout optimal to calculate the total number needed by all - levels and then to run reiserfs_new_blocks to get all of them at once. */ - - /* Initiate number_of_freeblk to the amount acquired prior to the restart of - the analysis or 0 if not restarted, then subtract the amount needed - by all of the levels of the tree below h. */ + /* + * number_of_freeblk is the number of empty blocks which have been + * acquired for use by the balancing algorithm minus the number of + * empty blocks used in the previous levels of the analysis, + * number_of_freeblk = tb->cur_blknum can be non-zero if a schedule + * occurs after empty blocks are acquired, and the balancing analysis + * is then restarted, amount_needed is the number needed by this + * level (h) of the balancing analysis. + * + * Note that for systems with many processes writing, it would be + * more layout optimal to calculate the total number needed by all + * levels and then to run reiserfs_new_blocks to get all of them at + * once. + */ + + /* + * Initiate number_of_freeblk to the amount acquired prior to the + * restart of the analysis or 0 if not restarted, then subtract the + * amount needed by all of the levels of the tree below h. + */ /* blknum includes S[h], so we subtract 1 in this calculation */ for (counter = 0, number_of_freeblk = tb->cur_blknum; counter < h; counter++) @@ -810,13 +865,19 @@ static int get_empty_nodes(struct tree_balance *tb, int h) /* Allocate missing empty blocks. */ /* if Sh == 0 then we are getting a new root */ amount_needed = (Sh) ? (tb->blknum[h] - 1) : 1; - /* Amount_needed = the amount that we need more than the amount that we have. */ + /* + * Amount_needed = the amount that we need more than the + * amount that we have. + */ if (amount_needed > number_of_freeblk) amount_needed -= number_of_freeblk; - else /* If we have enough already then there is nothing to do. */ + else /* If we have enough already then there is nothing to do. */ return CARRY_ON; - /* No need to check quota - is not allocated for blocks used for formatted nodes */ + /* + * No need to check quota - is not allocated for blocks used + * for formatted nodes + */ if (reiserfs_new_form_blocknrs(tb, blocknrs, amount_needed) == NO_DISK_SPACE) return NO_DISK_SPACE; @@ -849,8 +910,10 @@ static int get_empty_nodes(struct tree_balance *tb, int h) return retval; } -/* Get free space of the left neighbor, which is stored in the parent - * node of the left neighbor. */ +/* + * Get free space of the left neighbor, which is stored in the parent + * node of the left neighbor. + */ static int get_lfree(struct tree_balance *tb, int h) { struct buffer_head *l, *f; @@ -870,7 +933,8 @@ static int get_lfree(struct tree_balance *tb, int h) return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order))); } -/* Get free space of the right neighbor, +/* + * Get free space of the right neighbor, * which is stored in the parent node of the right neighbor. */ static int get_rfree(struct tree_balance *tb, int h) @@ -916,7 +980,10 @@ static int is_left_neighbor_in_cache(struct tree_balance *tb, int h) "vs-8165: F[h] (%b) or FL[h] (%b) is invalid", father, tb->FL[h]); - /* Get position of the pointer to the left neighbor into the left father. */ + /* + * Get position of the pointer to the left neighbor + * into the left father. + */ left_neighbor_position = (father == tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb->FL[h]); /* Get left neighbor block number. */ @@ -940,17 +1007,20 @@ static int is_left_neighbor_in_cache(struct tree_balance *tb, int h) static void decrement_key(struct cpu_key *key) { - // call item specific function for this key + /* call item specific function for this key */ item_ops[cpu_key_k_type(key)]->decrement_key(key); } -/* Calculate far left/right parent of the left/right neighbor of the current node, that - * is calculate the left/right (FL[h]/FR[h]) neighbor of the parent F[h]. +/* + * Calculate far left/right parent of the left/right neighbor of the + * current node, that is calculate the left/right (FL[h]/FR[h]) neighbor + * of the parent F[h]. * Calculate left/right common parent of the current node and L[h]/R[h]. * Calculate left/right delimiting key position. - * Returns: PATH_INCORRECT - path in the tree is not correct; - SCHEDULE_OCCURRED - schedule occurred while the function worked; - * CARRY_ON - schedule didn't occur while the function worked; + * Returns: PATH_INCORRECT - path in the tree is not correct + * SCHEDULE_OCCURRED - schedule occurred while the function worked + * CARRY_ON - schedule didn't occur while the function + * worked */ static int get_far_parent(struct tree_balance *tb, int h, @@ -966,8 +1036,10 @@ static int get_far_parent(struct tree_balance *tb, first_last_position = 0, path_offset = PATH_H_PATH_OFFSET(path, h); - /* Starting from F[h] go upwards in the tree, and look for the common - ancestor of F[h], and its neighbor l/r, that should be obtained. */ + /* + * Starting from F[h] go upwards in the tree, and look for the common + * ancestor of F[h], and its neighbor l/r, that should be obtained. + */ counter = path_offset; @@ -975,21 +1047,33 @@ static int get_far_parent(struct tree_balance *tb, "PAP-8180: invalid path length"); for (; counter > FIRST_PATH_ELEMENT_OFFSET; counter--) { - /* Check whether parent of the current buffer in the path is really parent in the tree. */ + /* + * Check whether parent of the current buffer in the path + * is really parent in the tree. + */ if (!B_IS_IN_TREE (parent = PATH_OFFSET_PBUFFER(path, counter - 1))) return REPEAT_SEARCH; + /* Check whether position in the parent is correct. */ if ((position = PATH_OFFSET_POSITION(path, counter - 1)) > B_NR_ITEMS(parent)) return REPEAT_SEARCH; - /* Check whether parent at the path really points to the child. */ + + /* + * Check whether parent at the path really points + * to the child. + */ if (B_N_CHILD_NUM(parent, position) != PATH_OFFSET_PBUFFER(path, counter)->b_blocknr) return REPEAT_SEARCH; - /* Return delimiting key if position in the parent is not equal to first/last one. */ + + /* + * Return delimiting key if position in the parent is not + * equal to first/last one. + */ if (c_lr_par == RIGHT_PARENTS) first_last_position = B_NR_ITEMS(parent); if (position != first_last_position) { @@ -1002,7 +1086,10 @@ static int get_far_parent(struct tree_balance *tb, /* if we are in the root of the tree, then there is no common father */ if (counter == FIRST_PATH_ELEMENT_OFFSET) { - /* Check whether first buffer in the path is the root of the tree. */ + /* + * Check whether first buffer in the path is the + * root of the tree. + */ if (PATH_OFFSET_PBUFFER (tb->tb_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr == @@ -1031,12 +1118,15 @@ static int get_far_parent(struct tree_balance *tb, } } - /* So, we got common parent of the current node and its left/right neighbor. - Now we are geting the parent of the left/right neighbor. */ + /* + * So, we got common parent of the current node and its + * left/right neighbor. Now we are getting the parent of the + * left/right neighbor. + */ /* Form key to get parent of the left/right neighbor. */ le_key2cpu_key(&s_lr_father_key, - B_N_PDELIM_KEY(*pcom_father, + internal_key(*pcom_father, (c_lr_par == LEFT_PARENTS) ? (tb->lkey[h - 1] = position - @@ -1050,7 +1140,7 @@ static int get_far_parent(struct tree_balance *tb, if (search_by_key (tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father, h + 1) == IO_ERROR) - // path is released + /* path is released */ return IO_ERROR; if (FILESYSTEM_CHANGED_TB(tb)) { @@ -1071,12 +1161,15 @@ static int get_far_parent(struct tree_balance *tb, return CARRY_ON; } -/* Get parents of neighbors of node in the path(S[path_offset]) and common parents of - * S[path_offset] and L[path_offset]/R[path_offset]: F[path_offset], FL[path_offset], - * FR[path_offset], CFL[path_offset], CFR[path_offset]. - * Calculate numbers of left and right delimiting keys position: lkey[path_offset], rkey[path_offset]. - * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked; - * CARRY_ON - schedule didn't occur while the function worked; +/* + * Get parents of neighbors of node in the path(S[path_offset]) and + * common parents of S[path_offset] and L[path_offset]/R[path_offset]: + * F[path_offset], FL[path_offset], FR[path_offset], CFL[path_offset], + * CFR[path_offset]. + * Calculate numbers of left and right delimiting keys position: + * lkey[path_offset], rkey[path_offset]. + * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked + * CARRY_ON - schedule didn't occur while the function worked */ static int get_parents(struct tree_balance *tb, int h) { @@ -1088,8 +1181,11 @@ static int get_parents(struct tree_balance *tb, int h) /* Current node is the root of the tree or will be root of the tree */ if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) { - /* The root can not have parents. - Release nodes which previously were obtained as parents of the current node neighbors. */ + /* + * The root can not have parents. + * Release nodes which previously were obtained as + * parents of the current node neighbors. + */ brelse(tb->FL[h]); brelse(tb->CFL[h]); brelse(tb->FR[h]); @@ -1111,10 +1207,14 @@ static int get_parents(struct tree_balance *tb, int h) get_bh(curf); tb->lkey[h] = position - 1; } else { - /* Calculate current parent of L[path_offset], which is the left neighbor of the current node. - Calculate current common parent of L[path_offset] and the current node. Note that - CFL[path_offset] not equal FL[path_offset] and CFL[path_offset] not equal F[path_offset]. - Calculate lkey[path_offset]. */ + /* + * Calculate current parent of L[path_offset], which is the + * left neighbor of the current node. Calculate current + * common parent of L[path_offset] and the current node. + * Note that CFL[path_offset] not equal FL[path_offset] and + * CFL[path_offset] not equal F[path_offset]. + * Calculate lkey[path_offset]. + */ if ((ret = get_far_parent(tb, h + 1, &curf, &curcf, LEFT_PARENTS)) != CARRY_ON) @@ -1130,19 +1230,22 @@ static int get_parents(struct tree_balance *tb, int h) (curcf && !B_IS_IN_TREE(curcf)), "PAP-8195: FL (%b) or CFL (%b) is invalid", curf, curcf); -/* Get parent FR[h] of R[h]. */ + /* Get parent FR[h] of R[h]. */ -/* Current node is the last child of F[h]. FR[h] != F[h]. */ + /* Current node is the last child of F[h]. FR[h] != F[h]. */ if (position == B_NR_ITEMS(PATH_H_PBUFFER(path, h + 1))) { -/* Calculate current parent of R[h], which is the right neighbor of F[h]. - Calculate current common parent of R[h] and current node. Note that CFR[h] - not equal FR[path_offset] and CFR[h] not equal F[h]. */ + /* + * Calculate current parent of R[h], which is the right + * neighbor of F[h]. Calculate current common parent of + * R[h] and current node. Note that CFR[h] not equal + * FR[path_offset] and CFR[h] not equal F[h]. + */ if ((ret = get_far_parent(tb, h + 1, &curf, &curcf, RIGHT_PARENTS)) != CARRY_ON) return ret; } else { -/* Current node is not the last child of its parent F[h]. */ + /* Current node is not the last child of its parent F[h]. */ curf = PATH_OFFSET_PBUFFER(path, path_offset - 1); curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1); get_bh(curf); @@ -1165,8 +1268,10 @@ static int get_parents(struct tree_balance *tb, int h) return CARRY_ON; } -/* it is possible to remove node as result of shiftings to - neighbors even when we insert or paste item. */ +/* + * it is possible to remove node as result of shiftings to + * neighbors even when we insert or paste item. + */ static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree, struct tree_balance *tb, int h) { @@ -1175,21 +1280,22 @@ static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree, struct item_head *ih; struct reiserfs_key *r_key = NULL; - ih = B_N_PITEM_HEAD(Sh, 0); + ih = item_head(Sh, 0); if (tb->CFR[h]) - r_key = B_N_PDELIM_KEY(tb->CFR[h], tb->rkey[h]); + r_key = internal_key(tb->CFR[h], tb->rkey[h]); if (lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes /* shifting may merge items which might save space */ - ((!h - && op_is_left_mergeable(&(ih->ih_key), Sh->b_size)) ? IH_SIZE : 0) + && op_is_left_mergeable(&ih->ih_key, Sh->b_size)) ? IH_SIZE : 0) - ((!h && r_key && op_is_left_mergeable(r_key, Sh->b_size)) ? IH_SIZE : 0) + ((h) ? KEY_SIZE : 0)) { /* node can not be removed */ - if (sfree >= levbytes) { /* new item fits into node S[h] without any shifting */ + if (sfree >= levbytes) { + /* new item fits into node S[h] without any shifting */ if (!h) tb->s0num = B_NR_ITEMS(Sh) + @@ -1202,7 +1308,8 @@ static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree, return !NO_BALANCING_NEEDED; } -/* Check whether current node S[h] is balanced when increasing its size by +/* + * Check whether current node S[h] is balanced when increasing its size by * Inserting or Pasting. * Calculate parameters for balancing for current level h. * Parameters: @@ -1219,39 +1326,48 @@ static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree, static int ip_check_balance(struct tree_balance *tb, int h) { struct virtual_node *vn = tb->tb_vn; - int levbytes, /* Number of bytes that must be inserted into (value - is negative if bytes are deleted) buffer which - contains node being balanced. The mnemonic is - that the attempted change in node space used level - is levbytes bytes. */ - ret; + /* + * Number of bytes that must be inserted into (value is negative + * if bytes are deleted) buffer which contains node being balanced. + * The mnemonic is that the attempted change in node space used + * level is levbytes bytes. + */ + int levbytes; + int ret; int lfree, sfree, rfree /* free space in L, S and R */ ; - /* nver is short for number of vertixes, and lnver is the number if - we shift to the left, rnver is the number if we shift to the - right, and lrnver is the number if we shift in both directions. - The goal is to minimize first the number of vertixes, and second, - the number of vertixes whose contents are changed by shifting, - and third the number of uncached vertixes whose contents are - changed by shifting and must be read from disk. */ + /* + * nver is short for number of vertixes, and lnver is the number if + * we shift to the left, rnver is the number if we shift to the + * right, and lrnver is the number if we shift in both directions. + * The goal is to minimize first the number of vertixes, and second, + * the number of vertixes whose contents are changed by shifting, + * and third the number of uncached vertixes whose contents are + * changed by shifting and must be read from disk. + */ int nver, lnver, rnver, lrnver; - /* used at leaf level only, S0 = S[0] is the node being balanced, - sInum [ I = 0,1,2 ] is the number of items that will - remain in node SI after balancing. S1 and S2 are new - nodes that might be created. */ + /* + * used at leaf level only, S0 = S[0] is the node being balanced, + * sInum [ I = 0,1,2 ] is the number of items that will + * remain in node SI after balancing. S1 and S2 are new + * nodes that might be created. + */ - /* we perform 8 calls to get_num_ver(). For each call we calculate five parameters. - where 4th parameter is s1bytes and 5th - s2bytes + /* + * we perform 8 calls to get_num_ver(). For each call we + * calculate five parameters. where 4th parameter is s1bytes + * and 5th - s2bytes + * + * s0num, s1num, s2num for 8 cases + * 0,1 - do not shift and do not shift but bottle + * 2 - shift only whole item to left + * 3 - shift to left and bottle as much as possible + * 4,5 - shift to right (whole items and as much as possible + * 6,7 - shift to both directions (whole items and as much as possible) */ - short snum012[40] = { 0, }; /* s0num, s1num, s2num for 8 cases - 0,1 - do not shift and do not shift but bottle - 2 - shift only whole item to left - 3 - shift to left and bottle as much as possible - 4,5 - shift to right (whole items and as much as possible - 6,7 - shift to both directions (whole items and as much as possible) - */ + short snum012[40] = { 0, }; /* Sh is the node whose balance is currently being checked */ struct buffer_head *Sh; @@ -1265,9 +1381,10 @@ static int ip_check_balance(struct tree_balance *tb, int h) reiserfs_panic(tb->tb_sb, "vs-8210", "S[0] can not be 0"); switch (ret = get_empty_nodes(tb, h)) { + /* no balancing for higher levels needed */ case CARRY_ON: set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); - return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */ + return NO_BALANCING_NEEDED; case NO_DISK_SPACE: case REPEAT_SEARCH: @@ -1278,7 +1395,9 @@ static int ip_check_balance(struct tree_balance *tb, int h) } } - if ((ret = get_parents(tb, h)) != CARRY_ON) /* get parents of S[h] neighbors. */ + /* get parents of S[h] neighbors. */ + ret = get_parents(tb, h); + if (ret != CARRY_ON) return ret; sfree = B_FREE_SPACE(Sh); @@ -1287,38 +1406,44 @@ static int ip_check_balance(struct tree_balance *tb, int h) rfree = get_rfree(tb, h); lfree = get_lfree(tb, h); + /* and new item fits into node S[h] without any shifting */ if (can_node_be_removed(vn->vn_mode, lfree, sfree, rfree, tb, h) == NO_BALANCING_NEEDED) - /* and new item fits into node S[h] without any shifting */ return NO_BALANCING_NEEDED; create_virtual_node(tb, h); /* - determine maximal number of items we can shift to the left neighbor (in tb structure) - and the maximal number of bytes that can flow to the left neighbor - from the left most liquid item that cannot be shifted from S[0] entirely (returned value) + * determine maximal number of items we can shift to the left + * neighbor (in tb structure) and the maximal number of bytes + * that can flow to the left neighbor from the left most liquid + * item that cannot be shifted from S[0] entirely (returned value) */ check_left(tb, h, lfree); /* - determine maximal number of items we can shift to the right neighbor (in tb structure) - and the maximal number of bytes that can flow to the right neighbor - from the right most liquid item that cannot be shifted from S[0] entirely (returned value) + * determine maximal number of items we can shift to the right + * neighbor (in tb structure) and the maximal number of bytes + * that can flow to the right neighbor from the right most liquid + * item that cannot be shifted from S[0] entirely (returned value) */ check_right(tb, h, rfree); - /* all contents of internal node S[h] can be moved into its - neighbors, S[h] will be removed after balancing */ + /* + * all contents of internal node S[h] can be moved into its + * neighbors, S[h] will be removed after balancing + */ if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) { int to_r; - /* Since we are working on internal nodes, and our internal - nodes have fixed size entries, then we can balance by the - number of items rather than the space they consume. In this - routine we set the left node equal to the right node, - allowing a difference of less than or equal to 1 child - pointer. */ + /* + * Since we are working on internal nodes, and our internal + * nodes have fixed size entries, then we can balance by the + * number of items rather than the space they consume. In this + * routine we set the left node equal to the right node, + * allowing a difference of less than or equal to 1 child + * pointer. + */ to_r = ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - @@ -1328,7 +1453,10 @@ static int ip_check_balance(struct tree_balance *tb, int h) return CARRY_ON; } - /* this checks balance condition, that any two neighboring nodes can not fit in one node */ + /* + * this checks balance condition, that any two neighboring nodes + * can not fit in one node + */ RFALSE(h && (tb->lnum[h] >= vn->vn_nr_item + 1 || tb->rnum[h] >= vn->vn_nr_item + 1), @@ -1337,16 +1465,22 @@ static int ip_check_balance(struct tree_balance *tb, int h) (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1))), "vs-8225: tree is not balanced on leaf level"); - /* all contents of S[0] can be moved into its neighbors - S[0] will be removed after balancing. */ + /* + * all contents of S[0] can be moved into its neighbors + * S[0] will be removed after balancing. + */ if (!h && is_leaf_removable(tb)) return CARRY_ON; - /* why do we perform this check here rather than earlier?? - Answer: we can win 1 node in some cases above. Moreover we - checked it above, when we checked, that S[0] is not removable - in principle */ - if (sfree >= levbytes) { /* new item fits into node S[h] without any shifting */ + /* + * why do we perform this check here rather than earlier?? + * Answer: we can win 1 node in some cases above. Moreover we + * checked it above, when we checked, that S[0] is not removable + * in principle + */ + + /* new item fits into node S[h] without any shifting */ + if (sfree >= levbytes) { if (!h) tb->s0num = vn->vn_nr_item; set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); @@ -1355,18 +1489,19 @@ static int ip_check_balance(struct tree_balance *tb, int h) { int lpar, rpar, nset, lset, rset, lrset; - /* - * regular overflowing of the node - */ + /* regular overflowing of the node */ - /* get_num_ver works in 2 modes (FLOW & NO_FLOW) - lpar, rpar - number of items we can shift to left/right neighbor (including splitting item) - nset, lset, rset, lrset - shows, whether flowing items give better packing + /* + * get_num_ver works in 2 modes (FLOW & NO_FLOW) + * lpar, rpar - number of items we can shift to left/right + * neighbor (including splitting item) + * nset, lset, rset, lrset - shows, whether flowing items + * give better packing */ #define FLOW 1 #define NO_FLOW 0 /* do not any splitting */ - /* we choose one the following */ + /* we choose one of the following */ #define NOTHING_SHIFT_NO_FLOW 0 #define NOTHING_SHIFT_FLOW 5 #define LEFT_SHIFT_NO_FLOW 10 @@ -1379,10 +1514,13 @@ static int ip_check_balance(struct tree_balance *tb, int h) lpar = tb->lnum[h]; rpar = tb->rnum[h]; - /* calculate number of blocks S[h] must be split into when - nothing is shifted to the neighbors, - as well as number of items in each part of the split node (s012 numbers), - and number of bytes (s1bytes) of the shared drop which flow to S1 if any */ + /* + * calculate number of blocks S[h] must be split into when + * nothing is shifted to the neighbors, as well as number of + * items in each part of the split node (s012 numbers), + * and number of bytes (s1bytes) of the shared drop which + * flow to S1 if any + */ nset = NOTHING_SHIFT_NO_FLOW; nver = get_num_ver(vn->vn_mode, tb, h, 0, -1, h ? vn->vn_nr_item : 0, -1, @@ -1391,7 +1529,10 @@ static int ip_check_balance(struct tree_balance *tb, int h) if (!h) { int nver1; - /* note, that in this case we try to bottle between S[0] and S1 (S1 - the first new node) */ + /* + * note, that in this case we try to bottle + * between S[0] and S1 (S1 - the first new node) + */ nver1 = get_num_ver(vn->vn_mode, tb, h, 0, -1, 0, -1, snum012 + NOTHING_SHIFT_FLOW, FLOW); @@ -1399,11 +1540,13 @@ static int ip_check_balance(struct tree_balance *tb, int h) nset = NOTHING_SHIFT_FLOW, nver = nver1; } - /* calculate number of blocks S[h] must be split into when - l_shift_num first items and l_shift_bytes of the right most - liquid item to be shifted are shifted to the left neighbor, - as well as number of items in each part of the splitted node (s012 numbers), - and number of bytes (s1bytes) of the shared drop which flow to S1 if any + /* + * calculate number of blocks S[h] must be split into when + * l_shift_num first items and l_shift_bytes of the right + * most liquid item to be shifted are shifted to the left + * neighbor, as well as number of items in each part of the + * splitted node (s012 numbers), and number of bytes + * (s1bytes) of the shared drop which flow to S1 if any */ lset = LEFT_SHIFT_NO_FLOW; lnver = get_num_ver(vn->vn_mode, tb, h, @@ -1422,11 +1565,13 @@ static int ip_check_balance(struct tree_balance *tb, int h) lset = LEFT_SHIFT_FLOW, lnver = lnver1; } - /* calculate number of blocks S[h] must be split into when - r_shift_num first items and r_shift_bytes of the left most - liquid item to be shifted are shifted to the right neighbor, - as well as number of items in each part of the splitted node (s012 numbers), - and number of bytes (s1bytes) of the shared drop which flow to S1 if any + /* + * calculate number of blocks S[h] must be split into when + * r_shift_num first items and r_shift_bytes of the left most + * liquid item to be shifted are shifted to the right neighbor, + * as well as number of items in each part of the splitted + * node (s012 numbers), and number of bytes (s1bytes) of the + * shared drop which flow to S1 if any */ rset = RIGHT_SHIFT_NO_FLOW; rnver = get_num_ver(vn->vn_mode, tb, h, @@ -1451,10 +1596,12 @@ static int ip_check_balance(struct tree_balance *tb, int h) rset = RIGHT_SHIFT_FLOW, rnver = rnver1; } - /* calculate number of blocks S[h] must be split into when - items are shifted in both directions, - as well as number of items in each part of the splitted node (s012 numbers), - and number of bytes (s1bytes) of the shared drop which flow to S1 if any + /* + * calculate number of blocks S[h] must be split into when + * items are shifted in both directions, as well as number + * of items in each part of the splitted node (s012 numbers), + * and number of bytes (s1bytes) of the shared drop which + * flow to S1 if any */ lrset = LR_SHIFT_NO_FLOW; lrnver = get_num_ver(vn->vn_mode, tb, h, @@ -1481,10 +1628,12 @@ static int ip_check_balance(struct tree_balance *tb, int h) lrset = LR_SHIFT_FLOW, lrnver = lrnver1; } - /* Our general shifting strategy is: - 1) to minimized number of new nodes; - 2) to minimized number of neighbors involved in shifting; - 3) to minimized number of disk reads; */ + /* + * Our general shifting strategy is: + * 1) to minimized number of new nodes; + * 2) to minimized number of neighbors involved in shifting; + * 3) to minimized number of disk reads; + */ /* we can win TWO or ONE nodes by shifting in both directions */ if (lrnver < lnver && lrnver < rnver) { @@ -1508,42 +1657,59 @@ static int ip_check_balance(struct tree_balance *tb, int h) return CARRY_ON; } - /* if shifting doesn't lead to better packing then don't shift */ + /* + * if shifting doesn't lead to better packing + * then don't shift + */ if (nver == lrnver) { set_parameters(tb, h, 0, 0, nver, snum012 + nset, -1, -1); return CARRY_ON; } - /* now we know that for better packing shifting in only one - direction either to the left or to the right is required */ + /* + * now we know that for better packing shifting in only one + * direction either to the left or to the right is required + */ - /* if shifting to the left is better than shifting to the right */ + /* + * if shifting to the left is better than + * shifting to the right + */ if (lnver < rnver) { SET_PAR_SHIFT_LEFT; return CARRY_ON; } - /* if shifting to the right is better than shifting to the left */ + /* + * if shifting to the right is better than + * shifting to the left + */ if (lnver > rnver) { SET_PAR_SHIFT_RIGHT; return CARRY_ON; } - /* now shifting in either direction gives the same number - of nodes and we can make use of the cached neighbors */ + /* + * now shifting in either direction gives the same number + * of nodes and we can make use of the cached neighbors + */ if (is_left_neighbor_in_cache(tb, h)) { SET_PAR_SHIFT_LEFT; return CARRY_ON; } - /* shift to the right independently on whether the right neighbor in cache or not */ + /* + * shift to the right independently on whether the + * right neighbor in cache or not + */ SET_PAR_SHIFT_RIGHT; return CARRY_ON; } } -/* Check whether current node S[h] is balanced when Decreasing its size by +/* + * Check whether current node S[h] is balanced when Decreasing its size by * Deleting or Cutting for INTERNAL node of S+tree. * Calculate parameters for balancing for current level h. * Parameters: @@ -1563,8 +1729,10 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h) { struct virtual_node *vn = tb->tb_vn; - /* Sh is the node whose balance is currently being checked, - and Fh is its father. */ + /* + * Sh is the node whose balance is currently being checked, + * and Fh is its father. + */ struct buffer_head *Sh, *Fh; int maxsize, ret; int lfree, rfree /* free space in L and R */ ; @@ -1574,19 +1742,25 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h) maxsize = MAX_CHILD_SIZE(Sh); -/* using tb->insert_size[h], which is negative in this case, create_virtual_node calculates: */ -/* new_nr_item = number of items node would have if operation is */ -/* performed without balancing (new_nr_item); */ + /* + * using tb->insert_size[h], which is negative in this case, + * create_virtual_node calculates: + * new_nr_item = number of items node would have if operation is + * performed without balancing (new_nr_item); + */ create_virtual_node(tb, h); if (!Fh) { /* S[h] is the root. */ + /* no balancing for higher levels needed */ if (vn->vn_nr_item > 0) { set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); - return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */ + return NO_BALANCING_NEEDED; } - /* new_nr_item == 0. + /* + * new_nr_item == 0. * Current root will be deleted resulting in - * decrementing the tree height. */ + * decrementing the tree height. + */ set_parameters(tb, h, 0, 0, 0, NULL, -1, -1); return CARRY_ON; } @@ -1602,12 +1776,18 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h) check_left(tb, h, lfree); check_right(tb, h, rfree); - if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) { /* Balance condition for the internal node is valid. - * In this case we balance only if it leads to better packing. */ - if (vn->vn_nr_item == MIN_NR_KEY(Sh)) { /* Here we join S[h] with one of its neighbors, - * which is impossible with greater values of new_nr_item. */ + /* + * Balance condition for the internal node is valid. + * In this case we balance only if it leads to better packing. + */ + if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) { + /* + * Here we join S[h] with one of its neighbors, + * which is impossible with greater values of new_nr_item. + */ + if (vn->vn_nr_item == MIN_NR_KEY(Sh)) { + /* All contents of S[h] can be moved to L[h]. */ if (tb->lnum[h] >= vn->vn_nr_item + 1) { - /* All contents of S[h] can be moved to L[h]. */ int n; int order_L; @@ -1623,8 +1803,8 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h) return CARRY_ON; } + /* All contents of S[h] can be moved to R[h]. */ if (tb->rnum[h] >= vn->vn_nr_item + 1) { - /* All contents of S[h] can be moved to R[h]. */ int n; int order_R; @@ -1641,8 +1821,11 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h) } } + /* + * All contents of S[h] can be moved to the neighbors + * (L[h] & R[h]). + */ if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) { - /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */ int to_r; to_r = @@ -1659,7 +1842,10 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h) return NO_BALANCING_NEEDED; } - /* Current node contain insufficient number of items. Balancing is required. */ + /* + * Current node contain insufficient number of items. + * Balancing is required. + */ /* Check whether we can merge S[h] with left neighbor. */ if (tb->lnum[h] >= vn->vn_nr_item + 1) if (is_left_neighbor_in_cache(tb, h) @@ -1726,7 +1912,8 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h) return CARRY_ON; } -/* Check whether current node S[h] is balanced when Decreasing its size by +/* + * Check whether current node S[h] is balanced when Decreasing its size by * Deleting or Truncating for LEAF node of S+tree. * Calculate parameters for balancing for current level h. * Parameters: @@ -1743,15 +1930,21 @@ static int dc_check_balance_leaf(struct tree_balance *tb, int h) { struct virtual_node *vn = tb->tb_vn; - /* Number of bytes that must be deleted from - (value is negative if bytes are deleted) buffer which - contains node being balanced. The mnemonic is that the - attempted change in node space used level is levbytes bytes. */ + /* + * Number of bytes that must be deleted from + * (value is negative if bytes are deleted) buffer which + * contains node being balanced. The mnemonic is that the + * attempted change in node space used level is levbytes bytes. + */ int levbytes; + /* the maximal item size */ int maxsize, ret; - /* S0 is the node whose balance is currently being checked, - and F0 is its father. */ + + /* + * S0 is the node whose balance is currently being checked, + * and F0 is its father. + */ struct buffer_head *S0, *F0; int lfree, rfree /* free space in L and R */ ; @@ -1784,9 +1977,11 @@ static int dc_check_balance_leaf(struct tree_balance *tb, int h) if (are_leaves_removable(tb, lfree, rfree)) return CARRY_ON; - /* determine maximal number of items we can shift to the left/right neighbor - and the maximal number of bytes that can flow to the left/right neighbor - from the left/right most liquid item that cannot be shifted from S[0] entirely + /* + * determine maximal number of items we can shift to the left/right + * neighbor and the maximal number of bytes that can flow to the + * left/right neighbor from the left/right most liquid item that + * cannot be shifted from S[0] entirely */ check_left(tb, h, lfree); check_right(tb, h, rfree); @@ -1810,7 +2005,10 @@ static int dc_check_balance_leaf(struct tree_balance *tb, int h) return CARRY_ON; } - /* All contents of S[0] can be moved to the neighbors (L[0] & R[0]). Set parameters and return */ + /* + * All contents of S[0] can be moved to the neighbors (L[0] & R[0]). + * Set parameters and return + */ if (is_leaf_removable(tb)) return CARRY_ON; @@ -1820,7 +2018,8 @@ static int dc_check_balance_leaf(struct tree_balance *tb, int h) return NO_BALANCING_NEEDED; } -/* Check whether current node S[h] is balanced when Decreasing its size by +/* + * Check whether current node S[h] is balanced when Decreasing its size by * Deleting or Cutting. * Calculate parameters for balancing for current level h. * Parameters: @@ -1844,15 +2043,16 @@ static int dc_check_balance(struct tree_balance *tb, int h) return dc_check_balance_leaf(tb, h); } -/* Check whether current node S[h] is balanced. +/* + * Check whether current node S[h] is balanced. * Calculate parameters for balancing for current level h. * Parameters: * * tb tree_balance structure: * - * tb is a large structure that must be read about in the header file - * at the same time as this procedure if the reader is to successfully - * understand this procedure + * tb is a large structure that must be read about in the header + * file at the same time as this procedure if the reader is + * to successfully understand this procedure * * h current level of the node; * inum item number in S[h]; @@ -1882,8 +2082,8 @@ static int check_balance(int mode, RFALSE(mode == M_INSERT && !vn->vn_ins_ih, "vs-8255: ins_ih can not be 0 in insert mode"); + /* Calculate balance parameters when size of node is increasing. */ if (tb->insert_size[h] > 0) - /* Calculate balance parameters when size of node is increasing. */ return ip_check_balance(tb, h); /* Calculate balance parameters when size of node is decreasing. */ @@ -1911,21 +2111,23 @@ static int get_direct_parent(struct tree_balance *tb, int h) PATH_OFFSET_POSITION(path, path_offset - 1) = 0; return CARRY_ON; } - return REPEAT_SEARCH; /* Root is changed and we must recalculate the path. */ + /* Root is changed and we must recalculate the path. */ + return REPEAT_SEARCH; } + /* Parent in the path is not in the tree. */ if (!B_IS_IN_TREE (bh = PATH_OFFSET_PBUFFER(path, path_offset - 1))) - return REPEAT_SEARCH; /* Parent in the path is not in the tree. */ + return REPEAT_SEARCH; if ((position = PATH_OFFSET_POSITION(path, path_offset - 1)) > B_NR_ITEMS(bh)) return REPEAT_SEARCH; + /* Parent in the path is not parent of the current node in the tree. */ if (B_N_CHILD_NUM(bh, position) != PATH_OFFSET_PBUFFER(path, path_offset)->b_blocknr) - /* Parent in the path is not parent of the current node in the tree. */ return REPEAT_SEARCH; if (buffer_locked(bh)) { @@ -1936,10 +2138,15 @@ static int get_direct_parent(struct tree_balance *tb, int h) return REPEAT_SEARCH; } - return CARRY_ON; /* Parent in the path is unlocked and really parent of the current node. */ + /* + * Parent in the path is unlocked and really parent + * of the current node. + */ + return CARRY_ON; } -/* Using lnum[h] and rnum[h] we should determine what neighbors +/* + * Using lnum[h] and rnum[h] we should determine what neighbors * of S[h] we * need in order to balance S[h], and get them if necessary. * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked; @@ -1997,7 +2204,7 @@ static int get_neighbors(struct tree_balance *tb, int h) } /* We need right neighbor to balance S[path_offset]. */ - if (tb->rnum[h]) { /* We need right neighbor to balance S[path_offset]. */ + if (tb->rnum[h]) { PROC_INFO_INC(sb, need_r_neighbor[h]); bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset); @@ -2053,9 +2260,11 @@ static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh) (max_num_of_entries - 1) * sizeof(__u16)); } -/* maybe we should fail balancing we are going to perform when kmalloc - fails several times. But now it will loop until kmalloc gets - required memory */ +/* + * maybe we should fail balancing we are going to perform when kmalloc + * fails several times. But now it will loop until kmalloc gets + * required memory + */ static int get_mem_for_virtual_node(struct tree_balance *tb) { int check_fs = 0; @@ -2064,8 +2273,8 @@ static int get_mem_for_virtual_node(struct tree_balance *tb) size = get_virtual_node_size(tb->tb_sb, PATH_PLAST_BUFFER(tb->tb_path)); + /* we have to allocate more memory for virtual node */ if (size > tb->vn_buf_size) { - /* we have to allocate more memory for virtual node */ if (tb->vn_buf) { /* free memory allocated before */ kfree(tb->vn_buf); @@ -2079,10 +2288,12 @@ static int get_mem_for_virtual_node(struct tree_balance *tb) /* get memory for virtual item */ buf = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN); if (!buf) { - /* getting memory with GFP_KERNEL priority may involve - balancing now (due to indirect_to_direct conversion on - dcache shrinking). So, release path and collected - resources here */ + /* + * getting memory with GFP_KERNEL priority may involve + * balancing now (due to indirect_to_direct conversion + * on dcache shrinking). So, release path and collected + * resources here + */ free_buffers_in_tb(tb); buf = kmalloc(size, GFP_NOFS); if (!buf) { @@ -2168,8 +2379,10 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb) for (i = tb->tb_path->path_length; !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) { if (PATH_OFFSET_PBUFFER(tb->tb_path, i)) { - /* if I understand correctly, we can only be sure the last buffer - ** in the path is in the tree --clm + /* + * if I understand correctly, we can only + * be sure the last buffer in the path is + * in the tree --clm */ #ifdef CONFIG_REISERFS_CHECK if (PATH_PLAST_BUFFER(tb->tb_path) == @@ -2256,13 +2469,15 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb) } } } - /* as far as I can tell, this is not required. The FEB list seems - ** to be full of newly allocated nodes, which will never be locked, - ** dirty, or anything else. - ** To be safe, I'm putting in the checks and waits in. For the moment, - ** they are needed to keep the code in journal.c from complaining - ** about the buffer. That code is inside CONFIG_REISERFS_CHECK as well. - ** --clm + + /* + * as far as I can tell, this is not required. The FEB list + * seems to be full of newly allocated nodes, which will + * never be locked, dirty, or anything else. + * To be safe, I'm putting in the checks and waits in. + * For the moment, they are needed to keep the code in + * journal.c from complaining about the buffer. + * That code is inside CONFIG_REISERFS_CHECK as well. --clm */ for (i = 0; !locked && i < MAX_FEB_SIZE; i++) { if (tb->FEB[i]) { @@ -2300,7 +2515,8 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb) return CARRY_ON; } -/* Prepare for balancing, that is +/* + * Prepare for balancing, that is * get all necessary parents, and neighbors; * analyze what and where should be moved; * get sufficient number of new nodes; @@ -2309,13 +2525,14 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb) * When ported to SMP kernels, only at the last moment after all needed nodes * are collected in cache, will the resources be locked using the usual * textbook ordered lock acquisition algorithms. Note that ensuring that - * this code neither write locks what it does not need to write lock nor locks out of order - * will be a pain in the butt that could have been avoided. Grumble grumble. -Hans + * this code neither write locks what it does not need to write lock nor locks + * out of order will be a pain in the butt that could have been avoided. + * Grumble grumble. -Hans * * fix is meant in the sense of render unchanging * - * Latency might be improved by first gathering a list of what buffers are needed - * and then getting as many of them in parallel as possible? -Hans + * Latency might be improved by first gathering a list of what buffers + * are needed and then getting as many of them in parallel as possible? -Hans * * Parameters: * op_mode i - insert, d - delete, c - cut (truncate), p - paste (append) @@ -2335,8 +2552,9 @@ int fix_nodes(int op_mode, struct tree_balance *tb, int ret, h, item_num = PATH_LAST_POSITION(tb->tb_path); int pos_in_item; - /* we set wait_tb_buffers_run when we have to restore any dirty bits cleared - ** during wait_tb_buffers_run + /* + * we set wait_tb_buffers_run when we have to restore any dirty + * bits cleared during wait_tb_buffers_run */ int wait_tb_buffers_run = 0; struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); @@ -2347,14 +2565,15 @@ int fix_nodes(int op_mode, struct tree_balance *tb, tb->fs_gen = get_generation(tb->tb_sb); - /* we prepare and log the super here so it will already be in the - ** transaction when do_balance needs to change it. - ** This way do_balance won't have to schedule when trying to prepare - ** the super for logging + /* + * we prepare and log the super here so it will already be in the + * transaction when do_balance needs to change it. + * This way do_balance won't have to schedule when trying to prepare + * the super for logging */ reiserfs_prepare_for_journal(tb->tb_sb, SB_BUFFER_WITH_SB(tb->tb_sb), 1); - journal_mark_dirty(tb->transaction_handle, tb->tb_sb, + journal_mark_dirty(tb->transaction_handle, SB_BUFFER_WITH_SB(tb->tb_sb)); if (FILESYSTEM_CHANGED_TB(tb)) return REPEAT_SEARCH; @@ -2408,7 +2627,7 @@ int fix_nodes(int op_mode, struct tree_balance *tb, #endif if (get_mem_for_virtual_node(tb) == REPEAT_SEARCH) - // FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat + /* FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat */ return REPEAT_SEARCH; /* Starting from the leaf level; for all levels h of the tree. */ @@ -2427,7 +2646,10 @@ int fix_nodes(int op_mode, struct tree_balance *tb, goto repeat; if (h != MAX_HEIGHT - 1) tb->insert_size[h + 1] = 0; - /* ok, analysis and resource gathering are complete */ + /* + * ok, analysis and resource gathering + * are complete + */ break; } goto repeat; @@ -2437,15 +2659,19 @@ int fix_nodes(int op_mode, struct tree_balance *tb, if (ret != CARRY_ON) goto repeat; - /* No disk space, or schedule occurred and analysis may be - * invalid and needs to be redone. */ + /* + * No disk space, or schedule occurred and analysis may be + * invalid and needs to be redone. + */ ret = get_empty_nodes(tb, h); if (ret != CARRY_ON) goto repeat; + /* + * We have a positive insert size but no nodes exist on this + * level, this means that we are creating a new root. + */ if (!PATH_H_PBUFFER(tb->tb_path, h)) { - /* We have a positive insert size but no nodes exist on this - level, this means that we are creating a new root. */ RFALSE(tb->blknum[h] != 1, "PAP-8350: creating new empty root"); @@ -2453,11 +2679,13 @@ int fix_nodes(int op_mode, struct tree_balance *tb, if (h < MAX_HEIGHT - 1) tb->insert_size[h + 1] = 0; } else if (!PATH_H_PBUFFER(tb->tb_path, h + 1)) { + /* + * The tree needs to be grown, so this node S[h] + * which is the root node is split into two nodes, + * and a new node (S[h+1]) will be created to + * become the root node. + */ if (tb->blknum[h] > 1) { - /* The tree needs to be grown, so this node S[h] - which is the root node is split into two nodes, - and a new node (S[h+1]) will be created to - become the root node. */ RFALSE(h == MAX_HEIGHT - 1, "PAP-8355: attempt to create too high of a tree"); @@ -2487,12 +2715,14 @@ int fix_nodes(int op_mode, struct tree_balance *tb, goto repeat; } - repeat: - // fix_nodes was unable to perform its calculation due to - // filesystem got changed under us, lack of free disk space or i/o - // failure. If the first is the case - the search will be - // repeated. For now - free all resources acquired so far except - // for the new allocated nodes +repeat: + /* + * fix_nodes was unable to perform its calculation due to + * filesystem got changed under us, lack of free disk space or i/o + * failure. If the first is the case - the search will be + * repeated. For now - free all resources acquired so far except + * for the new allocated nodes + */ { int i; @@ -2548,8 +2778,6 @@ int fix_nodes(int op_mode, struct tree_balance *tb, } -/* Anatoly will probably forgive me renaming tb to tb. I just - wanted to make lines shorter */ void unfix_nodes(struct tree_balance *tb) { int i; @@ -2578,8 +2806,10 @@ void unfix_nodes(struct tree_balance *tb) for (i = 0; i < MAX_FEB_SIZE; i++) { if (tb->FEB[i]) { b_blocknr_t blocknr = tb->FEB[i]->b_blocknr; - /* de-allocated block which was not used by balancing and - bforget about buffer for it */ + /* + * de-allocated block which was not used by + * balancing and bforget about buffer for it + */ brelse(tb->FEB[i]); reiserfs_free_block(tb->transaction_handle, NULL, blocknr, 0); diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c index 91b0cc1242a2..7a26c4fe6c46 100644 --- a/fs/reiserfs/hashes.c +++ b/fs/reiserfs/hashes.c @@ -12,12 +12,6 @@ * Yura's function is added (04/07/2000) */ -// -// keyed_hash -// yura_hash -// r5_hash -// - #include <linux/kernel.h> #include "reiserfs.h" #include <asm/types.h> @@ -56,7 +50,7 @@ u32 keyed_hash(const signed char *msg, int len) u32 pad; int i; - // assert(len >= 0 && len < 256); + /* assert(len >= 0 && len < 256); */ pad = (u32) len | ((u32) len << 8); pad |= pad << 16; @@ -127,9 +121,10 @@ u32 keyed_hash(const signed char *msg, int len) return h0 ^ h1; } -/* What follows in this file is copyright 2000 by Hans Reiser, and the - * licensing of what follows is governed by reiserfs/README */ - +/* + * What follows in this file is copyright 2000 by Hans Reiser, and the + * licensing of what follows is governed by reiserfs/README + */ u32 yura_hash(const signed char *msg, int len) { int j, pow; diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c index e1978fd895f5..73231b1ebdbe 100644 --- a/fs/reiserfs/ibalance.c +++ b/fs/reiserfs/ibalance.c @@ -12,7 +12,10 @@ int balance_internal(struct tree_balance *, int, int, struct item_head *, struct buffer_head **); -/* modes of internal_shift_left, internal_shift_right and internal_insert_childs */ +/* + * modes of internal_shift_left, internal_shift_right and + * internal_insert_childs + */ #define INTERNAL_SHIFT_FROM_S_TO_L 0 #define INTERNAL_SHIFT_FROM_R_TO_S 1 #define INTERNAL_SHIFT_FROM_L_TO_S 2 @@ -32,7 +35,9 @@ static void internal_define_dest_src_infos(int shift_mode, memset(src_bi, 0, sizeof(struct buffer_info)); /* define dest, src, dest parent, dest position */ switch (shift_mode) { - case INTERNAL_SHIFT_FROM_S_TO_L: /* used in internal_shift_left */ + + /* used in internal_shift_left */ + case INTERNAL_SHIFT_FROM_S_TO_L: src_bi->tb = tb; src_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h); src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h); @@ -52,12 +57,14 @@ static void internal_define_dest_src_infos(int shift_mode, dest_bi->tb = tb; dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h); dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h); - dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); /* dest position is analog of dest->b_item_order */ + /* dest position is analog of dest->b_item_order */ + dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); *d_key = tb->lkey[h]; *cf = tb->CFL[h]; break; - case INTERNAL_SHIFT_FROM_R_TO_S: /* used in internal_shift_left */ + /* used in internal_shift_left */ + case INTERNAL_SHIFT_FROM_R_TO_S: src_bi->tb = tb; src_bi->bi_bh = tb->R[h]; src_bi->bi_parent = tb->FR[h]; @@ -111,7 +118,8 @@ static void internal_define_dest_src_infos(int shift_mode, } } -/* Insert count node pointers into buffer cur before position to + 1. +/* + * Insert count node pointers into buffer cur before position to + 1. * Insert count items into buffer cur before position to. * Items and node pointers are specified by inserted and bh respectively. */ @@ -146,14 +154,14 @@ static void internal_insert_childs(struct buffer_info *cur_bi, /* copy to_be_insert disk children */ for (i = 0; i < count; i++) { - put_dc_size(&(new_dc[i]), + put_dc_size(&new_dc[i], MAX_CHILD_SIZE(bh[i]) - B_FREE_SPACE(bh[i])); - put_dc_block_number(&(new_dc[i]), bh[i]->b_blocknr); + put_dc_block_number(&new_dc[i], bh[i]->b_blocknr); } memcpy(dc, new_dc, DC_SIZE * count); /* prepare space for count items */ - ih = B_N_PDELIM_KEY(cur, ((to == -1) ? 0 : to)); + ih = internal_key(cur, ((to == -1) ? 0 : to)); memmove(ih + count, ih, (nr - to) * KEY_SIZE + (nr + 1 + count) * DC_SIZE); @@ -190,8 +198,10 @@ static void internal_insert_childs(struct buffer_info *cur_bi, } -/* Delete del_num items and node pointers from buffer cur starting from * - * the first_i'th item and first_p'th pointers respectively. */ +/* + * Delete del_num items and node pointers from buffer cur starting from + * the first_i'th item and first_p'th pointers respectively. + */ static void internal_delete_pointers_items(struct buffer_info *cur_bi, int first_p, int first_i, int del_num) @@ -233,7 +243,7 @@ static void internal_delete_pointers_items(struct buffer_info *cur_bi, dc = B_N_CHILD(cur, first_p); memmove(dc, dc + del_num, (nr + 1 - first_p - del_num) * DC_SIZE); - key = B_N_PDELIM_KEY(cur, first_i); + key = internal_key(cur, first_i); memmove(key, key + del_num, (nr - first_i - del_num) * KEY_SIZE + (nr + 1 - del_num) * DC_SIZE); @@ -270,22 +280,30 @@ static void internal_delete_childs(struct buffer_info *cur_bi, int from, int n) i_from = (from == 0) ? from : from - 1; - /* delete n pointers starting from `from' position in CUR; - delete n keys starting from 'i_from' position in CUR; + /* + * delete n pointers starting from `from' position in CUR; + * delete n keys starting from 'i_from' position in CUR; */ internal_delete_pointers_items(cur_bi, from, i_from, n); } -/* copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer dest -* last_first == FIRST_TO_LAST means, that we copy first items from src to tail of dest - * last_first == LAST_TO_FIRST means, that we copy last items from src to head of dest +/* + * copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer + * dest + * last_first == FIRST_TO_LAST means that we copy first items + * from src to tail of dest + * last_first == LAST_TO_FIRST means that we copy last items + * from src to head of dest */ static void internal_copy_pointers_items(struct buffer_info *dest_bi, struct buffer_head *src, int last_first, int cpy_num) { - /* ATTENTION! Number of node pointers in DEST is equal to number of items in DEST * - * as delimiting key have already inserted to buffer dest.*/ + /* + * ATTENTION! Number of node pointers in DEST is equal to number + * of items in DEST as delimiting key have already inserted to + * buffer dest. + */ struct buffer_head *dest = dest_bi->bi_bh; int nr_dest, nr_src; int dest_order, src_order; @@ -330,13 +348,13 @@ static void internal_copy_pointers_items(struct buffer_info *dest_bi, memcpy(dc, B_N_CHILD(src, src_order), DC_SIZE * cpy_num); /* prepare space for cpy_num - 1 item headers */ - key = B_N_PDELIM_KEY(dest, dest_order); + key = internal_key(dest, dest_order); memmove(key + cpy_num - 1, key, KEY_SIZE * (nr_dest - dest_order) + DC_SIZE * (nr_dest + cpy_num)); /* insert headers */ - memcpy(key, B_N_PDELIM_KEY(src, src_order), KEY_SIZE * (cpy_num - 1)); + memcpy(key, internal_key(src, src_order), KEY_SIZE * (cpy_num - 1)); /* sizes, item number */ set_blkh_nr_item(blkh, blkh_nr_item(blkh) + (cpy_num - 1)); @@ -366,7 +384,9 @@ static void internal_copy_pointers_items(struct buffer_info *dest_bi, } -/* Copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer dest. +/* + * Copy cpy_num node pointers and cpy_num - 1 items from buffer src to + * buffer dest. * Delete cpy_num - del_par items and node pointers from buffer src. * last_first == FIRST_TO_LAST means, that we copy/delete first items from src. * last_first == LAST_TO_FIRST means, that we copy/delete last items from src. @@ -385,8 +405,10 @@ static void internal_move_pointers_items(struct buffer_info *dest_bi, if (last_first == FIRST_TO_LAST) { /* shift_left occurs */ first_pointer = 0; first_item = 0; - /* delete cpy_num - del_par pointers and keys starting for pointers with first_pointer, - for key - with first_item */ + /* + * delete cpy_num - del_par pointers and keys starting for + * pointers with first_pointer, for key - with first_item + */ internal_delete_pointers_items(src_bi, first_pointer, first_item, cpy_num - del_par); } else { /* shift_right occurs */ @@ -404,7 +426,9 @@ static void internal_move_pointers_items(struct buffer_info *dest_bi, } /* Insert n_src'th key of buffer src before n_dest'th key of buffer dest. */ -static void internal_insert_key(struct buffer_info *dest_bi, int dest_position_before, /* insert key before key with n_dest number */ +static void internal_insert_key(struct buffer_info *dest_bi, + /* insert key before key with n_dest number */ + int dest_position_before, struct buffer_head *src, int src_position) { struct buffer_head *dest = dest_bi->bi_bh; @@ -429,12 +453,12 @@ static void internal_insert_key(struct buffer_info *dest_bi, int dest_position_b nr = blkh_nr_item(blkh); /* prepare space for inserting key */ - key = B_N_PDELIM_KEY(dest, dest_position_before); + key = internal_key(dest, dest_position_before); memmove(key + 1, key, (nr - dest_position_before) * KEY_SIZE + (nr + 1) * DC_SIZE); /* insert key */ - memcpy(key, B_N_PDELIM_KEY(src, src_position), KEY_SIZE); + memcpy(key, internal_key(src, src_position), KEY_SIZE); /* Change dirt, free space, item number fields. */ @@ -453,13 +477,19 @@ static void internal_insert_key(struct buffer_info *dest_bi, int dest_position_b } } -/* Insert d_key'th (delimiting) key from buffer cfl to tail of dest. - * Copy pointer_amount node pointers and pointer_amount - 1 items from buffer src to buffer dest. +/* + * Insert d_key'th (delimiting) key from buffer cfl to tail of dest. + * Copy pointer_amount node pointers and pointer_amount - 1 items from + * buffer src to buffer dest. * Replace d_key'th key in buffer cfl. * Delete pointer_amount items and node pointers from buffer src. */ /* this can be invoked both to shift from S to L and from R to S */ -static void internal_shift_left(int mode, /* INTERNAL_FROM_S_TO_L | INTERNAL_FROM_R_TO_S */ +static void internal_shift_left( + /* + * INTERNAL_FROM_S_TO_L | INTERNAL_FROM_R_TO_S + */ + int mode, struct tree_balance *tb, int h, int pointer_amount) { @@ -473,7 +503,10 @@ static void internal_shift_left(int mode, /* INTERNAL_FROM_S_TO_L | INTERNAL_FRO /*printk("pointer_amount = %d\n",pointer_amount); */ if (pointer_amount) { - /* insert delimiting key from common father of dest and src to node dest into position B_NR_ITEM(dest) */ + /* + * insert delimiting key from common father of dest and + * src to node dest into position B_NR_ITEM(dest) + */ internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf, d_key_position); @@ -492,7 +525,8 @@ static void internal_shift_left(int mode, /* INTERNAL_FROM_S_TO_L | INTERNAL_FRO } -/* Insert delimiting key to L[h]. +/* + * Insert delimiting key to L[h]. * Copy n node pointers and n - 1 items from buffer S[h] to L[h]. * Delete n - 1 items and node pointers from buffer S[h]. */ @@ -507,23 +541,27 @@ static void internal_shift1_left(struct tree_balance *tb, internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, &dest_bi, &src_bi, &d_key_position, &cf); - if (pointer_amount > 0) /* insert lkey[h]-th key from CFL[h] to left neighbor L[h] */ + /* insert lkey[h]-th key from CFL[h] to left neighbor L[h] */ + if (pointer_amount > 0) internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf, d_key_position); - /* internal_insert_key (tb->L[h], B_NR_ITEM(tb->L[h]), tb->CFL[h], tb->lkey[h]); */ /* last parameter is del_parameter */ internal_move_pointers_items(&dest_bi, &src_bi, FIRST_TO_LAST, pointer_amount, 1); - /* internal_move_pointers_items (tb->L[h], tb->S[h], FIRST_TO_LAST, pointer_amount, 1); */ } -/* Insert d_key'th (delimiting) key from buffer cfr to head of dest. +/* + * Insert d_key'th (delimiting) key from buffer cfr to head of dest. * Copy n node pointers and n - 1 items from buffer src to buffer dest. * Replace d_key'th key in buffer cfr. * Delete n items and node pointers from buffer src. */ -static void internal_shift_right(int mode, /* INTERNAL_FROM_S_TO_R | INTERNAL_FROM_L_TO_S */ +static void internal_shift_right( + /* + * INTERNAL_FROM_S_TO_R | INTERNAL_FROM_L_TO_S + */ + int mode, struct tree_balance *tb, int h, int pointer_amount) { @@ -538,7 +576,10 @@ static void internal_shift_right(int mode, /* INTERNAL_FROM_S_TO_R | INTERNAL_FR nr = B_NR_ITEMS(src_bi.bi_bh); if (pointer_amount > 0) { - /* insert delimiting key from common father of dest and src to dest node into position 0 */ + /* + * insert delimiting key from common father of dest + * and src to dest node into position 0 + */ internal_insert_key(&dest_bi, 0, cf, d_key_position); if (nr == pointer_amount - 1) { RFALSE(src_bi.bi_bh != PATH_H_PBUFFER(tb->tb_path, h) /*tb->S[h] */ || @@ -559,7 +600,8 @@ static void internal_shift_right(int mode, /* INTERNAL_FROM_S_TO_R | INTERNAL_FR pointer_amount, 0); } -/* Insert delimiting key to R[h]. +/* + * Insert delimiting key to R[h]. * Copy n node pointers and n - 1 items from buffer S[h] to R[h]. * Delete n - 1 items and node pointers from buffer S[h]. */ @@ -574,18 +616,19 @@ static void internal_shift1_right(struct tree_balance *tb, internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, &dest_bi, &src_bi, &d_key_position, &cf); - if (pointer_amount > 0) /* insert rkey from CFR[h] to right neighbor R[h] */ + /* insert rkey from CFR[h] to right neighbor R[h] */ + if (pointer_amount > 0) internal_insert_key(&dest_bi, 0, cf, d_key_position); - /* internal_insert_key (tb->R[h], 0, tb->CFR[h], tb->rkey[h]); */ /* last parameter is del_parameter */ internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST, pointer_amount, 1); - /* internal_move_pointers_items (tb->R[h], tb->S[h], LAST_TO_FIRST, pointer_amount, 1); */ } -/* Delete insert_num node pointers together with their left items - * and balance current node.*/ +/* + * Delete insert_num node pointers together with their left items + * and balance current node. + */ static void balance_internal_when_delete(struct tree_balance *tb, int h, int child_pos) { @@ -626,9 +669,11 @@ static void balance_internal_when_delete(struct tree_balance *tb, new_root = tb->R[h - 1]; else new_root = tb->L[h - 1]; - /* switch super block's tree root block number to the new value */ + /* + * switch super block's tree root block + * number to the new value */ PUT_SB_ROOT_BLOCK(tb->tb_sb, new_root->b_blocknr); - //REISERFS_SB(tb->tb_sb)->s_rs->s_tree_height --; + /*REISERFS_SB(tb->tb_sb)->s_rs->s_tree_height --; */ PUT_SB_TREE_HEIGHT(tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) - 1); @@ -636,8 +681,8 @@ static void balance_internal_when_delete(struct tree_balance *tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1); /*&&&&&&&&&&&&&&&&&&&&&& */ + /* use check_internal if new root is an internal node */ if (h > 1) - /* use check_internal if new root is an internal node */ check_internal(new_root); /*&&&&&&&&&&&&&&&&&&&&&& */ @@ -648,7 +693,8 @@ static void balance_internal_when_delete(struct tree_balance *tb, return; } - if (tb->L[h] && tb->lnum[h] == -B_NR_ITEMS(tb->L[h]) - 1) { /* join S[h] with L[h] */ + /* join S[h] with L[h] */ + if (tb->L[h] && tb->lnum[h] == -B_NR_ITEMS(tb->L[h]) - 1) { RFALSE(tb->rnum[h] != 0, "invalid tb->rnum[%d]==%d when joining S[h] with L[h]", @@ -660,7 +706,8 @@ static void balance_internal_when_delete(struct tree_balance *tb, return; } - if (tb->R[h] && tb->rnum[h] == -B_NR_ITEMS(tb->R[h]) - 1) { /* join S[h] with R[h] */ + /* join S[h] with R[h] */ + if (tb->R[h] && tb->rnum[h] == -B_NR_ITEMS(tb->R[h]) - 1) { RFALSE(tb->lnum[h] != 0, "invalid tb->lnum[%d]==%d when joining S[h] with R[h]", h, tb->lnum[h]); @@ -671,17 +718,18 @@ static void balance_internal_when_delete(struct tree_balance *tb, return; } - if (tb->lnum[h] < 0) { /* borrow from left neighbor L[h] */ + /* borrow from left neighbor L[h] */ + if (tb->lnum[h] < 0) { RFALSE(tb->rnum[h] != 0, "wrong tb->rnum[%d]==%d when borrow from L[h]", h, tb->rnum[h]); - /*internal_shift_right (tb, h, tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], -tb->lnum[h]); */ internal_shift_right(INTERNAL_SHIFT_FROM_L_TO_S, tb, h, -tb->lnum[h]); return; } - if (tb->rnum[h] < 0) { /* borrow from right neighbor R[h] */ + /* borrow from right neighbor R[h] */ + if (tb->rnum[h] < 0) { RFALSE(tb->lnum[h] != 0, "invalid tb->lnum[%d]==%d when borrow from R[h]", h, tb->lnum[h]); @@ -689,7 +737,8 @@ static void balance_internal_when_delete(struct tree_balance *tb, return; } - if (tb->lnum[h] > 0) { /* split S[h] into two parts and put them into neighbors */ + /* split S[h] into two parts and put them into neighbors */ + if (tb->lnum[h] > 0) { RFALSE(tb->rnum[h] == 0 || tb->lnum[h] + tb->rnum[h] != n + 1, "invalid tb->lnum[%d]==%d or tb->rnum[%d]==%d when S[h](item number == %d) is split between them", h, tb->lnum[h], h, tb->rnum[h], n); @@ -717,7 +766,7 @@ static void replace_lkey(struct tree_balance *tb, int h, struct item_head *key) if (B_NR_ITEMS(PATH_H_PBUFFER(tb->tb_path, h)) == 0) return; - memcpy(B_N_PDELIM_KEY(tb->CFL[h], tb->lkey[h]), key, KEY_SIZE); + memcpy(internal_key(tb->CFL[h], tb->lkey[h]), key, KEY_SIZE); do_balance_mark_internal_dirty(tb, tb->CFL[h], 0); } @@ -732,34 +781,41 @@ static void replace_rkey(struct tree_balance *tb, int h, struct item_head *key) "R[h] can not be empty if it exists (item number=%d)", B_NR_ITEMS(tb->R[h])); - memcpy(B_N_PDELIM_KEY(tb->CFR[h], tb->rkey[h]), key, KEY_SIZE); + memcpy(internal_key(tb->CFR[h], tb->rkey[h]), key, KEY_SIZE); do_balance_mark_internal_dirty(tb, tb->CFR[h], 0); } -int balance_internal(struct tree_balance *tb, /* tree_balance structure */ - int h, /* level of the tree */ - int child_pos, struct item_head *insert_key, /* key for insertion on higher level */ - struct buffer_head **insert_ptr /* node for insertion on higher level */ - ) - /* if inserting/pasting - { - child_pos is the position of the node-pointer in S[h] that * - pointed to S[h-1] before balancing of the h-1 level; * - this means that new pointers and items must be inserted AFTER * - child_pos - } - else - { - it is the position of the leftmost pointer that must be deleted (together with - its corresponding key to the left of the pointer) - as a result of the previous level's balancing. - } - */ + +/* + * if inserting/pasting { + * child_pos is the position of the node-pointer in S[h] that + * pointed to S[h-1] before balancing of the h-1 level; + * this means that new pointers and items must be inserted AFTER + * child_pos + * } else { + * it is the position of the leftmost pointer that must be deleted + * (together with its corresponding key to the left of the pointer) + * as a result of the previous level's balancing. + * } + */ + +int balance_internal(struct tree_balance *tb, + int h, /* level of the tree */ + int child_pos, + /* key for insertion on higher level */ + struct item_head *insert_key, + /* node for insertion on higher level */ + struct buffer_head **insert_ptr) { struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h); struct buffer_info bi; - int order; /* we return this: it is 0 if there is no S[h], else it is tb->S[h]->b_item_order */ + + /* + * we return this: it is 0 if there is no S[h], + * else it is tb->S[h]->b_item_order + */ + int order; int insert_num, n, k; struct buffer_head *S_new; struct item_head new_insert_key; @@ -774,8 +830,10 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure (tbSh) ? PATH_H_POSITION(tb->tb_path, h + 1) /*tb->S[h]->b_item_order */ : 0; - /* Using insert_size[h] calculate the number insert_num of items - that must be inserted to or deleted from S[h]. */ + /* + * Using insert_size[h] calculate the number insert_num of items + * that must be inserted to or deleted from S[h]. + */ insert_num = tb->insert_size[h] / ((int)(KEY_SIZE + DC_SIZE)); /* Check whether insert_num is proper * */ @@ -794,23 +852,21 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure k = 0; if (tb->lnum[h] > 0) { - /* shift lnum[h] items from S[h] to the left neighbor L[h]. - check how many of new items fall into L[h] or CFL[h] after - shifting */ + /* + * shift lnum[h] items from S[h] to the left neighbor L[h]. + * check how many of new items fall into L[h] or CFL[h] after + * shifting + */ n = B_NR_ITEMS(tb->L[h]); /* number of items in L[h] */ if (tb->lnum[h] <= child_pos) { /* new items don't fall into L[h] or CFL[h] */ internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h]); - /*internal_shift_left (tb->L[h],tb->CFL[h],tb->lkey[h],tbSh,tb->lnum[h]); */ child_pos -= tb->lnum[h]; } else if (tb->lnum[h] > child_pos + insert_num) { /* all new items fall into L[h] */ internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h] - insert_num); - /* internal_shift_left(tb->L[h],tb->CFL[h],tb->lkey[h],tbSh, - tb->lnum[h]-insert_num); - */ /* insert insert_num keys and node-pointers into L[h] */ bi.tb = tb; bi.bi_bh = tb->L[h]; @@ -826,7 +882,10 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure } else { struct disk_child *dc; - /* some items fall into L[h] or CFL[h], but some don't fall */ + /* + * some items fall into L[h] or CFL[h], + * but some don't fall + */ internal_shift1_left(tb, h, child_pos + 1); /* calculate number of new items that fall into L[h] */ k = tb->lnum[h] - child_pos - 1; @@ -841,7 +900,10 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure replace_lkey(tb, h, insert_key + k); - /* replace the first node-ptr in S[h] by node-ptr to insert_ptr[k] */ + /* + * replace the first node-ptr in S[h] by + * node-ptr to insert_ptr[k] + */ dc = B_N_CHILD(tbSh, 0); put_dc_size(dc, MAX_CHILD_SIZE(insert_ptr[k]) - @@ -860,17 +922,17 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure /* tb->lnum[h] > 0 */ if (tb->rnum[h] > 0) { /*shift rnum[h] items from S[h] to the right neighbor R[h] */ - /* check how many of new items fall into R or CFR after shifting */ + /* + * check how many of new items fall into R or CFR + * after shifting + */ n = B_NR_ITEMS(tbSh); /* number of items in S[h] */ if (n - tb->rnum[h] >= child_pos) /* new items fall into S[h] */ - /*internal_shift_right(tb,h,tbSh,tb->CFR[h],tb->rkey[h],tb->R[h],tb->rnum[h]); */ internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, tb->rnum[h]); else if (n + insert_num - tb->rnum[h] < child_pos) { /* all new items fall into R[h] */ - /*internal_shift_right(tb,h,tbSh,tb->CFR[h],tb->rkey[h],tb->R[h], - tb->rnum[h] - insert_num); */ internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, tb->rnum[h] - insert_num); @@ -904,7 +966,10 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure replace_rkey(tb, h, insert_key + insert_num - k - 1); - /* replace the first node-ptr in R[h] by node-ptr insert_ptr[insert_num-k-1] */ + /* + * replace the first node-ptr in R[h] by + * node-ptr insert_ptr[insert_num-k-1] + */ dc = B_N_CHILD(tb->R[h], 0); put_dc_size(dc, MAX_CHILD_SIZE(insert_ptr @@ -921,7 +986,7 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure } } - /** Fill new node that appears instead of S[h] **/ + /** Fill new node that appears instead of S[h] **/ RFALSE(tb->blknum[h] > 2, "blknum can not be > 2 for internal level"); RFALSE(tb->blknum[h] < 0, "blknum can not be < 0"); @@ -997,26 +1062,30 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure /* new items don't fall into S_new */ /* store the delimiting key for the next level */ /* new_insert_key = (n - snum)'th key in S[h] */ - memcpy(&new_insert_key, B_N_PDELIM_KEY(tbSh, n - snum), + memcpy(&new_insert_key, internal_key(tbSh, n - snum), KEY_SIZE); /* last parameter is del_par */ internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST, snum, 0); - /* internal_move_pointers_items(S_new, tbSh, LAST_TO_FIRST, snum, 0); */ } else if (n + insert_num - snum < child_pos) { /* all new items fall into S_new */ /* store the delimiting key for the next level */ - /* new_insert_key = (n + insert_item - snum)'th key in S[h] */ + /* + * new_insert_key = (n + insert_item - snum)'th + * key in S[h] + */ memcpy(&new_insert_key, - B_N_PDELIM_KEY(tbSh, n + insert_num - snum), + internal_key(tbSh, n + insert_num - snum), KEY_SIZE); /* last parameter is del_par */ internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST, snum - insert_num, 0); - /* internal_move_pointers_items(S_new,tbSh,1,snum - insert_num,0); */ - /* insert insert_num keys and node-pointers into S_new */ + /* + * insert insert_num keys and node-pointers + * into S_new + */ internal_insert_childs(&dest_bi, /*S_new,tb->S[h-1]->b_next, */ child_pos - n - insert_num + @@ -1033,7 +1102,6 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST, n - child_pos + 1, 1); - /* internal_move_pointers_items(S_new,tbSh,1,n - child_pos + 1,1); */ /* calculate number of new items that fall into S_new */ k = snum - n + child_pos - 1; @@ -1043,7 +1111,10 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure /* new_insert_key = insert_key[insert_num - k - 1] */ memcpy(&new_insert_key, insert_key + insert_num - k - 1, KEY_SIZE); - /* replace first node-ptr in S_new by node-ptr to insert_ptr[insert_num-k-1] */ + /* + * replace first node-ptr in S_new by node-ptr + * to insert_ptr[insert_num-k-1] + */ dc = B_N_CHILD(S_new, 0); put_dc_size(dc, @@ -1066,7 +1137,7 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure || buffer_dirty(S_new), "cm-00001: bad S_new (%b)", S_new); - // S_new is released in unfix_nodes + /* S_new is released in unfix_nodes */ } n = B_NR_ITEMS(tbSh); /*number of items in S[h] */ diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index bc8b8009897d..63b2b0ec49e6 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -25,7 +25,10 @@ int reiserfs_commit_write(struct file *f, struct page *page, void reiserfs_evict_inode(struct inode *inode) { - /* We need blocks for transaction + (user+group) quota update (possibly delete) */ + /* + * We need blocks for transaction + (user+group) quota + * update (possibly delete) + */ int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb); @@ -39,8 +42,12 @@ void reiserfs_evict_inode(struct inode *inode) if (inode->i_nlink) goto no_delete; - /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */ - if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */ + /* + * The = 0 happens when we abort creating a new inode + * for some reason like lack of space.. + * also handles bad_inode case + */ + if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { reiserfs_delete_xattrs(inode); @@ -54,34 +61,43 @@ void reiserfs_evict_inode(struct inode *inode) err = reiserfs_delete_object(&th, inode); - /* Do quota update inside a transaction for journaled quotas. We must do that - * after delete_object so that quota updates go into the same transaction as - * stat data deletion */ + /* + * Do quota update inside a transaction for journaled quotas. + * We must do that after delete_object so that quota updates + * go into the same transaction as stat data deletion + */ if (!err) { int depth = reiserfs_write_unlock_nested(inode->i_sb); dquot_free_inode(inode); reiserfs_write_lock_nested(inode->i_sb, depth); } - if (journal_end(&th, inode->i_sb, jbegin_count)) + if (journal_end(&th)) goto out; - /* check return value from reiserfs_delete_object after + /* + * check return value from reiserfs_delete_object after * ending the transaction */ if (err) goto out; - /* all items of file are deleted, so we can remove "save" link */ - remove_save_link(inode, 0 /* not truncate */ ); /* we can't do anything - * about an error here */ + /* + * all items of file are deleted, so we can remove + * "save" link + * we can't do anything about an error here + */ + remove_save_link(inode, 0 /* not truncate */); out: reiserfs_write_unlock(inode->i_sb); } else { /* no object items are in the tree */ ; } - clear_inode(inode); /* note this must go after the journal_end to prevent deadlock */ + + /* note this must go after the journal_end to prevent deadlock */ + clear_inode(inode); + dquot_drop(inode); inode->i_blocks = 0; return; @@ -103,8 +119,10 @@ static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid, key->key_length = length; } -/* take base of inode_key (it comes from inode always) (dirid, objectid) and version from an inode, set - offset and type of key */ +/* + * take base of inode_key (it comes from inode always) (dirid, objectid) + * and version from an inode, set offset and type of key + */ void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset, int type, int length) { @@ -114,9 +132,7 @@ void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset, length); } -// -// when key is 0, do not set version and short key -// +/* when key is 0, do not set version and short key */ inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key, int version, loff_t offset, int type, int length, @@ -132,43 +148,47 @@ inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key, set_le_ih_k_type(ih, type); put_ih_item_len(ih, length); /* set_ih_free_space (ih, 0); */ - // for directory items it is entry count, for directs and stat - // datas - 0xffff, for indirects - 0 + /* + * for directory items it is entry count, for directs and stat + * datas - 0xffff, for indirects - 0 + */ put_ih_entry_count(ih, entry_count); } -// -// FIXME: we might cache recently accessed indirect item - -// Ugh. Not too eager for that.... -// I cut the code until such time as I see a convincing argument (benchmark). -// I don't want a bloated inode struct..., and I don't like code complexity.... - -/* cutting the code is fine, since it really isn't in use yet and is easy -** to add back in. But, Vladimir has a really good idea here. Think -** about what happens for reading a file. For each page, -** The VFS layer calls reiserfs_readpage, who searches the tree to find -** an indirect item. This indirect item has X number of pointers, where -** X is a big number if we've done the block allocation right. But, -** we only use one or two of these pointers during each call to readpage, -** needlessly researching again later on. -** -** The size of the cache could be dynamic based on the size of the file. -** -** I'd also like to see us cache the location the stat data item, since -** we are needlessly researching for that frequently. -** -** --chris -*/ +/* + * FIXME: we might cache recently accessed indirect item + * Ugh. Not too eager for that.... + * I cut the code until such time as I see a convincing argument (benchmark). + * I don't want a bloated inode struct..., and I don't like code complexity.... + */ -/* If this page has a file tail in it, and -** it was read in by get_block_create_0, the page data is valid, -** but tail is still sitting in a direct item, and we can't write to -** it. So, look through this page, and check all the mapped buffers -** to make sure they have valid block numbers. Any that don't need -** to be unmapped, so that __block_write_begin will correctly call -** reiserfs_get_block to convert the tail into an unformatted node -*/ +/* + * cutting the code is fine, since it really isn't in use yet and is easy + * to add back in. But, Vladimir has a really good idea here. Think + * about what happens for reading a file. For each page, + * The VFS layer calls reiserfs_readpage, who searches the tree to find + * an indirect item. This indirect item has X number of pointers, where + * X is a big number if we've done the block allocation right. But, + * we only use one or two of these pointers during each call to readpage, + * needlessly researching again later on. + * + * The size of the cache could be dynamic based on the size of the file. + * + * I'd also like to see us cache the location the stat data item, since + * we are needlessly researching for that frequently. + * + * --chris + */ + +/* + * If this page has a file tail in it, and + * it was read in by get_block_create_0, the page data is valid, + * but tail is still sitting in a direct item, and we can't write to + * it. So, look through this page, and check all the mapped buffers + * to make sure they have valid block numbers. Any that don't need + * to be unmapped, so that __block_write_begin will correctly call + * reiserfs_get_block to convert the tail into an unformatted node + */ static inline void fix_tail_page_for_writing(struct page *page) { struct buffer_head *head, *next, *bh; @@ -186,8 +206,10 @@ static inline void fix_tail_page_for_writing(struct page *page) } } -/* reiserfs_get_block does not need to allocate a block only if it has been - done already or non-hole position has been found in the indirect item */ +/* + * reiserfs_get_block does not need to allocate a block only if it has been + * done already or non-hole position has been found in the indirect item + */ static inline int allocation_needed(int retval, b_blocknr_t allocated, struct item_head *ih, __le32 * item, int pos_in_item) @@ -211,14 +233,16 @@ static inline void set_block_dev_mapped(struct buffer_head *bh, map_bh(bh, inode->i_sb, block); } -// -// files which were created in the earlier version can not be longer, -// than 2 gb -// +/* + * files which were created in the earlier version can not be longer, + * than 2 gb + */ static int file_capable(struct inode *inode, sector_t block) { - if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 || // it is new file. - block < (1 << (31 - inode->i_sb->s_blocksize_bits))) // old file, but 'block' is inside of 2gb + /* it is new file. */ + if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 || + /* old file, but 'block' is inside of 2gb */ + block < (1 << (31 - inode->i_sb->s_blocksize_bits))) return 1; return 0; @@ -228,7 +252,6 @@ static int restart_transaction(struct reiserfs_transaction_handle *th, struct inode *inode, struct treepath *path) { struct super_block *s = th->t_super; - int len = th->t_blocks_allocated; int err; BUG_ON(!th->t_trans_id); @@ -241,7 +264,7 @@ static int restart_transaction(struct reiserfs_transaction_handle *th, return 0; } reiserfs_update_sd(th, inode); - err = journal_end(th, s, len); + err = journal_end(th); if (!err) { err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6); if (!err) @@ -250,14 +273,14 @@ static int restart_transaction(struct reiserfs_transaction_handle *th, return err; } -// it is called by get_block when create == 0. Returns block number -// for 'block'-th logical block of file. When it hits direct item it -// returns 0 (being called from bmap) or read direct item into piece -// of page (bh_result) - -// Please improve the english/clarity in the comment above, as it is -// hard to understand. - +/* + * it is called by get_block when create == 0. Returns block number + * for 'block'-th logical block of file. When it hits direct item it + * returns 0 (being called from bmap) or read direct item into piece + * of page (bh_result) + * Please improve the english/clarity in the comment above, as it is + * hard to understand. + */ static int _get_block_create_0(struct inode *inode, sector_t block, struct buffer_head *bh_result, int args) { @@ -273,7 +296,7 @@ static int _get_block_create_0(struct inode *inode, sector_t block, int done = 0; unsigned long offset; - // prepare the key to look for the 'block'-th block of file + /* prepare the key to look for the 'block'-th block of file */ make_cpu_key(&key, inode, (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY, 3); @@ -285,23 +308,28 @@ static int _get_block_create_0(struct inode *inode, sector_t block, kunmap(bh_result->b_page); if (result == IO_ERROR) return -EIO; - // We do not return -ENOENT if there is a hole but page is uptodate, because it means - // That there is some MMAPED data associated with it that is yet to be written to disk. + /* + * We do not return -ENOENT if there is a hole but page is + * uptodate, because it means that there is some MMAPED data + * associated with it that is yet to be written to disk. + */ if ((args & GET_BLOCK_NO_HOLE) && !PageUptodate(bh_result->b_page)) { return -ENOENT; } return 0; } - // + bh = get_last_bh(&path); - ih = get_ih(&path); + ih = tp_item_head(&path); if (is_indirect_le_ih(ih)) { - __le32 *ind_item = (__le32 *) B_I_PITEM(bh, ih); + __le32 *ind_item = (__le32 *) ih_item_body(bh, ih); - /* FIXME: here we could cache indirect item or part of it in - the inode to avoid search_by_key in case of subsequent - access to file */ + /* + * FIXME: here we could cache indirect item or part of it in + * the inode to avoid search_by_key in case of subsequent + * access to file + */ blocknr = get_block_num(ind_item, path.pos_in_item); ret = 0; if (blocknr) { @@ -311,8 +339,12 @@ static int _get_block_create_0(struct inode *inode, sector_t block, set_buffer_boundary(bh_result); } } else - // We do not return -ENOENT if there is a hole but page is uptodate, because it means - // That there is some MMAPED data associated with it that is yet to be written to disk. + /* + * We do not return -ENOENT if there is a hole but + * page is uptodate, because it means that there is + * some MMAPED data associated with it that is + * yet to be written to disk. + */ if ((args & GET_BLOCK_NO_HOLE) && !PageUptodate(bh_result->b_page)) { ret = -ENOENT; @@ -323,41 +355,45 @@ static int _get_block_create_0(struct inode *inode, sector_t block, kunmap(bh_result->b_page); return ret; } - // requested data are in direct item(s) + /* requested data are in direct item(s) */ if (!(args & GET_BLOCK_READ_DIRECT)) { - // we are called by bmap. FIXME: we can not map block of file - // when it is stored in direct item(s) + /* + * we are called by bmap. FIXME: we can not map block of file + * when it is stored in direct item(s) + */ pathrelse(&path); if (p) kunmap(bh_result->b_page); return -ENOENT; } - /* if we've got a direct item, and the buffer or page was uptodate, - ** we don't want to pull data off disk again. skip to the - ** end, where we map the buffer and return + /* + * if we've got a direct item, and the buffer or page was uptodate, + * we don't want to pull data off disk again. skip to the + * end, where we map the buffer and return */ if (buffer_uptodate(bh_result)) { goto finished; } else /* - ** grab_tail_page can trigger calls to reiserfs_get_block on up to date - ** pages without any buffers. If the page is up to date, we don't want - ** read old data off disk. Set the up to date bit on the buffer instead - ** and jump to the end + * grab_tail_page can trigger calls to reiserfs_get_block on + * up to date pages without any buffers. If the page is up + * to date, we don't want read old data off disk. Set the up + * to date bit on the buffer instead and jump to the end */ if (!bh_result->b_page || PageUptodate(bh_result->b_page)) { set_buffer_uptodate(bh_result); goto finished; } - // read file tail into part of page + /* read file tail into part of page */ offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1); copy_item_head(&tmp_ih, ih); - /* we only want to kmap if we are reading the tail into the page. - ** this is not the common case, so we don't kmap until we are - ** sure we need to. But, this means the item might move if - ** kmap schedules + /* + * we only want to kmap if we are reading the tail into the page. + * this is not the common case, so we don't kmap until we are + * sure we need to. But, this means the item might move if + * kmap schedules */ if (!p) p = (char *)kmap(bh_result->b_page); @@ -368,10 +404,11 @@ static int _get_block_create_0(struct inode *inode, sector_t block, if (!is_direct_le_ih(ih)) { BUG(); } - /* make sure we don't read more bytes than actually exist in - ** the file. This can happen in odd cases where i_size isn't - ** correct, and when direct item padding results in a few - ** extra bytes at the end of the direct item + /* + * make sure we don't read more bytes than actually exist in + * the file. This can happen in odd cases where i_size isn't + * correct, and when direct item padding results in a few + * extra bytes at the end of the direct item */ if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size) break; @@ -383,40 +420,43 @@ static int _get_block_create_0(struct inode *inode, sector_t block, } else { chars = ih_item_len(ih) - path.pos_in_item; } - memcpy(p, B_I_PITEM(bh, ih) + path.pos_in_item, chars); + memcpy(p, ih_item_body(bh, ih) + path.pos_in_item, chars); if (done) break; p += chars; + /* + * we done, if read direct item is not the last item of + * node FIXME: we could try to check right delimiting key + * to see whether direct item continues in the right + * neighbor or rely on i_size + */ if (PATH_LAST_POSITION(&path) != (B_NR_ITEMS(bh) - 1)) - // we done, if read direct item is not the last item of - // node FIXME: we could try to check right delimiting key - // to see whether direct item continues in the right - // neighbor or rely on i_size break; - // update key to look for the next piece + /* update key to look for the next piece */ set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + chars); result = search_for_position_by_key(inode->i_sb, &key, &path); if (result != POSITION_FOUND) - // i/o error most likely + /* i/o error most likely */ break; bh = get_last_bh(&path); - ih = get_ih(&path); + ih = tp_item_head(&path); } while (1); flush_dcache_page(bh_result->b_page); kunmap(bh_result->b_page); - finished: +finished: pathrelse(&path); if (result == IO_ERROR) return -EIO; - /* this buffer has valid data, but isn't valid for io. mapping it to + /* + * this buffer has valid data, but isn't valid for io. mapping it to * block #0 tells the rest of reiserfs it just has a tail in it */ map_bh(bh_result, inode->i_sb, 0); @@ -424,8 +464,10 @@ static int _get_block_create_0(struct inode *inode, sector_t block, return 0; } -// this is called to create file map. So, _get_block_create_0 will not -// read direct item +/* + * this is called to create file map. So, _get_block_create_0 will not + * read direct item + */ static int reiserfs_bmap(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create) { @@ -439,22 +481,23 @@ static int reiserfs_bmap(struct inode *inode, sector_t block, return 0; } -/* special version of get_block that is only used by grab_tail_page right -** now. It is sent to __block_write_begin, and when you try to get a -** block past the end of the file (or a block from a hole) it returns -** -ENOENT instead of a valid buffer. __block_write_begin expects to -** be able to do i/o on the buffers returned, unless an error value -** is also returned. -** -** So, this allows __block_write_begin to be used for reading a single block -** in a page. Where it does not produce a valid page for holes, or past the -** end of the file. This turns out to be exactly what we need for reading -** tails for conversion. -** -** The point of the wrapper is forcing a certain value for create, even -** though the VFS layer is calling this function with create==1. If you -** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block, -** don't use this function. +/* + * special version of get_block that is only used by grab_tail_page right + * now. It is sent to __block_write_begin, and when you try to get a + * block past the end of the file (or a block from a hole) it returns + * -ENOENT instead of a valid buffer. __block_write_begin expects to + * be able to do i/o on the buffers returned, unless an error value + * is also returned. + * + * So, this allows __block_write_begin to be used for reading a single block + * in a page. Where it does not produce a valid page for holes, or past the + * end of the file. This turns out to be exactly what we need for reading + * tails for conversion. + * + * The point of the wrapper is forcing a certain value for create, even + * though the VFS layer is calling this function with create==1. If you + * don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block, + * don't use this function. */ static int reiserfs_get_block_create_0(struct inode *inode, sector_t block, struct buffer_head *bh_result, @@ -463,8 +506,10 @@ static int reiserfs_get_block_create_0(struct inode *inode, sector_t block, return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE); } -/* This is special helper for reiserfs_get_block in case we are executing - direct_IO request. */ +/* + * This is special helper for reiserfs_get_block in case we are executing + * direct_IO request. + */ static int reiserfs_get_blocks_direct_io(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, @@ -474,9 +519,11 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode, bh_result->b_page = NULL; - /* We set the b_size before reiserfs_get_block call since it is - referenced in convert_tail_for_hole() that may be called from - reiserfs_get_block() */ + /* + * We set the b_size before reiserfs_get_block call since it is + * referenced in convert_tail_for_hole() that may be called from + * reiserfs_get_block() + */ bh_result->b_size = (1 << inode->i_blkbits); ret = reiserfs_get_block(inode, iblock, bh_result, @@ -486,14 +533,18 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode, /* don't allow direct io onto tail pages */ if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) { - /* make sure future calls to the direct io funcs for this offset - ** in the file fail by unmapping the buffer + /* + * make sure future calls to the direct io funcs for this + * offset in the file fail by unmapping the buffer */ clear_buffer_mapped(bh_result); ret = -EINVAL; } - /* Possible unpacked tail. Flush the data before pages have - disappeared */ + + /* + * Possible unpacked tail. Flush the data before pages have + * disappeared + */ if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) { int err; @@ -507,20 +558,20 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode, if (err < 0) ret = err; } - out: +out: return ret; } /* -** helper function for when reiserfs_get_block is called for a hole -** but the file tail is still in a direct item -** bh_result is the buffer head for the hole -** tail_offset is the offset of the start of the tail in the file -** -** This calls prepare_write, which will start a new transaction -** you should not be in a transaction, or have any paths held when you -** call this. -*/ + * helper function for when reiserfs_get_block is called for a hole + * but the file tail is still in a direct item + * bh_result is the buffer head for the hole + * tail_offset is the offset of the start of the tail in the file + * + * This calls prepare_write, which will start a new transaction + * you should not be in a transaction, or have any paths held when you + * call this. + */ static int convert_tail_for_hole(struct inode *inode, struct buffer_head *bh_result, loff_t tail_offset) @@ -540,9 +591,10 @@ static int convert_tail_for_hole(struct inode *inode, tail_end = (tail_start | (bh_result->b_size - 1)) + 1; index = tail_offset >> PAGE_CACHE_SHIFT; - /* hole_page can be zero in case of direct_io, we are sure - that we cannot get here if we write with O_DIRECT into - tail page */ + /* + * hole_page can be zero in case of direct_io, we are sure + * that we cannot get here if we write with O_DIRECT into tail page + */ if (!hole_page || index != hole_page->index) { tail_page = grab_cache_page(inode->i_mapping, index); retval = -ENOMEM; @@ -553,14 +605,15 @@ static int convert_tail_for_hole(struct inode *inode, tail_page = hole_page; } - /* we don't have to make sure the conversion did not happen while - ** we were locking the page because anyone that could convert - ** must first take i_mutex. - ** - ** We must fix the tail page for writing because it might have buffers - ** that are mapped, but have a block number of 0. This indicates tail - ** data that has been read directly into the page, and - ** __block_write_begin won't trigger a get_block in this case. + /* + * we don't have to make sure the conversion did not happen while + * we were locking the page because anyone that could convert + * must first take i_mutex. + * + * We must fix the tail page for writing because it might have buffers + * that are mapped, but have a block number of 0. This indicates tail + * data that has been read directly into the page, and + * __block_write_begin won't trigger a get_block in this case. */ fix_tail_page_for_writing(tail_page); retval = __reiserfs_write_begin(tail_page, tail_start, @@ -573,12 +626,12 @@ static int convert_tail_for_hole(struct inode *inode, retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end); - unlock: +unlock: if (tail_page != hole_page) { unlock_page(tail_page); page_cache_release(tail_page); } - out: +out: return retval; } @@ -604,7 +657,8 @@ int reiserfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create) { int repeat, retval = 0; - b_blocknr_t allocated_block_nr = 0; // b_blocknr_t is (unsigned) 32 bit int + /* b_blocknr_t is (unsigned) 32 bit int*/ + b_blocknr_t allocated_block_nr = 0; INITIALIZE_PATH(path); int pos_in_item; struct cpu_key key; @@ -614,12 +668,14 @@ int reiserfs_get_block(struct inode *inode, sector_t block, int done; int fs_gen; struct reiserfs_transaction_handle *th = NULL; - /* space reserved in transaction batch: - . 3 balancings in direct->indirect conversion - . 1 block involved into reiserfs_update_sd() - XXX in practically impossible worst case direct2indirect() - can incur (much) more than 3 balancings. - quota update for user, group */ + /* + * space reserved in transaction batch: + * . 3 balancings in direct->indirect conversion + * . 1 block involved into reiserfs_update_sd() + * XXX in practically impossible worst case direct2indirect() + * can incur (much) more than 3 balancings. + * quota update for user, group + */ int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb); @@ -636,8 +692,9 @@ int reiserfs_get_block(struct inode *inode, sector_t block, return -EFBIG; } - /* if !create, we aren't changing the FS, so we don't need to - ** log anything, so we don't need to start a transaction + /* + * if !create, we aren't changing the FS, so we don't need to + * log anything, so we don't need to start a transaction */ if (!(create & GET_BLOCK_CREATE)) { int ret; @@ -647,6 +704,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block, reiserfs_write_unlock(inode->i_sb); return ret; } + /* * if we're already in a transaction, make sure to close * any new transactions we start in this func @@ -655,8 +713,10 @@ int reiserfs_get_block(struct inode *inode, sector_t block, reiserfs_transaction_running(inode->i_sb)) dangle = 0; - /* If file is of such a size, that it might have a tail and tails are enabled - ** we should mark it as possibly needing tail packing on close + /* + * If file is of such a size, that it might have a tail and + * tails are enabled we should mark it as possibly needing + * tail packing on close */ if ((have_large_tails(inode->i_sb) && inode->i_size < i_block_size(inode) * 4) @@ -667,7 +727,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block, /* set the key of the first byte in the 'block'-th block of file */ make_cpu_key(&key, inode, new_offset, TYPE_ANY, 3 /*key length */ ); if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) { - start_trans: +start_trans: th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count); if (!th) { retval = -ENOMEM; @@ -675,7 +735,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block, } reiserfs_update_inode_transaction(inode); } - research: +research: retval = search_for_position_by_key(inode->i_sb, &key, &path); if (retval == IO_ERROR) { @@ -684,8 +744,8 @@ int reiserfs_get_block(struct inode *inode, sector_t block, } bh = get_last_bh(&path); - ih = get_ih(&path); - item = get_item(&path); + ih = tp_item_head(&path); + item = tp_item_body(&path); pos_in_item = path.pos_in_item; fs_gen = get_generation(inode->i_sb); @@ -703,11 +763,12 @@ int reiserfs_get_block(struct inode *inode, sector_t block, _allocate_block(th, block, inode, &allocated_block_nr, &path, create); + /* + * restart the transaction to give the journal a chance to free + * some blocks. releases the path, so we have to go back to + * research if we succeed on the second try + */ if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) { - /* restart the transaction to give the journal a chance to free - ** some blocks. releases the path, so we have to go back to - ** research if we succeed on the second try - */ SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1; retval = restart_transaction(th, inode, &path); if (retval) @@ -734,9 +795,11 @@ int reiserfs_get_block(struct inode *inode, sector_t block, if (indirect_item_found(retval, ih)) { b_blocknr_t unfm_ptr; - /* 'block'-th block is in the file already (there is - corresponding cell in some indirect item). But it may be - zero unformatted node pointer (hole) */ + /* + * 'block'-th block is in the file already (there is + * corresponding cell in some indirect item). But it may be + * zero unformatted node pointer (hole) + */ unfm_ptr = get_block_num(item, pos_in_item); if (unfm_ptr == 0) { /* use allocated block to plug the hole */ @@ -753,7 +816,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block, reiserfs_add_ordered_list(inode, bh_result); put_block_num(item, pos_in_item, allocated_block_nr); unfm_ptr = allocated_block_nr; - journal_mark_dirty(th, inode->i_sb, bh); + journal_mark_dirty(th, bh); reiserfs_update_sd(th, inode); } set_block_dev_mapped(bh_result, unfm_ptr, inode); @@ -764,9 +827,10 @@ int reiserfs_get_block(struct inode *inode, sector_t block, reiserfs_write_unlock(inode->i_sb); - /* the item was found, so new blocks were not added to the file - ** there is no need to make sure the inode is updated with this - ** transaction + /* + * the item was found, so new blocks were not added to the file + * there is no need to make sure the inode is updated with this + * transaction */ return retval; } @@ -776,9 +840,11 @@ int reiserfs_get_block(struct inode *inode, sector_t block, goto start_trans; } - /* desired position is not found or is in the direct item. We have - to append file with holes up to 'block'-th block converting - direct items to indirect one if necessary */ + /* + * desired position is not found or is in the direct item. We have + * to append file with holes up to 'block'-th block converting + * direct items to indirect one if necessary + */ done = 0; do { if (is_statdata_le_ih(ih)) { @@ -790,16 +856,18 @@ int reiserfs_get_block(struct inode *inode, sector_t block, TYPE_INDIRECT, UNFM_P_SIZE, 0 /* free_space */ ); + /* + * we are going to add 'block'-th block to the file. + * Use allocated block for that + */ if (cpu_key_k_offset(&key) == 1) { - /* we are going to add 'block'-th block to the file. Use - allocated block for that */ unp = cpu_to_le32(allocated_block_nr); set_block_dev_mapped(bh_result, allocated_block_nr, inode); set_buffer_new(bh_result); done = 1; } - tmp_key = key; // ;) + tmp_key = key; /* ;) */ set_cpu_key_k_offset(&tmp_key, 1); PATH_LAST_POSITION(&path)++; @@ -809,9 +877,12 @@ int reiserfs_get_block(struct inode *inode, sector_t block, if (retval) { reiserfs_free_block(th, inode, allocated_block_nr, 1); - goto failure; // retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST + /* + * retval == -ENOSPC, -EDQUOT or -EIO + * or -EEXIST + */ + goto failure; } - //mark_tail_converted (inode); } else if (is_direct_le_ih(ih)) { /* direct item has to be converted */ loff_t tail_offset; @@ -819,18 +890,24 @@ int reiserfs_get_block(struct inode *inode, sector_t block, tail_offset = ((le_ih_k_offset(ih) - 1) & ~(inode->i_sb->s_blocksize - 1)) + 1; + + /* + * direct item we just found fits into block we have + * to map. Convert it into unformatted node: use + * bh_result for the conversion + */ if (tail_offset == cpu_key_k_offset(&key)) { - /* direct item we just found fits into block we have - to map. Convert it into unformatted node: use - bh_result for the conversion */ set_block_dev_mapped(bh_result, allocated_block_nr, inode); unbh = bh_result; done = 1; } else { - /* we have to padd file tail stored in direct item(s) - up to block size and convert it to unformatted - node. FIXME: this should also get into page cache */ + /* + * we have to pad file tail stored in direct + * item(s) up to block size and convert it + * to unformatted node. FIXME: this should + * also get into page cache + */ pathrelse(&path); /* @@ -859,7 +936,10 @@ int reiserfs_get_block(struct inode *inode, sector_t block, inode->i_ino, retval); if (allocated_block_nr) { - /* the bitmap, the super, and the stat data == 3 */ + /* + * the bitmap, the super, + * and the stat data == 3 + */ if (!th) th = reiserfs_persistent_transaction(inode->i_sb, 3); if (th) @@ -881,43 +961,57 @@ int reiserfs_get_block(struct inode *inode, sector_t block, allocated_block_nr, 1); goto failure; } - /* it is important the set_buffer_uptodate is done after - ** the direct2indirect. The buffer might contain valid - ** data newer than the data on disk (read by readpage, changed, - ** and then sent here by writepage). direct2indirect needs - ** to know if unbh was already up to date, so it can decide - ** if the data in unbh needs to be replaced with data from - ** the disk + /* + * it is important the set_buffer_uptodate is done + * after the direct2indirect. The buffer might + * contain valid data newer than the data on disk + * (read by readpage, changed, and then sent here by + * writepage). direct2indirect needs to know if unbh + * was already up to date, so it can decide if the + * data in unbh needs to be replaced with data from + * the disk */ set_buffer_uptodate(unbh); - /* unbh->b_page == NULL in case of DIRECT_IO request, this means - buffer will disappear shortly, so it should not be added to + /* + * unbh->b_page == NULL in case of DIRECT_IO request, + * this means buffer will disappear shortly, so it + * should not be added to */ if (unbh->b_page) { - /* we've converted the tail, so we must - ** flush unbh before the transaction commits + /* + * we've converted the tail, so we must + * flush unbh before the transaction commits */ reiserfs_add_tail_list(inode, unbh); - /* mark it dirty now to prevent commit_write from adding - ** this buffer to the inode's dirty buffer list + /* + * mark it dirty now to prevent commit_write + * from adding this buffer to the inode's + * dirty buffer list */ /* - * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty(). - * It's still atomic, but it sets the page dirty too, - * which makes it eligible for writeback at any time by the - * VM (which was also the case with __mark_buffer_dirty()) + * AKPM: changed __mark_buffer_dirty to + * mark_buffer_dirty(). It's still atomic, + * but it sets the page dirty too, which makes + * it eligible for writeback at any time by the + * VM (which was also the case with + * __mark_buffer_dirty()) */ mark_buffer_dirty(unbh); } } else { - /* append indirect item with holes if needed, when appending - pointer to 'block'-th block use block, which is already - allocated */ + /* + * append indirect item with holes if needed, when + * appending pointer to 'block'-th block use block, + * which is already allocated + */ struct cpu_key tmp_key; - unp_t unf_single = 0; // We use this in case we need to allocate only - // one block which is a fastpath + /* + * We use this in case we need to allocate + * only one block which is a fastpath + */ + unp_t unf_single = 0; unp_t *un; __u64 max_to_insert = MAX_ITEM_LEN(inode->i_sb->s_blocksize) / @@ -926,14 +1020,17 @@ int reiserfs_get_block(struct inode *inode, sector_t block, RFALSE(pos_in_item != ih_item_len(ih) / UNFM_P_SIZE, "vs-804: invalid position for append"); - /* indirect item has to be appended, set up key of that position */ + /* + * indirect item has to be appended, + * set up key of that position + * (key type is unimportant) + */ make_cpu_key(&tmp_key, inode, le_key_k_offset(version, - &(ih->ih_key)) + + &ih->ih_key) + op_bytes_number(ih, inode->i_sb->s_blocksize), - //pos_in_item * inode->i_sb->s_blocksize, - TYPE_INDIRECT, 3); // key type is unimportant + TYPE_INDIRECT, 3); RFALSE(cpu_key_k_offset(&tmp_key) > cpu_key_k_offset(&key), "green-805: invalid offset"); @@ -954,8 +1051,10 @@ int reiserfs_get_block(struct inode *inode, sector_t block, } } if (blocks_needed <= max_to_insert) { - /* we are going to add target block to the file. Use allocated - block for that */ + /* + * we are going to add target block to + * the file. Use allocated block for that + */ un[blocks_needed - 1] = cpu_to_le32(allocated_block_nr); set_block_dev_mapped(bh_result, @@ -964,8 +1063,11 @@ int reiserfs_get_block(struct inode *inode, sector_t block, done = 1; } else { /* paste hole to the indirect item */ - /* If kmalloc failed, max_to_insert becomes zero and it means we - only have space for one block */ + /* + * If kmalloc failed, max_to_insert becomes + * zero and it means we only have space for + * one block + */ blocks_needed = max_to_insert ? max_to_insert : 1; } @@ -984,9 +1086,12 @@ int reiserfs_get_block(struct inode *inode, sector_t block, goto failure; } if (!done) { - /* We need to mark new file size in case this function will be - interrupted/aborted later on. And we may do this only for - holes. */ + /* + * We need to mark new file size in case + * this function will be interrupted/aborted + * later on. And we may do this only for + * holes. + */ inode->i_size += inode->i_sb->s_blocksize * blocks_needed; } @@ -995,13 +1100,13 @@ int reiserfs_get_block(struct inode *inode, sector_t block, if (done == 1) break; - /* this loop could log more blocks than we had originally asked - ** for. So, we have to allow the transaction to end if it is - ** too big or too full. Update the inode so things are - ** consistent if we crash before the function returns - ** - ** release the path so that anybody waiting on the path before - ** ending their transaction will be able to continue. + /* + * this loop could log more blocks than we had originally + * asked for. So, we have to allow the transaction to end + * if it is too big or too full. Update the inode so things + * are consistent if we crash before the function returns + * release the path so that anybody waiting on the path before + * ending their transaction will be able to continue. */ if (journal_transaction_should_end(th, th->t_blocks_allocated)) { retval = restart_transaction(th, inode, &path); @@ -1031,14 +1136,14 @@ int reiserfs_get_block(struct inode *inode, sector_t block, goto failure; } bh = get_last_bh(&path); - ih = get_ih(&path); - item = get_item(&path); + ih = tp_item_head(&path); + item = tp_item_body(&path); pos_in_item = path.pos_in_item; } while (1); retval = 0; - failure: +failure: if (th && (!dangle || (retval && !th->t_trans_id))) { int err; if (th->t_trans_id) @@ -1060,8 +1165,10 @@ reiserfs_readpages(struct file *file, struct address_space *mapping, return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block); } -/* Compute real number of used bytes by file - * Following three functions can go away when we'll have enough space in stat item +/* + * Compute real number of used bytes by file + * Following three functions can go away when we'll have enough space in + * stat item */ static int real_space_diff(struct inode *inode, int sd_size) { @@ -1071,13 +1178,14 @@ static int real_space_diff(struct inode *inode, int sd_size) if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) return sd_size; - /* End of file is also in full block with indirect reference, so round - ** up to the next block. - ** - ** there is just no way to know if the tail is actually packed - ** on the file, so we have to assume it isn't. When we pack the - ** tail, we add 4 bytes to pretend there really is an unformatted - ** node pointer + /* + * End of file is also in full block with indirect reference, so round + * up to the next block. + * + * there is just no way to know if the tail is actually packed + * on the file, so we have to assume it isn't. When we pack the + * tail, we add 4 bytes to pretend there really is an unformatted + * node pointer */ bytes = ((inode->i_size + @@ -1108,36 +1216,36 @@ static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size) bytes += (loff_t) 511; } - /* files from before the quota patch might i_blocks such that - ** bytes < real_space. Deal with that here to prevent it from - ** going negative. + /* + * files from before the quota patch might i_blocks such that + * bytes < real_space. Deal with that here to prevent it from + * going negative. */ if (bytes < real_space) return 0; return (bytes - real_space) >> 9; } -// -// BAD: new directories have stat data of new type and all other items -// of old type. Version stored in the inode says about body items, so -// in update_stat_data we can not rely on inode, but have to check -// item version directly -// +/* + * BAD: new directories have stat data of new type and all other items + * of old type. Version stored in the inode says about body items, so + * in update_stat_data we can not rely on inode, but have to check + * item version directly + */ -// called by read_locked_inode +/* called by read_locked_inode */ static void init_inode(struct inode *inode, struct treepath *path) { struct buffer_head *bh; struct item_head *ih; __u32 rdev; - //int version = ITEM_VERSION_1; bh = PATH_PLAST_BUFFER(path); - ih = PATH_PITEM_HEAD(path); + ih = tp_item_head(path); - copy_key(INODE_PKEY(inode), &(ih->ih_key)); + copy_key(INODE_PKEY(inode), &ih->ih_key); - INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list)); + INIT_LIST_HEAD(&REISERFS_I(inode)->i_prealloc_list); REISERFS_I(inode)->i_flags = 0; REISERFS_I(inode)->i_prealloc_block = 0; REISERFS_I(inode)->i_prealloc_count = 0; @@ -1147,7 +1255,7 @@ static void init_inode(struct inode *inode, struct treepath *path) if (stat_data_v1(ih)) { struct stat_data_v1 *sd = - (struct stat_data_v1 *)B_I_PITEM(bh, ih); + (struct stat_data_v1 *)ih_item_body(bh, ih); unsigned long blocks; set_inode_item_key_version(inode, KEY_FORMAT_3_5); @@ -1168,20 +1276,26 @@ static void init_inode(struct inode *inode, struct treepath *path) inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); blocks = (inode->i_size + 511) >> 9; blocks = _ROUND_UP(blocks, inode->i_sb->s_blocksize >> 9); + + /* + * there was a bug in <=3.5.23 when i_blocks could take + * negative values. Starting from 3.5.17 this value could + * even be stored in stat data. For such files we set + * i_blocks based on file size. Just 2 notes: this can be + * wrong for sparse files. On-disk value will be only + * updated if file's inode will ever change + */ if (inode->i_blocks > blocks) { - // there was a bug in <=3.5.23 when i_blocks could take negative - // values. Starting from 3.5.17 this value could even be stored in - // stat data. For such files we set i_blocks based on file - // size. Just 2 notes: this can be wrong for sparce files. On-disk value will be - // only updated if file's inode will ever change inode->i_blocks = blocks; } rdev = sd_v1_rdev(sd); REISERFS_I(inode)->i_first_direct_byte = sd_v1_first_direct_byte(sd); - /* an early bug in the quota code can give us an odd number for the - ** block count. This is incorrect, fix it here. + + /* + * an early bug in the quota code can give us an odd + * number for the block count. This is incorrect, fix it here. */ if (inode->i_blocks & 1) { inode->i_blocks++; @@ -1189,13 +1303,17 @@ static void init_inode(struct inode *inode, struct treepath *path) inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks, SD_V1_SIZE)); - /* nopack is initially zero for v1 objects. For v2 objects, - nopack is initialised from sd_attrs */ + /* + * nopack is initially zero for v1 objects. For v2 objects, + * nopack is initialised from sd_attrs + */ REISERFS_I(inode)->i_flags &= ~i_nopack_mask; } else { - // new stat data found, but object may have old items - // (directories and symlinks) - struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih); + /* + * new stat data found, but object may have old items + * (directories and symlinks) + */ + struct stat_data *sd = (struct stat_data *)ih_item_body(bh, ih); inode->i_mode = sd_v2_mode(sd); set_nlink(inode, sd_v2_nlink(sd)); @@ -1225,8 +1343,10 @@ static void init_inode(struct inode *inode, struct treepath *path) inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks, SD_V2_SIZE)); - /* read persistent inode attributes from sd and initialise - generic inode flags from them */ + /* + * read persistent inode attributes from sd and initialise + * generic inode flags from them + */ REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd); sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode); } @@ -1249,7 +1369,7 @@ static void init_inode(struct inode *inode, struct treepath *path) } } -// update new stat data with inode fields +/* update new stat data with inode fields */ static void inode2sd(void *sd, struct inode *inode, loff_t size) { struct stat_data *sd_v2 = (struct stat_data *)sd; @@ -1273,7 +1393,7 @@ static void inode2sd(void *sd, struct inode *inode, loff_t size) set_sd_v2_attrs(sd_v2, flags); } -// used to copy inode's fields to old stat data +/* used to copy inode's fields to old stat data */ static void inode2sd_v1(void *sd, struct inode *inode, loff_t size) { struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd; @@ -1292,14 +1412,15 @@ static void inode2sd_v1(void *sd, struct inode *inode, loff_t size) else set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE)); - // Sigh. i_first_direct_byte is back + /* Sigh. i_first_direct_byte is back */ set_sd_v1_first_direct_byte(sd_v1, REISERFS_I(inode)->i_first_direct_byte); } -/* NOTE, you must prepare the buffer head before sending it here, -** and then log it after the call -*/ +/* + * NOTE, you must prepare the buffer head before sending it here, + * and then log it after the call + */ static void update_stat_data(struct treepath *path, struct inode *inode, loff_t size) { @@ -1307,17 +1428,17 @@ static void update_stat_data(struct treepath *path, struct inode *inode, struct item_head *ih; bh = PATH_PLAST_BUFFER(path); - ih = PATH_PITEM_HEAD(path); + ih = tp_item_head(path); if (!is_statdata_le_ih(ih)) reiserfs_panic(inode->i_sb, "vs-13065", "key %k, found item %h", INODE_PKEY(inode), ih); + /* path points to old stat data */ if (stat_data_v1(ih)) { - // path points to old stat data - inode2sd_v1(B_I_PITEM(bh, ih), inode, size); + inode2sd_v1(ih_item_body(bh, ih), inode, size); } else { - inode2sd(B_I_PITEM(bh, ih), inode, size); + inode2sd(ih_item_body(bh, ih), inode, size); } return; @@ -1335,7 +1456,8 @@ void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th, BUG_ON(!th->t_trans_id); - make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3); //key type is unimportant + /* key type is unimportant */ + make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3); for (;;) { int pos; @@ -1363,45 +1485,48 @@ void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th, return; } - /* sigh, prepare_for_journal might schedule. When it schedules the - ** FS might change. We have to detect that, and loop back to the - ** search if the stat data item has moved + /* + * sigh, prepare_for_journal might schedule. When it + * schedules the FS might change. We have to detect that, + * and loop back to the search if the stat data item has moved */ bh = get_last_bh(&path); - ih = get_ih(&path); + ih = tp_item_head(&path); copy_item_head(&tmp_ih, ih); fs_gen = get_generation(inode->i_sb); reiserfs_prepare_for_journal(inode->i_sb, bh, 1); + + /* Stat_data item has been moved after scheduling. */ if (fs_changed(fs_gen, inode->i_sb) && item_moved(&tmp_ih, &path)) { reiserfs_restore_prepared_buffer(inode->i_sb, bh); - continue; /* Stat_data item has been moved after scheduling. */ + continue; } break; } update_stat_data(&path, inode, size); - journal_mark_dirty(th, th->t_super, bh); + journal_mark_dirty(th, bh); pathrelse(&path); return; } -/* reiserfs_read_locked_inode is called to read the inode off disk, and it -** does a make_bad_inode when things go wrong. But, we need to make sure -** and clear the key in the private portion of the inode, otherwise a -** corresponding iput might try to delete whatever object the inode last -** represented. -*/ +/* + * reiserfs_read_locked_inode is called to read the inode off disk, and it + * does a make_bad_inode when things go wrong. But, we need to make sure + * and clear the key in the private portion of the inode, otherwise a + * corresponding iput might try to delete whatever object the inode last + * represented. + */ static void reiserfs_make_bad_inode(struct inode *inode) { memset(INODE_PKEY(inode), 0, KEY_SIZE); make_bad_inode(inode); } -// -// initially this function was derived from minix or ext2's analog and -// evolved as the prototype did -// - +/* + * initially this function was derived from minix or ext2's analog and + * evolved as the prototype did + */ int reiserfs_init_locked_inode(struct inode *inode, void *p) { struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p; @@ -1410,8 +1535,10 @@ int reiserfs_init_locked_inode(struct inode *inode, void *p) return 0; } -/* looks for stat data in the tree, and fills up the fields of in-core - inode stat data fields */ +/* + * looks for stat data in the tree, and fills up the fields of in-core + * inode stat data fields + */ void reiserfs_read_locked_inode(struct inode *inode, struct reiserfs_iget_args *args) { @@ -1422,8 +1549,10 @@ void reiserfs_read_locked_inode(struct inode *inode, dirino = args->dirid; - /* set version 1, version 2 could be used too, because stat data - key is the same in both versions */ + /* + * set version 1, version 2 could be used too, because stat data + * key is the same in both versions + */ key.version = KEY_FORMAT_3_5; key.on_disk_key.k_dir_id = dirino; key.on_disk_key.k_objectid = inode->i_ino; @@ -1439,8 +1568,9 @@ void reiserfs_read_locked_inode(struct inode *inode, reiserfs_make_bad_inode(inode); return; } + + /* a stale NFS handle can trigger this without it being an error */ if (retval != ITEM_FOUND) { - /* a stale NFS handle can trigger this without it being an error */ pathrelse(&path_to_sd); reiserfs_make_bad_inode(inode); clear_nlink(inode); @@ -1449,20 +1579,25 @@ void reiserfs_read_locked_inode(struct inode *inode, init_inode(inode, &path_to_sd); - /* It is possible that knfsd is trying to access inode of a file - that is being removed from the disk by some other thread. As we - update sd on unlink all that is required is to check for nlink - here. This bug was first found by Sizif when debugging - SquidNG/Butterfly, forgotten, and found again after Philippe - Gramoulle <philippe.gramoulle@mmania.com> reproduced it. - - More logical fix would require changes in fs/inode.c:iput() to - remove inode from hash-table _after_ fs cleaned disk stuff up and - in iget() to return NULL if I_FREEING inode is found in - hash-table. */ - /* Currently there is one place where it's ok to meet inode with - nlink==0: processing of open-unlinked and half-truncated files - during mount (fs/reiserfs/super.c:finish_unfinished()). */ + /* + * It is possible that knfsd is trying to access inode of a file + * that is being removed from the disk by some other thread. As we + * update sd on unlink all that is required is to check for nlink + * here. This bug was first found by Sizif when debugging + * SquidNG/Butterfly, forgotten, and found again after Philippe + * Gramoulle <philippe.gramoulle@mmania.com> reproduced it. + + * More logical fix would require changes in fs/inode.c:iput() to + * remove inode from hash-table _after_ fs cleaned disk stuff up and + * in iget() to return NULL if I_FREEING inode is found in + * hash-table. + */ + + /* + * Currently there is one place where it's ok to meet inode with + * nlink==0: processing of open-unlinked and half-truncated files + * during mount (fs/reiserfs/super.c:finish_unfinished()). + */ if ((inode->i_nlink == 0) && !REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) { reiserfs_warning(inode->i_sb, "vs-13075", @@ -1472,7 +1607,8 @@ void reiserfs_read_locked_inode(struct inode *inode, reiserfs_make_bad_inode(inode); } - reiserfs_check_path(&path_to_sd); /* init inode should be relsing */ + /* init inode should be relsing */ + reiserfs_check_path(&path_to_sd); /* * Stat data v1 doesn't support ACLs. @@ -1481,7 +1617,7 @@ void reiserfs_read_locked_inode(struct inode *inode, cache_no_acl(inode); } -/** +/* * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked(). * * @inode: inode from hash table to check @@ -1556,7 +1692,8 @@ static struct dentry *reiserfs_get_dentry(struct super_block *sb, struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { - /* fhtype happens to reflect the number of u32s encoded. + /* + * fhtype happens to reflect the number of u32s encoded. * due to a bug in earlier code, fhtype might indicate there * are more u32s then actually fitted. * so if fhtype seems to be more than len, reduce fhtype. @@ -1625,13 +1762,16 @@ int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp, return *lenp; } -/* looks for stat data, then copies fields to it, marks the buffer - containing stat data as dirty */ -/* reiserfs inodes are never really dirty, since the dirty inode call -** always logs them. This call allows the VFS inode marking routines -** to properly mark inodes for datasync and such, but only actually -** does something when called for a synchronous update. -*/ +/* + * looks for stat data, then copies fields to it, marks the buffer + * containing stat data as dirty + */ +/* + * reiserfs inodes are never really dirty, since the dirty inode call + * always logs them. This call allows the VFS inode marking routines + * to properly mark inodes for datasync and such, but only actually + * does something when called for a synchronous update. + */ int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct reiserfs_transaction_handle th; @@ -1639,24 +1779,28 @@ int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc) if (inode->i_sb->s_flags & MS_RDONLY) return -EROFS; - /* memory pressure can sometimes initiate write_inode calls with sync == 1, - ** these cases are just when the system needs ram, not when the - ** inode needs to reach disk for safety, and they can safely be - ** ignored because the altered inode has already been logged. + /* + * memory pressure can sometimes initiate write_inode calls with + * sync == 1, + * these cases are just when the system needs ram, not when the + * inode needs to reach disk for safety, and they can safely be + * ignored because the altered inode has already been logged. */ if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) { reiserfs_write_lock(inode->i_sb); if (!journal_begin(&th, inode->i_sb, jbegin_count)) { reiserfs_update_sd(&th, inode); - journal_end_sync(&th, inode->i_sb, jbegin_count); + journal_end_sync(&th); } reiserfs_write_unlock(inode->i_sb); } return 0; } -/* stat data of new object is inserted already, this inserts the item - containing "." and ".." entries */ +/* + * stat data of new object is inserted already, this inserts the item + * containing "." and ".." entries + */ static int reiserfs_new_directory(struct reiserfs_transaction_handle *th, struct inode *inode, struct item_head *ih, struct treepath *path, @@ -1674,9 +1818,11 @@ static int reiserfs_new_directory(struct reiserfs_transaction_handle *th, le32_to_cpu(ih->ih_key.k_objectid), DOT_OFFSET, TYPE_DIRENTRY, 3 /*key length */ ); - /* compose item head for new item. Directories consist of items of - old type (ITEM_VERSION_1). Do not set key (second arg is 0), it - is done by reiserfs_new_inode */ + /* + * compose item head for new item. Directories consist of items of + * old type (ITEM_VERSION_1). Do not set key (second arg is 0), it + * is done by reiserfs_new_inode + */ if (old_format_only(sb)) { make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2); @@ -1714,9 +1860,12 @@ static int reiserfs_new_directory(struct reiserfs_transaction_handle *th, return reiserfs_insert_item(th, path, &key, ih, inode, body); } -/* stat data of object has been inserted, this inserts the item - containing the body of symlink */ -static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct inode *inode, /* Inode of symlink */ +/* + * stat data of object has been inserted, this inserts the item + * containing the body of symlink + */ +static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, + struct inode *inode, struct item_head *ih, struct treepath *path, const char *symname, int item_len) @@ -1754,15 +1903,26 @@ static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct i return reiserfs_insert_item(th, path, &key, ih, inode, symname); } -/* inserts the stat data into the tree, and then calls - reiserfs_new_directory (to insert ".", ".." item if new object is - directory) or reiserfs_new_symlink (to insert symlink body if new - object is symlink) or nothing (if new object is regular file) - - NOTE! uid and gid must already be set in the inode. If we return - non-zero due to an error, we have to drop the quota previously allocated - for the fresh inode. This can only be done outside a transaction, so - if we return non-zero, we also end the transaction. */ +/* + * inserts the stat data into the tree, and then calls + * reiserfs_new_directory (to insert ".", ".." item if new object is + * directory) or reiserfs_new_symlink (to insert symlink body if new + * object is symlink) or nothing (if new object is regular file) + + * NOTE! uid and gid must already be set in the inode. If we return + * non-zero due to an error, we have to drop the quota previously allocated + * for the fresh inode. This can only be done outside a transaction, so + * if we return non-zero, we also end the transaction. + * + * @th: active transaction handle + * @dir: parent directory for new inode + * @mode: mode of new inode + * @symname: symlink contents if inode is symlink + * @isize: 0 for regular file, EMPTY_DIR_SIZE for dirs, strlen(symname) for + * symlinks + * @inode: inode to be filled + * @security: optional security context to associate with this inode + */ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, struct inode *dir, umode_t mode, const char *symname, /* 0 for regular, EMTRY_DIR_SIZE for dirs, @@ -1807,7 +1967,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, else make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET, TYPE_STAT_DATA, SD_SIZE, MAX_US_INT); - memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE); + memcpy(INODE_PKEY(inode), &ih.ih_key, KEY_SIZE); args.dirid = le32_to_cpu(ih.ih_key.k_dir_id); depth = reiserfs_write_unlock_nested(inode->i_sb); @@ -1820,10 +1980,11 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, } if (old_format_only(sb)) - /* not a perfect generation count, as object ids can be reused, but - ** this is as good as reiserfs can do right now. - ** note that the private part of inode isn't filled in yet, we have - ** to use the directory. + /* + * not a perfect generation count, as object ids can be reused, + * but this is as good as reiserfs can do right now. + * note that the private part of inode isn't filled in yet, + * we have to use the directory. */ inode->i_generation = le32_to_cpu(INODE_PKEY(dir)->k_objectid); else @@ -1850,7 +2011,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 : U32_MAX /*NO_BYTES_IN_DIRECT_ITEM */ ; - INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list)); + INIT_LIST_HEAD(&REISERFS_I(inode)->i_prealloc_list); REISERFS_I(inode)->i_flags = 0; REISERFS_I(inode)->i_prealloc_block = 0; REISERFS_I(inode)->i_prealloc_count = 0; @@ -1878,9 +2039,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, goto out_bad_inode; } if (old_format_only(sb)) { + /* i_uid or i_gid is too big to be stored in stat data v3.5 */ if (i_uid_read(inode) & ~0xffff || i_gid_read(inode) & ~0xffff) { pathrelse(&path_to_key); - /* i_uid or i_gid is too big to be stored in stat data v3.5 */ err = -EINVAL; goto out_bad_inode; } @@ -1888,9 +2049,11 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, } else { inode2sd(&sd, inode, inode->i_size); } - // store in in-core inode the key of stat data and version all - // object items will have (directory items will have old offset - // format, other new objects will consist of new items) + /* + * store in in-core inode the key of stat data and version all + * object items will have (directory items will have old offset + * format, other new objects will consist of new items) + */ if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode)) set_inode_item_key_version(inode, KEY_FORMAT_3_5); else @@ -1934,7 +2097,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, if (retval) { err = retval; reiserfs_check_path(&path_to_key); - journal_end(th, th->t_super, th->t_blocks_allocated); + journal_end(th); goto out_inserted_sd; } @@ -1945,7 +2108,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, if (retval) { err = retval; reiserfs_check_path(&path_to_key); - journal_end(th, th->t_super, th->t_blocks_allocated); + journal_end(th); goto out_inserted_sd; } } else if (inode->i_sb->s_flags & MS_POSIXACL) { @@ -1962,8 +2125,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, if (retval) { err = retval; reiserfs_check_path(&path_to_key); - retval = journal_end(th, th->t_super, - th->t_blocks_allocated); + retval = journal_end(th); if (retval) err = retval; goto out_inserted_sd; @@ -1975,11 +2137,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, return 0; -/* it looks like you can easily compress these two goto targets into - * one. Keeping it like this doesn't actually hurt anything, and they - * are place holders for what the quota code actually needs. - */ - out_bad_inode: +out_bad_inode: /* Invalidate the object, nothing was inserted yet */ INODE_PKEY(inode)->k_objectid = 0; @@ -1988,16 +2146,19 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, dquot_free_inode(inode); reiserfs_write_lock_nested(inode->i_sb, depth); - out_end_trans: - journal_end(th, th->t_super, th->t_blocks_allocated); - /* Drop can be outside and it needs more credits so it's better to have it outside */ +out_end_trans: + journal_end(th); + /* + * Drop can be outside and it needs more credits so it's better + * to have it outside + */ depth = reiserfs_write_unlock_nested(inode->i_sb); dquot_drop(inode); reiserfs_write_lock_nested(inode->i_sb, depth); inode->i_flags |= S_NOQUOTA; make_bad_inode(inode); - out_inserted_sd: +out_inserted_sd: clear_nlink(inode); th->t_trans_id = 0; /* so the caller can't use this handle later */ unlock_new_inode(inode); /* OK to do even if we hadn't locked it */ @@ -2006,25 +2167,26 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, } /* -** finds the tail page in the page cache, -** reads the last block in. -** -** On success, page_result is set to a locked, pinned page, and bh_result -** is set to an up to date buffer for the last block in the file. returns 0. -** -** tail conversion is not done, so bh_result might not be valid for writing -** check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before -** trying to write the block. -** -** on failure, nonzero is returned, page_result and bh_result are untouched. -*/ + * finds the tail page in the page cache, + * reads the last block in. + * + * On success, page_result is set to a locked, pinned page, and bh_result + * is set to an up to date buffer for the last block in the file. returns 0. + * + * tail conversion is not done, so bh_result might not be valid for writing + * check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before + * trying to write the block. + * + * on failure, nonzero is returned, page_result and bh_result are untouched. + */ static int grab_tail_page(struct inode *inode, struct page **page_result, struct buffer_head **bh_result) { - /* we want the page with the last byte in the file, - ** not the page that will hold the next byte for appending + /* + * we want the page with the last byte in the file, + * not the page that will hold the next byte for appending */ unsigned long index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; unsigned long pos = 0; @@ -2036,10 +2198,11 @@ static int grab_tail_page(struct inode *inode, struct page *page; int error; - /* we know that we are only called with inode->i_size > 0. - ** we also know that a file tail can never be as big as a block - ** If i_size % blocksize == 0, our file is currently block aligned - ** and it won't need converting or zeroing after a truncate. + /* + * we know that we are only called with inode->i_size > 0. + * we also know that a file tail can never be as big as a block + * If i_size % blocksize == 0, our file is currently block aligned + * and it won't need converting or zeroing after a truncate. */ if ((offset & (blocksize - 1)) == 0) { return -ENOENT; @@ -2068,10 +2231,11 @@ static int grab_tail_page(struct inode *inode, } while (bh != head); if (!buffer_uptodate(bh)) { - /* note, this should never happen, prepare_write should - ** be taking care of this for us. If the buffer isn't up to date, - ** I've screwed up the code to find the buffer, or the code to - ** call prepare_write + /* + * note, this should never happen, prepare_write should be + * taking care of this for us. If the buffer isn't up to + * date, I've screwed up the code to find the buffer, or the + * code to call prepare_write */ reiserfs_error(inode->i_sb, "clm-6000", "error reading block %lu", bh->b_blocknr); @@ -2081,21 +2245,21 @@ static int grab_tail_page(struct inode *inode, *bh_result = bh; *page_result = page; - out: +out: return error; - unlock: +unlock: unlock_page(page); page_cache_release(page); return error; } /* -** vfs version of truncate file. Must NOT be called with -** a transaction already started. -** -** some code taken from block_truncate_page -*/ + * vfs version of truncate file. Must NOT be called with + * a transaction already started. + * + * some code taken from block_truncate_page + */ int reiserfs_truncate_file(struct inode *inode, int update_timestamps) { struct reiserfs_transaction_handle th; @@ -2113,9 +2277,11 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps) if (inode->i_size > 0) { error = grab_tail_page(inode, &page, &bh); if (error) { - // -ENOENT means we truncated past the end of the file, - // and get_block_create_0 could not find a block to read in, - // which is ok. + /* + * -ENOENT means we truncated past the end of the + * file, and get_block_create_0 could not find a + * block to read in, which is ok. + */ if (error != -ENOENT) reiserfs_error(inode->i_sb, "clm-6001", "grab_tail_page failed %d", @@ -2125,29 +2291,33 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps) } } - /* so, if page != NULL, we have a buffer head for the offset at - ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0, - ** then we have an unformatted node. Otherwise, we have a direct item, - ** and no zeroing is required on disk. We zero after the truncate, - ** because the truncate might pack the item anyway - ** (it will unmap bh if it packs). + /* + * so, if page != NULL, we have a buffer head for the offset at + * the end of the file. if the bh is mapped, and bh->b_blocknr != 0, + * then we have an unformatted node. Otherwise, we have a direct item, + * and no zeroing is required on disk. We zero after the truncate, + * because the truncate might pack the item anyway + * (it will unmap bh if it packs). + * + * it is enough to reserve space in transaction for 2 balancings: + * one for "save" link adding and another for the first + * cut_from_item. 1 is for update_sd */ - /* it is enough to reserve space in transaction for 2 balancings: - one for "save" link adding and another for the first - cut_from_item. 1 is for update_sd */ error = journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1); if (error) goto out; reiserfs_update_inode_transaction(inode); if (update_timestamps) - /* we are doing real truncate: if the system crashes before the last - transaction of truncating gets committed - on reboot the file - either appears truncated properly or not truncated at all */ + /* + * we are doing real truncate: if the system crashes + * before the last transaction of truncating gets committed + * - on reboot the file either appears truncated properly + * or not truncated at all + */ add_save_link(&th, inode, 1); err2 = reiserfs_do_truncate(&th, inode, page, update_timestamps); - error = - journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1); + error = journal_end(&th); if (error) goto out; @@ -2180,7 +2350,7 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps) reiserfs_write_unlock(inode->i_sb); return 0; - out: +out: if (page) { unlock_page(page); page_cache_release(page); @@ -2212,7 +2382,10 @@ static int map_block_for_writepage(struct inode *inode, int copy_size; int trans_running = 0; - /* catch places below that try to log something without starting a trans */ + /* + * catch places below that try to log something without + * starting a trans + */ th.t_trans_id = 0; if (!buffer_uptodate(bh_result)) { @@ -2220,11 +2393,11 @@ static int map_block_for_writepage(struct inode *inode, } kmap(bh_result->b_page); - start_over: +start_over: reiserfs_write_lock(inode->i_sb); make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3); - research: +research: retval = search_for_position_by_key(inode->i_sb, &key, &path); if (retval != POSITION_FOUND) { use_get_block = 1; @@ -2232,8 +2405,8 @@ static int map_block_for_writepage(struct inode *inode, } bh = get_last_bh(&path); - ih = get_ih(&path); - item = get_item(&path); + ih = tp_item_head(&path); + item = tp_item_body(&path); pos_in_item = path.pos_in_item; /* we've found an unformatted node */ @@ -2281,10 +2454,10 @@ static int map_block_for_writepage(struct inode *inode, goto research; } - memcpy(B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied, + memcpy(ih_item_body(bh, ih) + pos_in_item, p + bytes_copied, copy_size); - journal_mark_dirty(&th, inode->i_sb, bh); + journal_mark_dirty(&th, bh); bytes_copied += copy_size; set_block_dev_mapped(bh_result, 0, inode); @@ -2304,10 +2477,10 @@ static int map_block_for_writepage(struct inode *inode, } retval = 0; - out: +out: pathrelse(&path); if (trans_running) { - int err = journal_end(&th, inode->i_sb, jbegin_count); + int err = journal_end(&th); if (err) retval = err; trans_running = 0; @@ -2331,7 +2504,8 @@ static int map_block_for_writepage(struct inode *inode, kunmap(bh_result->b_page); if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) { - /* we've copied data from the page into the direct item, so the + /* + * we've copied data from the page into the direct item, so the * buffer in the page is now clean, mark it to reflect that. */ lock_buffer(bh_result); @@ -2370,7 +2544,8 @@ static int reiserfs_write_full_page(struct page *page, return 0; } - /* The page dirty bit is cleared before writepage is called, which + /* + * The page dirty bit is cleared before writepage is called, which * means we have to tell create_empty_buffers to make dirty buffers * The page really should be up to date at this point, so tossing * in the BH_Uptodate is just a sanity check. @@ -2381,8 +2556,9 @@ static int reiserfs_write_full_page(struct page *page, } head = page_buffers(page); - /* last page in the file, zero out any contents past the - ** last byte in the file + /* + * last page in the file, zero out any contents past the + * last byte in the file */ if (page->index >= end_index) { unsigned last_offset; @@ -2412,7 +2588,8 @@ static int reiserfs_write_full_page(struct page *page, (!buffer_mapped(bh) || (buffer_mapped(bh) && bh->b_blocknr == 0))) { - /* not mapped yet, or it points to a direct item, search + /* + * not mapped yet, or it points to a direct item, search * the btree for the mapping info, and log any direct * items found */ @@ -2450,10 +2627,11 @@ static int reiserfs_write_full_page(struct page *page, if (checked) { reiserfs_prepare_for_journal(s, bh, 1); - journal_mark_dirty(&th, s, bh); + journal_mark_dirty(&th, bh); continue; } - /* from this point on, we know the buffer is mapped to a + /* + * from this point on, we know the buffer is mapped to a * real block and not a direct item */ if (wbc->sync_mode != WB_SYNC_NONE) { @@ -2472,7 +2650,7 @@ static int reiserfs_write_full_page(struct page *page, } while ((bh = bh->b_this_page) != head); if (checked) { - error = journal_end(&th, s, bh_per_page + 1); + error = journal_end(&th); reiserfs_write_unlock(s); if (error) goto fail; @@ -2497,7 +2675,7 @@ static int reiserfs_write_full_page(struct page *page, } while (bh != head); error = 0; - done: +done: if (nr == 0) { /* * if this page only had a direct item, it is very possible for @@ -2519,8 +2697,9 @@ static int reiserfs_write_full_page(struct page *page, } return error; - fail: - /* catches various errors, we need to make sure any valid dirty blocks +fail: + /* + * catches various errors, we need to make sure any valid dirty blocks * get to the media. The page is currently locked and not marked for * writeback */ @@ -2533,8 +2712,8 @@ static int reiserfs_write_full_page(struct page *page, mark_buffer_async_write(bh); } else { /* - * clear any dirty bits that might have come from getting - * attached to a dirty page + * clear any dirty bits that might have come from + * getting attached to a dirty page */ clear_buffer_dirty(bh); } @@ -2614,15 +2793,18 @@ static int reiserfs_write_begin(struct file *file, ret = __block_write_begin(page, pos, len, reiserfs_get_block); if (ret && reiserfs_transaction_running(inode->i_sb)) { struct reiserfs_transaction_handle *th = current->journal_info; - /* this gets a little ugly. If reiserfs_get_block returned an - * error and left a transacstion running, we've got to close it, - * and we've got to free handle if it was a persistent transaction. + /* + * this gets a little ugly. If reiserfs_get_block returned an + * error and left a transacstion running, we've got to close + * it, and we've got to free handle if it was a persistent + * transaction. * * But, if we had nested into an existing transaction, we need * to just drop the ref count on the handle. * * If old_ref == 0, the transaction is from reiserfs_get_block, - * and it was a persistent trans. Otherwise, it was nested above. + * and it was a persistent trans. Otherwise, it was nested + * above. */ if (th->t_refcount > old_ref) { if (old_ref) @@ -2671,15 +2853,18 @@ int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len) ret = __block_write_begin(page, from, len, reiserfs_get_block); if (ret && reiserfs_transaction_running(inode->i_sb)) { struct reiserfs_transaction_handle *th = current->journal_info; - /* this gets a little ugly. If reiserfs_get_block returned an - * error and left a transacstion running, we've got to close it, - * and we've got to free handle if it was a persistent transaction. + /* + * this gets a little ugly. If reiserfs_get_block returned an + * error and left a transacstion running, we've got to close + * it, and we've got to free handle if it was a persistent + * transaction. * * But, if we had nested into an existing transaction, we need * to just drop the ref count on the handle. * * If old_ref == 0, the transaction is from reiserfs_get_block, - * and it was a persistent trans. Otherwise, it was nested above. + * and it was a persistent trans. Otherwise, it was nested + * above. */ if (th->t_refcount > old_ref) { if (old_ref) @@ -2734,17 +2919,20 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, reiserfs_commit_page(inode, page, start, start + copied); - /* generic_commit_write does this for us, but does not update the - ** transaction tracking stuff when the size changes. So, we have - ** to do the i_size updates here. + /* + * generic_commit_write does this for us, but does not update the + * transaction tracking stuff when the size changes. So, we have + * to do the i_size updates here. */ if (pos + copied > inode->i_size) { struct reiserfs_transaction_handle myth; reiserfs_write_lock(inode->i_sb); locked = true; - /* If the file have grown beyond the border where it - can have a tail, unmark it as needing a tail - packing */ + /* + * If the file have grown beyond the border where it + * can have a tail, unmark it as needing a tail + * packing + */ if ((have_large_tails(inode->i_sb) && inode->i_size > i_block_size(inode) * 4) || (have_small_tails(inode->i_sb) @@ -2759,13 +2947,13 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, inode->i_size = pos + copied; /* * this will just nest into our transaction. It's important - * to use mark_inode_dirty so the inode gets pushed around on the - * dirty lists, and so that O_SYNC works as expected + * to use mark_inode_dirty so the inode gets pushed around on + * the dirty lists, and so that O_SYNC works as expected */ mark_inode_dirty(inode); reiserfs_update_sd(&myth, inode); update_sd = 1; - ret = journal_end(&myth, inode->i_sb, 1); + ret = journal_end(&myth); if (ret) goto journal_error; } @@ -2781,7 +2969,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, goto out; } - out: +out: if (locked) reiserfs_write_unlock(inode->i_sb); unlock_page(page); @@ -2792,7 +2980,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, return ret == 0 ? copied : ret; - journal_error: +journal_error: reiserfs_write_unlock(inode->i_sb); locked = false; if (th) { @@ -2822,15 +3010,18 @@ int reiserfs_commit_write(struct file *f, struct page *page, } reiserfs_commit_page(inode, page, from, to); - /* generic_commit_write does this for us, but does not update the - ** transaction tracking stuff when the size changes. So, we have - ** to do the i_size updates here. + /* + * generic_commit_write does this for us, but does not update the + * transaction tracking stuff when the size changes. So, we have + * to do the i_size updates here. */ if (pos > inode->i_size) { struct reiserfs_transaction_handle myth; - /* If the file have grown beyond the border where it - can have a tail, unmark it as needing a tail - packing */ + /* + * If the file have grown beyond the border where it + * can have a tail, unmark it as needing a tail + * packing + */ if ((have_large_tails(inode->i_sb) && inode->i_size > i_block_size(inode) * 4) || (have_small_tails(inode->i_sb) @@ -2845,13 +3036,13 @@ int reiserfs_commit_write(struct file *f, struct page *page, inode->i_size = pos; /* * this will just nest into our transaction. It's important - * to use mark_inode_dirty so the inode gets pushed around on the - * dirty lists, and so that O_SYNC works as expected + * to use mark_inode_dirty so the inode gets pushed around + * on the dirty lists, and so that O_SYNC works as expected */ mark_inode_dirty(inode); reiserfs_update_sd(&myth, inode); update_sd = 1; - ret = journal_end(&myth, inode->i_sb, 1); + ret = journal_end(&myth); if (ret) goto journal_error; } @@ -2863,10 +3054,10 @@ int reiserfs_commit_write(struct file *f, struct page *page, goto out; } - out: +out: return ret; - journal_error: +journal_error: if (th) { if (!update_sd) reiserfs_update_sd(th, inode); @@ -2924,9 +3115,10 @@ void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs) } } -/* decide if this buffer needs to stay around for data logging or ordered -** write purposes -*/ +/* + * decide if this buffer needs to stay around for data logging or ordered + * write purposes + */ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh) { int ret = 1; @@ -2937,7 +3129,8 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh) if (!buffer_mapped(bh)) { goto free_jh; } - /* the page is locked, and the only places that log a data buffer + /* + * the page is locked, and the only places that log a data buffer * also lock the page. */ if (reiserfs_file_data_log(inode)) { @@ -2952,7 +3145,8 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh) struct reiserfs_journal_list *jl; struct reiserfs_jh *jh = bh->b_private; - /* why is this safe? + /* + * why is this safe? * reiserfs_setattr updates i_size in the on disk * stat data before allowing vmtruncate to be called. * @@ -2969,7 +3163,7 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh) && jl != SB_JOURNAL(inode->i_sb)->j_current_jl) ret = 0; } - free_jh: +free_jh: if (ret && bh->b_private) { reiserfs_free_jh(bh); } @@ -3028,7 +3222,7 @@ static void reiserfs_invalidatepage(struct page *page, unsigned int offset, ret = try_to_release_page(page, 0); /* maybe should BUG_ON(!ret); - neilb */ } - out: +out: return; } @@ -3080,18 +3274,20 @@ static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags) return ret; } -/* We thank Mingming Cao for helping us understand in great detail what - to do in this section of the code. */ +/* + * We thank Mingming Cao for helping us understand in great detail what + * to do in this section of the code. + */ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, - reiserfs_get_blocks_direct_io); + ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, + reiserfs_get_blocks_direct_io); /* * In case of error extending write may have instantiated a few @@ -3099,7 +3295,7 @@ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb, */ if (unlikely((rw & WRITE) && ret < 0)) { loff_t isize = i_size_read(inode); - loff_t end = offset + iov_length(iov, nr_segs); + loff_t end = offset + count; if ((end > isize) && inode_newsize_ok(inode, isize) == 0) { truncate_setsize(inode, isize); @@ -3127,8 +3323,9 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) dquot_initialize(inode); reiserfs_write_lock(inode->i_sb); if (attr->ia_valid & ATTR_SIZE) { - /* version 2 items will be caught by the s_maxbytes check - ** done for us in vmtruncate + /* + * version 2 items will be caught by the s_maxbytes check + * done for us in vmtruncate */ if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 && attr->ia_size > MAX_NON_LFS) { @@ -3149,7 +3346,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) err = journal_begin(&th, inode->i_sb, 4); if (!err) { reiserfs_discard_prealloc(&th, inode); - err = journal_end(&th, inode->i_sb, 4); + err = journal_end(&th); } if (err) error = err; @@ -3189,7 +3386,10 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; - /* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */ + /* + * (user+group)*(old+new) structure - we count quota + * info and , inode write (sb, inode) + */ reiserfs_write_lock(inode->i_sb); error = journal_begin(&th, inode->i_sb, jbegin_count); reiserfs_write_unlock(inode->i_sb); @@ -3198,19 +3398,21 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) error = dquot_transfer(inode, attr); reiserfs_write_lock(inode->i_sb); if (error) { - journal_end(&th, inode->i_sb, jbegin_count); + journal_end(&th); reiserfs_write_unlock(inode->i_sb); goto out; } - /* Update corresponding info in inode so that everything is in - * one transaction */ + /* + * Update corresponding info in inode so that everything + * is in one transaction + */ if (attr->ia_valid & ATTR_UID) inode->i_uid = attr->ia_uid; if (attr->ia_valid & ATTR_GID) inode->i_gid = attr->ia_gid; mark_inode_dirty(inode); - error = journal_end(&th, inode->i_sb, jbegin_count); + error = journal_end(&th); reiserfs_write_unlock(inode->i_sb); if (error) goto out; @@ -3220,8 +3422,14 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) attr->ia_size != i_size_read(inode)) { error = inode_newsize_ok(inode, attr->ia_size); if (!error) { + /* + * Could race against reiserfs_file_release + * if called from NFS, so take tailpack mutex. + */ + mutex_lock(&REISERFS_I(inode)->tailpack); truncate_setsize(inode, attr->ia_size); - reiserfs_vfs_truncate_file(inode); + reiserfs_truncate_file(inode, 1); + mutex_unlock(&REISERFS_I(inode)->tailpack); } } diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 946ccbf5b5a1..501ed6811a2b 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -15,7 +15,8 @@ * reiserfs_ioctl - handler for ioctl for inode * supported commands: * 1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect - * and prevent packing file (argument arg has to be non-zero) + * and prevent packing file (argument arg has t + * be non-zero) * 2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION * 3) That's all for a while ... */ @@ -132,7 +133,10 @@ setversion_out: long reiserfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - /* These are just misnamed, they actually get/put from/to user an int */ + /* + * These are just misnamed, they actually + * get/put from/to user an int + */ switch (cmd) { case REISERFS_IOC32_UNPACK: cmd = REISERFS_IOC_UNPACK; @@ -160,10 +164,10 @@ long reiserfs_compat_ioctl(struct file *file, unsigned int cmd, int reiserfs_commit_write(struct file *f, struct page *page, unsigned from, unsigned to); /* -** reiserfs_unpack -** Function try to convert tail from direct item into indirect. -** It set up nopack attribute in the REISERFS_I(inode)->nopack -*/ + * reiserfs_unpack + * Function try to convert tail from direct item into indirect. + * It set up nopack attribute in the REISERFS_I(inode)->nopack + */ int reiserfs_unpack(struct inode *inode, struct file *filp) { int retval = 0; @@ -194,9 +198,10 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) goto out; } - /* we unpack by finding the page with the tail, and calling - ** __reiserfs_write_begin on that page. This will force a - ** reiserfs_get_block to unpack the tail for us. + /* + * we unpack by finding the page with the tail, and calling + * __reiserfs_write_begin on that page. This will force a + * reiserfs_get_block to unpack the tail for us. */ index = inode->i_size >> PAGE_CACHE_SHIFT; mapping = inode->i_mapping; @@ -214,11 +219,11 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) retval = reiserfs_commit_write(NULL, page, write_from, write_from); REISERFS_I(inode)->i_flags |= i_nopack_mask; - out_unlock: +out_unlock: unlock_page(page); page_cache_release(page); - out: +out: mutex_unlock(&inode->i_mutex); reiserfs_write_unlock(inode->i_sb); return retval; diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c index ee382ef3d300..cfaee912ee09 100644 --- a/fs/reiserfs/item_ops.c +++ b/fs/reiserfs/item_ops.c @@ -5,15 +5,17 @@ #include <linux/time.h> #include "reiserfs.h" -// this contains item handlers for old item types: sd, direct, -// indirect, directory +/* + * this contains item handlers for old item types: sd, direct, + * indirect, directory + */ -/* and where are the comments? how about saying where we can find an - explanation of each item handler method? -Hans */ +/* + * and where are the comments? how about saying where we can find an + * explanation of each item handler method? -Hans + */ -////////////////////////////////////////////////////////////////////////////// -// stat data functions -// +/* stat data functions */ static int sd_bytes_number(struct item_head *ih, int block_size) { return 0; @@ -60,7 +62,7 @@ static void sd_print_item(struct item_head *ih, char *item) static void sd_check_item(struct item_head *ih, char *item) { - // FIXME: type something here! + /* unused */ } static int sd_create_vi(struct virtual_node *vn, @@ -68,7 +70,6 @@ static int sd_create_vi(struct virtual_node *vn, int is_affected, int insert_size) { vi->vi_index = TYPE_STAT_DATA; - //vi->vi_type |= VI_TYPE_STAT_DATA;// not needed? return 0; } @@ -117,15 +118,13 @@ static struct item_operations stat_data_ops = { .print_vi = sd_print_vi }; -////////////////////////////////////////////////////////////////////////////// -// direct item functions -// +/* direct item functions */ static int direct_bytes_number(struct item_head *ih, int block_size) { return ih_item_len(ih); } -// FIXME: this should probably switch to indirect as well +/* FIXME: this should probably switch to indirect as well */ static void direct_decrement_key(struct cpu_key *key) { cpu_key_k_offset_dec(key); @@ -144,7 +143,7 @@ static void direct_print_item(struct item_head *ih, char *item) { int j = 0; -// return; +/* return; */ printk("\""); while (j < ih_item_len(ih)) printk("%c", item[j++]); @@ -153,7 +152,7 @@ static void direct_print_item(struct item_head *ih, char *item) static void direct_check_item(struct item_head *ih, char *item) { - // FIXME: type something here! + /* unused */ } static int direct_create_vi(struct virtual_node *vn, @@ -161,7 +160,6 @@ static int direct_create_vi(struct virtual_node *vn, int is_affected, int insert_size) { vi->vi_index = TYPE_DIRECT; - //vi->vi_type |= VI_TYPE_DIRECT; return 0; } @@ -211,16 +209,13 @@ static struct item_operations direct_ops = { .print_vi = direct_print_vi }; -////////////////////////////////////////////////////////////////////////////// -// indirect item functions -// - +/* indirect item functions */ static int indirect_bytes_number(struct item_head *ih, int block_size) { - return ih_item_len(ih) / UNFM_P_SIZE * block_size; //- get_ih_free_space (ih); + return ih_item_len(ih) / UNFM_P_SIZE * block_size; } -// decrease offset, if it becomes 0, change type to stat data +/* decrease offset, if it becomes 0, change type to stat data */ static void indirect_decrement_key(struct cpu_key *key) { cpu_key_k_offset_dec(key); @@ -228,7 +223,7 @@ static void indirect_decrement_key(struct cpu_key *key) set_cpu_key_k_type(key, TYPE_STAT_DATA); } -// if it is not first item of the body, then it is mergeable +/* if it is not first item of the body, then it is mergeable */ static int indirect_is_left_mergeable(struct reiserfs_key *key, unsigned long bsize) { @@ -236,7 +231,7 @@ static int indirect_is_left_mergeable(struct reiserfs_key *key, return (le_key_k_offset(version, key) != 1); } -// printing of indirect item +/* printing of indirect item */ static void start_new_sequence(__u32 * start, int *len, __u32 new) { *start = new; @@ -295,7 +290,7 @@ static void indirect_print_item(struct item_head *ih, char *item) static void indirect_check_item(struct item_head *ih, char *item) { - // FIXME: type something here! + /* unused */ } static int indirect_create_vi(struct virtual_node *vn, @@ -303,7 +298,6 @@ static int indirect_create_vi(struct virtual_node *vn, int is_affected, int insert_size) { vi->vi_index = TYPE_INDIRECT; - //vi->vi_type |= VI_TYPE_INDIRECT; return 0; } @@ -321,16 +315,19 @@ static int indirect_check_right(struct virtual_item *vi, int free) return indirect_check_left(vi, free, 0, 0); } -// return size in bytes of 'units' units. If first == 0 - calculate from the head (left), otherwise - from tail (right) +/* + * return size in bytes of 'units' units. If first == 0 - calculate + * from the head (left), otherwise - from tail (right) + */ static int indirect_part_size(struct virtual_item *vi, int first, int units) { - // unit of indirect item is byte (yet) + /* unit of indirect item is byte (yet) */ return units; } static int indirect_unit_num(struct virtual_item *vi) { - // unit of indirect item is byte (yet) + /* unit of indirect item is byte (yet) */ return vi->vi_item_len - IH_SIZE; } @@ -356,10 +353,7 @@ static struct item_operations indirect_ops = { .print_vi = indirect_print_vi }; -////////////////////////////////////////////////////////////////////////////// -// direntry functions -// - +/* direntry functions */ static int direntry_bytes_number(struct item_head *ih, int block_size) { reiserfs_warning(NULL, "vs-16090", @@ -396,7 +390,7 @@ static void direntry_print_item(struct item_head *ih, char *item) deh = (struct reiserfs_de_head *)item; - for (i = 0; i < I_ENTRY_COUNT(ih); i++, deh++) { + for (i = 0; i < ih_entry_count(ih); i++, deh++) { namelen = (i ? (deh_location(deh - 1)) : ih_item_len(ih)) - deh_location(deh); @@ -428,9 +422,9 @@ static void direntry_check_item(struct item_head *ih, char *item) int i; struct reiserfs_de_head *deh; - // FIXME: type something here! + /* unused */ deh = (struct reiserfs_de_head *)item; - for (i = 0; i < I_ENTRY_COUNT(ih); i++, deh++) { + for (i = 0; i < ih_entry_count(ih); i++, deh++) { ; } } @@ -439,7 +433,8 @@ static void direntry_check_item(struct item_head *ih, char *item) /* * function returns old entry number in directory item in real node - * using new entry number in virtual item in virtual node */ + * using new entry number in virtual item in virtual node + */ static inline int old_entry_num(int is_affected, int virtual_entry_num, int pos_in_item, int mode) { @@ -463,9 +458,11 @@ static inline int old_entry_num(int is_affected, int virtual_entry_num, return virtual_entry_num - 1; } -/* Create an array of sizes of directory entries for virtual - item. Return space used by an item. FIXME: no control over - consuming of space used by this item handler */ +/* + * Create an array of sizes of directory entries for virtual + * item. Return space used by an item. FIXME: no control over + * consuming of space used by this item handler + */ static int direntry_create_vi(struct virtual_node *vn, struct virtual_item *vi, int is_affected, int insert_size) @@ -494,8 +491,8 @@ static int direntry_create_vi(struct virtual_node *vn, j = old_entry_num(is_affected, i, vn->vn_pos_in_item, vn->vn_mode); dir_u->entry_sizes[i] = - (j ? deh_location(&(deh[j - 1])) : ih_item_len(vi->vi_ih)) - - deh_location(&(deh[j])) + DEH_SIZE; + (j ? deh_location(&deh[j - 1]) : ih_item_len(vi->vi_ih)) - + deh_location(&deh[j]) + DEH_SIZE; } size += (dir_u->entry_count * sizeof(short)); @@ -529,10 +526,10 @@ static int direntry_create_vi(struct virtual_node *vn, } -// -// return number of entries which may fit into specified amount of -// free space, or -1 if free space is not enough even for 1 entry -// +/* + * return number of entries which may fit into specified amount of + * free space, or -1 if free space is not enough even for 1 entry + */ static int direntry_check_left(struct virtual_item *vi, int free, int start_skip, int end_skip) { @@ -541,8 +538,8 @@ static int direntry_check_left(struct virtual_item *vi, int free, struct direntry_uarea *dir_u = vi->vi_uarea; for (i = start_skip; i < dir_u->entry_count - end_skip; i++) { + /* i-th entry doesn't fit into the remaining free space */ if (dir_u->entry_sizes[i] > free) - /* i-th entry doesn't fit into the remaining free space */ break; free -= dir_u->entry_sizes[i]; @@ -570,8 +567,8 @@ static int direntry_check_right(struct virtual_item *vi, int free) struct direntry_uarea *dir_u = vi->vi_uarea; for (i = dir_u->entry_count - 1; i >= 0; i--) { + /* i-th entry doesn't fit into the remaining free space */ if (dir_u->entry_sizes[i] > free) - /* i-th entry doesn't fit into the remaining free space */ break; free -= dir_u->entry_sizes[i]; @@ -643,9 +640,7 @@ static struct item_operations direntry_ops = { .print_vi = direntry_print_vi }; -////////////////////////////////////////////////////////////////////////////// -// Error catching functions to catch errors caused by incorrect item types. -// +/* Error catching functions to catch errors caused by incorrect item types. */ static int errcatch_bytes_number(struct item_head *ih, int block_size) { reiserfs_warning(NULL, "green-16001", @@ -685,8 +680,12 @@ static int errcatch_create_vi(struct virtual_node *vn, { reiserfs_warning(NULL, "green-16006", "Invalid item type observed, run fsck ASAP"); - return 0; // We might return -1 here as well, but it won't help as create_virtual_node() from where - // this operation is called from is of return type void. + /* + * We might return -1 here as well, but it won't help as + * create_virtual_node() from where this operation is called + * from is of return type void. + */ + return 0; } static int errcatch_check_left(struct virtual_item *vi, int free, @@ -739,9 +738,6 @@ static struct item_operations errcatch_ops = { errcatch_print_vi }; -////////////////////////////////////////////////////////////////////////////// -// -// #if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3) #error Item types must use disk-format assigned values. #endif diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index fd777032c2ba..e8870de4627e 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -1,38 +1,38 @@ /* -** Write ahead logging implementation copyright Chris Mason 2000 -** -** The background commits make this code very interrelated, and -** overly complex. I need to rethink things a bit....The major players: -** -** journal_begin -- call with the number of blocks you expect to log. -** If the current transaction is too -** old, it will block until the current transaction is -** finished, and then start a new one. -** Usually, your transaction will get joined in with -** previous ones for speed. -** -** journal_join -- same as journal_begin, but won't block on the current -** transaction regardless of age. Don't ever call -** this. Ever. There are only two places it should be -** called from, and they are both inside this file. -** -** journal_mark_dirty -- adds blocks into this transaction. clears any flags -** that might make them get sent to disk -** and then marks them BH_JDirty. Puts the buffer head -** into the current transaction hash. -** -** journal_end -- if the current transaction is batchable, it does nothing -** otherwise, it could do an async/synchronous commit, or -** a full flush of all log and real blocks in the -** transaction. -** -** flush_old_commits -- if the current transaction is too old, it is ended and -** commit blocks are sent to disk. Forces commit blocks -** to disk for all backgrounded commits that have been -** around too long. -** -- Note, if you call this as an immediate flush from -** from within kupdate, it will ignore the immediate flag -*/ + * Write ahead logging implementation copyright Chris Mason 2000 + * + * The background commits make this code very interrelated, and + * overly complex. I need to rethink things a bit....The major players: + * + * journal_begin -- call with the number of blocks you expect to log. + * If the current transaction is too + * old, it will block until the current transaction is + * finished, and then start a new one. + * Usually, your transaction will get joined in with + * previous ones for speed. + * + * journal_join -- same as journal_begin, but won't block on the current + * transaction regardless of age. Don't ever call + * this. Ever. There are only two places it should be + * called from, and they are both inside this file. + * + * journal_mark_dirty -- adds blocks into this transaction. clears any flags + * that might make them get sent to disk + * and then marks them BH_JDirty. Puts the buffer head + * into the current transaction hash. + * + * journal_end -- if the current transaction is batchable, it does nothing + * otherwise, it could do an async/synchronous commit, or + * a full flush of all log and real blocks in the + * transaction. + * + * flush_old_commits -- if the current transaction is too old, it is ended and + * commit blocks are sent to disk. Forces commit blocks + * to disk for all backgrounded commits that have been + * around too long. + * -- Note, if you call this as an immediate flush from + * from within kupdate, it will ignore the immediate flag + */ #include <linux/time.h> #include <linux/semaphore.h> @@ -58,23 +58,19 @@ #define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \ j_working_list)) -/* the number of mounted filesystems. This is used to decide when to -** start and kill the commit workqueue -*/ -static int reiserfs_mounted_fs_count; - -static struct workqueue_struct *commit_wq; - -#define JOURNAL_TRANS_HALF 1018 /* must be correct to keep the desc and commit - structs at 4k */ +/* must be correct to keep the desc and commit structs at 4k */ +#define JOURNAL_TRANS_HALF 1018 #define BUFNR 64 /*read ahead */ /* cnode stat bits. Move these into reiserfs_fs.h */ -#define BLOCK_FREED 2 /* this block was freed, and can't be written. */ -#define BLOCK_FREED_HOLDER 3 /* this block was freed during this transaction, and can't be written */ +/* this block was freed, and can't be written. */ +#define BLOCK_FREED 2 +/* this block was freed during this transaction, and can't be written */ +#define BLOCK_FREED_HOLDER 3 -#define BLOCK_NEEDS_FLUSH 4 /* used in flush_journal_list */ +/* used in flush_journal_list */ +#define BLOCK_NEEDS_FLUSH 4 #define BLOCK_DIRTIED 5 /* journal list state bits */ @@ -87,16 +83,14 @@ static struct workqueue_struct *commit_wq; #define COMMIT_NOW 2 /* end and commit this transaction */ #define WAIT 4 /* wait for the log blocks to hit the disk */ -static int do_journal_end(struct reiserfs_transaction_handle *, - struct super_block *, unsigned long nblocks, - int flags); +static int do_journal_end(struct reiserfs_transaction_handle *, int flags); static int flush_journal_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall); static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall); static int can_dirty(struct reiserfs_journal_cnode *cn); static int journal_join(struct reiserfs_transaction_handle *th, - struct super_block *sb, unsigned long nblocks); + struct super_block *sb); static void release_journal_dev(struct super_block *super, struct reiserfs_journal *journal); static int dirty_one_transaction(struct super_block *s, @@ -107,8 +101,10 @@ static void queue_log_writer(struct super_block *s); /* values for join in do_journal_begin_r */ enum { JBEGIN_REG = 0, /* regular journal begin */ - JBEGIN_JOIN = 1, /* join the running transaction if at all possible */ - JBEGIN_ABORT = 2, /* called from cleanup code, ignores aborted flag */ + /* join the running transaction if at all possible */ + JBEGIN_JOIN = 1, + /* called from cleanup code, ignores aborted flag */ + JBEGIN_ABORT = 2, }; static int do_journal_begin_r(struct reiserfs_transaction_handle *th, @@ -123,10 +119,11 @@ static void init_journal_hash(struct super_block *sb) } /* -** clears BH_Dirty and sticks the buffer on the clean list. Called because I can't allow refile_buffer to -** make schedule happen after I've freed a block. Look at remove_from_transaction and journal_mark_freed for -** more details. -*/ + * clears BH_Dirty and sticks the buffer on the clean list. Called because + * I can't allow refile_buffer to make schedule happen after I've freed a + * block. Look at remove_from_transaction and journal_mark_freed for + * more details. + */ static int reiserfs_clean_and_file_buffer(struct buffer_head *bh) { if (bh) { @@ -163,7 +160,7 @@ static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *sb) struct list_head *entry = journal->j_bitmap_nodes.next; journal->j_used_bitmap_nodes++; - repeat: +repeat: if (entry != &journal->j_bitmap_nodes) { bn = list_entry(entry, struct reiserfs_bitmap_node, list); @@ -204,7 +201,8 @@ static void allocate_bitmap_nodes(struct super_block *sb) list_add(&bn->list, &journal->j_bitmap_nodes); journal->j_free_bitmap_nodes++; } else { - break; /* this is ok, we'll try again when more are needed */ + /* this is ok, we'll try again when more are needed */ + break; } } } @@ -239,8 +237,8 @@ static void cleanup_bitmap_list(struct super_block *sb, } /* -** only call this on FS unmount. -*/ + * only call this on FS unmount. + */ static int free_list_bitmaps(struct super_block *sb, struct reiserfs_list_bitmap *jb_array) { @@ -275,9 +273,9 @@ static int free_bitmap_nodes(struct super_block *sb) } /* -** get memory for JOURNAL_NUM_BITMAPS worth of bitmaps. -** jb_array is the array to be filled in. -*/ + * get memory for JOURNAL_NUM_BITMAPS worth of bitmaps. + * jb_array is the array to be filled in. + */ int reiserfs_allocate_list_bitmaps(struct super_block *sb, struct reiserfs_list_bitmap *jb_array, unsigned int bmap_nr) @@ -306,9 +304,9 @@ int reiserfs_allocate_list_bitmaps(struct super_block *sb, } /* -** find an available list bitmap. If you can't find one, flush a commit list -** and try again -*/ + * find an available list bitmap. If you can't find one, flush a commit list + * and try again + */ static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *sb, struct reiserfs_journal_list *jl) @@ -332,18 +330,18 @@ static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *sb, break; } } - if (jb->journal_list) { /* double check to make sure if flushed correctly */ + /* double check to make sure if flushed correctly */ + if (jb->journal_list) return NULL; - } jb->journal_list = jl; return jb; } /* -** allocates a new chunk of X nodes, and links them all together as a list. -** Uses the cnode->next and cnode->prev pointers -** returns NULL on failure -*/ + * allocates a new chunk of X nodes, and links them all together as a list. + * Uses the cnode->next and cnode->prev pointers + * returns NULL on failure + */ static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes) { struct reiserfs_journal_cnode *head; @@ -365,9 +363,7 @@ static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes) return head; } -/* -** pulls a cnode off the free list, or returns NULL on failure -*/ +/* pulls a cnode off the free list, or returns NULL on failure */ static struct reiserfs_journal_cnode *get_cnode(struct super_block *sb) { struct reiserfs_journal_cnode *cn; @@ -393,8 +389,8 @@ static struct reiserfs_journal_cnode *get_cnode(struct super_block *sb) } /* -** returns a cnode to the free list -*/ + * returns a cnode to the free list + */ static void free_cnode(struct super_block *sb, struct reiserfs_journal_cnode *cn) { @@ -419,7 +415,10 @@ static void clear_prepared_bits(struct buffer_head *bh) clear_buffer_journal_restore_dirty(bh); } -/* return a cnode with same dev, block number and size in table, or null if not found */ +/* + * return a cnode with same dev, block number and size in table, + * or null if not found + */ static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct super_block *sb, @@ -439,23 +438,24 @@ static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct } /* -** this actually means 'can this block be reallocated yet?'. If you set search_all, a block can only be allocated -** if it is not in the current transaction, was not freed by the current transaction, and has no chance of ever -** being overwritten by a replay after crashing. -** -** If you don't set search_all, a block can only be allocated if it is not in the current transaction. Since deleting -** a block removes it from the current transaction, this case should never happen. If you don't set search_all, make -** sure you never write the block without logging it. -** -** next_zero_bit is a suggestion about the next block to try for find_forward. -** when bl is rejected because it is set in a journal list bitmap, we search -** for the next zero bit in the bitmap that rejected bl. Then, we return that -** through next_zero_bit for find_forward to try. -** -** Just because we return something in next_zero_bit does not mean we won't -** reject it on the next call to reiserfs_in_journal -** -*/ + * this actually means 'can this block be reallocated yet?'. If you set + * search_all, a block can only be allocated if it is not in the current + * transaction, was not freed by the current transaction, and has no chance + * of ever being overwritten by a replay after crashing. + * + * If you don't set search_all, a block can only be allocated if it is not + * in the current transaction. Since deleting a block removes it from the + * current transaction, this case should never happen. If you don't set + * search_all, make sure you never write the block without logging it. + * + * next_zero_bit is a suggestion about the next block to try for find_forward. + * when bl is rejected because it is set in a journal list bitmap, we search + * for the next zero bit in the bitmap that rejected bl. Then, we return + * that through next_zero_bit for find_forward to try. + * + * Just because we return something in next_zero_bit does not mean we won't + * reject it on the next call to reiserfs_in_journal + */ int reiserfs_in_journal(struct super_block *sb, unsigned int bmap_nr, int bit_nr, int search_all, b_blocknr_t * next_zero_bit) @@ -469,9 +469,11 @@ int reiserfs_in_journal(struct super_block *sb, *next_zero_bit = 0; /* always start this at zero. */ PROC_INFO_INC(sb, journal.in_journal); - /* If we aren't doing a search_all, this is a metablock, and it will be logged before use. - ** if we crash before the transaction that freed it commits, this transaction won't - ** have committed either, and the block will never be written + /* + * If we aren't doing a search_all, this is a metablock, and it + * will be logged before use. if we crash before the transaction + * that freed it commits, this transaction won't have committed + * either, and the block will never be written */ if (search_all) { for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { @@ -511,8 +513,7 @@ int reiserfs_in_journal(struct super_block *sb, return 0; } -/* insert cn into table -*/ +/* insert cn into table */ static inline void insert_journal_hash(struct reiserfs_journal_cnode **table, struct reiserfs_journal_cnode *cn) { @@ -558,10 +559,10 @@ static inline void put_journal_list(struct super_block *s, } /* -** this used to be much more involved, and I'm keeping it just in case things get ugly again. -** it gets called by flush_commit_list, and cleans up any data stored about blocks freed during a -** transaction. -*/ + * this used to be much more involved, and I'm keeping it just in case + * things get ugly again. it gets called by flush_commit_list, and + * cleans up any data stored about blocks freed during a transaction. + */ static void cleanup_freed_for_journal_list(struct super_block *sb, struct reiserfs_journal_list *jl) { @@ -756,11 +757,12 @@ static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh, jh = bh->b_private; list_del_init(&jh->list); } else { - no_jh: +no_jh: get_bh(bh); jh = alloc_jh(); spin_lock(&j->j_dirty_buffers_lock); - /* buffer must be locked for __add_jh, should be able to have + /* + * buffer must be locked for __add_jh, should be able to have * two adds at the same time */ BUG_ON(bh->b_private); @@ -818,7 +820,8 @@ static int write_ordered_buffers(spinlock_t * lock, spin_lock(lock); goto loop_next; } - /* in theory, dirty non-uptodate buffers should never get here, + /* + * in theory, dirty non-uptodate buffers should never get here, * but the upper layer io error paths still have a few quirks. * Handle them here as gracefully as we can */ @@ -833,7 +836,7 @@ static int write_ordered_buffers(spinlock_t * lock, reiserfs_free_jh(bh); unlock_buffer(bh); } - loop_next: +loop_next: put_bh(bh); cond_resched_lock(lock); } @@ -856,13 +859,14 @@ static int write_ordered_buffers(spinlock_t * lock, if (!buffer_uptodate(bh)) { ret = -EIO; } - /* ugly interaction with invalidatepage here. - * reiserfs_invalidate_page will pin any buffer that has a valid - * journal head from an older transaction. If someone else sets - * our buffer dirty after we write it in the first loop, and - * then someone truncates the page away, nobody will ever write - * the buffer. We're safe if we write the page one last time - * after freeing the journal header. + /* + * ugly interaction with invalidatepage here. + * reiserfs_invalidate_page will pin any buffer that has a + * valid journal head from an older transaction. If someone + * else sets our buffer dirty after we write it in the first + * loop, and then someone truncates the page away, nobody + * will ever write the buffer. We're safe if we write the + * page one last time after freeing the journal header. */ if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) { spin_unlock(lock); @@ -887,7 +891,7 @@ static int flush_older_commits(struct super_block *s, unsigned int other_trans_id; unsigned int first_trans_id; - find_first: +find_first: /* * first we walk backwards to find the oldest uncommitted transation */ @@ -923,9 +927,11 @@ static int flush_older_commits(struct super_block *s, if (!journal_list_still_alive(s, trans_id)) return 1; - /* the one we just flushed is gone, this means all - * older lists are also gone, so first_jl is no longer - * valid either. Go back to the beginning. + /* + * the one we just flushed is gone, this means + * all older lists are also gone, so first_jl + * is no longer valid either. Go back to the + * beginning. */ if (!journal_list_still_alive (s, other_trans_id)) { @@ -958,12 +964,12 @@ static int reiserfs_async_progress_wait(struct super_block *s) } /* -** if this journal list still has commit blocks unflushed, send them to disk. -** -** log areas must be flushed in order (transaction 2 can't commit before transaction 1) -** Before the commit block can by written, every other log block must be safely on disk -** -*/ + * if this journal list still has commit blocks unflushed, send them to disk. + * + * log areas must be flushed in order (transaction 2 can't commit before + * transaction 1) Before the commit block can by written, every other log + * block must be safely on disk + */ static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) { @@ -982,8 +988,9 @@ static int flush_commit_list(struct super_block *s, return 0; } - /* before we can put our commit blocks on disk, we have to make sure everyone older than - ** us is on disk too + /* + * before we can put our commit blocks on disk, we have to make + * sure everyone older than us is on disk too */ BUG_ON(jl->j_len <= 0); BUG_ON(trans_id == journal->j_trans_id); @@ -991,7 +998,10 @@ static int flush_commit_list(struct super_block *s, get_journal_list(jl); if (flushall) { if (flush_older_commits(s, jl) == 1) { - /* list disappeared during flush_older_commits. return */ + /* + * list disappeared during flush_older_commits. + * return + */ goto put_jl; } } @@ -1006,9 +1016,9 @@ static int flush_commit_list(struct super_block *s, BUG_ON(jl->j_trans_id == 0); /* this commit is done, exit */ - if (atomic_read(&(jl->j_commit_left)) <= 0) { + if (atomic_read(&jl->j_commit_left) <= 0) { if (flushall) { - atomic_set(&(jl->j_older_commits_done), 1); + atomic_set(&jl->j_older_commits_done, 1); } mutex_unlock(&jl->j_commit_mutex); goto put_jl; @@ -1063,9 +1073,10 @@ static int flush_commit_list(struct super_block *s, depth = reiserfs_write_unlock_nested(s); __wait_on_buffer(tbh); reiserfs_write_lock_nested(s, depth); - // since we're using ll_rw_blk above, it might have skipped over - // a locked buffer. Double check here - // + /* + * since we're using ll_rw_blk above, it might have skipped + * over a locked buffer. Double check here + */ /* redundant, sync_dirty_buffer() checks */ if (buffer_dirty(tbh)) { depth = reiserfs_write_unlock_nested(s); @@ -1079,17 +1090,21 @@ static int flush_commit_list(struct super_block *s, #endif retval = -EIO; } - put_bh(tbh); /* once for journal_find_get_block */ - put_bh(tbh); /* once due to original getblk in do_journal_end */ - atomic_dec(&(jl->j_commit_left)); + /* once for journal_find_get_block */ + put_bh(tbh); + /* once due to original getblk in do_journal_end */ + put_bh(tbh); + atomic_dec(&jl->j_commit_left); } - BUG_ON(atomic_read(&(jl->j_commit_left)) != 1); + BUG_ON(atomic_read(&jl->j_commit_left) != 1); - /* If there was a write error in the journal - we can't commit + /* + * If there was a write error in the journal - we can't commit * this transaction - it will be invalid and, if successful, * will just end up propagating the write error out to - * the file system. */ + * the file system. + */ if (likely(!retval && !reiserfs_is_journal_aborted (journal))) { if (buffer_dirty(jl->j_commit_bh)) BUG(); @@ -1102,9 +1117,11 @@ static int flush_commit_list(struct super_block *s, reiserfs_write_lock_nested(s, depth); } - /* If there was a write error in the journal - we can't commit this + /* + * If there was a write error in the journal - we can't commit this * transaction - it will be invalid and, if successful, will just end - * up propagating the write error out to the filesystem. */ + * up propagating the write error out to the filesystem. + */ if (unlikely(!buffer_uptodate(jl->j_commit_bh))) { #ifdef CONFIG_REISERFS_CHECK reiserfs_warning(s, "journal-615", "buffer write failed"); @@ -1119,7 +1136,10 @@ static int flush_commit_list(struct super_block *s, } journal->j_last_commit_id = jl->j_trans_id; - /* now, every commit block is on the disk. It is safe to allow blocks freed during this transaction to be reallocated */ + /* + * now, every commit block is on the disk. It is safe to allow + * blocks freed during this transaction to be reallocated + */ cleanup_freed_for_journal_list(s, jl); retval = retval ? retval : journal->j_errno; @@ -1127,13 +1147,13 @@ static int flush_commit_list(struct super_block *s, /* mark the metadata dirty */ if (!retval) dirty_one_transaction(s, jl); - atomic_dec(&(jl->j_commit_left)); + atomic_dec(&jl->j_commit_left); if (flushall) { - atomic_set(&(jl->j_older_commits_done), 1); + atomic_set(&jl->j_older_commits_done, 1); } mutex_unlock(&jl->j_commit_mutex); - put_jl: +put_jl: put_journal_list(s, jl); if (retval) @@ -1143,9 +1163,9 @@ static int flush_commit_list(struct super_block *s, } /* -** flush_journal_list frequently needs to find a newer transaction for a given block. This does that, or -** returns NULL if it can't find anything -*/ + * flush_journal_list frequently needs to find a newer transaction for a + * given block. This does that, or returns NULL if it can't find anything + */ static struct reiserfs_journal_list *find_newer_jl_for_cn(struct reiserfs_journal_cnode *cn) @@ -1169,10 +1189,11 @@ static void remove_journal_hash(struct super_block *, int); /* -** once all the real blocks have been flushed, it is safe to remove them from the -** journal list for this transaction. Aside from freeing the cnode, this also allows the -** block to be reallocated for data blocks if it had been deleted. -*/ + * once all the real blocks have been flushed, it is safe to remove them + * from the journal list for this transaction. Aside from freeing the + * cnode, this also allows the block to be reallocated for data blocks + * if it had been deleted. + */ static void remove_all_from_journal_list(struct super_block *sb, struct reiserfs_journal_list *jl, int debug) @@ -1181,8 +1202,9 @@ static void remove_all_from_journal_list(struct super_block *sb, struct reiserfs_journal_cnode *cn, *last; cn = jl->j_realblock; - /* which is better, to lock once around the whole loop, or - ** to lock for each call to remove_journal_hash? + /* + * which is better, to lock once around the whole loop, or + * to lock for each call to remove_journal_hash? */ while (cn) { if (cn->blocknr != 0) { @@ -1204,12 +1226,13 @@ static void remove_all_from_journal_list(struct super_block *sb, } /* -** if this timestamp is greater than the timestamp we wrote last to the header block, write it to the header block. -** once this is done, I can safely say the log area for this transaction won't ever be replayed, and I can start -** releasing blocks in this transaction for reuse as data blocks. -** called by flush_journal_list, before it calls remove_all_from_journal_list -** -*/ + * if this timestamp is greater than the timestamp we wrote last to the + * header block, write it to the header block. once this is done, I can + * safely say the log area for this transaction won't ever be replayed, + * and I can start releasing blocks in this transaction for reuse as data + * blocks. called by flush_journal_list, before it calls + * remove_all_from_journal_list + */ static int _update_journal_header_block(struct super_block *sb, unsigned long offset, unsigned int trans_id) @@ -1279,10 +1302,11 @@ static int flush_older_journal_lists(struct super_block *sb, struct reiserfs_journal *journal = SB_JOURNAL(sb); unsigned int trans_id = jl->j_trans_id; - /* we know we are the only ones flushing things, no extra race + /* + * we know we are the only ones flushing things, no extra race * protection is required. */ - restart: +restart: entry = journal->j_journal_list.next; /* Did we wrap? */ if (entry == &journal->j_journal_list) @@ -1309,15 +1333,16 @@ static void del_from_work_list(struct super_block *s, } } -/* flush a journal list, both commit and real blocks -** -** always set flushall to 1, unless you are calling from inside -** flush_journal_list -** -** IMPORTANT. This can only be called while there are no journal writers, -** and the journal is locked. That means it can only be called from -** do_journal_end, or by journal_release -*/ +/* + * flush a journal list, both commit and real blocks + * + * always set flushall to 1, unless you are calling from inside + * flush_journal_list + * + * IMPORTANT. This can only be called while there are no journal writers, + * and the journal is locked. That means it can only be called from + * do_journal_end, or by journal_release + */ static int flush_journal_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) { @@ -1354,13 +1379,14 @@ static int flush_journal_list(struct super_block *s, } /* if all the work is already done, get out of here */ - if (atomic_read(&(jl->j_nonzerolen)) <= 0 && - atomic_read(&(jl->j_commit_left)) <= 0) { + if (atomic_read(&jl->j_nonzerolen) <= 0 && + atomic_read(&jl->j_commit_left) <= 0) { goto flush_older_and_return; } - /* start by putting the commit list on disk. This will also flush - ** the commit lists of any olders transactions + /* + * start by putting the commit list on disk. This will also flush + * the commit lists of any olders transactions */ flush_commit_list(s, jl, 1); @@ -1369,15 +1395,16 @@ static int flush_journal_list(struct super_block *s, BUG(); /* are we done now? */ - if (atomic_read(&(jl->j_nonzerolen)) <= 0 && - atomic_read(&(jl->j_commit_left)) <= 0) { + if (atomic_read(&jl->j_nonzerolen) <= 0 && + atomic_read(&jl->j_commit_left) <= 0) { goto flush_older_and_return; } - /* loop through each cnode, see if we need to write it, - ** or wait on a more recent transaction, or just ignore it + /* + * loop through each cnode, see if we need to write it, + * or wait on a more recent transaction, or just ignore it */ - if (atomic_read(&(journal->j_wcount)) != 0) { + if (atomic_read(&journal->j_wcount) != 0) { reiserfs_panic(s, "journal-844", "journal list is flushing, " "wcount is not 0"); } @@ -1391,20 +1418,25 @@ static int flush_journal_list(struct super_block *s, goto free_cnode; } - /* This transaction failed commit. Don't write out to the disk */ + /* + * This transaction failed commit. + * Don't write out to the disk + */ if (!(jl->j_state & LIST_DIRTY)) goto free_cnode; pjl = find_newer_jl_for_cn(cn); - /* the order is important here. We check pjl to make sure we - ** don't clear BH_JDirty_wait if we aren't the one writing this - ** block to disk + /* + * the order is important here. We check pjl to make sure we + * don't clear BH_JDirty_wait if we aren't the one writing this + * block to disk */ if (!pjl && cn->bh) { saved_bh = cn->bh; - /* we do this to make sure nobody releases the buffer while - ** we are working with it + /* + * we do this to make sure nobody releases the + * buffer while we are working with it */ get_bh(saved_bh); @@ -1413,13 +1445,17 @@ static int flush_journal_list(struct super_block *s, was_jwait = 1; was_dirty = 1; } else if (can_dirty(cn)) { - /* everything with !pjl && jwait should be writable */ + /* + * everything with !pjl && jwait + * should be writable + */ BUG(); } } - /* if someone has this block in a newer transaction, just make - ** sure they are committed, and don't try writing it to disk + /* + * if someone has this block in a newer transaction, just make + * sure they are committed, and don't try writing it to disk */ if (pjl) { if (atomic_read(&pjl->j_commit_left)) @@ -1427,16 +1463,18 @@ static int flush_journal_list(struct super_block *s, goto free_cnode; } - /* bh == NULL when the block got to disk on its own, OR, - ** the block got freed in a future transaction + /* + * bh == NULL when the block got to disk on its own, OR, + * the block got freed in a future transaction */ if (saved_bh == NULL) { goto free_cnode; } - /* this should never happen. kupdate_one_transaction has this list - ** locked while it works, so we should never see a buffer here that - ** is not marked JDirty_wait + /* + * this should never happen. kupdate_one_transaction has + * this list locked while it works, so we should never see a + * buffer here that is not marked JDirty_wait */ if ((!was_jwait) && !buffer_locked(saved_bh)) { reiserfs_warning(s, "journal-813", @@ -1447,7 +1485,10 @@ static int flush_journal_list(struct super_block *s, was_jwait ? ' ' : '!'); } if (was_dirty) { - /* we inc again because saved_bh gets decremented at free_cnode */ + /* + * we inc again because saved_bh gets decremented + * at free_cnode + */ get_bh(saved_bh); set_bit(BLOCK_NEEDS_FLUSH, &cn->state); lock_buffer(saved_bh); @@ -1463,13 +1504,16 @@ static int flush_journal_list(struct super_block *s, (unsigned long long)saved_bh-> b_blocknr, __func__); } - free_cnode: +free_cnode: last = cn; cn = cn->next; if (saved_bh) { - /* we incremented this to keep others from taking the buffer head away */ + /* + * we incremented this to keep others from + * taking the buffer head away + */ put_bh(saved_bh); - if (atomic_read(&(saved_bh->b_count)) < 0) { + if (atomic_read(&saved_bh->b_count) < 0) { reiserfs_warning(s, "journal-945", "saved_bh->b_count < 0"); } @@ -1499,8 +1543,10 @@ static int flush_journal_list(struct super_block *s, #endif err = -EIO; } - /* note, we must clear the JDirty_wait bit after the up to date - ** check, otherwise we race against our flushpage routine + /* + * note, we must clear the JDirty_wait bit + * after the up to date check, otherwise we + * race against our flushpage routine */ BUG_ON(!test_clear_buffer_journal_dirty (cn->bh)); @@ -1518,25 +1564,27 @@ static int flush_journal_list(struct super_block *s, reiserfs_abort(s, -EIO, "Write error while pushing transaction to disk in %s", __func__); - flush_older_and_return: +flush_older_and_return: - /* before we can update the journal header block, we _must_ flush all - ** real blocks from all older transactions to disk. This is because - ** once the header block is updated, this transaction will not be - ** replayed after a crash + /* + * before we can update the journal header block, we _must_ flush all + * real blocks from all older transactions to disk. This is because + * once the header block is updated, this transaction will not be + * replayed after a crash */ if (flushall) { flush_older_journal_lists(s, jl); } err = journal->j_errno; - /* before we can remove everything from the hash tables for this - ** transaction, we must make sure it can never be replayed - ** - ** since we are only called from do_journal_end, we know for sure there - ** are no allocations going on while we are flushing journal lists. So, - ** we only need to update the journal header block for the last list - ** being flushed + /* + * before we can remove everything from the hash tables for this + * transaction, we must make sure it can never be replayed + * + * since we are only called from do_journal_end, we know for sure there + * are no allocations going on while we are flushing journal lists. So, + * we only need to update the journal header block for the last list + * being flushed */ if (!err && flushall) { err = @@ -1561,11 +1609,12 @@ static int flush_journal_list(struct super_block *s, } journal->j_last_flush_id = jl->j_trans_id; - /* not strictly required since we are freeing the list, but it should + /* + * not strictly required since we are freeing the list, but it should * help find code using dead lists later on */ jl->j_len = 0; - atomic_set(&(jl->j_nonzerolen), 0); + atomic_set(&jl->j_nonzerolen, 0); jl->j_start = 0; jl->j_realblock = NULL; jl->j_commit_bh = NULL; @@ -1592,15 +1641,17 @@ static int write_one_transaction(struct super_block *s, cn = jl->j_realblock; while (cn) { - /* if the blocknr == 0, this has been cleared from the hash, - ** skip it + /* + * if the blocknr == 0, this has been cleared from the hash, + * skip it */ if (cn->blocknr == 0) { goto next; } if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) { struct buffer_head *tmp_bh; - /* we can race against journal_mark_freed when we try + /* + * we can race against journal_mark_freed when we try * to lock_buffer(cn->bh), so we have to inc the buffer * count, and recheck things after locking */ @@ -1619,7 +1670,7 @@ static int write_one_transaction(struct super_block *s, } put_bh(tmp_bh); } - next: +next: cn = cn->next; cond_resched(); } @@ -1637,15 +1688,17 @@ static int dirty_one_transaction(struct super_block *s, jl->j_state |= LIST_DIRTY; cn = jl->j_realblock; while (cn) { - /* look for a more recent transaction that logged this - ** buffer. Only the most recent transaction with a buffer in - ** it is allowed to send that buffer to disk + /* + * look for a more recent transaction that logged this + * buffer. Only the most recent transaction with a buffer in + * it is allowed to send that buffer to disk */ pjl = find_newer_jl_for_cn(cn); if (!pjl && cn->blocknr && cn->bh && buffer_journal_dirty(cn->bh)) { BUG_ON(!can_dirty(cn)); - /* if the buffer is prepared, it will either be logged + /* + * if the buffer is prepared, it will either be logged * or restored. If restored, we need to make sure * it actually gets marked dirty */ @@ -1682,7 +1735,8 @@ static int kupdate_transactions(struct super_block *s, goto done; } - /* we've got j_flush_mutex held, nobody is going to delete any + /* + * we've got j_flush_mutex held, nobody is going to delete any * of these lists out from underneath us */ while ((num_trans && transactions_flushed < num_trans) || @@ -1716,20 +1770,21 @@ static int kupdate_transactions(struct super_block *s, write_chunk(&chunk); } - done: +done: mutex_unlock(&journal->j_flush_mutex); return ret; } -/* for o_sync and fsync heavy applications, they tend to use -** all the journa list slots with tiny transactions. These -** trigger lots and lots of calls to update the header block, which -** adds seeks and slows things down. -** -** This function tries to clear out a large chunk of the journal lists -** at once, which makes everything faster since only the newest journal -** list updates the header block -*/ +/* + * for o_sync and fsync heavy applications, they tend to use + * all the journa list slots with tiny transactions. These + * trigger lots and lots of calls to update the header block, which + * adds seeks and slows things down. + * + * This function tries to clear out a large chunk of the journal lists + * at once, which makes everything faster since only the newest journal + * list updates the header block + */ static int flush_used_journal_lists(struct super_block *s, struct reiserfs_journal_list *jl) { @@ -1766,9 +1821,11 @@ static int flush_used_journal_lists(struct super_block *s, } get_journal_list(jl); get_journal_list(flush_jl); - /* try to find a group of blocks we can flush across all the - ** transactions, but only bother if we've actually spanned - ** across multiple lists + + /* + * try to find a group of blocks we can flush across all the + * transactions, but only bother if we've actually spanned + * across multiple lists */ if (flush_jl != jl) { ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i); @@ -1780,9 +1837,9 @@ static int flush_used_journal_lists(struct super_block *s, } /* -** removes any nodes in table with name block and dev as bh. -** only touchs the hnext and hprev pointers. -*/ + * removes any nodes in table with name block and dev as bh. + * only touchs the hnext and hprev pointers. + */ void remove_journal_hash(struct super_block *sb, struct reiserfs_journal_cnode **table, struct reiserfs_journal_list *jl, @@ -1811,8 +1868,12 @@ void remove_journal_hash(struct super_block *sb, cur->blocknr = 0; cur->sb = NULL; cur->state = 0; - if (cur->bh && cur->jlist) /* anybody who clears the cur->bh will also dec the nonzerolen */ - atomic_dec(&(cur->jlist->j_nonzerolen)); + /* + * anybody who clears the cur->bh will also + * dec the nonzerolen + */ + if (cur->bh && cur->jlist) + atomic_dec(&cur->jlist->j_nonzerolen); cur->bh = NULL; cur->jlist = NULL; } @@ -1832,17 +1893,18 @@ static void free_journal_ram(struct super_block *sb) if (journal->j_header_bh) { brelse(journal->j_header_bh); } - /* j_header_bh is on the journal dev, make sure not to release the journal - * dev until we brelse j_header_bh + /* + * j_header_bh is on the journal dev, make sure + * not to release the journal dev until we brelse j_header_bh */ release_journal_dev(sb, journal); vfree(journal); } /* -** call on unmount. Only set error to 1 if you haven't made your way out -** of read_super() yet. Any other caller must keep error at 0. -*/ + * call on unmount. Only set error to 1 if you haven't made your way out + * of read_super() yet. Any other caller must keep error at 0. + */ static int do_journal_release(struct reiserfs_transaction_handle *th, struct super_block *sb, int error) { @@ -1850,21 +1912,25 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, int flushed = 0; struct reiserfs_journal *journal = SB_JOURNAL(sb); - /* we only want to flush out transactions if we were called with error == 0 + /* + * we only want to flush out transactions if we were + * called with error == 0 */ if (!error && !(sb->s_flags & MS_RDONLY)) { /* end the current trans */ BUG_ON(!th->t_trans_id); - do_journal_end(th, sb, 10, FLUSH_ALL); + do_journal_end(th, FLUSH_ALL); - /* make sure something gets logged to force our way into the flush code */ - if (!journal_join(&myth, sb, 1)) { + /* + * make sure something gets logged to force + * our way into the flush code + */ + if (!journal_join(&myth, sb)) { reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1); - journal_mark_dirty(&myth, sb, - SB_BUFFER_WITH_SB(sb)); - do_journal_end(&myth, sb, 1, FLUSH_ALL); + journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb)); + do_journal_end(&myth, FLUSH_ALL); flushed = 1; } } @@ -1872,17 +1938,15 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, /* this also catches errors during the do_journal_end above */ if (!error && reiserfs_is_journal_aborted(journal)) { memset(&myth, 0, sizeof(myth)); - if (!journal_join_abort(&myth, sb, 1)) { + if (!journal_join_abort(&myth, sb)) { reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1); - journal_mark_dirty(&myth, sb, - SB_BUFFER_WITH_SB(sb)); - do_journal_end(&myth, sb, 1, FLUSH_ALL); + journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb)); + do_journal_end(&myth, FLUSH_ALL); } } - reiserfs_mounted_fs_count--; /* wait for all commits to finish */ cancel_delayed_work(&SB_JOURNAL(sb)->j_work); @@ -1893,12 +1957,7 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, reiserfs_write_unlock(sb); cancel_delayed_work_sync(&REISERFS_SB(sb)->old_work); - flush_workqueue(commit_wq); - - if (!reiserfs_mounted_fs_count) { - destroy_workqueue(commit_wq); - commit_wq = NULL; - } + flush_workqueue(REISERFS_SB(sb)->commit_wq); free_journal_ram(sb); @@ -1907,25 +1966,24 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, return 0; } -/* -** call on unmount. flush all journal trans, release all alloc'd ram -*/ +/* * call on unmount. flush all journal trans, release all alloc'd ram */ int journal_release(struct reiserfs_transaction_handle *th, struct super_block *sb) { return do_journal_release(th, sb, 0); } -/* -** only call from an error condition inside reiserfs_read_super! -*/ +/* only call from an error condition inside reiserfs_read_super! */ int journal_release_error(struct reiserfs_transaction_handle *th, struct super_block *sb) { return do_journal_release(th, sb, 1); } -/* compares description block with commit block. returns 1 if they differ, 0 if they are the same */ +/* + * compares description block with commit block. + * returns 1 if they differ, 0 if they are the same + */ static int journal_compare_desc_commit(struct super_block *sb, struct reiserfs_journal_desc *desc, struct reiserfs_journal_commit *commit) @@ -1939,11 +1997,12 @@ static int journal_compare_desc_commit(struct super_block *sb, return 0; } -/* returns 0 if it did not find a description block -** returns -1 if it found a corrupt commit block -** returns 1 if both desc and commit were valid -** NOTE: only called during fs mount -*/ +/* + * returns 0 if it did not find a description block + * returns -1 if it found a corrupt commit block + * returns 1 if both desc and commit were valid + * NOTE: only called during fs mount + */ static int journal_transaction_is_valid(struct super_block *sb, struct buffer_head *d_bh, unsigned int *oldest_invalid_trans_id, @@ -1989,7 +2048,10 @@ static int journal_transaction_is_valid(struct super_block *sb, } offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb); - /* ok, we have a journal description block, lets see if the transaction was valid */ + /* + * ok, we have a journal description block, + * let's see if the transaction was valid + */ c_bh = journal_bread(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) + @@ -2041,11 +2103,11 @@ static void brelse_array(struct buffer_head **heads, int num) } /* -** given the start, and values for the oldest acceptable transactions, -** this either reads in a replays a transaction, or returns because the -** transaction is invalid, or too old. -** NOTE: only called during fs mount -*/ + * given the start, and values for the oldest acceptable transactions, + * this either reads in a replays a transaction, or returns because the + * transaction is invalid, or too old. + * NOTE: only called during fs mount + */ static int journal_read_transaction(struct super_block *sb, unsigned long cur_dblock, unsigned long oldest_start, @@ -2119,7 +2181,10 @@ static int journal_read_transaction(struct super_block *sb, } trans_id = get_desc_trans_id(desc); - /* now we know we've got a good transaction, and it was inside the valid time ranges */ + /* + * now we know we've got a good transaction, and it was + * inside the valid time ranges + */ log_blocks = kmalloc(get_desc_trans_len(desc) * sizeof(struct buffer_head *), GFP_NOFS); real_blocks = kmalloc(get_desc_trans_len(desc) * @@ -2164,7 +2229,7 @@ static int journal_read_transaction(struct super_block *sb, reiserfs_warning(sb, "journal-1204", "REPLAY FAILURE fsck required! " "Trying to replay onto a log block"); - abort_replay: +abort_replay: brelse_array(log_blocks, i); brelse_array(real_blocks, i); brelse(c_bh); @@ -2226,7 +2291,10 @@ static int journal_read_transaction(struct super_block *sb, "journal-1095: setting journal " "start to offset %ld", cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb)); - /* init starting values for the first transaction, in case this is the last transaction to be replayed. */ + /* + * init starting values for the first transaction, in case + * this is the last transaction to be replayed. + */ journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb); journal->j_last_flush_trans_id = trans_id; journal->j_trans_id = trans_id + 1; @@ -2240,12 +2308,14 @@ static int journal_read_transaction(struct super_block *sb, return 0; } -/* This function reads blocks starting from block and to max_block of bufsize - size (but no more than BUFNR blocks at a time). This proved to improve - mounting speed on self-rebuilding raid5 arrays at least. - Right now it is only used from journal code. But later we might use it - from other places. - Note: Do not use journal_getblk/sb_getblk functions here! */ +/* + * This function reads blocks starting from block and to max_block of bufsize + * size (but no more than BUFNR blocks at a time). This proved to improve + * mounting speed on self-rebuilding raid5 arrays at least. + * Right now it is only used from journal code. But later we might use it + * from other places. + * Note: Do not use journal_getblk/sb_getblk functions here! + */ static struct buffer_head *reiserfs_breada(struct block_device *dev, b_blocknr_t block, int bufsize, b_blocknr_t max_block) @@ -2284,15 +2354,17 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev, } /* -** read and replay the log -** on a clean unmount, the journal header's next unflushed pointer will -** be to an invalid transaction. This tests that before finding all the -** transactions in the log, which makes normal mount times fast. -** After a crash, this starts with the next unflushed transaction, and -** replays until it finds one too old, or invalid. -** On exit, it sets things up so the first transaction will work correctly. -** NOTE: only called during fs mount -*/ + * read and replay the log + * on a clean unmount, the journal header's next unflushed pointer will be + * to an invalid transaction. This tests that before finding all the + * transactions in the log, which makes normal mount times fast. + * + * After a crash, this starts with the next unflushed transaction, and + * replays until it finds one too old, or invalid. + * + * On exit, it sets things up so the first transaction will work correctly. + * NOTE: only called during fs mount + */ static int journal_read(struct super_block *sb) { struct reiserfs_journal *journal = SB_JOURNAL(sb); @@ -2316,9 +2388,10 @@ static int journal_read(struct super_block *sb) bdevname(journal->j_dev_bd, b)); start = get_seconds(); - /* step 1, read in the journal header block. Check the transaction it says - ** is the first unflushed, and if that transaction is not valid, - ** replay is done + /* + * step 1, read in the journal header block. Check the transaction + * it says is the first unflushed, and if that transaction is not + * valid, replay is done */ journal->j_header_bh = journal_bread(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) @@ -2342,9 +2415,10 @@ static int journal_read(struct super_block *sb) le32_to_cpu(jh->j_last_flush_trans_id)); valid_journal_header = 1; - /* now, we try to read the first unflushed offset. If it is not valid, - ** there is nothing more we can do, and it makes no sense to read - ** through the whole log. + /* + * now, we try to read the first unflushed offset. If it + * is not valid, there is nothing more we can do, and it + * makes no sense to read through the whole log. */ d_bh = journal_bread(sb, @@ -2358,15 +2432,19 @@ static int journal_read(struct super_block *sb) goto start_log_replay; } - /* ok, there are transactions that need to be replayed. start with the first log block, find - ** all the valid transactions, and pick out the oldest. + /* + * ok, there are transactions that need to be replayed. start + * with the first log block, find all the valid transactions, and + * pick out the oldest. */ while (continue_replay && cur_dblock < (SB_ONDISK_JOURNAL_1st_BLOCK(sb) + SB_ONDISK_JOURNAL_SIZE(sb))) { - /* Note that it is required for blocksize of primary fs device and journal - device to be the same */ + /* + * Note that it is required for blocksize of primary fs + * device and journal device to be the same + */ d_bh = reiserfs_breada(journal->j_dev_bd, cur_dblock, sb->s_blocksize, @@ -2413,7 +2491,7 @@ static int journal_read(struct super_block *sb) brelse(d_bh); } - start_log_replay: +start_log_replay: cur_dblock = oldest_start; if (oldest_trans_id) { reiserfs_debug(sb, REISERFS_DEBUG_CODE, @@ -2444,9 +2522,11 @@ static int journal_read(struct super_block *sb) reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1225: No valid " "transactions found"); } - /* j_start does not get set correctly if we don't replay any transactions. - ** if we had a valid journal_header, set j_start to the first unflushed transaction value, - ** copy the trans_id from the header + /* + * j_start does not get set correctly if we don't replay any + * transactions. if we had a valid journal_header, set j_start + * to the first unflushed transaction value, copy the trans_id + * from the header */ if (valid_journal_header && replay_count == 0) { journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset); @@ -2475,8 +2555,9 @@ static int journal_read(struct super_block *sb) _update_journal_header_block(sb, journal->j_start, journal->j_last_flush_trans_id)) { reiserfs_write_unlock(sb); - /* replay failed, caller must call free_journal_ram and abort - ** the mount + /* + * replay failed, caller must call free_journal_ram and abort + * the mount */ return -1; } @@ -2569,7 +2650,7 @@ static int journal_init_dev(struct super_block *super, return 0; } -/** +/* * When creating/tuning a file system user can assign some * journal params within boundaries which depend on the ratio * blocksize/standard_blocksize. @@ -2587,8 +2668,7 @@ static int check_advise_trans_params(struct super_block *sb, struct reiserfs_journal *journal) { if (journal->j_trans_max) { - /* Non-default journal params. - Do sanity check for them. */ + /* Non-default journal params. Do sanity check for them. */ int ratio = 1; if (sb->s_blocksize < REISERFS_STANDARD_BLKSIZE) ratio = REISERFS_STANDARD_BLKSIZE / sb->s_blocksize; @@ -2610,10 +2690,12 @@ static int check_advise_trans_params(struct super_block *sb, return 1; } } else { - /* Default journal params. - The file system was created by old version - of mkreiserfs, so some fields contain zeros, - and we need to advise proper values for them */ + /* + * Default journal params. + * The file system was created by old version + * of mkreiserfs, so some fields contain zeros, + * and we need to advise proper values for them + */ if (sb->s_blocksize != REISERFS_STANDARD_BLKSIZE) { reiserfs_warning(sb, "sh-464", "bad blocksize (%u)", sb->s_blocksize); @@ -2626,9 +2708,7 @@ static int check_advise_trans_params(struct super_block *sb, return 0; } -/* -** must be called once on fs mount. calls journal_read for you -*/ +/* must be called once on fs mount. calls journal_read for you */ int journal_init(struct super_block *sb, const char *j_dev_name, int old_format, unsigned int commit_max_age) { @@ -2667,8 +2747,10 @@ int journal_init(struct super_block *sb, const char *j_dev_name, REISERFS_DISK_OFFSET_IN_BYTES / sb->s_blocksize + 2); - /* Sanity check to see is the standard journal fitting within first bitmap - (actual for small blocksizes) */ + /* + * Sanity check to see is the standard journal fitting + * within first bitmap (actual for small blocksizes) + */ if (!SB_ONDISK_JOURNAL_DEVICE(sb) && (SB_JOURNAL_1st_RESERVED_BLOCK(sb) + SB_ONDISK_JOURNAL_SIZE(sb) > sb->s_blocksize * 8)) { @@ -2754,20 +2836,20 @@ int journal_init(struct super_block *sb, const char *j_dev_name, journal->j_start = 0; journal->j_len = 0; journal->j_len_alloc = 0; - atomic_set(&(journal->j_wcount), 0); - atomic_set(&(journal->j_async_throttle), 0); + atomic_set(&journal->j_wcount, 0); + atomic_set(&journal->j_async_throttle, 0); journal->j_bcount = 0; journal->j_trans_start_time = 0; journal->j_last = NULL; journal->j_first = NULL; - init_waitqueue_head(&(journal->j_join_wait)); + init_waitqueue_head(&journal->j_join_wait); mutex_init(&journal->j_mutex); mutex_init(&journal->j_flush_mutex); journal->j_trans_id = 10; journal->j_mount_id = 10; journal->j_state = 0; - atomic_set(&(journal->j_jlock), 0); + atomic_set(&journal->j_jlock, 0); journal->j_cnode_free_list = allocate_cnodes(num_cnodes); journal->j_cnode_free_orig = journal->j_cnode_free_list; journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0; @@ -2807,23 +2889,19 @@ int journal_init(struct super_block *sb, const char *j_dev_name, goto free_and_return; } - reiserfs_mounted_fs_count++; - if (reiserfs_mounted_fs_count <= 1) - commit_wq = alloc_workqueue("reiserfs", WQ_MEM_RECLAIM, 0); - INIT_DELAYED_WORK(&journal->j_work, flush_async_commits); journal->j_work_sb = sb; return 0; - free_and_return: +free_and_return: free_journal_ram(sb); return 1; } /* -** test for a polite end of the current transaction. Used by file_write, and should -** be used by delete to make sure they don't write more than can fit inside a single -** transaction -*/ + * test for a polite end of the current transaction. Used by file_write, + * and should be used by delete to make sure they don't write more than + * can fit inside a single transaction + */ int journal_transaction_should_end(struct reiserfs_transaction_handle *th, int new_alloc) { @@ -2835,7 +2913,7 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th, return 0; if (journal->j_must_wait > 0 || (journal->j_len_alloc + new_alloc) >= journal->j_max_batch || - atomic_read(&(journal->j_jlock)) || + atomic_read(&journal->j_jlock) || (now - journal->j_trans_start_time) > journal->j_max_trans_age || journal->j_cnode_free < (journal->j_trans_max * 3)) { return 1; @@ -2846,8 +2924,7 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th, return 0; } -/* this must be called inside a transaction -*/ +/* this must be called inside a transaction */ void reiserfs_block_writes(struct reiserfs_transaction_handle *th) { struct reiserfs_journal *journal = SB_JOURNAL(th->t_super); @@ -2857,8 +2934,7 @@ void reiserfs_block_writes(struct reiserfs_transaction_handle *th) return; } -/* this must be called without a transaction started -*/ +/* this must be called without a transaction started */ void reiserfs_allow_writes(struct super_block *s) { struct reiserfs_journal *journal = SB_JOURNAL(s); @@ -2866,8 +2942,7 @@ void reiserfs_allow_writes(struct super_block *s) wake_up(&journal->j_join_wait); } -/* this must be called without a transaction started -*/ +/* this must be called without a transaction started */ void reiserfs_wait_on_write_block(struct super_block *s) { struct reiserfs_journal *journal = SB_JOURNAL(s); @@ -2929,11 +3004,12 @@ static void let_transaction_grow(struct super_block *sb, unsigned int trans_id) } } -/* join == true if you must join an existing transaction. -** join == false if you can deal with waiting for others to finish -** -** this will block until the transaction is joinable. send the number of blocks you -** expect to use in nblocks. +/* + * join == true if you must join an existing transaction. + * join == false if you can deal with waiting for others to finish + * + * this will block until the transaction is joinable. send the number of + * blocks you expect to use in nblocks. */ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct super_block *sb, unsigned long nblocks, @@ -2955,7 +3031,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, th->t_refcount = 1; th->t_super = sb; - relock: +relock: lock_journal(sb); if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted(journal)) { unlock_journal(sb); @@ -2974,9 +3050,11 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, } now = get_seconds(); - /* if there is no room in the journal OR - ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning - ** we don't sleep if there aren't other writers + /* + * if there is no room in the journal OR + * if this transaction is too old, and we weren't called joinable, + * wait for it to finish before beginning we don't sleep if there + * aren't other writers */ if ((!join && journal->j_must_wait > 0) || @@ -2990,7 +3068,8 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, || (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) { old_trans_id = journal->j_trans_id; - unlock_journal(sb); /* allow others to finish this transaction */ + /* allow others to finish this transaction */ + unlock_journal(sb); if (!join && (journal->j_len_alloc + nblocks + 2) >= journal->j_max_batch && @@ -3002,8 +3081,9 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, goto relock; } } - /* don't mess with joining the transaction if all we have to do is - * wait for someone else to do a commit + /* + * don't mess with joining the transaction if all we + * have to do is wait for someone else to do a commit */ if (atomic_read(&journal->j_jlock)) { while (journal->j_trans_id == old_trans_id && @@ -3012,15 +3092,15 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, } goto relock; } - retval = journal_join(&myth, sb, 1); + retval = journal_join(&myth, sb); if (retval) goto out_fail; /* someone might have ended the transaction while we joined */ if (old_trans_id != journal->j_trans_id) { - retval = do_journal_end(&myth, sb, 1, 0); + retval = do_journal_end(&myth, 0); } else { - retval = do_journal_end(&myth, sb, 1, COMMIT_NOW); + retval = do_journal_end(&myth, COMMIT_NOW); } if (retval) @@ -3033,7 +3113,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, if (journal->j_trans_start_time == 0) { journal->j_trans_start_time = get_seconds(); } - atomic_inc(&(journal->j_wcount)); + atomic_inc(&journal->j_wcount); journal->j_len_alloc += nblocks; th->t_blocks_logged = 0; th->t_blocks_allocated = nblocks; @@ -3042,11 +3122,13 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, INIT_LIST_HEAD(&th->t_list); return 0; - out_fail: +out_fail: memset(th, 0, sizeof(*th)); - /* Re-set th->t_super, so we can properly keep track of how many + /* + * Re-set th->t_super, so we can properly keep track of how many * persistent transactions there are. We need to do this so if this - * call is part of a failed restart_transaction, we can free it later */ + * call is part of a failed restart_transaction, we can free it later + */ th->t_super = sb; return retval; } @@ -3059,14 +3141,15 @@ struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct int ret; struct reiserfs_transaction_handle *th; - /* if we're nesting into an existing transaction. It will be - ** persistent on its own + /* + * if we're nesting into an existing transaction. It will be + * persistent on its own */ if (reiserfs_transaction_running(s)) { th = current->journal_info; th->t_refcount++; BUG_ON(th->t_refcount < 2); - + return th; } th = kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS); @@ -3087,7 +3170,7 @@ int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th) struct super_block *s = th->t_super; int ret = 0; if (th->t_trans_id) - ret = journal_end(th, th->t_super, th->t_blocks_allocated); + ret = journal_end(th); else ret = -EIO; if (th->t_refcount == 0) { @@ -3098,29 +3181,31 @@ int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th) } static int journal_join(struct reiserfs_transaction_handle *th, - struct super_block *sb, unsigned long nblocks) + struct super_block *sb) { struct reiserfs_transaction_handle *cur_th = current->journal_info; - /* this keeps do_journal_end from NULLing out the current->journal_info - ** pointer + /* + * this keeps do_journal_end from NULLing out the + * current->journal_info pointer */ th->t_handle_save = cur_th; BUG_ON(cur_th && cur_th->t_refcount > 1); - return do_journal_begin_r(th, sb, nblocks, JBEGIN_JOIN); + return do_journal_begin_r(th, sb, 1, JBEGIN_JOIN); } int journal_join_abort(struct reiserfs_transaction_handle *th, - struct super_block *sb, unsigned long nblocks) + struct super_block *sb) { struct reiserfs_transaction_handle *cur_th = current->journal_info; - /* this keeps do_journal_end from NULLing out the current->journal_info - ** pointer + /* + * this keeps do_journal_end from NULLing out the + * current->journal_info pointer */ th->t_handle_save = cur_th; BUG_ON(cur_th && cur_th->t_refcount > 1); - return do_journal_begin_r(th, sb, nblocks, JBEGIN_ABORT); + return do_journal_begin_r(th, sb, 1, JBEGIN_ABORT); } int journal_begin(struct reiserfs_transaction_handle *th, @@ -3142,9 +3227,10 @@ int journal_begin(struct reiserfs_transaction_handle *th, "journal_info != 0"); return 0; } else { - /* we've ended up with a handle from a different filesystem. - ** save it and restore on journal_end. This should never - ** really happen... + /* + * we've ended up with a handle from a different + * filesystem. save it and restore on journal_end. + * This should never really happen... */ reiserfs_warning(sb, "clm-2100", "nesting info a different FS"); @@ -3157,9 +3243,10 @@ int journal_begin(struct reiserfs_transaction_handle *th, ret = do_journal_begin_r(th, sb, nblocks, JBEGIN_REG); BUG_ON(current->journal_info != th); - /* I guess this boils down to being the reciprocal of clm-2100 above. - * If do_journal_begin_r fails, we need to put it back, since journal_end - * won't be called to do it. */ + /* + * I guess this boils down to being the reciprocal of clm-2100 above. + * If do_journal_begin_r fails, we need to put it back, since + * journal_end won't be called to do it. */ if (ret) current->journal_info = th->t_handle_save; else @@ -3169,17 +3256,19 @@ int journal_begin(struct reiserfs_transaction_handle *th, } /* -** puts bh into the current transaction. If it was already there, reorders removes the -** old pointers from the hash, and puts new ones in (to make sure replay happen in the right order). -** -** if it was dirty, cleans and files onto the clean list. I can't let it be dirty again until the -** transaction is committed. -** -** if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len. -*/ + * puts bh into the current transaction. If it was already there, reorders + * removes the old pointers from the hash, and puts new ones in (to make + * sure replay happen in the right order). + * + * if it was dirty, cleans and files onto the clean list. I can't let it + * be dirty again until the transaction is committed. + * + * if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len. + */ int journal_mark_dirty(struct reiserfs_transaction_handle *th, - struct super_block *sb, struct buffer_head *bh) + struct buffer_head *bh) { + struct super_block *sb = th->t_super; struct reiserfs_journal *journal = SB_JOURNAL(sb); struct reiserfs_journal_cnode *cn = NULL; int count_already_incd = 0; @@ -3201,9 +3290,10 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th, return 0; } - /* this must be turned into a panic instead of a warning. We can't allow - ** a dirty or journal_dirty or locked buffer to be logged, as some changes - ** could get to disk too early. NOT GOOD. + /* + * this must be turned into a panic instead of a warning. We can't + * allow a dirty or journal_dirty or locked buffer to be logged, as + * some changes could get to disk too early. NOT GOOD. */ if (!prepared || buffer_dirty(bh)) { reiserfs_warning(sb, "journal-1777", @@ -3216,14 +3306,16 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th, buffer_journal_dirty(bh) ? ' ' : '!'); } - if (atomic_read(&(journal->j_wcount)) <= 0) { + if (atomic_read(&journal->j_wcount) <= 0) { reiserfs_warning(sb, "journal-1409", "returning because j_wcount was %d", - atomic_read(&(journal->j_wcount))); + atomic_read(&journal->j_wcount)); return 1; } - /* this error means I've screwed up, and we've overflowed the transaction. - ** Nothing can be done here, except make the FS readonly or panic. + /* + * this error means I've screwed up, and we've overflowed + * the transaction. Nothing can be done here, except make the + * FS readonly or panic. */ if (journal->j_len >= journal->j_trans_max) { reiserfs_panic(th->t_super, "journal-1413", @@ -3280,9 +3372,9 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th, return 0; } -int journal_end(struct reiserfs_transaction_handle *th, - struct super_block *sb, unsigned long nblocks) +int journal_end(struct reiserfs_transaction_handle *th) { + struct super_block *sb = th->t_super; if (!current->journal_info && th->t_refcount > 1) reiserfs_warning(sb, "REISER-NESTING", "th NULL, refcount %d", th->t_refcount); @@ -3297,8 +3389,9 @@ int journal_end(struct reiserfs_transaction_handle *th, struct reiserfs_transaction_handle *cur_th = current->journal_info; - /* we aren't allowed to close a nested transaction on a different - ** filesystem from the one in the task struct + /* + * we aren't allowed to close a nested transaction on a + * different filesystem from the one in the task struct */ BUG_ON(cur_th->t_super != th->t_super); @@ -3308,17 +3401,18 @@ int journal_end(struct reiserfs_transaction_handle *th, } return 0; } else { - return do_journal_end(th, sb, nblocks, 0); + return do_journal_end(th, 0); } } -/* removes from the current transaction, relsing and descrementing any counters. -** also files the removed buffer directly onto the clean list -** -** called by journal_mark_freed when a block has been deleted -** -** returns 1 if it cleaned and relsed the buffer. 0 otherwise -*/ +/* + * removes from the current transaction, relsing and descrementing any counters. + * also files the removed buffer directly onto the clean list + * + * called by journal_mark_freed when a block has been deleted + * + * returns 1 if it cleaned and relsed the buffer. 0 otherwise + */ static int remove_from_transaction(struct super_block *sb, b_blocknr_t blocknr, int already_cleaned) { @@ -3354,7 +3448,7 @@ static int remove_from_transaction(struct super_block *sb, clear_buffer_dirty(bh); clear_buffer_journal_test(bh); put_bh(bh); - if (atomic_read(&(bh->b_count)) < 0) { + if (atomic_read(&bh->b_count) < 0) { reiserfs_warning(sb, "journal-1752", "b_count < 0"); } @@ -3367,15 +3461,16 @@ static int remove_from_transaction(struct super_block *sb, } /* -** for any cnode in a journal list, it can only be dirtied of all the -** transactions that include it are committed to disk. -** this checks through each transaction, and returns 1 if you are allowed to dirty, -** and 0 if you aren't -** -** it is called by dirty_journal_list, which is called after flush_commit_list has gotten all the log -** blocks for a given transaction on disk -** -*/ + * for any cnode in a journal list, it can only be dirtied of all the + * transactions that include it are committed to disk. + * this checks through each transaction, and returns 1 if you are allowed + * to dirty, and 0 if you aren't + * + * it is called by dirty_journal_list, which is called after + * flush_commit_list has gotten all the log blocks for a given + * transaction on disk + * + */ static int can_dirty(struct reiserfs_journal_cnode *cn) { struct super_block *sb = cn->sb; @@ -3383,9 +3478,10 @@ static int can_dirty(struct reiserfs_journal_cnode *cn) struct reiserfs_journal_cnode *cur = cn->hprev; int can_dirty = 1; - /* first test hprev. These are all newer than cn, so any node here - ** with the same block number and dev means this node can't be sent - ** to disk right now. + /* + * first test hprev. These are all newer than cn, so any node here + * with the same block number and dev means this node can't be sent + * to disk right now. */ while (cur && can_dirty) { if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb && @@ -3394,13 +3490,14 @@ static int can_dirty(struct reiserfs_journal_cnode *cn) } cur = cur->hprev; } - /* then test hnext. These are all older than cn. As long as they - ** are committed to the log, it is safe to write cn to disk + /* + * then test hnext. These are all older than cn. As long as they + * are committed to the log, it is safe to write cn to disk */ cur = cn->hnext; while (cur && can_dirty) { if (cur->jlist && cur->jlist->j_len > 0 && - atomic_read(&(cur->jlist->j_commit_left)) > 0 && cur->bh && + atomic_read(&cur->jlist->j_commit_left) > 0 && cur->bh && cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) { can_dirty = 0; } @@ -3409,12 +3506,13 @@ static int can_dirty(struct reiserfs_journal_cnode *cn) return can_dirty; } -/* syncs the commit blocks, but does not force the real buffers to disk -** will wait until the current transaction is done/committed before returning -*/ -int journal_end_sync(struct reiserfs_transaction_handle *th, - struct super_block *sb, unsigned long nblocks) +/* + * syncs the commit blocks, but does not force the real buffers to disk + * will wait until the current transaction is done/committed before returning + */ +int journal_end_sync(struct reiserfs_transaction_handle *th) { + struct super_block *sb = th->t_super; struct reiserfs_journal *journal = SB_JOURNAL(sb); BUG_ON(!th->t_trans_id); @@ -3423,14 +3521,12 @@ int journal_end_sync(struct reiserfs_transaction_handle *th, if (journal->j_len == 0) { reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1); - journal_mark_dirty(th, sb, SB_BUFFER_WITH_SB(sb)); + journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb)); } - return do_journal_end(th, sb, nblocks, COMMIT_NOW | WAIT); + return do_journal_end(th, COMMIT_NOW | WAIT); } -/* -** writeback the pending async commits to disk -*/ +/* writeback the pending async commits to disk */ static void flush_async_commits(struct work_struct *work) { struct reiserfs_journal *journal = @@ -3450,9 +3546,9 @@ static void flush_async_commits(struct work_struct *work) } /* -** flushes any old transactions to disk -** ends the current transaction if it is too old -*/ + * flushes any old transactions to disk + * ends the current transaction if it is too old + */ void reiserfs_flush_old_commits(struct super_block *sb) { time_t now; @@ -3460,48 +3556,53 @@ void reiserfs_flush_old_commits(struct super_block *sb) struct reiserfs_journal *journal = SB_JOURNAL(sb); now = get_seconds(); - /* safety check so we don't flush while we are replaying the log during + /* + * safety check so we don't flush while we are replaying the log during * mount */ if (list_empty(&journal->j_journal_list)) return; - /* check the current transaction. If there are no writers, and it is + /* + * check the current transaction. If there are no writers, and it is * too old, finish it, and force the commit blocks to disk */ if (atomic_read(&journal->j_wcount) <= 0 && journal->j_trans_start_time > 0 && journal->j_len > 0 && (now - journal->j_trans_start_time) > journal->j_max_trans_age) { - if (!journal_join(&th, sb, 1)) { + if (!journal_join(&th, sb)) { reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1); - journal_mark_dirty(&th, sb, - SB_BUFFER_WITH_SB(sb)); + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb)); - /* we're only being called from kreiserfsd, it makes no sense to do - ** an async commit so that kreiserfsd can do it later + /* + * we're only being called from kreiserfsd, it makes + * no sense to do an async commit so that kreiserfsd + * can do it later */ - do_journal_end(&th, sb, 1, COMMIT_NOW | WAIT); + do_journal_end(&th, COMMIT_NOW | WAIT); } } } /* -** returns 0 if do_journal_end should return right away, returns 1 if do_journal_end should finish the commit -** -** if the current transaction is too old, but still has writers, this will wait on j_join_wait until all -** the writers are done. By the time it wakes up, the transaction it was called has already ended, so it just -** flushes the commit list and returns 0. -** -** Won't batch when flush or commit_now is set. Also won't batch when others are waiting on j_join_wait. -** -** Note, we can't allow the journal_end to proceed while there are still writers in the log. -*/ -static int check_journal_end(struct reiserfs_transaction_handle *th, - struct super_block *sb, unsigned long nblocks, - int flags) + * returns 0 if do_journal_end should return right away, returns 1 if + * do_journal_end should finish the commit + * + * if the current transaction is too old, but still has writers, this will + * wait on j_join_wait until all the writers are done. By the time it + * wakes up, the transaction it was called has already ended, so it just + * flushes the commit list and returns 0. + * + * Won't batch when flush or commit_now is set. Also won't batch when + * others are waiting on j_join_wait. + * + * Note, we can't allow the journal_end to proceed while there are still + * writers in the log. + */ +static int check_journal_end(struct reiserfs_transaction_handle *th, int flags) { time_t now; @@ -3509,6 +3610,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th, int commit_now = flags & COMMIT_NOW; int wait_on_commit = flags & WAIT; struct reiserfs_journal_list *jl; + struct super_block *sb = th->t_super; struct reiserfs_journal *journal = SB_JOURNAL(sb); BUG_ON(!th->t_trans_id); @@ -3520,23 +3622,27 @@ static int check_journal_end(struct reiserfs_transaction_handle *th, } journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged); - if (atomic_read(&(journal->j_wcount)) > 0) { /* <= 0 is allowed. unmounting might not call begin */ - atomic_dec(&(journal->j_wcount)); - } + /* <= 0 is allowed. unmounting might not call begin */ + if (atomic_read(&journal->j_wcount) > 0) + atomic_dec(&journal->j_wcount); - /* BUG, deal with case where j_len is 0, but people previously freed blocks need to be released - ** will be dealt with by next transaction that actually writes something, but should be taken - ** care of in this trans + /* + * BUG, deal with case where j_len is 0, but people previously + * freed blocks need to be released will be dealt with by next + * transaction that actually writes something, but should be taken + * care of in this trans */ BUG_ON(journal->j_len == 0); - /* if wcount > 0, and we are called to with flush or commit_now, - ** we wait on j_join_wait. We will wake up when the last writer has - ** finished the transaction, and started it on its way to the disk. - ** Then, we flush the commit or journal list, and just return 0 - ** because the rest of journal end was already done for this transaction. + /* + * if wcount > 0, and we are called to with flush or commit_now, + * we wait on j_join_wait. We will wake up when the last writer has + * finished the transaction, and started it on its way to the disk. + * Then, we flush the commit or journal list, and just return 0 + * because the rest of journal end was already done for this + * transaction. */ - if (atomic_read(&(journal->j_wcount)) > 0) { + if (atomic_read(&journal->j_wcount) > 0) { if (flush || commit_now) { unsigned trans_id; @@ -3544,27 +3650,30 @@ static int check_journal_end(struct reiserfs_transaction_handle *th, trans_id = jl->j_trans_id; if (wait_on_commit) jl->j_state |= LIST_COMMIT_PENDING; - atomic_set(&(journal->j_jlock), 1); + atomic_set(&journal->j_jlock, 1); if (flush) { journal->j_next_full_flush = 1; } unlock_journal(sb); - /* sleep while the current transaction is still j_jlocked */ + /* + * sleep while the current transaction is + * still j_jlocked + */ while (journal->j_trans_id == trans_id) { if (atomic_read(&journal->j_jlock)) { queue_log_writer(sb); } else { lock_journal(sb); if (journal->j_trans_id == trans_id) { - atomic_set(&(journal->j_jlock), + atomic_set(&journal->j_jlock, 1); } unlock_journal(sb); } } BUG_ON(journal->j_trans_id == trans_id); - + if (commit_now && journal_list_still_alive(sb, trans_id) && wait_on_commit) { @@ -3584,7 +3693,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th, } /* don't batch when someone is waiting on j_join_wait */ /* don't batch when syncing the commit or flushing the whole trans */ - if (!(journal->j_must_wait > 0) && !(atomic_read(&(journal->j_jlock))) + if (!(journal->j_must_wait > 0) && !(atomic_read(&journal->j_jlock)) && !flush && !commit_now && (journal->j_len < journal->j_max_batch) && journal->j_len_alloc < journal->j_max_batch && journal->j_cnode_free > (journal->j_trans_max * 3)) { @@ -3602,19 +3711,22 @@ static int check_journal_end(struct reiserfs_transaction_handle *th, } /* -** Does all the work that makes deleting blocks safe. -** when deleting a block mark BH_JNew, just remove it from the current transaction, clean it's buffer_head and move on. -** -** otherwise: -** set a bit for the block in the journal bitmap. That will prevent it from being allocated for unformatted nodes -** before this transaction has finished. -** -** mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers. That will prevent any old transactions with -** this block from trying to flush to the real location. Since we aren't removing the cnode from the journal_list_hash, -** the block can't be reallocated yet. -** -** Then remove it from the current transaction, decrementing any counters and filing it on the clean list. -*/ + * Does all the work that makes deleting blocks safe. + * when deleting a block mark BH_JNew, just remove it from the current + * transaction, clean it's buffer_head and move on. + * + * otherwise: + * set a bit for the block in the journal bitmap. That will prevent it from + * being allocated for unformatted nodes before this transaction has finished. + * + * mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers. + * That will prevent any old transactions with this block from trying to flush + * to the real location. Since we aren't removing the cnode from the + * journal_list_hash, *the block can't be reallocated yet. + * + * Then remove it from the current transaction, decrementing any counters and + * filing it on the clean list. + */ int journal_mark_freed(struct reiserfs_transaction_handle *th, struct super_block *sb, b_blocknr_t blocknr) { @@ -3637,7 +3749,10 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th, reiserfs_clean_and_file_buffer(bh); cleaned = remove_from_transaction(sb, blocknr, cleaned); } else { - /* set the bit for this block in the journal bitmap for this transaction */ + /* + * set the bit for this block in the journal bitmap + * for this transaction + */ jb = journal->j_current_jl->j_list_bitmap; if (!jb) { reiserfs_panic(sb, "journal-1702", @@ -3653,17 +3768,22 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th, } cleaned = remove_from_transaction(sb, blocknr, cleaned); - /* find all older transactions with this block, make sure they don't try to write it out */ + /* + * find all older transactions with this block, + * make sure they don't try to write it out + */ cn = get_journal_hash_dev(sb, journal->j_list_hash_table, blocknr); while (cn) { if (sb == cn->sb && blocknr == cn->blocknr) { set_bit(BLOCK_FREED, &cn->state); if (cn->bh) { + /* + * remove_from_transaction will brelse + * the buffer if it was in the current + * trans + */ if (!cleaned) { - /* remove_from_transaction will brelse the buffer if it was - ** in the current trans - */ clear_buffer_journal_dirty(cn-> bh); clear_buffer_dirty(cn->bh); @@ -3672,16 +3792,19 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th, cleaned = 1; put_bh(cn->bh); if (atomic_read - (&(cn->bh->b_count)) < 0) { + (&cn->bh->b_count) < 0) { reiserfs_warning(sb, "journal-2138", "cn->bh->b_count < 0"); } } - if (cn->jlist) { /* since we are clearing the bh, we MUST dec nonzerolen */ - atomic_dec(& - (cn->jlist-> - j_nonzerolen)); + /* + * since we are clearing the bh, + * we MUST dec nonzerolen + */ + if (cn->jlist) { + atomic_dec(&cn->jlist-> + j_nonzerolen); } cn->bh = NULL; } @@ -3714,10 +3837,16 @@ static int __commit_trans_jl(struct inode *inode, unsigned long id, struct reiserfs_journal *journal = SB_JOURNAL(sb); int ret = 0; - /* is it from the current transaction, or from an unknown transaction? */ + /* + * is it from the current transaction, + * or from an unknown transaction? + */ if (id == journal->j_trans_id) { jl = journal->j_current_jl; - /* try to let other writers come in and grow this transaction */ + /* + * try to let other writers come in and + * grow this transaction + */ let_transaction_grow(sb, id); if (journal->j_trans_id != id) { goto flush_commit_only; @@ -3731,21 +3860,22 @@ static int __commit_trans_jl(struct inode *inode, unsigned long id, if (journal->j_trans_id != id) { reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1); - journal_mark_dirty(&th, sb, SB_BUFFER_WITH_SB(sb)); - ret = journal_end(&th, sb, 1); + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb)); + ret = journal_end(&th); goto flush_commit_only; } - ret = journal_end_sync(&th, sb, 1); + ret = journal_end_sync(&th); if (!ret) ret = 1; } else { - /* this gets tricky, we have to make sure the journal list in + /* + * this gets tricky, we have to make sure the journal list in * the inode still exists. We know the list is still around * if we've got a larger transaction id than the oldest list */ - flush_commit_only: +flush_commit_only: if (journal_list_still_alive(inode->i_sb, id)) { /* * we only set ret to 1 when we know for sure @@ -3768,7 +3898,8 @@ int reiserfs_commit_for_inode(struct inode *inode) unsigned int id = REISERFS_I(inode)->i_trans_id; struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl; - /* for the whole inode, assume unset id means it was + /* + * for the whole inode, assume unset id means it was * changed in the current transaction. More conservative */ if (!id || !jl) { @@ -3806,12 +3937,11 @@ void reiserfs_restore_prepared_buffer(struct super_block *sb, extern struct tree_balance *cur_tb; /* -** before we can change a metadata block, we have to make sure it won't -** be written to disk while we are altering it. So, we must: -** clean it -** wait on it. -** -*/ + * before we can change a metadata block, we have to make sure it won't + * be written to disk while we are altering it. So, we must: + * clean it + * wait on it. + */ int reiserfs_prepare_for_journal(struct super_block *sb, struct buffer_head *bh, int wait) { @@ -3832,19 +3962,18 @@ int reiserfs_prepare_for_journal(struct super_block *sb, } /* -** long and ugly. If flush, will not return until all commit -** blocks and all real buffers in the trans are on disk. -** If no_async, won't return until all commit blocks are on disk. -** -** keep reading, there are comments as you go along -** -** If the journal is aborted, we just clean up. Things like flushing -** journal lists, etc just won't happen. -*/ -static int do_journal_end(struct reiserfs_transaction_handle *th, - struct super_block *sb, unsigned long nblocks, - int flags) + * long and ugly. If flush, will not return until all commit + * blocks and all real buffers in the trans are on disk. + * If no_async, won't return until all commit blocks are on disk. + * + * keep reading, there are comments as you go along + * + * If the journal is aborted, we just clean up. Things like flushing + * journal lists, etc just won't happen. + */ +static int do_journal_end(struct reiserfs_transaction_handle *th, int flags) { + struct super_block *sb = th->t_super; struct reiserfs_journal *journal = SB_JOURNAL(sb); struct reiserfs_journal_cnode *cn, *next, *jl_cn; struct reiserfs_journal_cnode *last_cn = NULL; @@ -3866,9 +3995,12 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, BUG_ON(th->t_refcount > 1); BUG_ON(!th->t_trans_id); + BUG_ON(!th->t_super); - /* protect flush_older_commits from doing mistakes if the - transaction ID counter gets overflowed. */ + /* + * protect flush_older_commits from doing mistakes if the + * transaction ID counter gets overflowed. + */ if (th->t_trans_id == ~0U) flags |= FLUSH_ALL | COMMIT_NOW | WAIT; flush = flags & FLUSH_ALL; @@ -3879,7 +4011,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, if (journal->j_len == 0) { reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1); - journal_mark_dirty(th, sb, SB_BUFFER_WITH_SB(sb)); + journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb)); } lock_journal(sb); @@ -3892,10 +4024,12 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, wait_on_commit = 1; } - /* check_journal_end locks the journal, and unlocks if it does not return 1 - ** it tells us if we should continue with the journal_end, or just return + /* + * check_journal_end locks the journal, and unlocks if it does + * not return 1 it tells us if we should continue with the + * journal_end, or just return */ - if (!check_journal_end(th, sb, nblocks, flags)) { + if (!check_journal_end(th, flags)) { reiserfs_schedule_old_flush(sb); wake_queued_writers(sb); reiserfs_async_progress_wait(sb); @@ -3908,19 +4042,23 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, } /* - ** j must wait means we have to flush the log blocks, and the real blocks for - ** this transaction + * j must wait means we have to flush the log blocks, and the + * real blocks for this transaction */ if (journal->j_must_wait > 0) { flush = 1; } #ifdef REISERFS_PREALLOCATE - /* quota ops might need to nest, setup the journal_info pointer for them - * and raise the refcount so that it is > 0. */ + /* + * quota ops might need to nest, setup the journal_info pointer + * for them and raise the refcount so that it is > 0. + */ current->journal_info = th; th->t_refcount++; - reiserfs_discard_all_prealloc(th); /* it should not involve new blocks into - * the transaction */ + + /* it should not involve new blocks into the transaction */ + reiserfs_discard_all_prealloc(th); + th->t_refcount--; current->journal_info = th->t_handle_save; #endif @@ -3936,7 +4074,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, memcpy(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8); set_desc_trans_id(desc, journal->j_trans_id); - /* setup commit block. Don't write (keep it clean too) this one until after everyone else is written */ + /* + * setup commit block. Don't write (keep it clean too) this one + * until after everyone else is written + */ c_bh = journal_getblk(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) + ((journal->j_start + journal->j_len + 1) % SB_ONDISK_JOURNAL_SIZE(sb))); @@ -3948,7 +4089,8 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, /* init this journal list */ jl = journal->j_current_jl; - /* we lock the commit before doing anything because + /* + * we lock the commit before doing anything because * we want to make sure nobody tries to run flush_commit_list until * the new transaction is fully setup, and we've already flushed the * ordered bh list @@ -3968,9 +4110,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, atomic_set(&jl->j_commit_left, journal->j_len + 2); jl->j_realblock = NULL; - /* The ENTIRE FOR LOOP MUST not cause schedule to occur. - ** for each real block, add it to the journal list hash, - ** copy into real block index array in the commit or desc block + /* + * The ENTIRE FOR LOOP MUST not cause schedule to occur. + * for each real block, add it to the journal list hash, + * copy into real block index array in the commit or desc block */ trans_half = journal_trans_half(sb->s_blocksize); for (i = 0, cn = journal->j_first; cn; cn = cn->next, i++) { @@ -3989,9 +4132,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, last_cn->next = jl_cn; } last_cn = jl_cn; - /* make sure the block we are trying to log is not a block - of journal or reserved area */ - + /* + * make sure the block we are trying to log + * is not a block of journal or reserved area + */ if (is_block_in_log_or_reserved_area (sb, cn->bh->b_blocknr)) { reiserfs_panic(sb, "journal-2332", @@ -4021,19 +4165,26 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, set_desc_trans_id(desc, journal->j_trans_id); set_commit_trans_len(commit, journal->j_len); - /* special check in case all buffers in the journal were marked for not logging */ + /* + * special check in case all buffers in the journal + * were marked for not logging + */ BUG_ON(journal->j_len == 0); - /* we're about to dirty all the log blocks, mark the description block + /* + * we're about to dirty all the log blocks, mark the description block * dirty now too. Don't mark the commit block dirty until all the * others are on disk */ mark_buffer_dirty(d_bh); - /* first data block is j_start + 1, so add one to cur_write_start wherever you use it */ + /* + * first data block is j_start + 1, so add one to + * cur_write_start wherever you use it + */ cur_write_start = journal->j_start; cn = journal->j_first; - jindex = 1; /* start at one so we don't get the desc again */ + jindex = 1; /* start at one so we don't get the desc again */ while (cn) { clear_buffer_journal_new(cn->bh); /* copy all the real blocks into log area. dirty log blocks */ @@ -4059,7 +4210,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, set_buffer_journal_dirty(cn->bh); clear_buffer_journaled(cn->bh); } else { - /* JDirty cleared sometime during transaction. don't log this one */ + /* + * JDirty cleared sometime during transaction. + * don't log this one + */ reiserfs_warning(sb, "journal-2048", "BAD, buffer in journal hash, " "but not JDirty!"); @@ -4071,9 +4225,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, reiserfs_cond_resched(sb); } - /* we are done with both the c_bh and d_bh, but - ** c_bh must be written after all other commit blocks, - ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1. + /* + * we are done with both the c_bh and d_bh, but + * c_bh must be written after all other commit blocks, + * so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1. */ journal->j_current_jl = alloc_journal_list(sb); @@ -4088,7 +4243,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, journal->j_start = (journal->j_start + journal->j_len + 2) % SB_ONDISK_JOURNAL_SIZE(sb); - atomic_set(&(journal->j_wcount), 0); + atomic_set(&journal->j_wcount, 0); journal->j_bcount = 0; journal->j_last = NULL; journal->j_first = NULL; @@ -4104,15 +4259,18 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, journal->j_next_async_flush = 0; init_journal_hash(sb); - // make sure reiserfs_add_jh sees the new current_jl before we - // write out the tails + /* + * make sure reiserfs_add_jh sees the new current_jl before we + * write out the tails + */ smp_mb(); - /* tail conversion targets have to hit the disk before we end the + /* + * tail conversion targets have to hit the disk before we end the * transaction. Otherwise a later transaction might repack the tail - * before this transaction commits, leaving the data block unflushed and - * clean, if we crash before the later transaction commits, the data block - * is lost. + * before this transaction commits, leaving the data block unflushed + * and clean, if we crash before the later transaction commits, the + * data block is lost. */ if (!list_empty(&jl->j_tail_bh_list)) { depth = reiserfs_write_unlock_nested(sb); @@ -4123,24 +4281,27 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, BUG_ON(!list_empty(&jl->j_tail_bh_list)); mutex_unlock(&jl->j_commit_mutex); - /* honor the flush wishes from the caller, simple commits can - ** be done outside the journal lock, they are done below - ** - ** if we don't flush the commit list right now, we put it into - ** the work queue so the people waiting on the async progress work - ** queue don't wait for this proc to flush journal lists and such. + /* + * honor the flush wishes from the caller, simple commits can + * be done outside the journal lock, they are done below + * + * if we don't flush the commit list right now, we put it into + * the work queue so the people waiting on the async progress work + * queue don't wait for this proc to flush journal lists and such. */ if (flush) { flush_commit_list(sb, jl, 1); flush_journal_list(sb, jl, 1); } else if (!(jl->j_state & LIST_COMMIT_PENDING)) - queue_delayed_work(commit_wq, &journal->j_work, HZ / 10); + queue_delayed_work(REISERFS_SB(sb)->commit_wq, + &journal->j_work, HZ / 10); - /* if the next transaction has any chance of wrapping, flush - ** transactions that might get overwritten. If any journal lists are very - ** old flush them as well. + /* + * if the next transaction has any chance of wrapping, flush + * transactions that might get overwritten. If any journal lists + * are very old flush them as well. */ - first_jl: +first_jl: list_for_each_safe(entry, safe, &journal->j_journal_list) { temp_jl = JOURNAL_LIST_ENTRY(entry); if (journal->j_start <= temp_jl->j_start) { @@ -4151,8 +4312,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, } else if ((journal->j_start + journal->j_trans_max + 1) < SB_ONDISK_JOURNAL_SIZE(sb)) { - /* if we don't cross into the next transaction and we don't - * wrap, there is no way we can overlap any later transactions + /* + * if we don't cross into the next + * transaction and we don't wrap, there is + * no way we can overlap any later transactions * break now */ break; @@ -4166,10 +4329,12 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, flush_used_journal_lists(sb, temp_jl); goto first_jl; } else { - /* we don't overlap anything from out start to the end of the - * log, and our wrapped portion doesn't overlap anything at - * the start of the log. We can break - */ + /* + * we don't overlap anything from out start + * to the end of the log, and our wrapped + * portion doesn't overlap anything at + * the start of the log. We can break + */ break; } } @@ -4183,23 +4348,25 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, "could not get a list bitmap"); } - atomic_set(&(journal->j_jlock), 0); + atomic_set(&journal->j_jlock, 0); unlock_journal(sb); /* wake up any body waiting to join. */ clear_bit(J_WRITERS_QUEUED, &journal->j_state); - wake_up(&(journal->j_join_wait)); + wake_up(&journal->j_join_wait); if (!flush && wait_on_commit && journal_list_still_alive(sb, commit_trans_id)) { flush_commit_list(sb, jl, 1); } - out: +out: reiserfs_check_lock_depth(sb, "journal end2"); memset(th, 0, sizeof(*th)); - /* Re-set th->t_super, so we can properly keep track of how many + /* + * Re-set th->t_super, so we can properly keep track of how many * persistent transactions there are. We need to do this so if this - * call is part of a failed restart_transaction, we can free it later */ + * call is part of a failed restart_transaction, we can free it later + */ th->t_super = sb; return journal->j_errno; diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c index 79e5a8b4c226..d6744c8b24e1 100644 --- a/fs/reiserfs/lbalance.c +++ b/fs/reiserfs/lbalance.c @@ -8,46 +8,42 @@ #include "reiserfs.h" #include <linux/buffer_head.h> -/* these are used in do_balance.c */ - -/* leaf_move_items - leaf_shift_left - leaf_shift_right - leaf_delete_items - leaf_insert_into_buf - leaf_paste_in_buffer - leaf_cut_from_buffer - leaf_paste_entries - */ - -/* copy copy_count entries from source directory item to dest buffer (creating new item if needed) */ +/* + * copy copy_count entries from source directory item to dest buffer + * (creating new item if needed) + */ static void leaf_copy_dir_entries(struct buffer_info *dest_bi, struct buffer_head *source, int last_first, int item_num, int from, int copy_count) { struct buffer_head *dest = dest_bi->bi_bh; - int item_num_in_dest; /* either the number of target item, - or if we must create a new item, - the number of the item we will - create it next to */ + /* + * either the number of target item, or if we must create a + * new item, the number of the item we will create it next to + */ + int item_num_in_dest; + struct item_head *ih; struct reiserfs_de_head *deh; int copy_records_len; /* length of all records in item to be copied */ char *records; - ih = B_N_PITEM_HEAD(source, item_num); + ih = item_head(source, item_num); RFALSE(!is_direntry_le_ih(ih), "vs-10000: item must be directory item"); - /* length of all record to be copied and first byte of the last of them */ + /* + * length of all record to be copied and first byte of + * the last of them + */ deh = B_I_DEH(source, ih); if (copy_count) { - copy_records_len = (from ? deh_location(&(deh[from - 1])) : + copy_records_len = (from ? deh_location(&deh[from - 1]) : ih_item_len(ih)) - - deh_location(&(deh[from + copy_count - 1])); + deh_location(&deh[from + copy_count - 1]); records = source->b_data + ih_location(ih) + - deh_location(&(deh[from + copy_count - 1])); + deh_location(&deh[from + copy_count - 1]); } else { copy_records_len = 0; records = NULL; @@ -59,12 +55,15 @@ static void leaf_copy_dir_entries(struct buffer_info *dest_bi, LAST_TO_FIRST) ? ((B_NR_ITEMS(dest)) ? 0 : -1) : (B_NR_ITEMS(dest) - 1); - /* if there are no items in dest or the first/last item in dest is not item of the same directory */ + /* + * if there are no items in dest or the first/last item in + * dest is not item of the same directory + */ if ((item_num_in_dest == -1) || (last_first == FIRST_TO_LAST && le_ih_k_offset(ih) == DOT_OFFSET) || (last_first == LAST_TO_FIRST && comp_short_le_keys /*COMP_SHORT_KEYS */ (&ih->ih_key, - B_N_PKEY(dest, + leaf_key(dest, item_num_in_dest)))) { /* create new item in dest */ @@ -80,16 +79,22 @@ static void leaf_copy_dir_entries(struct buffer_info *dest_bi, if (last_first == LAST_TO_FIRST) { /* form key by the following way */ - if (from < I_ENTRY_COUNT(ih)) { + if (from < ih_entry_count(ih)) { set_le_ih_k_offset(&new_ih, - deh_offset(&(deh[from]))); - /*memcpy (&new_ih.ih_key.k_offset, &deh[from].deh_offset, SHORT_KEY_SIZE); */ + deh_offset(&deh[from])); } else { - /* no entries will be copied to this item in this function */ + /* + * no entries will be copied to this + * item in this function + */ set_le_ih_k_offset(&new_ih, U32_MAX); - /* this item is not yet valid, but we want I_IS_DIRECTORY_ITEM to return 1 for it, so we -1 */ + /* + * this item is not yet valid, but we + * want I_IS_DIRECTORY_ITEM to return 1 + * for it, so we -1 + */ } - set_le_key_k_type(KEY_FORMAT_3_5, &(new_ih.ih_key), + set_le_key_k_type(KEY_FORMAT_3_5, &new_ih.ih_key, TYPE_DIRENTRY); } @@ -113,36 +118,44 @@ static void leaf_copy_dir_entries(struct buffer_info *dest_bi, leaf_paste_entries(dest_bi, item_num_in_dest, (last_first == - FIRST_TO_LAST) ? I_ENTRY_COUNT(B_N_PITEM_HEAD(dest, + FIRST_TO_LAST) ? ih_entry_count(item_head(dest, item_num_in_dest)) : 0, copy_count, deh + from, records, DEH_SIZE * copy_count + copy_records_len); } -/* Copy the first (if last_first == FIRST_TO_LAST) or last (last_first == LAST_TO_FIRST) item or - part of it or nothing (see the return 0 below) from SOURCE to the end - (if last_first) or beginning (!last_first) of the DEST */ +/* + * Copy the first (if last_first == FIRST_TO_LAST) or last + * (last_first == LAST_TO_FIRST) item or part of it or nothing + * (see the return 0 below) from SOURCE to the end (if last_first) + * or beginning (!last_first) of the DEST + */ /* returns 1 if anything was copied, else 0 */ static int leaf_copy_boundary_item(struct buffer_info *dest_bi, struct buffer_head *src, int last_first, int bytes_or_entries) { struct buffer_head *dest = dest_bi->bi_bh; - int dest_nr_item, src_nr_item; /* number of items in the source and destination buffers */ + /* number of items in the source and destination buffers */ + int dest_nr_item, src_nr_item; struct item_head *ih; struct item_head *dih; dest_nr_item = B_NR_ITEMS(dest); + /* + * if ( DEST is empty or first item of SOURCE and last item of + * DEST are the items of different objects or of different types ) + * then there is no need to treat this item differently from the + * other items that we copy, so we return + */ if (last_first == FIRST_TO_LAST) { - /* if ( DEST is empty or first item of SOURCE and last item of DEST are the items of different objects - or of different types ) then there is no need to treat this item differently from the other items - that we copy, so we return */ - ih = B_N_PITEM_HEAD(src, 0); - dih = B_N_PITEM_HEAD(dest, dest_nr_item - 1); + ih = item_head(src, 0); + dih = item_head(dest, dest_nr_item - 1); + + /* there is nothing to merge */ if (!dest_nr_item - || (!op_is_left_mergeable(&(ih->ih_key), src->b_size))) - /* there is nothing to merge */ + || (!op_is_left_mergeable(&ih->ih_key, src->b_size))) return 0; RFALSE(!ih_item_len(ih), @@ -157,8 +170,11 @@ static int leaf_copy_boundary_item(struct buffer_info *dest_bi, return 1; } - /* copy part of the body of the first item of SOURCE to the end of the body of the last item of the DEST - part defined by 'bytes_or_entries'; if bytes_or_entries == -1 copy whole body; don't create new item header + /* + * copy part of the body of the first item of SOURCE + * to the end of the body of the last item of the DEST + * part defined by 'bytes_or_entries'; if bytes_or_entries + * == -1 copy whole body; don't create new item header */ if (bytes_or_entries == -1) bytes_or_entries = ih_item_len(ih); @@ -176,11 +192,13 @@ static int leaf_copy_boundary_item(struct buffer_info *dest_bi, } #endif - /* merge first item (or its part) of src buffer with the last - item of dest buffer. Both are of the same file */ + /* + * merge first item (or its part) of src buffer with the last + * item of dest buffer. Both are of the same file + */ leaf_paste_in_buffer(dest_bi, dest_nr_item - 1, ih_item_len(dih), - bytes_or_entries, B_I_PITEM(src, ih), 0); + bytes_or_entries, ih_item_body(src, ih), 0); if (is_indirect_le_ih(dih)) { RFALSE(get_ih_free_space(dih), @@ -195,19 +213,23 @@ static int leaf_copy_boundary_item(struct buffer_info *dest_bi, /* copy boundary item to right (last_first == LAST_TO_FIRST) */ - /* ( DEST is empty or last item of SOURCE and first item of DEST - are the items of different object or of different types ) + /* + * (DEST is empty or last item of SOURCE and first item of DEST + * are the items of different object or of different types) */ src_nr_item = B_NR_ITEMS(src); - ih = B_N_PITEM_HEAD(src, src_nr_item - 1); - dih = B_N_PITEM_HEAD(dest, 0); + ih = item_head(src, src_nr_item - 1); + dih = item_head(dest, 0); - if (!dest_nr_item || !op_is_left_mergeable(&(dih->ih_key), src->b_size)) + if (!dest_nr_item || !op_is_left_mergeable(&dih->ih_key, src->b_size)) return 0; if (is_direntry_le_ih(ih)) { + /* + * bytes_or_entries = entries number in last + * item body of SOURCE + */ if (bytes_or_entries == -1) - /* bytes_or_entries = entries number in last item body of SOURCE */ bytes_or_entries = ih_entry_count(ih); leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST, @@ -217,9 +239,11 @@ static int leaf_copy_boundary_item(struct buffer_info *dest_bi, return 1; } - /* copy part of the body of the last item of SOURCE to the begin of the body of the first item of the DEST; - part defined by 'bytes_or_entries'; if byte_or_entriess == -1 copy whole body; change first item key of the DEST; - don't create new item header + /* + * copy part of the body of the last item of SOURCE to the + * begin of the body of the first item of the DEST; part defined + * by 'bytes_or_entries'; if byte_or_entriess == -1 copy whole body; + * change first item key of the DEST; don't create new item header */ RFALSE(is_indirect_le_ih(ih) && get_ih_free_space(ih), @@ -270,15 +294,18 @@ static int leaf_copy_boundary_item(struct buffer_info *dest_bi, } leaf_paste_in_buffer(dest_bi, 0, 0, bytes_or_entries, - B_I_PITEM(src, + ih_item_body(src, ih) + ih_item_len(ih) - bytes_or_entries, 0); return 1; } -/* copy cpy_mun items from buffer src to buffer dest - * last_first == FIRST_TO_LAST means, that we copy cpy_num items beginning from first-th item in src to tail of dest - * last_first == LAST_TO_FIRST means, that we copy cpy_num items beginning from first-th item in src to head of dest +/* + * copy cpy_mun items from buffer src to buffer dest + * last_first == FIRST_TO_LAST means, that we copy cpy_num items beginning + * from first-th item in src to tail of dest + * last_first == LAST_TO_FIRST means, that we copy cpy_num items beginning + * from first-th item in src to head of dest */ static void leaf_copy_items_entirely(struct buffer_info *dest_bi, struct buffer_head *src, int last_first, @@ -311,11 +338,14 @@ static void leaf_copy_items_entirely(struct buffer_info *dest_bi, nr = blkh_nr_item(blkh); free_space = blkh_free_space(blkh); - /* we will insert items before 0-th or nr-th item in dest buffer. It depends of last_first parameter */ + /* + * we will insert items before 0-th or nr-th item in dest buffer. + * It depends of last_first parameter + */ dest_before = (last_first == LAST_TO_FIRST) ? 0 : nr; /* location of head of first new item */ - ih = B_N_PITEM_HEAD(dest, dest_before); + ih = item_head(dest, dest_before); RFALSE(blkh_free_space(blkh) < cpy_num * IH_SIZE, "vs-10140: not enough free space for headers %d (needed %d)", @@ -325,7 +355,7 @@ static void leaf_copy_items_entirely(struct buffer_info *dest_bi, memmove(ih + cpy_num, ih, (nr - dest_before) * IH_SIZE); /* copy item headers */ - memcpy(ih, B_N_PITEM_HEAD(src, first), cpy_num * IH_SIZE); + memcpy(ih, item_head(src, first), cpy_num * IH_SIZE); free_space -= (IH_SIZE * cpy_num); set_blkh_free_space(blkh, free_space); @@ -338,8 +368,8 @@ static void leaf_copy_items_entirely(struct buffer_info *dest_bi, } /* prepare space for items */ - last_loc = ih_location(&(ih[nr + cpy_num - 1 - dest_before])); - last_inserted_loc = ih_location(&(ih[cpy_num - 1])); + last_loc = ih_location(&ih[nr + cpy_num - 1 - dest_before]); + last_inserted_loc = ih_location(&ih[cpy_num - 1]); /* check free space */ RFALSE(free_space < j - last_inserted_loc, @@ -352,7 +382,8 @@ static void leaf_copy_items_entirely(struct buffer_info *dest_bi, /* copy items */ memcpy(dest->b_data + last_inserted_loc, - B_N_PITEM(src, (first + cpy_num - 1)), j - last_inserted_loc); + item_body(src, (first + cpy_num - 1)), + j - last_inserted_loc); /* sizes, item number */ set_blkh_nr_item(blkh, nr + cpy_num); @@ -376,8 +407,10 @@ static void leaf_copy_items_entirely(struct buffer_info *dest_bi, } } -/* This function splits the (liquid) item into two items (useful when - shifting part of an item into another node.) */ +/* + * This function splits the (liquid) item into two items (useful when + * shifting part of an item into another node.) + */ static void leaf_item_bottle(struct buffer_info *dest_bi, struct buffer_head *src, int last_first, int item_num, int cpy_bytes) @@ -389,17 +422,22 @@ static void leaf_item_bottle(struct buffer_info *dest_bi, "vs-10170: bytes == - 1 means: do not split item"); if (last_first == FIRST_TO_LAST) { - /* if ( if item in position item_num in buffer SOURCE is directory item ) */ - ih = B_N_PITEM_HEAD(src, item_num); + /* + * if ( if item in position item_num in buffer SOURCE + * is directory item ) + */ + ih = item_head(src, item_num); if (is_direntry_le_ih(ih)) leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST, item_num, 0, cpy_bytes); else { struct item_head n_ih; - /* copy part of the body of the item number 'item_num' of SOURCE to the end of the DEST - part defined by 'cpy_bytes'; create new item header; change old item_header (????); - n_ih = new item_header; + /* + * copy part of the body of the item number 'item_num' + * of SOURCE to the end of the DEST part defined by + * 'cpy_bytes'; create new item header; change old + * item_header (????); n_ih = new item_header; */ memcpy(&n_ih, ih, IH_SIZE); put_ih_item_len(&n_ih, cpy_bytes); @@ -411,30 +449,36 @@ static void leaf_item_bottle(struct buffer_info *dest_bi, set_ih_free_space(&n_ih, 0); } - RFALSE(op_is_left_mergeable(&(ih->ih_key), src->b_size), + RFALSE(op_is_left_mergeable(&ih->ih_key, src->b_size), "vs-10190: bad mergeability of item %h", ih); n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */ leaf_insert_into_buf(dest_bi, B_NR_ITEMS(dest), &n_ih, - B_N_PITEM(src, item_num), 0); + item_body(src, item_num), 0); } } else { - /* if ( if item in position item_num in buffer SOURCE is directory item ) */ - ih = B_N_PITEM_HEAD(src, item_num); + /* + * if ( if item in position item_num in buffer + * SOURCE is directory item ) + */ + ih = item_head(src, item_num); if (is_direntry_le_ih(ih)) leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST, item_num, - I_ENTRY_COUNT(ih) - cpy_bytes, + ih_entry_count(ih) - cpy_bytes, cpy_bytes); else { struct item_head n_ih; - /* copy part of the body of the item number 'item_num' of SOURCE to the begin of the DEST - part defined by 'cpy_bytes'; create new item header; - n_ih = new item_header; + /* + * copy part of the body of the item number 'item_num' + * of SOURCE to the begin of the DEST part defined by + * 'cpy_bytes'; create new item header; + * n_ih = new item_header; */ memcpy(&n_ih, ih, SHORT_KEY_SIZE); - n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */ + /* Endian safe, both le */ + n_ih.ih_version = ih->ih_version; if (is_direct_le_ih(ih)) { set_le_ih_k_offset(&n_ih, @@ -458,20 +502,22 @@ static void leaf_item_bottle(struct buffer_info *dest_bi, /* set item length */ put_ih_item_len(&n_ih, cpy_bytes); - n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */ + /* Endian safe, both le */ + n_ih.ih_version = ih->ih_version; leaf_insert_into_buf(dest_bi, 0, &n_ih, - B_N_PITEM(src, - item_num) + - ih_item_len(ih) - cpy_bytes, 0); + item_body(src, item_num) + + ih_item_len(ih) - cpy_bytes, 0); } } } -/* If cpy_bytes equals minus one than copy cpy_num whole items from SOURCE to DEST. - If cpy_bytes not equal to minus one than copy cpy_num-1 whole items from SOURCE to DEST. - From last item copy cpy_num bytes for regular item and cpy_num directory entries for - directory item. */ +/* + * If cpy_bytes equals minus one than copy cpy_num whole items from SOURCE + * to DEST. If cpy_bytes not equal to minus one than copy cpy_num-1 whole + * items from SOURCE to DEST. From last item copy cpy_num bytes for regular + * item and cpy_num directory entries for directory item. + */ static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src, int last_first, int cpy_num, int cpy_bytes) { @@ -498,22 +544,34 @@ static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src, else bytes = -1; - /* copy the first item or it part or nothing to the end of the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,0,bytes)) */ + /* + * copy the first item or it part or nothing to the end of + * the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,0,bytes)) + */ i = leaf_copy_boundary_item(dest_bi, src, FIRST_TO_LAST, bytes); cpy_num -= i; if (cpy_num == 0) return i; pos += i; if (cpy_bytes == -1) - /* copy first cpy_num items starting from position 'pos' of SOURCE to end of DEST */ + /* + * copy first cpy_num items starting from position + * 'pos' of SOURCE to end of DEST + */ leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST, pos, cpy_num); else { - /* copy first cpy_num-1 items starting from position 'pos-1' of the SOURCE to the end of the DEST */ + /* + * copy first cpy_num-1 items starting from position + * 'pos-1' of the SOURCE to the end of the DEST + */ leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST, pos, cpy_num - 1); - /* copy part of the item which number is cpy_num+pos-1 to the end of the DEST */ + /* + * copy part of the item which number is + * cpy_num+pos-1 to the end of the DEST + */ leaf_item_bottle(dest_bi, src, FIRST_TO_LAST, cpy_num + pos - 1, cpy_bytes); } @@ -525,7 +583,11 @@ static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src, else bytes = -1; - /* copy the last item or it part or nothing to the begin of the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,1,bytes)); */ + /* + * copy the last item or it part or nothing to the + * begin of the DEST + * (i = leaf_copy_boundary_item(DEST,SOURCE,1,bytes)); + */ i = leaf_copy_boundary_item(dest_bi, src, LAST_TO_FIRST, bytes); cpy_num -= i; @@ -534,15 +596,24 @@ static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src, pos = src_nr_item - cpy_num - i; if (cpy_bytes == -1) { - /* starting from position 'pos' copy last cpy_num items of SOURCE to begin of DEST */ + /* + * starting from position 'pos' copy last cpy_num + * items of SOURCE to begin of DEST + */ leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST, pos, cpy_num); } else { - /* copy last cpy_num-1 items starting from position 'pos+1' of the SOURCE to the begin of the DEST; */ + /* + * copy last cpy_num-1 items starting from position + * 'pos+1' of the SOURCE to the begin of the DEST; + */ leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST, pos + 1, cpy_num - 1); - /* copy part of the item which number is pos to the begin of the DEST */ + /* + * copy part of the item which number is pos to + * the begin of the DEST + */ leaf_item_bottle(dest_bi, src, LAST_TO_FIRST, pos, cpy_bytes); } @@ -550,9 +621,11 @@ static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src, return i; } -/* there are types of coping: from S[0] to L[0], from S[0] to R[0], - from R[0] to L[0]. for each of these we have to define parent and - positions of destination and source buffers */ +/* + * there are types of coping: from S[0] to L[0], from S[0] to R[0], + * from R[0] to L[0]. for each of these we have to define parent and + * positions of destination and source buffers + */ static void leaf_define_dest_src_infos(int shift_mode, struct tree_balance *tb, struct buffer_info *dest_bi, struct buffer_info *src_bi, @@ -568,7 +641,9 @@ static void leaf_define_dest_src_infos(int shift_mode, struct tree_balance *tb, src_bi->tb = tb; src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path); src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0); - src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0); /* src->b_item_order */ + + /* src->b_item_order */ + src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0); dest_bi->tb = tb; dest_bi->bi_bh = tb->L[0]; dest_bi->bi_parent = tb->FL[0]; @@ -633,8 +708,10 @@ static void leaf_define_dest_src_infos(int shift_mode, struct tree_balance *tb, shift_mode, src_bi->bi_bh, dest_bi->bi_bh); } -/* copy mov_num items and mov_bytes of the (mov_num-1)th item to - neighbor. Delete them from source */ +/* + * copy mov_num items and mov_bytes of the (mov_num-1)th item to + * neighbor. Delete them from source + */ int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num, int mov_bytes, struct buffer_head *Snew) { @@ -657,18 +734,24 @@ int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num, return ret_value; } -/* Shift shift_num items (and shift_bytes of last shifted item if shift_bytes != -1) - from S[0] to L[0] and replace the delimiting key */ +/* + * Shift shift_num items (and shift_bytes of last shifted item if + * shift_bytes != -1) from S[0] to L[0] and replace the delimiting key + */ int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes) { struct buffer_head *S0 = PATH_PLAST_BUFFER(tb->tb_path); int i; - /* move shift_num (and shift_bytes bytes) items from S[0] to left neighbor L[0] */ + /* + * move shift_num (and shift_bytes bytes) items from S[0] + * to left neighbor L[0] + */ i = leaf_move_items(LEAF_FROM_S_TO_L, tb, shift_num, shift_bytes, NULL); if (shift_num) { - if (B_NR_ITEMS(S0) == 0) { /* number of items in S[0] == 0 */ + /* number of items in S[0] == 0 */ + if (B_NR_ITEMS(S0) == 0) { RFALSE(shift_bytes != -1, "vs-10270: S0 is empty now, but shift_bytes != -1 (%d)", @@ -691,10 +774,10 @@ int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes) replace_key(tb, tb->CFL[0], tb->lkey[0], S0, 0); RFALSE((shift_bytes != -1 && - !(is_direntry_le_ih(B_N_PITEM_HEAD(S0, 0)) - && !I_ENTRY_COUNT(B_N_PITEM_HEAD(S0, 0)))) && + !(is_direntry_le_ih(item_head(S0, 0)) + && !ih_entry_count(item_head(S0, 0)))) && (!op_is_left_mergeable - (B_N_PKEY(S0, 0), S0->b_size)), + (leaf_key(S0, 0), S0->b_size)), "vs-10280: item must be mergeable"); } } @@ -704,13 +787,18 @@ int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes) /* CLEANING STOPPED HERE */ -/* Shift shift_num (shift_bytes) items from S[0] to the right neighbor, and replace the delimiting key */ +/* + * Shift shift_num (shift_bytes) items from S[0] to the right neighbor, + * and replace the delimiting key + */ int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes) { - // struct buffer_head * S0 = PATH_PLAST_BUFFER (tb->tb_path); int ret_value; - /* move shift_num (and shift_bytes) items from S[0] to right neighbor R[0] */ + /* + * move shift_num (and shift_bytes) items from S[0] to + * right neighbor R[0] + */ ret_value = leaf_move_items(LEAF_FROM_S_TO_R, tb, shift_num, shift_bytes, NULL); @@ -725,12 +813,16 @@ int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes) static void leaf_delete_items_entirely(struct buffer_info *bi, int first, int del_num); -/* If del_bytes == -1, starting from position 'first' delete del_num items in whole in buffer CUR. - If not. - If last_first == 0. Starting from position 'first' delete del_num-1 items in whole. Delete part of body of - the first item. Part defined by del_bytes. Don't delete first item header - If last_first == 1. Starting from position 'first+1' delete del_num-1 items in whole. Delete part of body of - the last item . Part defined by del_bytes. Don't delete last item header. +/* + * If del_bytes == -1, starting from position 'first' delete del_num + * items in whole in buffer CUR. + * If not. + * If last_first == 0. Starting from position 'first' delete del_num-1 + * items in whole. Delete part of body of the first item. Part defined by + * del_bytes. Don't delete first item header + * If last_first == 1. Starting from position 'first+1' delete del_num-1 + * items in whole. Delete part of body of the last item . Part defined by + * del_bytes. Don't delete last item header. */ void leaf_delete_items(struct buffer_info *cur_bi, int last_first, int first, int del_num, int del_bytes) @@ -761,32 +853,43 @@ void leaf_delete_items(struct buffer_info *cur_bi, int last_first, leaf_delete_items_entirely(cur_bi, first, del_num); else { if (last_first == FIRST_TO_LAST) { - /* delete del_num-1 items beginning from item in position first */ + /* + * delete del_num-1 items beginning from + * item in position first + */ leaf_delete_items_entirely(cur_bi, first, del_num - 1); - /* delete the part of the first item of the bh - do not delete item header + /* + * delete the part of the first item of the bh + * do not delete item header */ leaf_cut_from_buffer(cur_bi, 0, 0, del_bytes); } else { struct item_head *ih; int len; - /* delete del_num-1 items beginning from item in position first+1 */ + /* + * delete del_num-1 items beginning from + * item in position first+1 + */ leaf_delete_items_entirely(cur_bi, first + 1, del_num - 1); - ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh) - 1); + ih = item_head(bh, B_NR_ITEMS(bh) - 1); if (is_direntry_le_ih(ih)) /* the last item is directory */ - /* len = numbers of directory entries in this item */ + /* + * len = numbers of directory entries + * in this item + */ len = ih_entry_count(ih); else /* len = body len of item */ len = ih_item_len(ih); - /* delete the part of the last item of the bh - do not delete item header + /* + * delete the part of the last item of the bh + * do not delete item header */ leaf_cut_from_buffer(cur_bi, B_NR_ITEMS(bh) - 1, len - del_bytes, del_bytes); @@ -820,10 +923,10 @@ void leaf_insert_into_buf(struct buffer_info *bi, int before, zeros_number, ih_item_len(inserted_item_ih)); /* get item new item must be inserted before */ - ih = B_N_PITEM_HEAD(bh, before); + ih = item_head(bh, before); /* prepare space for the body of new item */ - last_loc = nr ? ih_location(&(ih[nr - before - 1])) : bh->b_size; + last_loc = nr ? ih_location(&ih[nr - before - 1]) : bh->b_size; unmoved_loc = before ? ih_location(ih - 1) : bh->b_size; memmove(bh->b_data + last_loc - ih_item_len(inserted_item_ih), @@ -846,8 +949,8 @@ void leaf_insert_into_buf(struct buffer_info *bi, int before, /* change locations */ for (i = before; i < nr + 1; i++) { - unmoved_loc -= ih_item_len(&(ih[i - before])); - put_ih_location(&(ih[i - before]), unmoved_loc); + unmoved_loc -= ih_item_len(&ih[i - before]); + put_ih_location(&ih[i - before], unmoved_loc); } /* sizes, free space, item number */ @@ -867,8 +970,10 @@ void leaf_insert_into_buf(struct buffer_info *bi, int before, } } -/* paste paste_size bytes to affected_item_num-th item. - When item is a directory, this only prepare space for new entries */ +/* + * paste paste_size bytes to affected_item_num-th item. + * When item is a directory, this only prepare space for new entries + */ void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num, int pos_in_item, int paste_size, const char *body, int zeros_number) @@ -902,9 +1007,9 @@ void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num, #endif /* CONFIG_REISERFS_CHECK */ /* item to be appended */ - ih = B_N_PITEM_HEAD(bh, affected_item_num); + ih = item_head(bh, affected_item_num); - last_loc = ih_location(&(ih[nr - affected_item_num - 1])); + last_loc = ih_location(&ih[nr - affected_item_num - 1]); unmoved_loc = affected_item_num ? ih_location(ih - 1) : bh->b_size; /* prepare space */ @@ -913,8 +1018,8 @@ void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num, /* change locations */ for (i = affected_item_num; i < nr; i++) - put_ih_location(&(ih[i - affected_item_num]), - ih_location(&(ih[i - affected_item_num])) - + put_ih_location(&ih[i - affected_item_num], + ih_location(&ih[i - affected_item_num]) - paste_size); if (body) { @@ -957,10 +1062,12 @@ void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num, } } -/* cuts DEL_COUNT entries beginning from FROM-th entry. Directory item - does not have free space, so it moves DEHs and remaining records as - necessary. Return value is size of removed part of directory item - in bytes. */ +/* + * cuts DEL_COUNT entries beginning from FROM-th entry. Directory item + * does not have free space, so it moves DEHs and remaining records as + * necessary. Return value is size of removed part of directory item + * in bytes. + */ static int leaf_cut_entries(struct buffer_head *bh, struct item_head *ih, int from, int del_count) { @@ -971,12 +1078,14 @@ static int leaf_cut_entries(struct buffer_head *bh, int cut_records_len; /* length of all removed records */ int i; - /* make sure, that item is directory and there are enough entries to - remove */ + /* + * make sure that item is directory and there are enough entries to + * remove + */ RFALSE(!is_direntry_le_ih(ih), "10180: item is not directory item"); - RFALSE(I_ENTRY_COUNT(ih) < from + del_count, + RFALSE(ih_entry_count(ih) < from + del_count, "10185: item contains not enough entries: entry_count = %d, from = %d, to delete = %d", - I_ENTRY_COUNT(ih), from, del_count); + ih_entry_count(ih), from, del_count); if (del_count == 0) return 0; @@ -987,22 +1096,24 @@ static int leaf_cut_entries(struct buffer_head *bh, /* entry head array */ deh = B_I_DEH(bh, ih); - /* first byte of remaining entries, those are BEFORE cut entries - (prev_record) and length of all removed records (cut_records_len) */ + /* + * first byte of remaining entries, those are BEFORE cut entries + * (prev_record) and length of all removed records (cut_records_len) + */ prev_record_offset = - (from ? deh_location(&(deh[from - 1])) : ih_item_len(ih)); + (from ? deh_location(&deh[from - 1]) : ih_item_len(ih)); cut_records_len = prev_record_offset /*from_record */ - - deh_location(&(deh[from + del_count - 1])); + deh_location(&deh[from + del_count - 1]); prev_record = item + prev_record_offset; /* adjust locations of remaining entries */ - for (i = I_ENTRY_COUNT(ih) - 1; i > from + del_count - 1; i--) - put_deh_location(&(deh[i]), + for (i = ih_entry_count(ih) - 1; i > from + del_count - 1; i--) + put_deh_location(&deh[i], deh_location(&deh[i]) - (DEH_SIZE * del_count)); for (i = 0; i < from; i++) - put_deh_location(&(deh[i]), + put_deh_location(&deh[i], deh_location(&deh[i]) - (DEH_SIZE * del_count + cut_records_len)); @@ -1021,14 +1132,15 @@ static int leaf_cut_entries(struct buffer_head *bh, return DEH_SIZE * del_count + cut_records_len; } -/* when cut item is part of regular file - pos_in_item - first byte that must be cut - cut_size - number of bytes to be cut beginning from pos_in_item - - when cut item is part of directory - pos_in_item - number of first deleted entry - cut_size - count of deleted entries - */ +/* + * when cut item is part of regular file + * pos_in_item - first byte that must be cut + * cut_size - number of bytes to be cut beginning from pos_in_item + * + * when cut item is part of directory + * pos_in_item - number of first deleted entry + * cut_size - count of deleted entries + */ void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num, int pos_in_item, int cut_size) { @@ -1043,7 +1155,7 @@ void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num, nr = blkh_nr_item(blkh); /* item head of truncated item */ - ih = B_N_PITEM_HEAD(bh, cut_item_num); + ih = item_head(bh, cut_item_num); if (is_direntry_le_ih(ih)) { /* first cut entry () */ @@ -1055,7 +1167,6 @@ void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num, cut_item_num); /* change item key by key of first entry in the item */ set_le_ih_k_offset(ih, deh_offset(B_I_DEH(bh, ih))); - /*memcpy (&ih->ih_key.k_offset, &(B_I_DEH (bh, ih)->deh_offset), SHORT_KEY_SIZE); */ } } else { /* item is direct or indirect */ @@ -1089,7 +1200,7 @@ void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num, } /* location of the last item */ - last_loc = ih_location(&(ih[nr - cut_item_num - 1])); + last_loc = ih_location(&ih[nr - cut_item_num - 1]); /* location of the item, which is remaining at the same place */ unmoved_loc = cut_item_num ? ih_location(ih - 1) : bh->b_size; @@ -1108,7 +1219,7 @@ void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num, /* change locations */ for (i = cut_item_num; i < nr; i++) - put_ih_location(&(ih[i - cut_item_num]), + put_ih_location(&ih[i - cut_item_num], ih_location(&ih[i - cut_item_num]) + cut_size); /* size, free space */ @@ -1156,14 +1267,14 @@ static void leaf_delete_items_entirely(struct buffer_info *bi, return; } - ih = B_N_PITEM_HEAD(bh, first); + ih = item_head(bh, first); /* location of unmovable item */ j = (first == 0) ? bh->b_size : ih_location(ih - 1); /* delete items */ - last_loc = ih_location(&(ih[nr - 1 - first])); - last_removed_loc = ih_location(&(ih[del_num - 1])); + last_loc = ih_location(&ih[nr - 1 - first]); + last_removed_loc = ih_location(&ih[del_num - 1]); memmove(bh->b_data + last_loc + j - last_removed_loc, bh->b_data + last_loc, last_removed_loc - last_loc); @@ -1173,8 +1284,8 @@ static void leaf_delete_items_entirely(struct buffer_info *bi, /* change item location */ for (i = first; i < nr - del_num; i++) - put_ih_location(&(ih[i - first]), - ih_location(&(ih[i - first])) + (j - + put_ih_location(&ih[i - first], + ih_location(&ih[i - first]) + (j - last_removed_loc)); /* sizes, item number */ @@ -1195,7 +1306,10 @@ static void leaf_delete_items_entirely(struct buffer_info *bi, } } -/* paste new_entry_count entries (new_dehs, records) into position before to item_num-th item */ +/* + * paste new_entry_count entries (new_dehs, records) into position + * before to item_num-th item + */ void leaf_paste_entries(struct buffer_info *bi, int item_num, int before, @@ -1213,13 +1327,16 @@ void leaf_paste_entries(struct buffer_info *bi, if (new_entry_count == 0) return; - ih = B_N_PITEM_HEAD(bh, item_num); + ih = item_head(bh, item_num); - /* make sure, that item is directory, and there are enough records in it */ + /* + * make sure, that item is directory, and there are enough + * records in it + */ RFALSE(!is_direntry_le_ih(ih), "10225: item is not directory item"); - RFALSE(I_ENTRY_COUNT(ih) < before, + RFALSE(ih_entry_count(ih) < before, "10230: there are no entry we paste entries before. entry_count = %d, before = %d", - I_ENTRY_COUNT(ih), before); + ih_entry_count(ih), before); /* first byte of dest item */ item = bh->b_data + ih_location(ih); @@ -1230,21 +1347,21 @@ void leaf_paste_entries(struct buffer_info *bi, /* new records will be pasted at this point */ insert_point = item + - (before ? deh_location(&(deh[before - 1])) + (before ? deh_location(&deh[before - 1]) : (ih_item_len(ih) - paste_size)); /* adjust locations of records that will be AFTER new records */ - for (i = I_ENTRY_COUNT(ih) - 1; i >= before; i--) - put_deh_location(&(deh[i]), - deh_location(&(deh[i])) + + for (i = ih_entry_count(ih) - 1; i >= before; i--) + put_deh_location(&deh[i], + deh_location(&deh[i]) + (DEH_SIZE * new_entry_count)); /* adjust locations of records that will be BEFORE new records */ for (i = 0; i < before; i++) - put_deh_location(&(deh[i]), - deh_location(&(deh[i])) + paste_size); + put_deh_location(&deh[i], + deh_location(&deh[i]) + paste_size); - old_entry_num = I_ENTRY_COUNT(ih); + old_entry_num = ih_entry_count(ih); put_ih_entry_count(ih, ih_entry_count(ih) + new_entry_count); /* prepare space for pasted records */ @@ -1266,10 +1383,10 @@ void leaf_paste_entries(struct buffer_info *bi, /* set locations of new records */ for (i = 0; i < new_entry_count; i++) { - put_deh_location(&(deh[i]), - deh_location(&(deh[i])) + + put_deh_location(&deh[i], + deh_location(&deh[i]) + (-deh_location - (&(new_dehs[new_entry_count - 1])) + + (&new_dehs[new_entry_count - 1]) + insert_point + DEH_SIZE * new_entry_count - item)); } @@ -1277,28 +1394,26 @@ void leaf_paste_entries(struct buffer_info *bi, /* change item key if necessary (when we paste before 0-th entry */ if (!before) { set_le_ih_k_offset(ih, deh_offset(new_dehs)); -/* memcpy (&ih->ih_key.k_offset, - &new_dehs->deh_offset, SHORT_KEY_SIZE);*/ } #ifdef CONFIG_REISERFS_CHECK { int prev, next; /* check record locations */ deh = B_I_DEH(bh, ih); - for (i = 0; i < I_ENTRY_COUNT(ih); i++) { + for (i = 0; i < ih_entry_count(ih); i++) { next = (i < - I_ENTRY_COUNT(ih) - - 1) ? deh_location(&(deh[i + 1])) : 0; - prev = (i != 0) ? deh_location(&(deh[i - 1])) : 0; + ih_entry_count(ih) - + 1) ? deh_location(&deh[i + 1]) : 0; + prev = (i != 0) ? deh_location(&deh[i - 1]) : 0; - if (prev && prev <= deh_location(&(deh[i]))) + if (prev && prev <= deh_location(&deh[i])) reiserfs_error(sb_from_bi(bi), "vs-10240", "directory item (%h) " "corrupted (prev %a, " "cur(%d) %a)", ih, deh + i - 1, i, deh + i); - if (next && next >= deh_location(&(deh[i]))) + if (next && next >= deh_location(&deh[i])) reiserfs_error(sb_from_bi(bi), "vs-10250", "directory item (%h) " "corrupted (cur(%d) %a, " diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index e825f8b63e6b..cd11358b10c7 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -22,8 +22,10 @@ #define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) set_nlink(i, 1); } #define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) drop_nlink(i); -// directory item contains array of entry headers. This performs -// binary search through that array +/* + * directory item contains array of entry headers. This performs + * binary search through that array + */ static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, loff_t off) { struct item_head *ih = de->de_ih; @@ -31,7 +33,7 @@ static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, loff_t off) int rbound, lbound, j; lbound = 0; - rbound = I_ENTRY_COUNT(ih) - 1; + rbound = ih_entry_count(ih) - 1; for (j = (rbound + lbound) / 2; lbound <= rbound; j = (rbound + lbound) / 2) { @@ -43,7 +45,7 @@ static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, loff_t off) lbound = j + 1; continue; } - // this is not name found, but matched third key component + /* this is not name found, but matched third key component */ de->de_entry_num = j; return NAME_FOUND; } @@ -52,17 +54,21 @@ static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, loff_t off) return NAME_NOT_FOUND; } -// comment? maybe something like set de to point to what the path points to? +/* + * comment? maybe something like set de to point to what the path points to? + */ static inline void set_de_item_location(struct reiserfs_dir_entry *de, struct treepath *path) { de->de_bh = get_last_bh(path); - de->de_ih = get_ih(path); + de->de_ih = tp_item_head(path); de->de_deh = B_I_DEH(de->de_bh, de->de_ih); de->de_item_num = PATH_LAST_POSITION(path); } -// de_bh, de_ih, de_deh (points to first element of array), de_item_num is set +/* + * de_bh, de_ih, de_deh (points to first element of array), de_item_num is set + */ inline void set_de_name_and_namelen(struct reiserfs_dir_entry *de) { struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num; @@ -71,17 +77,17 @@ inline void set_de_name_and_namelen(struct reiserfs_dir_entry *de) de->de_entrylen = entry_length(de->de_bh, de->de_ih, de->de_entry_num); de->de_namelen = de->de_entrylen - (de_with_sd(deh) ? SD_SIZE : 0); - de->de_name = B_I_PITEM(de->de_bh, de->de_ih) + deh_location(deh); + de->de_name = ih_item_body(de->de_bh, de->de_ih) + deh_location(deh); if (de->de_name[de->de_namelen - 1] == 0) de->de_namelen = strlen(de->de_name); } -// what entry points to +/* what entry points to */ static inline void set_de_object_key(struct reiserfs_dir_entry *de) { BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih)); - de->de_dir_id = deh_dir_id(&(de->de_deh[de->de_entry_num])); - de->de_objectid = deh_objectid(&(de->de_deh[de->de_entry_num])); + de->de_dir_id = deh_dir_id(&de->de_deh[de->de_entry_num]); + de->de_objectid = deh_objectid(&de->de_deh[de->de_entry_num]); } static inline void store_de_entry_key(struct reiserfs_dir_entry *de) @@ -96,21 +102,20 @@ static inline void store_de_entry_key(struct reiserfs_dir_entry *de) le32_to_cpu(de->de_ih->ih_key.k_dir_id); de->de_entry_key.on_disk_key.k_objectid = le32_to_cpu(de->de_ih->ih_key.k_objectid); - set_cpu_key_k_offset(&(de->de_entry_key), deh_offset(deh)); - set_cpu_key_k_type(&(de->de_entry_key), TYPE_DIRENTRY); + set_cpu_key_k_offset(&de->de_entry_key, deh_offset(deh)); + set_cpu_key_k_type(&de->de_entry_key, TYPE_DIRENTRY); } -/* We assign a key to each directory item, and place multiple entries -in a single directory item. A directory item has a key equal to the -key of the first directory entry in it. - -This function first calls search_by_key, then, if item whose first -entry matches is not found it looks for the entry inside directory -item found by search_by_key. Fills the path to the entry, and to the -entry position in the item - -*/ - +/* + * We assign a key to each directory item, and place multiple entries in a + * single directory item. A directory item has a key equal to the key of + * the first directory entry in it. + + * This function first calls search_by_key, then, if item whose first entry + * matches is not found it looks for the entry inside directory item found + * by search_by_key. Fills the path to the entry, and to the entry position + * in the item + */ /* The function is NOT SCHEDULE-SAFE! */ int search_by_entry_key(struct super_block *sb, const struct cpu_key *key, struct treepath *path, struct reiserfs_dir_entry *de) @@ -144,7 +149,7 @@ int search_by_entry_key(struct super_block *sb, const struct cpu_key *key, #ifdef CONFIG_REISERFS_CHECK if (!is_direntry_le_ih(de->de_ih) || - COMP_SHORT_KEYS(&(de->de_ih->ih_key), key)) { + COMP_SHORT_KEYS(&de->de_ih->ih_key, key)) { print_block(de->de_bh, 0, -1, -1); reiserfs_panic(sb, "vs-7005", "found item %h is not directory " "item or does not belong to the same directory " @@ -152,12 +157,17 @@ int search_by_entry_key(struct super_block *sb, const struct cpu_key *key, } #endif /* CONFIG_REISERFS_CHECK */ - /* binary search in directory item by third componen t of the - key. sets de->de_entry_num of de */ + /* + * binary search in directory item by third component of the + * key. sets de->de_entry_num of de + */ retval = bin_search_in_dir_item(de, cpu_key_k_offset(key)); path->pos_in_item = de->de_entry_num; if (retval != NAME_NOT_FOUND) { - // ugly, but rename needs de_bh, de_deh, de_name, de_namelen, de_objectid set + /* + * ugly, but rename needs de_bh, de_deh, de_name, + * de_namelen, de_objectid set + */ set_de_name_and_namelen(de); set_de_object_key(de); } @@ -166,11 +176,12 @@ int search_by_entry_key(struct super_block *sb, const struct cpu_key *key, /* Keyed 32-bit hash function using TEA in a Davis-Meyer function */ -/* The third component is hashed, and you can choose from more than - one hash function. Per directory hashes are not yet implemented - but are thought about. This function should be moved to hashes.c - Jedi, please do so. -Hans */ - +/* + * The third component is hashed, and you can choose from more than + * one hash function. Per directory hashes are not yet implemented + * but are thought about. This function should be moved to hashes.c + * Jedi, please do so. -Hans + */ static __u32 get_third_component(struct super_block *s, const char *name, int len) { @@ -183,11 +194,13 @@ static __u32 get_third_component(struct super_block *s, res = REISERFS_SB(s)->s_hash_function(name, len); - // take bits from 7-th to 30-th including both bounds + /* take bits from 7-th to 30-th including both bounds */ res = GET_HASH_VALUE(res); if (res == 0) - // needed to have no names before "." and ".." those have hash - // value == 0 and generation conters 1 and 2 accordingly + /* + * needed to have no names before "." and ".." those have hash + * value == 0 and generation conters 1 and 2 accordingly + */ res = 128; return res + MAX_GENERATION_NUMBER; } @@ -208,7 +221,7 @@ static int reiserfs_match(struct reiserfs_dir_entry *de, /* de's de_bh, de_ih, de_deh, de_item_num, de_entry_num are set already */ - /* used when hash collisions exist */ +/* used when hash collisions exist */ static int linear_search_in_dir_item(struct cpu_key *key, struct reiserfs_dir_entry *de, @@ -220,7 +233,7 @@ static int linear_search_in_dir_item(struct cpu_key *key, i = de->de_entry_num; - if (i == I_ENTRY_COUNT(de->de_ih) || + if (i == ih_entry_count(de->de_ih) || GET_HASH_VALUE(deh_offset(deh + i)) != GET_HASH_VALUE(cpu_key_k_offset(key))) { i--; @@ -232,43 +245,50 @@ static int linear_search_in_dir_item(struct cpu_key *key, deh += i; for (; i >= 0; i--, deh--) { + /* hash value does not match, no need to check whole name */ if (GET_HASH_VALUE(deh_offset(deh)) != GET_HASH_VALUE(cpu_key_k_offset(key))) { - // hash value does not match, no need to check whole name return NAME_NOT_FOUND; } - /* mark, that this generation number is used */ + /* mark that this generation number is used */ if (de->de_gen_number_bit_string) set_bit(GET_GENERATION_NUMBER(deh_offset(deh)), de->de_gen_number_bit_string); - // calculate pointer to name and namelen + /* calculate pointer to name and namelen */ de->de_entry_num = i; set_de_name_and_namelen(de); + /* + * de's de_name, de_namelen, de_recordlen are set. + * Fill the rest. + */ if ((retval = reiserfs_match(de, name, namelen)) != NAME_NOT_FOUND) { - // de's de_name, de_namelen, de_recordlen are set. Fill the rest: - // key of pointed object + /* key of pointed object */ set_de_object_key(de); store_de_entry_key(de); - // retval can be NAME_FOUND or NAME_FOUND_INVISIBLE + /* retval can be NAME_FOUND or NAME_FOUND_INVISIBLE */ return retval; } } if (GET_GENERATION_NUMBER(le_ih_k_offset(de->de_ih)) == 0) - /* we have reached left most entry in the node. In common we - have to go to the left neighbor, but if generation counter - is 0 already, we know for sure, that there is no name with - the same hash value */ - // FIXME: this work correctly only because hash value can not - // be 0. Btw, in case of Yura's hash it is probably possible, - // so, this is a bug + /* + * we have reached left most entry in the node. In common we + * have to go to the left neighbor, but if generation counter + * is 0 already, we know for sure, that there is no name with + * the same hash value + */ + /* + * FIXME: this work correctly only because hash value can not + * be 0. Btw, in case of Yura's hash it is probably possible, + * so, this is a bug + */ return NAME_NOT_FOUND; RFALSE(de->de_item_num, @@ -277,8 +297,10 @@ static int linear_search_in_dir_item(struct cpu_key *key, return GOTO_PREVIOUS_ITEM; } -// may return NAME_FOUND, NAME_FOUND_INVISIBLE, NAME_NOT_FOUND -// FIXME: should add something like IOERROR +/* + * may return NAME_FOUND, NAME_FOUND_INVISIBLE, NAME_NOT_FOUND + * FIXME: should add something like IOERROR + */ static int reiserfs_find_entry(struct inode *dir, const char *name, int namelen, struct treepath *path_to_entry, struct reiserfs_dir_entry *de) @@ -307,13 +329,19 @@ static int reiserfs_find_entry(struct inode *dir, const char *name, int namelen, retval = linear_search_in_dir_item(&key_to_search, de, name, namelen); + /* + * there is no need to scan directory anymore. + * Given entry found or does not exist + */ if (retval != GOTO_PREVIOUS_ITEM) { - /* there is no need to scan directory anymore. Given entry found or does not exist */ path_to_entry->pos_in_item = de->de_entry_num; return retval; } - /* there is left neighboring item of this directory and given entry can be there */ + /* + * there is left neighboring item of this directory + * and given entry can be there + */ set_cpu_key_k_offset(&key_to_search, le_ih_k_offset(de->de_ih) - 1); pathrelse(path_to_entry); @@ -341,14 +369,16 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry, pathrelse(&path_to_entry); if (retval == NAME_FOUND) { inode = reiserfs_iget(dir->i_sb, - (struct cpu_key *)&(de.de_dir_id)); + (struct cpu_key *)&de.de_dir_id); if (!inode || IS_ERR(inode)) { reiserfs_write_unlock(dir->i_sb); return ERR_PTR(-EACCES); } - /* Propagate the private flag so we know we're - * in the priv tree */ + /* + * Propagate the private flag so we know we're + * in the priv tree + */ if (IS_PRIVATE(dir)) inode->i_flags |= S_PRIVATE; } @@ -361,9 +391,9 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry, } /* -** looks up the dentry of the parent directory for child. -** taken from ext2_get_parent -*/ + * looks up the dentry of the parent directory for child. + * taken from ext2_get_parent + */ struct dentry *reiserfs_get_parent(struct dentry *child) { int retval; @@ -384,7 +414,7 @@ struct dentry *reiserfs_get_parent(struct dentry *child) reiserfs_write_unlock(dir->i_sb); return ERR_PTR(-ENOENT); } - inode = reiserfs_iget(dir->i_sb, (struct cpu_key *)&(de.de_dir_id)); + inode = reiserfs_iget(dir->i_sb, (struct cpu_key *)&de.de_dir_id); reiserfs_write_unlock(dir->i_sb); return d_obtain_alias(inode); @@ -406,8 +436,13 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, struct reiserfs_dir_entry de; DECLARE_BITMAP(bit_string, MAX_GENERATION_NUMBER + 1); int gen_number; - char small_buf[32 + DEH_SIZE]; /* 48 bytes now and we avoid kmalloc - if we create file with short name */ + + /* + * 48 bytes now and we avoid kmalloc if we + * create file with short name + */ + char small_buf[32 + DEH_SIZE]; + char *buffer; int buflen, paste_size; int retval; @@ -439,21 +474,30 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, (get_inode_sd_version(dir) == STAT_DATA_V1) ? (DEH_SIZE + namelen) : buflen; - /* fill buffer : directory entry head, name[, dir objectid | , stat data | ,stat data, dir objectid ] */ + /* + * fill buffer : directory entry head, name[, dir objectid | , + * stat data | ,stat data, dir objectid ] + */ deh = (struct reiserfs_de_head *)buffer; deh->deh_location = 0; /* JDM Endian safe if 0 */ put_deh_offset(deh, cpu_key_k_offset(&entry_key)); deh->deh_state = 0; /* JDM Endian safe if 0 */ /* put key (ino analog) to de */ - deh->deh_dir_id = INODE_PKEY(inode)->k_dir_id; /* safe: k_dir_id is le */ - deh->deh_objectid = INODE_PKEY(inode)->k_objectid; /* safe: k_objectid is le */ + + /* safe: k_dir_id is le */ + deh->deh_dir_id = INODE_PKEY(inode)->k_dir_id; + /* safe: k_objectid is le */ + deh->deh_objectid = INODE_PKEY(inode)->k_objectid; /* copy name */ memcpy((char *)(deh + 1), name, namelen); /* padd by 0s to the 4 byte boundary */ padd_item((char *)(deh + 1), ROUND_UP(namelen), namelen); - /* entry is ready to be pasted into tree, set 'visibility' and 'stat data in entry' attributes */ + /* + * entry is ready to be pasted into tree, set 'visibility' + * and 'stat data in entry' attributes + */ mark_de_without_sd(deh); visible ? mark_de_visible(deh) : mark_de_hidden(deh); @@ -499,7 +543,8 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, /* update max-hash-collisions counter in reiserfs_sb_info */ PROC_INFO_MAX(th->t_super, max_hash_collisions, gen_number); - if (gen_number != 0) { /* we need to re-search for the insertion point */ + /* we need to re-search for the insertion point */ + if (gen_number != 0) { if (search_by_entry_key(dir->i_sb, &entry_key, &path, &de) != NAME_NOT_FOUND) { reiserfs_warning(dir->i_sb, "vs-7032", @@ -527,18 +572,19 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, dir->i_size += paste_size; dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; if (!S_ISDIR(inode->i_mode) && visible) - // reiserfs_mkdir or reiserfs_rename will do that by itself + /* reiserfs_mkdir or reiserfs_rename will do that by itself */ reiserfs_update_sd(th, dir); reiserfs_check_path(&path); return 0; } -/* quota utility function, call if you've had to abort after calling -** new_inode_init, and have not called reiserfs_new_inode yet. -** This should only be called on inodes that do not have stat data -** inserted into the tree yet. -*/ +/* + * quota utility function, call if you've had to abort after calling + * new_inode_init, and have not called reiserfs_new_inode yet. + * This should only be called on inodes that do not have stat data + * inserted into the tree yet. + */ static int drop_new_inode(struct inode *inode) { dquot_drop(inode); @@ -548,18 +594,23 @@ static int drop_new_inode(struct inode *inode) return 0; } -/* utility function that does setup for reiserfs_new_inode. -** dquot_initialize needs lots of credits so it's better to have it -** outside of a transaction, so we had to pull some bits of -** reiserfs_new_inode out into this func. -*/ +/* + * utility function that does setup for reiserfs_new_inode. + * dquot_initialize needs lots of credits so it's better to have it + * outside of a transaction, so we had to pull some bits of + * reiserfs_new_inode out into this func. + */ static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode) { - /* Make inode invalid - just in case we are going to drop it before - * the initialization happens */ + /* + * Make inode invalid - just in case we are going to drop it before + * the initialization happens + */ INODE_PKEY(inode)->k_objectid = 0; - /* the quota init calls have to know who to charge the quota to, so - ** we have to set uid and gid here + + /* + * the quota init calls have to know who to charge the quota to, so + * we have to set uid and gid here */ inode_init_owner(inode, dir, mode); dquot_initialize(inode); @@ -571,7 +622,10 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mod { int retval; struct inode *inode; - /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ + /* + * We need blocks for transaction + (user+group)*(quotas + * for new inode + update of quota for directory owner) + */ int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + @@ -618,7 +672,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mod int err; drop_nlink(inode); reiserfs_update_sd(&th, inode); - err = journal_end(&th, dir->i_sb, jbegin_count); + err = journal_end(&th); if (err) retval = err; unlock_new_inode(inode); @@ -630,9 +684,9 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mod unlock_new_inode(inode); d_instantiate(dentry, inode); - retval = journal_end(&th, dir->i_sb, jbegin_count); + retval = journal_end(&th); - out_failed: +out_failed: reiserfs_write_unlock(dir->i_sb); return retval; } @@ -644,7 +698,10 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode struct inode *inode; struct reiserfs_transaction_handle th; struct reiserfs_security_handle security; - /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ + /* + * We need blocks for transaction + (user+group)*(quotas + * for new inode + update of quota for directory owner) + */ int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + @@ -685,7 +742,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode inode->i_op = &reiserfs_special_inode_operations; init_special_inode(inode, inode->i_mode, rdev); - //FIXME: needed for block and char devices only + /* FIXME: needed for block and char devices only */ reiserfs_update_sd(&th, inode); reiserfs_update_inode_transaction(inode); @@ -698,7 +755,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode int err; drop_nlink(inode); reiserfs_update_sd(&th, inode); - err = journal_end(&th, dir->i_sb, jbegin_count); + err = journal_end(&th); if (err) retval = err; unlock_new_inode(inode); @@ -708,9 +765,9 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode unlock_new_inode(inode); d_instantiate(dentry, inode); - retval = journal_end(&th, dir->i_sb, jbegin_count); + retval = journal_end(&th); - out_failed: +out_failed: reiserfs_write_unlock(dir->i_sb); return retval; } @@ -721,7 +778,10 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode struct inode *inode; struct reiserfs_transaction_handle th; struct reiserfs_security_handle security; - /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ + /* + * We need blocks for transaction + (user+group)*(quotas + * for new inode + update of quota for directory owner) + */ int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + @@ -730,7 +790,10 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode dquot_initialize(dir); #ifdef DISPLACE_NEW_PACKING_LOCALITIES - /* set flag that new packing locality created and new blocks for the content * of that directory are not displaced yet */ + /* + * set flag that new packing locality created and new blocks + * for the content of that directory are not displaced yet + */ REISERFS_I(dir)->new_packing_locality = 1; #endif mode = S_IFDIR | mode; @@ -754,8 +817,9 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode goto out_failed; } - /* inc the link count now, so another writer doesn't overflow it while - ** we sleep later on. + /* + * inc the link count now, so another writer doesn't overflow + * it while we sleep later on. */ INC_DIR_INODE_NLINK(dir) @@ -774,7 +838,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode inode->i_op = &reiserfs_dir_inode_operations; inode->i_fop = &reiserfs_dir_operations; - // note, _this_ add_entry will not update dir's stat data + /* note, _this_ add_entry will not update dir's stat data */ retval = reiserfs_add_entry(&th, dir, dentry->d_name.name, dentry->d_name.len, inode, 1 /*visible */ ); @@ -783,19 +847,19 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode clear_nlink(inode); DEC_DIR_INODE_NLINK(dir); reiserfs_update_sd(&th, inode); - err = journal_end(&th, dir->i_sb, jbegin_count); + err = journal_end(&th); if (err) retval = err; unlock_new_inode(inode); iput(inode); goto out_failed; } - // the above add_entry did not update dir's stat data + /* the above add_entry did not update dir's stat data */ reiserfs_update_sd(&th, dir); unlock_new_inode(inode); d_instantiate(dentry, inode); - retval = journal_end(&th, dir->i_sb, jbegin_count); + retval = journal_end(&th); out_failed: reiserfs_write_unlock(dir->i_sb); return retval; @@ -803,10 +867,11 @@ out_failed: static inline int reiserfs_empty_dir(struct inode *inode) { - /* we can cheat because an old format dir cannot have - ** EMPTY_DIR_SIZE, and a new format dir cannot have - ** EMPTY_DIR_SIZE_V1. So, if the inode is either size, - ** regardless of disk format version, the directory is empty. + /* + * we can cheat because an old format dir cannot have + * EMPTY_DIR_SIZE, and a new format dir cannot have + * EMPTY_DIR_SIZE_V1. So, if the inode is either size, + * regardless of disk format version, the directory is empty. */ if (inode->i_size != EMPTY_DIR_SIZE && inode->i_size != EMPTY_DIR_SIZE_V1) { @@ -824,10 +889,12 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) INITIALIZE_PATH(path); struct reiserfs_dir_entry de; - /* we will be doing 2 balancings and update 2 stat data, we change quotas - * of the owner of the directory and of the owner of the parent directory. - * The quota structure is possibly deleted only on last iput => outside - * of this transaction */ + /* + * we will be doing 2 balancings and update 2 stat data, we + * change quotas of the owner of the directory and of the owner + * of the parent directory. The quota structure is possibly + * deleted only on last iput => outside of this transaction + */ jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); @@ -856,8 +923,9 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) reiserfs_update_inode_transaction(dir); if (de.de_objectid != inode->i_ino) { - // FIXME: compare key of an object and a key found in the - // entry + /* + * FIXME: compare key of an object and a key found in the entry + */ retval = -EIO; goto end_rmdir; } @@ -867,7 +935,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) } /* cut entry from dir directory */ - retval = reiserfs_cut_from_item(&th, &path, &(de.de_entry_key), dir, NULL, /* page */ + retval = reiserfs_cut_from_item(&th, &path, &de.de_entry_key, + dir, NULL, /* page */ 0 /*new file size - not used here */ ); if (retval < 0) goto end_rmdir; @@ -888,18 +957,20 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) /* prevent empty directory from getting lost */ add_save_link(&th, inode, 0 /* not truncate */ ); - retval = journal_end(&th, dir->i_sb, jbegin_count); + retval = journal_end(&th); reiserfs_check_path(&path); - out_rmdir: +out_rmdir: reiserfs_write_unlock(dir->i_sb); return retval; - end_rmdir: - /* we must release path, because we did not call - reiserfs_cut_from_item, or reiserfs_cut_from_item does not - release path if operation was not complete */ +end_rmdir: + /* + * we must release path, because we did not call + * reiserfs_cut_from_item, or reiserfs_cut_from_item does not + * release path if operation was not complete + */ pathrelse(&path); - err = journal_end(&th, dir->i_sb, jbegin_count); + err = journal_end(&th); reiserfs_write_unlock(dir->i_sb); return err ? err : retval; } @@ -918,10 +989,13 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) inode = dentry->d_inode; - /* in this transaction we can be doing at max two balancings and update - * two stat datas, we change quotas of the owner of the directory and of - * the owner of the parent directory. The quota structure is possibly - * deleted only on iput => outside of this transaction */ + /* + * in this transaction we can be doing at max two balancings and + * update two stat datas, we change quotas of the owner of the + * directory and of the owner of the parent directory. The quota + * structure is possibly deleted only on iput => outside of + * this transaction + */ jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); @@ -946,8 +1020,9 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) reiserfs_update_inode_transaction(dir); if (de.de_objectid != inode->i_ino) { - // FIXME: compare key of an object and a key found in the - // entry + /* + * FIXME: compare key of an object and a key found in the entry + */ retval = -EIO; goto end_unlink; } @@ -968,7 +1043,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) savelink = inode->i_nlink; retval = - reiserfs_cut_from_item(&th, &path, &(de.de_entry_key), dir, NULL, + reiserfs_cut_from_item(&th, &path, &de.de_entry_key, dir, NULL, 0); if (retval < 0) { inc_nlink(inode); @@ -985,18 +1060,18 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) /* prevent file from getting lost */ add_save_link(&th, inode, 0 /* not truncate */ ); - retval = journal_end(&th, dir->i_sb, jbegin_count); + retval = journal_end(&th); reiserfs_check_path(&path); reiserfs_write_unlock(dir->i_sb); return retval; - end_unlink: +end_unlink: pathrelse(&path); - err = journal_end(&th, dir->i_sb, jbegin_count); + err = journal_end(&th); reiserfs_check_path(&path); if (err) retval = err; - out_unlink: +out_unlink: reiserfs_write_unlock(dir->i_sb); return retval; } @@ -1011,7 +1086,10 @@ static int reiserfs_symlink(struct inode *parent_dir, struct reiserfs_transaction_handle th; struct reiserfs_security_handle security; int mode = S_IFLNK | S_IRWXUGO; - /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ + /* + * We need blocks for transaction + (user+group)*(quotas for + * new inode + update of quota for directory owner) + */ int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) + @@ -1070,17 +1148,13 @@ static int reiserfs_symlink(struct inode *parent_dir, inode->i_op = &reiserfs_symlink_inode_operations; inode->i_mapping->a_ops = &reiserfs_address_space_operations; - // must be sure this inode is written with this transaction - // - //reiserfs_update_sd (&th, inode, READ_BLOCKS); - retval = reiserfs_add_entry(&th, parent_dir, dentry->d_name.name, dentry->d_name.len, inode, 1 /*visible */ ); if (retval) { int err; drop_nlink(inode); reiserfs_update_sd(&th, inode); - err = journal_end(&th, parent_dir->i_sb, jbegin_count); + err = journal_end(&th); if (err) retval = err; unlock_new_inode(inode); @@ -1090,8 +1164,8 @@ static int reiserfs_symlink(struct inode *parent_dir, unlock_new_inode(inode); d_instantiate(dentry, inode); - retval = journal_end(&th, parent_dir->i_sb, jbegin_count); - out_failed: + retval = journal_end(&th); +out_failed: reiserfs_write_unlock(parent_dir->i_sb); return retval; } @@ -1102,7 +1176,10 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, int retval; struct inode *inode = old_dentry->d_inode; struct reiserfs_transaction_handle th; - /* We need blocks for transaction + update of quotas for the owners of the directory */ + /* + * We need blocks for transaction + update of quotas for + * the owners of the directory + */ int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); @@ -1111,7 +1188,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, reiserfs_write_lock(dir->i_sb); if (inode->i_nlink >= REISERFS_LINK_MAX) { - //FIXME: sd_nlink is 32 bit for new files + /* FIXME: sd_nlink is 32 bit for new files */ reiserfs_write_unlock(dir->i_sb); return -EMLINK; } @@ -1137,7 +1214,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, if (retval) { int err; drop_nlink(inode); - err = journal_end(&th, dir->i_sb, jbegin_count); + err = journal_end(&th); reiserfs_write_unlock(dir->i_sb); return err ? err : retval; } @@ -1147,7 +1224,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, ihold(inode); d_instantiate(dentry, inode); - retval = journal_end(&th, dir->i_sb, jbegin_count); + retval = journal_end(&th); reiserfs_write_unlock(dir->i_sb); return retval; } @@ -1158,9 +1235,9 @@ static int de_still_valid(const char *name, int len, { struct reiserfs_dir_entry tmp = *de; - // recalculate pointer to name and name length + /* recalculate pointer to name and name length */ set_de_name_and_namelen(&tmp); - // FIXME: could check more + /* FIXME: could check more */ if (tmp.de_namelen != len || memcmp(name, de->de_name, len)) return 0; return 1; @@ -1217,14 +1294,16 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, unsigned long savelink = 1; struct timespec ctime; - /* three balancings: (1) old name removal, (2) new name insertion - and (3) maybe "save" link insertion - stat data updates: (1) old directory, - (2) new directory and (3) maybe old object stat data (when it is - directory) and (4) maybe stat data of object to which new entry - pointed initially and (5) maybe block containing ".." of - renamed directory - quota updates: two parent directories */ + /* + * three balancings: (1) old name removal, (2) new name insertion + * and (3) maybe "save" link insertion + * stat data updates: (1) old directory, + * (2) new directory and (3) maybe old object stat data (when it is + * directory) and (4) maybe stat data of object to which new entry + * pointed initially and (5) maybe block containing ".." of + * renamed directory + * quota updates: two parent directories + */ jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 5 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb); @@ -1235,8 +1314,10 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, old_inode = old_dentry->d_inode; new_dentry_inode = new_dentry->d_inode; - // make sure, that oldname still exists and points to an object we - // are going to rename + /* + * make sure that oldname still exists and points to an object we + * are going to rename + */ old_de.de_gen_number_bit_string = NULL; reiserfs_write_lock(old_dir->i_sb); retval = @@ -1256,10 +1337,11 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, old_inode_mode = old_inode->i_mode; if (S_ISDIR(old_inode_mode)) { - // make sure, that directory being renamed has correct ".." - // and that its new parent directory has not too many links - // already - + /* + * make sure that directory being renamed has correct ".." + * and that its new parent directory has not too many links + * already + */ if (new_dentry_inode) { if (!reiserfs_empty_dir(new_dentry_inode)) { reiserfs_write_unlock(old_dir->i_sb); @@ -1267,8 +1349,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, } } - /* directory is renamed, its parent directory will be changed, - ** so find ".." entry + /* + * directory is renamed, its parent directory will be changed, + * so find ".." entry */ dot_dot_de.de_gen_number_bit_string = NULL; retval = @@ -1303,7 +1386,7 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, "new entry is found, new inode == 0"); } } else if (retval) { - int err = journal_end(&th, old_dir->i_sb, jbegin_count); + int err = journal_end(&th); reiserfs_write_unlock(old_dir->i_sb); return err ? err : retval; } @@ -1311,8 +1394,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, reiserfs_update_inode_transaction(old_dir); reiserfs_update_inode_transaction(new_dir); - /* this makes it so an fsync on an open fd for the old name will - ** commit the rename operation + /* + * this makes it so an fsync on an open fd for the old name will + * commit the rename operation */ reiserfs_update_inode_transaction(old_inode); @@ -1320,38 +1404,45 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, reiserfs_update_inode_transaction(new_dentry_inode); while (1) { - // look for old name using corresponding entry key (found by reiserfs_find_entry) + /* + * look for old name using corresponding entry key + * (found by reiserfs_find_entry) + */ if ((retval = search_by_entry_key(new_dir->i_sb, &old_de.de_entry_key, &old_entry_path, &old_de)) != NAME_FOUND) { pathrelse(&old_entry_path); - journal_end(&th, old_dir->i_sb, jbegin_count); + journal_end(&th); reiserfs_write_unlock(old_dir->i_sb); return -EIO; } - copy_item_head(&old_entry_ih, get_ih(&old_entry_path)); + copy_item_head(&old_entry_ih, tp_item_head(&old_entry_path)); reiserfs_prepare_for_journal(old_inode->i_sb, old_de.de_bh, 1); - // look for new name by reiserfs_find_entry + /* look for new name by reiserfs_find_entry */ new_de.de_gen_number_bit_string = NULL; retval = reiserfs_find_entry(new_dir, new_dentry->d_name.name, new_dentry->d_name.len, &new_entry_path, &new_de); - // reiserfs_add_entry should not return IO_ERROR, because it is called with essentially same parameters from - // reiserfs_add_entry above, and we'll catch any i/o errors before we get here. + /* + * reiserfs_add_entry should not return IO_ERROR, + * because it is called with essentially same parameters from + * reiserfs_add_entry above, and we'll catch any i/o errors + * before we get here. + */ if (retval != NAME_FOUND_INVISIBLE && retval != NAME_FOUND) { pathrelse(&new_entry_path); pathrelse(&old_entry_path); - journal_end(&th, old_dir->i_sb, jbegin_count); + journal_end(&th); reiserfs_write_unlock(old_dir->i_sb); return -EIO; } - copy_item_head(&new_entry_ih, get_ih(&new_entry_path)); + copy_item_head(&new_entry_ih, tp_item_head(&new_entry_path)); reiserfs_prepare_for_journal(old_inode->i_sb, new_de.de_bh, 1); @@ -1364,28 +1455,32 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, pathrelse(&dot_dot_entry_path); pathrelse(&new_entry_path); pathrelse(&old_entry_path); - journal_end(&th, old_dir->i_sb, jbegin_count); + journal_end(&th); reiserfs_write_unlock(old_dir->i_sb); return -EIO; } copy_item_head(&dot_dot_ih, - get_ih(&dot_dot_entry_path)); - // node containing ".." gets into transaction + tp_item_head(&dot_dot_entry_path)); + /* node containing ".." gets into transaction */ reiserfs_prepare_for_journal(old_inode->i_sb, dot_dot_de.de_bh, 1); } - /* we should check seals here, not do - this stuff, yes? Then, having - gathered everything into RAM we - should lock the buffers, yes? -Hans */ - /* probably. our rename needs to hold more - ** than one path at once. The seals would - ** have to be written to deal with multi-path - ** issues -chris + /* + * we should check seals here, not do + * this stuff, yes? Then, having + * gathered everything into RAM we + * should lock the buffers, yes? -Hans + */ + /* + * probably. our rename needs to hold more + * than one path at once. The seals would + * have to be written to deal with multi-path + * issues -chris */ - /* sanity checking before doing the rename - avoid races many - ** of the above checks could have scheduled. We have to be - ** sure our items haven't been shifted by another process. + /* + * sanity checking before doing the rename - avoid races many + * of the above checks could have scheduled. We have to be + * sure our items haven't been shifted by another process. */ if (item_moved(&new_entry_ih, &new_entry_path) || !entry_points_to_object(new_dentry->d_name.name, @@ -1430,24 +1525,28 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, break; } - /* ok, all the changes can be done in one fell swoop when we - have claimed all the buffers needed. */ + /* + * ok, all the changes can be done in one fell swoop when we + * have claimed all the buffers needed. + */ mark_de_visible(new_de.de_deh + new_de.de_entry_num); set_ino_in_dir_entry(&new_de, INODE_PKEY(old_inode)); - journal_mark_dirty(&th, old_dir->i_sb, new_de.de_bh); + journal_mark_dirty(&th, new_de.de_bh); mark_de_hidden(old_de.de_deh + old_de.de_entry_num); - journal_mark_dirty(&th, old_dir->i_sb, old_de.de_bh); + journal_mark_dirty(&th, old_de.de_bh); ctime = CURRENT_TIME_SEC; old_dir->i_ctime = old_dir->i_mtime = ctime; new_dir->i_ctime = new_dir->i_mtime = ctime; - /* thanks to Alex Adriaanse <alex_a@caltech.edu> for patch which adds ctime update of - renamed object */ + /* + * thanks to Alex Adriaanse <alex_a@caltech.edu> for patch + * which adds ctime update of renamed object + */ old_inode->i_ctime = ctime; if (new_dentry_inode) { - // adjust link number of the victim + /* adjust link number of the victim */ if (S_ISDIR(new_dentry_inode->i_mode)) { clear_nlink(new_dentry_inode); } else { @@ -1460,25 +1559,32 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (S_ISDIR(old_inode_mode)) { /* adjust ".." of renamed directory */ set_ino_in_dir_entry(&dot_dot_de, INODE_PKEY(new_dir)); - journal_mark_dirty(&th, new_dir->i_sb, dot_dot_de.de_bh); + journal_mark_dirty(&th, dot_dot_de.de_bh); + /* + * there (in new_dir) was no directory, so it got new link + * (".." of renamed directory) + */ if (!new_dentry_inode) - /* there (in new_dir) was no directory, so it got new link - (".." of renamed directory) */ INC_DIR_INODE_NLINK(new_dir); /* old directory lost one link - ".. " of renamed directory */ DEC_DIR_INODE_NLINK(old_dir); } - // looks like in 2.3.99pre3 brelse is atomic. so we can use pathrelse + /* + * looks like in 2.3.99pre3 brelse is atomic. + * so we can use pathrelse + */ pathrelse(&new_entry_path); pathrelse(&dot_dot_entry_path); - // FIXME: this reiserfs_cut_from_item's return value may screw up - // anybody, but it will panic if will not be able to find the - // entry. This needs one more clean up + /* + * FIXME: this reiserfs_cut_from_item's return value may screw up + * anybody, but it will panic if will not be able to find the + * entry. This needs one more clean up + */ if (reiserfs_cut_from_item - (&th, &old_entry_path, &(old_de.de_entry_key), old_dir, NULL, + (&th, &old_entry_path, &old_de.de_entry_key, old_dir, NULL, 0) < 0) reiserfs_error(old_dir->i_sb, "vs-7060", "couldn't not cut old name. Fsck later?"); @@ -1496,16 +1602,13 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, reiserfs_update_sd(&th, new_dentry_inode); } - retval = journal_end(&th, old_dir->i_sb, jbegin_count); + retval = journal_end(&th); reiserfs_write_unlock(old_dir->i_sb); return retval; } -/* - * directories can handle most operations... - */ +/* directories can handle most operations... */ const struct inode_operations reiserfs_dir_inode_operations = { - //&reiserfs_dir_operations, /* default_file_ops */ .create = reiserfs_create, .lookup = reiserfs_lookup, .link = reiserfs_link, diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c index f732d6a5251d..99a5d5dae46a 100644 --- a/fs/reiserfs/objectid.c +++ b/fs/reiserfs/objectid.c @@ -7,7 +7,7 @@ #include <linux/time.h> #include "reiserfs.h" -// find where objectid map starts +/* find where objectid map starts */ #define objectid_map(s,rs) (old_format_only (s) ? \ (__le32 *)((struct reiserfs_super_block_v1 *)(rs) + 1) :\ (__le32 *)((rs) + 1)) @@ -20,7 +20,7 @@ static void check_objectid_map(struct super_block *s, __le32 * map) reiserfs_panic(s, "vs-15010", "map corrupted: %lx", (long unsigned int)le32_to_cpu(map[0])); - // FIXME: add something else here + /* FIXME: add something else here */ } #else @@ -29,19 +29,21 @@ static void check_objectid_map(struct super_block *s, __le32 * map) } #endif -/* When we allocate objectids we allocate the first unused objectid. - Each sequence of objectids in use (the odd sequences) is followed - by a sequence of objectids not in use (the even sequences). We - only need to record the last objectid in each of these sequences - (both the odd and even sequences) in order to fully define the - boundaries of the sequences. A consequence of allocating the first - objectid not in use is that under most conditions this scheme is - extremely compact. The exception is immediately after a sequence - of operations which deletes a large number of objects of - non-sequential objectids, and even then it will become compact - again as soon as more objects are created. Note that many - interesting optimizations of layout could result from complicating - objectid assignment, but we have deferred making them for now. */ +/* + * When we allocate objectids we allocate the first unused objectid. + * Each sequence of objectids in use (the odd sequences) is followed + * by a sequence of objectids not in use (the even sequences). We + * only need to record the last objectid in each of these sequences + * (both the odd and even sequences) in order to fully define the + * boundaries of the sequences. A consequence of allocating the first + * objectid not in use is that under most conditions this scheme is + * extremely compact. The exception is immediately after a sequence + * of operations which deletes a large number of objects of + * non-sequential objectids, and even then it will become compact + * again as soon as more objects are created. Note that many + * interesting optimizations of layout could result from complicating + * objectid assignment, but we have deferred making them for now. + */ /* get unique object identifier */ __u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th) @@ -64,26 +66,30 @@ __u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th) return 0; } - /* This incrementation allocates the first unused objectid. That - is to say, the first entry on the objectid map is the first - unused objectid, and by incrementing it we use it. See below - where we check to see if we eliminated a sequence of unused - objectids.... */ + /* + * This incrementation allocates the first unused objectid. That + * is to say, the first entry on the objectid map is the first + * unused objectid, and by incrementing it we use it. See below + * where we check to see if we eliminated a sequence of unused + * objectids.... + */ map[1] = cpu_to_le32(unused_objectid + 1); - /* Now we check to see if we eliminated the last remaining member of - the first even sequence (and can eliminate the sequence by - eliminating its last objectid from oids), and can collapse the - first two odd sequences into one sequence. If so, then the net - result is to eliminate a pair of objectids from oids. We do this - by shifting the entire map to the left. */ + /* + * Now we check to see if we eliminated the last remaining member of + * the first even sequence (and can eliminate the sequence by + * eliminating its last objectid from oids), and can collapse the + * first two odd sequences into one sequence. If so, then the net + * result is to eliminate a pair of objectids from oids. We do this + * by shifting the entire map to the left. + */ if (sb_oid_cursize(rs) > 2 && map[1] == map[2]) { memmove(map + 1, map + 3, (sb_oid_cursize(rs) - 3) * sizeof(__u32)); set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2); } - journal_mark_dirty(th, s, SB_BUFFER_WITH_SB(s)); + journal_mark_dirty(th, SB_BUFFER_WITH_SB(s)); return unused_objectid; } @@ -97,30 +103,33 @@ void reiserfs_release_objectid(struct reiserfs_transaction_handle *th, int i = 0; BUG_ON(!th->t_trans_id); - //return; + /*return; */ check_objectid_map(s, map); reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); - journal_mark_dirty(th, s, SB_BUFFER_WITH_SB(s)); - - /* start at the beginning of the objectid map (i = 0) and go to - the end of it (i = disk_sb->s_oid_cursize). Linear search is - what we use, though it is possible that binary search would be - more efficient after performing lots of deletions (which is - when oids is large.) We only check even i's. */ + journal_mark_dirty(th, SB_BUFFER_WITH_SB(s)); + + /* + * start at the beginning of the objectid map (i = 0) and go to + * the end of it (i = disk_sb->s_oid_cursize). Linear search is + * what we use, though it is possible that binary search would be + * more efficient after performing lots of deletions (which is + * when oids is large.) We only check even i's. + */ while (i < sb_oid_cursize(rs)) { if (objectid_to_release == le32_to_cpu(map[i])) { /* This incrementation unallocates the objectid. */ - //map[i]++; le32_add_cpu(&map[i], 1); - /* Did we unallocate the last member of an odd sequence, and can shrink oids? */ + /* + * Did we unallocate the last member of an + * odd sequence, and can shrink oids? + */ if (map[i] == map[i + 1]) { /* shrink objectid map */ memmove(map + i, map + i + 2, (sb_oid_cursize(rs) - i - 2) * sizeof(__u32)); - //disk_sb->s_oid_cursize -= 2; set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2); RFALSE(sb_oid_cursize(rs) < 2 || @@ -135,14 +144,19 @@ void reiserfs_release_objectid(struct reiserfs_transaction_handle *th, objectid_to_release < le32_to_cpu(map[i + 1])) { /* size of objectid map is not changed */ if (objectid_to_release + 1 == le32_to_cpu(map[i + 1])) { - //objectid_map[i+1]--; le32_add_cpu(&map[i + 1], -1); return; } - /* JDM comparing two little-endian values for equality -- safe */ + /* + * JDM comparing two little-endian values for + * equality -- safe + */ + /* + * objectid map must be expanded, but + * there is no space + */ if (sb_oid_cursize(rs) == sb_oid_maxsize(rs)) { - /* objectid map must be expanded, but there is no space */ PROC_INFO_INC(s, leaked_oid); return; } @@ -178,8 +192,9 @@ int reiserfs_convert_objectid_map_v1(struct super_block *s) new_objectid_map = (__le32 *) (disk_sb + 1); if (cur_size > new_size) { - /* mark everyone used that was listed as free at the end of the objectid - ** map + /* + * mark everyone used that was listed as free at + * the end of the objectid map */ objectid_map[new_size - 1] = objectid_map[cur_size - 1]; set_sb_oid_cursize(disk_sb, new_size); diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c index 54944d5a4a6e..c9b47e91baf8 100644 --- a/fs/reiserfs/prints.c +++ b/fs/reiserfs/prints.c @@ -172,18 +172,19 @@ static char *is_there_reiserfs_struct(char *fmt, int *what) return k; } -/* debugging reiserfs we used to print out a lot of different - variables, like keys, item headers, buffer heads etc. Values of - most fields matter. So it took a long time just to write - appropriative printk. With this reiserfs_warning you can use format - specification for complex structures like you used to do with - printfs for integers, doubles and pointers. For instance, to print - out key structure you have to write just: - reiserfs_warning ("bad key %k", key); - instead of - printk ("bad key %lu %lu %lu %lu", key->k_dir_id, key->k_objectid, - key->k_offset, key->k_uniqueness); -*/ +/* + * debugging reiserfs we used to print out a lot of different + * variables, like keys, item headers, buffer heads etc. Values of + * most fields matter. So it took a long time just to write + * appropriative printk. With this reiserfs_warning you can use format + * specification for complex structures like you used to do with + * printfs for integers, doubles and pointers. For instance, to print + * out key structure you have to write just: + * reiserfs_warning ("bad key %k", key); + * instead of + * printk ("bad key %lu %lu %lu %lu", key->k_dir_id, key->k_objectid, + * key->k_offset, key->k_uniqueness); + */ static DEFINE_SPINLOCK(error_lock); static void prepare_error_buf(const char *fmt, va_list args) { @@ -243,15 +244,16 @@ static void prepare_error_buf(const char *fmt, va_list args) } -/* in addition to usual conversion specifiers this accepts reiserfs - specific conversion specifiers: - %k to print little endian key, - %K to print cpu key, - %h to print item_head, - %t to print directory entry - %z to print block head (arg must be struct buffer_head * - %b to print buffer_head -*/ +/* + * in addition to usual conversion specifiers this accepts reiserfs + * specific conversion specifiers: + * %k to print little endian key, + * %K to print cpu key, + * %h to print item_head, + * %t to print directory entry + * %z to print block head (arg must be struct buffer_head * + * %b to print buffer_head + */ #define do_reiserfs_warning(fmt)\ {\ @@ -304,50 +306,52 @@ void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...) #endif } -/* The format: - - maintainer-errorid: [function-name:] message - - where errorid is unique to the maintainer and function-name is - optional, is recommended, so that anyone can easily find the bug - with a simple grep for the short to type string - maintainer-errorid. Don't bother with reusing errorids, there are - lots of numbers out there. - - Example: - - reiserfs_panic( - p_sb, "reiser-29: reiserfs_new_blocknrs: " - "one of search_start or rn(%d) is equal to MAX_B_NUM," - "which means that we are optimizing location based on the bogus location of a temp buffer (%p).", - rn, bh - ); - - Regular panic()s sometimes clear the screen before the message can - be read, thus the need for the while loop. - - Numbering scheme for panic used by Vladimir and Anatoly( Hans completely ignores this scheme, and considers it - pointless complexity): - - panics in reiserfs.h have numbers from 1000 to 1999 - super.c 2000 to 2999 - preserve.c (unused) 3000 to 3999 - bitmap.c 4000 to 4999 - stree.c 5000 to 5999 - prints.c 6000 to 6999 - namei.c 7000 to 7999 - fix_nodes.c 8000 to 8999 - dir.c 9000 to 9999 - lbalance.c 10000 to 10999 - ibalance.c 11000 to 11999 not ready - do_balan.c 12000 to 12999 - inode.c 13000 to 13999 - file.c 14000 to 14999 - objectid.c 15000 - 15999 - buffer.c 16000 - 16999 - symlink.c 17000 - 17999 - - . */ +/* + * The format: + * + * maintainer-errorid: [function-name:] message + * + * where errorid is unique to the maintainer and function-name is + * optional, is recommended, so that anyone can easily find the bug + * with a simple grep for the short to type string + * maintainer-errorid. Don't bother with reusing errorids, there are + * lots of numbers out there. + * + * Example: + * + * reiserfs_panic( + * p_sb, "reiser-29: reiserfs_new_blocknrs: " + * "one of search_start or rn(%d) is equal to MAX_B_NUM," + * "which means that we are optimizing location based on the " + * "bogus location of a temp buffer (%p).", + * rn, bh + * ); + * + * Regular panic()s sometimes clear the screen before the message can + * be read, thus the need for the while loop. + * + * Numbering scheme for panic used by Vladimir and Anatoly( Hans completely + * ignores this scheme, and considers it pointless complexity): + * + * panics in reiserfs_fs.h have numbers from 1000 to 1999 + * super.c 2000 to 2999 + * preserve.c (unused) 3000 to 3999 + * bitmap.c 4000 to 4999 + * stree.c 5000 to 5999 + * prints.c 6000 to 6999 + * namei.c 7000 to 7999 + * fix_nodes.c 8000 to 8999 + * dir.c 9000 to 9999 + * lbalance.c 10000 to 10999 + * ibalance.c 11000 to 11999 not ready + * do_balan.c 12000 to 12999 + * inode.c 13000 to 13999 + * file.c 14000 to 14999 + * objectid.c 15000 - 15999 + * buffer.c 16000 - 16999 + * symlink.c 17000 - 17999 + * + * . */ void __reiserfs_panic(struct super_block *sb, const char *id, const char *function, const char *fmt, ...) @@ -411,9 +415,11 @@ void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...) reiserfs_abort_journal(sb, errno); } -/* this prints internal nodes (4 keys/items in line) (dc_number, - dc_size)[k_dirid, k_objectid, k_offset, k_uniqueness](dc_number, - dc_size)...*/ +/* + * this prints internal nodes (4 keys/items in line) (dc_number, + * dc_size)[k_dirid, k_objectid, k_offset, k_uniqueness](dc_number, + * dc_size)... + */ static int print_internal(struct buffer_head *bh, int first, int last) { struct reiserfs_key *key; @@ -439,7 +445,7 @@ static int print_internal(struct buffer_head *bh, int first, int last) dc = B_N_CHILD(bh, from); reiserfs_printk("PTR %d: %y ", from, dc); - for (i = from, key = B_N_PDELIM_KEY(bh, from), dc++; i < to; + for (i = from, key = internal_key(bh, from), dc++; i < to; i++, key++, dc++) { reiserfs_printk("KEY %d: %k PTR %d: %y ", i, key, i + 1, dc); if (i && i % 4 == 0) @@ -463,7 +469,7 @@ static int print_leaf(struct buffer_head *bh, int print_mode, int first, check_leaf(bh); blkh = B_BLK_HEAD(bh); - ih = B_N_PITEM_HEAD(bh, 0); + ih = item_head(bh, 0); nr = blkh_nr_item(blkh); printk @@ -496,7 +502,7 @@ static int print_leaf(struct buffer_head *bh, int print_mode, int first, ("-------------------------------------------------------------------------------\n"); reiserfs_printk("|%2d| %h |\n", i, ih); if (print_mode & PRINT_LEAF_ITEMS) - op_print_item(ih, B_I_PITEM(bh, ih)); + op_print_item(ih, ih_item_body(bh, ih)); } printk @@ -543,9 +549,11 @@ static int print_super_block(struct buffer_head *bh) printk("Block count %u\n", sb_block_count(rs)); printk("Blocksize %d\n", sb_blocksize(rs)); printk("Free blocks %u\n", sb_free_blocks(rs)); - // FIXME: this would be confusing if - // someone stores reiserfs super block in some data block ;) + /* + * FIXME: this would be confusing if + * someone stores reiserfs super block in some data block ;) // skipped = (bh->b_blocknr * bh->b_size) / sb_blocksize(rs); + */ skipped = bh->b_blocknr; data_blocks = sb_block_count(rs) - skipped - 1 - sb_bmap_nr(rs) - (!is_reiserfs_jr(rs) ? sb_jp_journal_size(rs) + @@ -581,8 +589,8 @@ static int print_desc_block(struct buffer_head *bh) return 0; } - -void print_block(struct buffer_head *bh, ...) //int print_mode, int first, int last) +/* ..., int print_mode, int first, int last) */ +void print_block(struct buffer_head *bh, ...) { va_list args; int mode, first, last; @@ -644,11 +652,11 @@ void store_print_tb(struct tree_balance *tb) "* %d * %3lld(%2d) * %3lld(%2d) * %3lld(%2d) * %5lld * %5lld * %5lld * %5lld * %5lld *\n", h, (tbSh) ? (long long)(tbSh->b_blocknr) : (-1LL), - (tbSh) ? atomic_read(&(tbSh->b_count)) : -1, + (tbSh) ? atomic_read(&tbSh->b_count) : -1, (tb->L[h]) ? (long long)(tb->L[h]->b_blocknr) : (-1LL), - (tb->L[h]) ? atomic_read(&(tb->L[h]->b_count)) : -1, + (tb->L[h]) ? atomic_read(&tb->L[h]->b_count) : -1, (tb->R[h]) ? (long long)(tb->R[h]->b_blocknr) : (-1LL), - (tb->R[h]) ? atomic_read(&(tb->R[h]->b_count)) : -1, + (tb->R[h]) ? atomic_read(&tb->R[h]->b_count) : -1, (tbFh) ? (long long)(tbFh->b_blocknr) : (-1LL), (tb->FL[h]) ? (long long)(tb->FL[h]-> b_blocknr) : (-1LL), @@ -665,9 +673,9 @@ void store_print_tb(struct tree_balance *tb) "* h * size * ln * lb * rn * rb * blkn * s0 * s1 * s1b * s2 * s2b * curb * lk * rk *\n" "* 0 * %4d * %2d * %2d * %2d * %2d * %4d * %2d * %2d * %3d * %2d * %3d * %4d * %2d * %2d *\n", tb->insert_size[0], tb->lnum[0], tb->lbytes, tb->rnum[0], - tb->rbytes, tb->blknum[0], tb->s0num, tb->s1num, tb->s1bytes, - tb->s2num, tb->s2bytes, tb->cur_blknum, tb->lkey[0], - tb->rkey[0]); + tb->rbytes, tb->blknum[0], tb->s0num, tb->snum[0], + tb->sbytes[0], tb->snum[1], tb->sbytes[1], + tb->cur_blknum, tb->lkey[0], tb->rkey[0]); /* this prints balance parameters for non-leaf levels */ h = 0; @@ -690,7 +698,7 @@ void store_print_tb(struct tree_balance *tb) "%p (%llu %d)%s", tb->FEB[i], tb->FEB[i] ? (unsigned long long)tb->FEB[i]-> b_blocknr : 0ULL, - tb->FEB[i] ? atomic_read(&(tb->FEB[i]->b_count)) : 0, + tb->FEB[i] ? atomic_read(&tb->FEB[i]->b_count) : 0, (i == ARRAY_SIZE(tb->FEB) - 1) ? "\n" : ", "); sprintf(print_tb_buf + strlen(print_tb_buf), @@ -744,8 +752,8 @@ void check_leaf(struct buffer_head *bh) if (!bh) return; check_leaf_block_head(bh); - for (i = 0, ih = B_N_PITEM_HEAD(bh, 0); i < B_NR_ITEMS(bh); i++, ih++) - op_check_item(ih, B_I_PITEM(bh, ih)); + for (i = 0, ih = item_head(bh, 0); i < B_NR_ITEMS(bh); i++, ih++) + op_check_item(ih, ih_item_body(bh, ih)); } void check_internal(struct buffer_head *bh) diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 83d4eac8059a..bf53888c7f59 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -1,5 +1,6 @@ /* - * Copyright 1996, 1997, 1998 Hans Reiser, see reiserfs/README for licensing and copyright details + * Copyright 1996, 1997, 1998 Hans Reiser, see reiserfs/README for + * licensing and copyright details */ #include <linux/reiserfs_fs.h> @@ -23,52 +24,73 @@ struct reiserfs_journal_list; -/** bitmasks for i_flags field in reiserfs-specific part of inode */ +/* bitmasks for i_flags field in reiserfs-specific part of inode */ typedef enum { - /** this says what format of key do all items (but stat data) of - an object have. If this is set, that format is 3.6 otherwise - - 3.5 */ + /* + * this says what format of key do all items (but stat data) of + * an object have. If this is set, that format is 3.6 otherwise - 3.5 + */ i_item_key_version_mask = 0x0001, - /** If this is unset, object has 3.5 stat data, otherwise, it has - 3.6 stat data with 64bit size, 32bit nlink etc. */ + + /* + * If this is unset, object has 3.5 stat data, otherwise, + * it has 3.6 stat data with 64bit size, 32bit nlink etc. + */ i_stat_data_version_mask = 0x0002, - /** file might need tail packing on close */ + + /* file might need tail packing on close */ i_pack_on_close_mask = 0x0004, - /** don't pack tail of file */ + + /* don't pack tail of file */ i_nopack_mask = 0x0008, - /** If those is set, "safe link" was created for this file during - truncate or unlink. Safe link is used to avoid leakage of disk - space on crash with some files open, but unlinked. */ + + /* + * If either of these are set, "safe link" was created for this + * file during truncate or unlink. Safe link is used to avoid + * leakage of disk space on crash with some files open, but unlinked. + */ i_link_saved_unlink_mask = 0x0010, i_link_saved_truncate_mask = 0x0020, + i_has_xattr_dir = 0x0040, i_data_log = 0x0080, } reiserfs_inode_flags; struct reiserfs_inode_info { __u32 i_key[4]; /* key is still 4 32 bit integers */ - /** transient inode flags that are never stored on disk. Bitmasks - for this field are defined above. */ + + /* + * transient inode flags that are never stored on disk. Bitmasks + * for this field are defined above. + */ __u32 i_flags; - __u32 i_first_direct_byte; // offset of first byte stored in direct item. + /* offset of first byte stored in direct item. */ + __u32 i_first_direct_byte; /* copy of persistent inode flags read from sd_attrs. */ __u32 i_attrs; - int i_prealloc_block; /* first unused block of a sequence of unused blocks */ + /* first unused block of a sequence of unused blocks */ + int i_prealloc_block; int i_prealloc_count; /* length of that sequence */ - struct list_head i_prealloc_list; /* per-transaction list of inodes which - * have preallocated blocks */ - unsigned new_packing_locality:1; /* new_packig_locality is created; new blocks - * for the contents of this directory should be - * displaced */ + /* per-transaction list of inodes which have preallocated blocks */ + struct list_head i_prealloc_list; - /* we use these for fsync or O_SYNC to decide which transaction - ** needs to be committed in order for this inode to be properly - ** flushed */ + /* + * new_packing_locality is created; new blocks for the contents + * of this directory should be displaced + */ + unsigned new_packing_locality:1; + + /* + * we use these for fsync or O_SYNC to decide which transaction + * needs to be committed in order for this inode to be properly + * flushed + */ unsigned int i_trans_id; + struct reiserfs_journal_list *i_jl; atomic_t openers; struct mutex tailpack; @@ -82,9 +104,10 @@ typedef enum { reiserfs_attrs_cleared = 0x00000001, } reiserfs_super_block_flags; -/* struct reiserfs_super_block accessors/mutators - * since this is a disk structure, it will always be in - * little endian format. */ +/* + * struct reiserfs_super_block accessors/mutators since this is a disk + * structure, it will always be in little endian format. + */ #define sb_block_count(sbp) (le32_to_cpu((sbp)->s_v1.s_block_count)) #define set_sb_block_count(sbp,v) ((sbp)->s_v1.s_block_count = cpu_to_le32(v)) #define sb_free_blocks(sbp) (le32_to_cpu((sbp)->s_v1.s_free_blocks)) @@ -152,48 +175,61 @@ typedef enum { /* LOGGING -- */ -/* These all interelate for performance. -** -** If the journal block count is smaller than n transactions, you lose speed. -** I don't know what n is yet, I'm guessing 8-16. -** -** typical transaction size depends on the application, how often fsync is -** called, and how many metadata blocks you dirty in a 30 second period. -** The more small files (<16k) you use, the larger your transactions will -** be. -** -** If your journal fills faster than dirty buffers get flushed to disk, it must flush them before allowing the journal -** to wrap, which slows things down. If you need high speed meta data updates, the journal should be big enough -** to prevent wrapping before dirty meta blocks get to disk. -** -** If the batch max is smaller than the transaction max, you'll waste space at the end of the journal -** because journal_end sets the next transaction to start at 0 if the next transaction has any chance of wrapping. -** -** The large the batch max age, the better the speed, and the more meta data changes you'll lose after a crash. -** -*/ +/* + * These all interelate for performance. + * + * If the journal block count is smaller than n transactions, you lose speed. + * I don't know what n is yet, I'm guessing 8-16. + * + * typical transaction size depends on the application, how often fsync is + * called, and how many metadata blocks you dirty in a 30 second period. + * The more small files (<16k) you use, the larger your transactions will + * be. + * + * If your journal fills faster than dirty buffers get flushed to disk, it + * must flush them before allowing the journal to wrap, which slows things + * down. If you need high speed meta data updates, the journal should be + * big enough to prevent wrapping before dirty meta blocks get to disk. + * + * If the batch max is smaller than the transaction max, you'll waste space + * at the end of the journal because journal_end sets the next transaction + * to start at 0 if the next transaction has any chance of wrapping. + * + * The large the batch max age, the better the speed, and the more meta + * data changes you'll lose after a crash. + */ /* don't mess with these for a while */ - /* we have a node size define somewhere in reiserfs_fs.h. -Hans */ +/* we have a node size define somewhere in reiserfs_fs.h. -Hans */ #define JOURNAL_BLOCK_SIZE 4096 /* BUG gotta get rid of this */ #define JOURNAL_MAX_CNODE 1500 /* max cnodes to allocate. */ #define JOURNAL_HASH_SIZE 8192 -#define JOURNAL_NUM_BITMAPS 5 /* number of copies of the bitmaps to have floating. Must be >= 2 */ - -/* One of these for every block in every transaction -** Each one is in two hash tables. First, a hash of the current transaction, and after journal_end, a -** hash of all the in memory transactions. -** next and prev are used by the current transaction (journal_hash). -** hnext and hprev are used by journal_list_hash. If a block is in more than one transaction, the journal_list_hash -** links it in multiple times. This allows flush_journal_list to remove just the cnode belonging -** to a given transaction. -*/ + +/* number of copies of the bitmaps to have floating. Must be >= 2 */ +#define JOURNAL_NUM_BITMAPS 5 + +/* + * One of these for every block in every transaction + * Each one is in two hash tables. First, a hash of the current transaction, + * and after journal_end, a hash of all the in memory transactions. + * next and prev are used by the current transaction (journal_hash). + * hnext and hprev are used by journal_list_hash. If a block is in more + * than one transaction, the journal_list_hash links it in multiple times. + * This allows flush_journal_list to remove just the cnode belonging to a + * given transaction. + */ struct reiserfs_journal_cnode { struct buffer_head *bh; /* real buffer head */ struct super_block *sb; /* dev of real buffer head */ - __u32 blocknr; /* block number of real buffer head, == 0 when buffer on disk */ + + /* block number of real buffer head, == 0 when buffer on disk */ + __u32 blocknr; + unsigned long state; - struct reiserfs_journal_list *jlist; /* journal list this cnode lives in */ + + /* journal list this cnode lives in */ + struct reiserfs_journal_list *jlist; + struct reiserfs_journal_cnode *next; /* next in transaction list */ struct reiserfs_journal_cnode *prev; /* prev in transaction list */ struct reiserfs_journal_cnode *hprev; /* prev in hash list */ @@ -212,18 +248,22 @@ struct reiserfs_list_bitmap { }; /* -** one of these for each transaction. The most important part here is the j_realblock. -** this list of cnodes is used to hash all the blocks in all the commits, to mark all the -** real buffer heads dirty once all the commits hit the disk, -** and to make sure every real block in a transaction is on disk before allowing the log area -** to be overwritten */ + * one of these for each transaction. The most important part here is the + * j_realblock. this list of cnodes is used to hash all the blocks in all + * the commits, to mark all the real buffer heads dirty once all the commits + * hit the disk, and to make sure every real block in a transaction is on + * disk before allowing the log area to be overwritten + */ struct reiserfs_journal_list { unsigned long j_start; unsigned long j_state; unsigned long j_len; atomic_t j_nonzerolen; atomic_t j_commit_left; - atomic_t j_older_commits_done; /* all commits older than this on disk */ + + /* all commits older than this on disk */ + atomic_t j_older_commits_done; + struct mutex j_commit_mutex; unsigned int j_trans_id; time_t j_timestamp; @@ -234,11 +274,15 @@ struct reiserfs_journal_list { /* time ordered list of all active transactions */ struct list_head j_list; - /* time ordered list of all transactions we haven't tried to flush yet */ + /* + * time ordered list of all transactions we haven't tried + * to flush yet + */ struct list_head j_working_list; /* list of tail conversion targets in need of flush before commit */ struct list_head j_tail_bh_list; + /* list of data=ordered buffers in need of flush before commit */ struct list_head j_bh_list; int j_refcount; @@ -246,46 +290,83 @@ struct reiserfs_journal_list { struct reiserfs_journal { struct buffer_head **j_ap_blocks; /* journal blocks on disk */ - struct reiserfs_journal_cnode *j_last; /* newest journal block */ - struct reiserfs_journal_cnode *j_first; /* oldest journal block. start here for traverse */ + /* newest journal block */ + struct reiserfs_journal_cnode *j_last; + + /* oldest journal block. start here for traverse */ + struct reiserfs_journal_cnode *j_first; struct block_device *j_dev_bd; fmode_t j_dev_mode; - int j_1st_reserved_block; /* first block on s_dev of reserved area journal */ + + /* first block on s_dev of reserved area journal */ + int j_1st_reserved_block; unsigned long j_state; unsigned int j_trans_id; unsigned long j_mount_id; - unsigned long j_start; /* start of current waiting commit (index into j_ap_blocks) */ + + /* start of current waiting commit (index into j_ap_blocks) */ + unsigned long j_start; unsigned long j_len; /* length of current waiting commit */ - unsigned long j_len_alloc; /* number of buffers requested by journal_begin() */ + + /* number of buffers requested by journal_begin() */ + unsigned long j_len_alloc; + atomic_t j_wcount; /* count of writers for current commit */ - unsigned long j_bcount; /* batch count. allows turning X transactions into 1 */ - unsigned long j_first_unflushed_offset; /* first unflushed transactions offset */ - unsigned j_last_flush_trans_id; /* last fully flushed journal timestamp */ + + /* batch count. allows turning X transactions into 1 */ + unsigned long j_bcount; + + /* first unflushed transactions offset */ + unsigned long j_first_unflushed_offset; + + /* last fully flushed journal timestamp */ + unsigned j_last_flush_trans_id; + struct buffer_head *j_header_bh; time_t j_trans_start_time; /* time this transaction started */ struct mutex j_mutex; struct mutex j_flush_mutex; - wait_queue_head_t j_join_wait; /* wait for current transaction to finish before starting new one */ - atomic_t j_jlock; /* lock for j_join_wait */ + + /* wait for current transaction to finish before starting new one */ + wait_queue_head_t j_join_wait; + + atomic_t j_jlock; /* lock for j_join_wait */ int j_list_bitmap_index; /* number of next list bitmap to use */ - int j_must_wait; /* no more journal begins allowed. MUST sleep on j_join_wait */ - int j_next_full_flush; /* next journal_end will flush all journal list */ - int j_next_async_flush; /* next journal_end will flush all async commits */ + + /* no more journal begins allowed. MUST sleep on j_join_wait */ + int j_must_wait; + + /* next journal_end will flush all journal list */ + int j_next_full_flush; + + /* next journal_end will flush all async commits */ + int j_next_async_flush; int j_cnode_used; /* number of cnodes on the used list */ int j_cnode_free; /* number of cnodes on the free list */ - unsigned int j_trans_max; /* max number of blocks in a transaction. */ - unsigned int j_max_batch; /* max number of blocks to batch into a trans */ - unsigned int j_max_commit_age; /* in seconds, how old can an async commit be */ - unsigned int j_max_trans_age; /* in seconds, how old can a transaction be */ - unsigned int j_default_max_commit_age; /* the default for the max commit age */ + /* max number of blocks in a transaction. */ + unsigned int j_trans_max; + + /* max number of blocks to batch into a trans */ + unsigned int j_max_batch; + + /* in seconds, how old can an async commit be */ + unsigned int j_max_commit_age; + + /* in seconds, how old can a transaction be */ + unsigned int j_max_trans_age; + + /* the default for the max commit age */ + unsigned int j_default_max_commit_age; struct reiserfs_journal_cnode *j_cnode_free_list; - struct reiserfs_journal_cnode *j_cnode_free_orig; /* orig pointer returned from vmalloc */ + + /* orig pointer returned from vmalloc */ + struct reiserfs_journal_cnode *j_cnode_free_orig; struct reiserfs_journal_list *j_current_jl; int j_free_bitmap_nodes; @@ -306,14 +387,21 @@ struct reiserfs_journal { /* list of all active transactions */ struct list_head j_journal_list; + /* lists that haven't been touched by writeback attempts */ struct list_head j_working_list; - struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS]; /* array of bitmaps to record the deleted blocks */ - struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE]; /* hash table for real buffer heads in current trans */ - struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE]; /* hash table for all the real buffer heads in all - the transactions */ - struct list_head j_prealloc_list; /* list of inodes which have preallocated blocks */ + /* hash table for real buffer heads in current trans */ + struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE]; + + /* hash table for all the real buffer heads in all the transactions */ + struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE]; + + /* array of bitmaps to record the deleted blocks */ + struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS]; + + /* list of inodes which have preallocated blocks */ + struct list_head j_prealloc_list; int j_persistent_trans; unsigned long j_max_trans_size; unsigned long j_max_batch_size; @@ -328,11 +416,12 @@ struct reiserfs_journal { enum journal_state_bits { J_WRITERS_BLOCKED = 1, /* set when new writers not allowed */ - J_WRITERS_QUEUED, /* set when log is full due to too many writers */ - J_ABORTED, /* set when log is aborted */ + J_WRITERS_QUEUED, /* set when log is full due to too many writers */ + J_ABORTED, /* set when log is aborted */ }; -#define JOURNAL_DESC_MAGIC "ReIsErLB" /* ick. magic string to find desc blocks in the journal */ +/* ick. magic string to find desc blocks in the journal */ +#define JOURNAL_DESC_MAGIC "ReIsErLB" typedef __u32(*hashf_t) (const signed char *, int); @@ -364,7 +453,10 @@ typedef struct reiserfs_proc_info_data { stat_cnt_t leaked_oid; stat_cnt_t leaves_removable; - /* balances per level. Use explicit 5 as MAX_HEIGHT is not visible yet. */ + /* + * balances per level. + * Use explicit 5 as MAX_HEIGHT is not visible yet. + */ stat_cnt_t balance_at[5]; /* XXX */ /* sbk == search_by_key */ stat_cnt_t sbk_read_at[5]; /* XXX */ @@ -416,47 +508,75 @@ typedef struct reiserfs_proc_info_data { /* reiserfs union of in-core super block data */ struct reiserfs_sb_info { - struct buffer_head *s_sbh; /* Buffer containing the super block */ - /* both the comment and the choice of - name are unclear for s_rs -Hans */ - struct reiserfs_super_block *s_rs; /* Pointer to the super block in the buffer */ + /* Buffer containing the super block */ + struct buffer_head *s_sbh; + + /* Pointer to the on-disk super block in the buffer */ + struct reiserfs_super_block *s_rs; struct reiserfs_bitmap_info *s_ap_bitmap; - struct reiserfs_journal *s_journal; /* pointer to journal information */ + + /* pointer to journal information */ + struct reiserfs_journal *s_journal; + unsigned short s_mount_state; /* reiserfs state (valid, invalid) */ /* Serialize writers access, replace the old bkl */ struct mutex lock; + /* Owner of the lock (can be recursive) */ struct task_struct *lock_owner; + /* Depth of the lock, start from -1 like the bkl */ int lock_depth; + struct workqueue_struct *commit_wq; + /* Comment? -Hans */ void (*end_io_handler) (struct buffer_head *, int); - hashf_t s_hash_function; /* pointer to function which is used - to sort names in directory. Set on - mount */ - unsigned long s_mount_opt; /* reiserfs's mount options are set - here (currently - NOTAIL, NOLOG, - REPLAYONLY) */ - - struct { /* This is a structure that describes block allocator options */ - unsigned long bits; /* Bitfield for enable/disable kind of options */ - unsigned long large_file_size; /* size started from which we consider file to be a large one(in blocks) */ + + /* + * pointer to function which is used to sort names in directory. + * Set on mount + */ + hashf_t s_hash_function; + + /* reiserfs's mount options are set here */ + unsigned long s_mount_opt; + + /* This is a structure that describes block allocator options */ + struct { + /* Bitfield for enable/disable kind of options */ + unsigned long bits; + + /* + * size started from which we consider file + * to be a large one (in blocks) + */ + unsigned long large_file_size; + int border; /* percentage of disk, border takes */ - int preallocmin; /* Minimal file size (in blocks) starting from which we do preallocations */ - int preallocsize; /* Number of blocks we try to prealloc when file - reaches preallocmin size (in blocks) or - prealloc_list is empty. */ + + /* + * Minimal file size (in blocks) starting + * from which we do preallocations + */ + int preallocmin; + + /* + * Number of blocks we try to prealloc when file + * reaches preallocmin size (in blocks) or prealloc_list + is empty. + */ + int preallocsize; } s_alloc_options; /* Comment? -Hans */ wait_queue_head_t s_wait; - /* To be obsoleted soon by per buffer seals.. -Hans */ - atomic_t s_generation_counter; // increased by one every time the - // tree gets re-balanced - unsigned long s_properties; /* File system properties. Currently holds - on-disk FS format */ + /* increased by one every time the tree gets re-balanced */ + atomic_t s_generation_counter; + + /* File system properties. Currently holds on-disk FS format */ + unsigned long s_properties; /* session statistics */ int s_disk_reads; @@ -469,14 +589,23 @@ struct reiserfs_sb_info { int s_bmaps_without_search; int s_direct2indirect; int s_indirect2direct; - /* set up when it's ok for reiserfs_read_inode2() to read from - disk inode with nlink==0. Currently this is only used during - finish_unfinished() processing at mount time */ + + /* + * set up when it's ok for reiserfs_read_inode2() to read from + * disk inode with nlink==0. Currently this is only used during + * finish_unfinished() processing at mount time + */ int s_is_unlinked_ok; + reiserfs_proc_info_data_t s_proc_info_data; struct proc_dir_entry *procdir; - int reserved_blocks; /* amount of blocks reserved for further allocations */ - spinlock_t bitmap_lock; /* this lock on now only used to protect reserved_blocks variable */ + + /* amount of blocks reserved for further allocations */ + int reserved_blocks; + + + /* this lock on now only used to protect reserved_blocks variable */ + spinlock_t bitmap_lock; struct dentry *priv_root; /* root of /.reiserfs_priv */ struct dentry *xattr_root; /* root of /.reiserfs_priv/xattrs */ int j_errno; @@ -492,14 +621,13 @@ struct reiserfs_sb_info { char *s_jdev; /* Stored jdev for mount option showing */ #ifdef CONFIG_REISERFS_CHECK - struct tree_balance *cur_tb; /* - * Detects whether more than one - * copy of tb exists per superblock - * as a means of checking whether - * do_balance is executing concurrently - * against another tree reader/writer - * on a same mount point. - */ + /* + * Detects whether more than one copy of tb exists per superblock + * as a means of checking whether do_balance is executing + * concurrently against another tree reader/writer on a same + * mount point. + */ + struct tree_balance *cur_tb; #endif }; @@ -508,25 +636,36 @@ struct reiserfs_sb_info { #define REISERFS_3_6 1 #define REISERFS_OLD_FORMAT 2 -enum reiserfs_mount_options { /* Mount options */ - REISERFS_LARGETAIL, /* large tails will be created in a session */ - REISERFS_SMALLTAIL, /* small (for files less than block size) tails will be created in a session */ - REPLAYONLY, /* replay journal and return 0. Use by fsck */ - REISERFS_CONVERT, /* -o conv: causes conversion of old - format super block to the new - format. If not specified - old - partition will be dealt with in a - manner of 3.5.x */ - -/* -o hash={tea, rupasov, r5, detect} is meant for properly mounting -** reiserfs disks from 3.5.19 or earlier. 99% of the time, this option -** is not required. If the normal autodection code can't determine which -** hash to use (because both hashes had the same value for a file) -** use this option to force a specific hash. It won't allow you to override -** the existing hash on the FS, so if you have a tea hash disk, and mount -** with -o hash=rupasov, the mount will fail. -*/ +enum reiserfs_mount_options { + /* large tails will be created in a session */ + REISERFS_LARGETAIL, + /* + * small (for files less than block size) tails will + * be created in a session + */ + REISERFS_SMALLTAIL, + + /* replay journal and return 0. Use by fsck */ + REPLAYONLY, + + /* + * -o conv: causes conversion of old format super block to the + * new format. If not specified - old partition will be dealt + * with in a manner of 3.5.x + */ + REISERFS_CONVERT, + + /* + * -o hash={tea, rupasov, r5, detect} is meant for properly mounting + * reiserfs disks from 3.5.19 or earlier. 99% of the time, this + * option is not required. If the normal autodection code can't + * determine which hash to use (because both hashes had the same + * value for a file) use this option to force a specific hash. + * It won't allow you to override the existing hash on the FS, so + * if you have a tea hash disk, and mount with -o hash=rupasov, + * the mount will fail. + */ FORCE_TEA_HASH, /* try to force tea hash on mount */ FORCE_RUPASOV_HASH, /* try to force rupasov hash on mount */ FORCE_R5_HASH, /* try to force rupasov hash on mount */ @@ -536,9 +675,11 @@ enum reiserfs_mount_options { REISERFS_DATA_ORDERED, REISERFS_DATA_WRITEBACK, -/* used for testing experimental features, makes benchmarking new - features with and without more convenient, should never be used by - users in any code shipped to users (ideally) */ + /* + * used for testing experimental features, makes benchmarking new + * features with and without more convenient, should never be used by + * users in any code shipped to users (ideally) + */ REISERFS_NO_BORDER, REISERFS_NO_UNHASHED_RELOCATION, @@ -705,28 +846,28 @@ static inline void reiserfs_cond_resched(struct super_block *s) struct fid; -/* in reading the #defines, it may help to understand that they employ - the following abbreviations: - - B = Buffer - I = Item header - H = Height within the tree (should be changed to LEV) - N = Number of the item in the node - STAT = stat data - DEH = Directory Entry Header - EC = Entry Count - E = Entry number - UL = Unsigned Long - BLKH = BLocK Header - UNFM = UNForMatted node - DC = Disk Child - P = Path - - These #defines are named by concatenating these abbreviations, - where first comes the arguments, and last comes the return value, - of the macro. - -*/ +/* + * in reading the #defines, it may help to understand that they employ + * the following abbreviations: + * + * B = Buffer + * I = Item header + * H = Height within the tree (should be changed to LEV) + * N = Number of the item in the node + * STAT = stat data + * DEH = Directory Entry Header + * EC = Entry Count + * E = Entry number + * UL = Unsigned Long + * BLKH = BLocK Header + * UNFM = UNForMatted node + * DC = Disk Child + * P = Path + * + * These #defines are named by concatenating these abbreviations, + * where first comes the arguments, and last comes the return value, + * of the macro. + */ #define USE_INODE_GENERATION_COUNTER @@ -737,14 +878,17 @@ struct fid; /* n must be power of 2 */ #define _ROUND_UP(x,n) (((x)+(n)-1u) & ~((n)-1u)) -// to be ok for alpha and others we have to align structures to 8 byte -// boundary. -// FIXME: do not change 4 by anything else: there is code which relies on that +/* + * to be ok for alpha and others we have to align structures to 8 byte + * boundary. + * FIXME: do not change 4 by anything else: there is code which relies on that + */ #define ROUND_UP(x) _ROUND_UP(x,8LL) -/* debug levels. Right now, CONFIG_REISERFS_CHECK means print all debug -** messages. -*/ +/* + * debug levels. Right now, CONFIG_REISERFS_CHECK means print all debug + * messages. + */ #define REISERFS_DEBUG_CODE 5 /* extra messages to help find/debug errors */ void __reiserfs_warning(struct super_block *s, const char *id, @@ -753,7 +897,7 @@ void __reiserfs_warning(struct super_block *s, const char *id, __reiserfs_warning(s, id, __func__, fmt, ##args) /* assertions handling */ -/** always check a condition and panic if it's false. */ +/* always check a condition and panic if it's false. */ #define __RASSERT(cond, scond, format, args...) \ do { \ if (!(cond)) \ @@ -776,35 +920,48 @@ do { \ * Disk Data Structures */ -/***************************************************************************/ -/* SUPER BLOCK */ -/***************************************************************************/ +/*************************************************************************** + * SUPER BLOCK * + ***************************************************************************/ /* - * Structure of super block on disk, a version of which in RAM is often accessed as REISERFS_SB(s)->s_rs - * the version in RAM is part of a larger structure containing fields never written to disk. + * Structure of super block on disk, a version of which in RAM is often + * accessed as REISERFS_SB(s)->s_rs. The version in RAM is part of a larger + * structure containing fields never written to disk. */ -#define UNSET_HASH 0 // read_super will guess about, what hash names - // in directories were sorted with +#define UNSET_HASH 0 /* Detect hash on disk */ #define TEA_HASH 1 #define YURA_HASH 2 #define R5_HASH 3 #define DEFAULT_HASH R5_HASH struct journal_params { - __le32 jp_journal_1st_block; /* where does journal start from on its - * device */ - __le32 jp_journal_dev; /* journal device st_rdev */ - __le32 jp_journal_size; /* size of the journal */ - __le32 jp_journal_trans_max; /* max number of blocks in a transaction. */ - __le32 jp_journal_magic; /* random value made on fs creation (this - * was sb_journal_block_count) */ - __le32 jp_journal_max_batch; /* max number of blocks to batch into a - * trans */ - __le32 jp_journal_max_commit_age; /* in seconds, how old can an async - * commit be */ - __le32 jp_journal_max_trans_age; /* in seconds, how old can a transaction - * be */ + /* where does journal start from on its * device */ + __le32 jp_journal_1st_block; + + /* journal device st_rdev */ + __le32 jp_journal_dev; + + /* size of the journal */ + __le32 jp_journal_size; + + /* max number of blocks in a transaction. */ + __le32 jp_journal_trans_max; + + /* + * random value made on fs creation + * (this was sb_journal_block_count) + */ + __le32 jp_journal_magic; + + /* max number of blocks to batch into a trans */ + __le32 jp_journal_max_batch; + + /* in seconds, how old can an async commit be */ + __le32 jp_journal_max_commit_age; + + /* in seconds, how old can a transaction be */ + __le32 jp_journal_max_trans_age; }; /* this is the super from 3.5.X, where X >= 10 */ @@ -814,26 +971,48 @@ struct reiserfs_super_block_v1 { __le32 s_root_block; /* root block number */ struct journal_params s_journal; __le16 s_blocksize; /* block size */ - __le16 s_oid_maxsize; /* max size of object id array, see - * get_objectid() commentary */ + + /* max size of object id array, see get_objectid() commentary */ + __le16 s_oid_maxsize; __le16 s_oid_cursize; /* current size of object id array */ - __le16 s_umount_state; /* this is set to 1 when filesystem was - * umounted, to 2 - when not */ - char s_magic[10]; /* reiserfs magic string indicates that - * file system is reiserfs: - * "ReIsErFs" or "ReIsEr2Fs" or "ReIsEr3Fs" */ - __le16 s_fs_state; /* it is set to used by fsck to mark which - * phase of rebuilding is done */ - __le32 s_hash_function_code; /* indicate, what hash function is being use - * to sort names in a directory*/ + + /* this is set to 1 when filesystem was umounted, to 2 - when not */ + __le16 s_umount_state; + + /* + * reiserfs magic string indicates that file system is reiserfs: + * "ReIsErFs" or "ReIsEr2Fs" or "ReIsEr3Fs" + */ + char s_magic[10]; + + /* + * it is set to used by fsck to mark which + * phase of rebuilding is done + */ + __le16 s_fs_state; + /* + * indicate, what hash function is being use + * to sort names in a directory + */ + __le32 s_hash_function_code; __le16 s_tree_height; /* height of disk tree */ - __le16 s_bmap_nr; /* amount of bitmap blocks needed to address - * each block of file system */ - __le16 s_version; /* this field is only reliable on filesystem - * with non-standard journal */ - __le16 s_reserved_for_journal; /* size in blocks of journal area on main - * device, we need to keep after - * making fs with non-standard journal */ + + /* + * amount of bitmap blocks needed to address + * each block of file system + */ + __le16 s_bmap_nr; + + /* + * this field is only reliable on filesystem with non-standard journal + */ + __le16 s_version; + + /* + * size in blocks of journal area on main device, we need to + * keep after making fs with non-standard journal + */ + __le16 s_reserved_for_journal; } __attribute__ ((__packed__)); #define SB_SIZE_V1 (sizeof(struct reiserfs_super_block_v1)) @@ -842,17 +1021,21 @@ struct reiserfs_super_block_v1 { struct reiserfs_super_block { struct reiserfs_super_block_v1 s_v1; __le32 s_inode_generation; - __le32 s_flags; /* Right now used only by inode-attributes, if enabled */ + + /* Right now used only by inode-attributes, if enabled */ + __le32 s_flags; + unsigned char s_uuid[16]; /* filesystem unique identifier */ unsigned char s_label[16]; /* filesystem volume label */ __le16 s_mnt_count; /* Count of mounts since last fsck */ __le16 s_max_mnt_count; /* Maximum mounts before check */ __le32 s_lastcheck; /* Timestamp of last fsck */ __le32 s_check_interval; /* Interval between checks */ - char s_unused[76]; /* zero filled by mkreiserfs and - * reiserfs_convert_objectid_map_v1() - * so any additions must be updated - * there as well. */ + + /* + * zero filled by mkreiserfs and reiserfs_convert_objectid_map_v1() + * so any additions must be updated there as well. */ + char s_unused[76]; } __attribute__ ((__packed__)); #define SB_SIZE (sizeof(struct reiserfs_super_block)) @@ -860,7 +1043,7 @@ struct reiserfs_super_block { #define REISERFS_VERSION_1 0 #define REISERFS_VERSION_2 2 -// on-disk super block fields converted to cpu form +/* on-disk super block fields converted to cpu form */ #define SB_DISK_SUPER_BLOCK(s) (REISERFS_SB(s)->s_rs) #define SB_V1_DISK_SUPER_BLOCK(s) (&(SB_DISK_SUPER_BLOCK(s)->s_v1)) #define SB_BLOCKSIZE(s) \ @@ -915,11 +1098,13 @@ int is_reiserfs_3_5(struct reiserfs_super_block *rs); int is_reiserfs_3_6(struct reiserfs_super_block *rs); int is_reiserfs_jr(struct reiserfs_super_block *rs); -/* ReiserFS leaves the first 64k unused, so that partition labels have - enough space. If someone wants to write a fancy bootloader that - needs more than 64k, let us know, and this will be increased in size. - This number must be larger than than the largest block size on any - platform, or code will break. -Hans */ +/* + * ReiserFS leaves the first 64k unused, so that partition labels have + * enough space. If someone wants to write a fancy bootloader that + * needs more than 64k, let us know, and this will be increased in size. + * This number must be larger than than the largest block size on any + * platform, or code will break. -Hans + */ #define REISERFS_DISK_OFFSET_IN_BYTES (64 * 1024) #define REISERFS_FIRST_BLOCK unused_define #define REISERFS_JOURNAL_OFFSET_IN_BYTES REISERFS_DISK_OFFSET_IN_BYTES @@ -944,8 +1129,7 @@ struct unfm_nodeinfo { unsigned short unfm_freespace; }; -/* there are two formats of keys: 3.5 and 3.6 - */ +/* there are two formats of keys: 3.5 and 3.6 */ #define KEY_FORMAT_3_5 0 #define KEY_FORMAT_3_6 1 @@ -963,8 +1147,10 @@ static inline struct reiserfs_sb_info *REISERFS_SB(const struct super_block *sb) return sb->s_fs_info; } -/* Don't trust REISERFS_SB(sb)->s_bmap_nr, it's a u16 - * which overflows on large file systems. */ +/* + * Don't trust REISERFS_SB(sb)->s_bmap_nr, it's a u16 + * which overflows on large file systems. + */ static inline __u32 reiserfs_bmap_count(struct super_block *sb) { return (SB_BLOCK_COUNT(sb) - 1) / (sb->s_blocksize * 8) + 1; @@ -975,8 +1161,10 @@ static inline int bmap_would_wrap(unsigned bmap_nr) return bmap_nr > ((1LL << 16) - 1); } -/** this says about version of key of all items (but stat data) the - object consists of */ +/* + * this says about version of key of all items (but stat data) the + * object consists of + */ #define get_inode_item_key_version( inode ) \ ((REISERFS_I(inode)->i_flags & i_item_key_version_mask) ? KEY_FORMAT_3_6 : KEY_FORMAT_3_5) @@ -995,16 +1183,18 @@ static inline int bmap_would_wrap(unsigned bmap_nr) else \ REISERFS_I(inode)->i_flags &= ~i_stat_data_version_mask; }) -/* This is an aggressive tail suppression policy, I am hoping it - improves our benchmarks. The principle behind it is that percentage - space saving is what matters, not absolute space saving. This is - non-intuitive, but it helps to understand it if you consider that the - cost to access 4 blocks is not much more than the cost to access 1 - block, if you have to do a seek and rotate. A tail risks a - non-linear disk access that is significant as a percentage of total - time cost for a 4 block file and saves an amount of space that is - less significant as a percentage of space, or so goes the hypothesis. - -Hans */ +/* + * This is an aggressive tail suppression policy, I am hoping it + * improves our benchmarks. The principle behind it is that percentage + * space saving is what matters, not absolute space saving. This is + * non-intuitive, but it helps to understand it if you consider that the + * cost to access 4 blocks is not much more than the cost to access 1 + * block, if you have to do a seek and rotate. A tail risks a + * non-linear disk access that is significant as a percentage of total + * time cost for a 4 block file and saves an amount of space that is + * less significant as a percentage of space, or so goes the hypothesis. + * -Hans + */ #define STORE_TAIL_IN_UNFM_S1(n_file_size,n_tail_size,n_block_size) \ (\ (!(n_tail_size)) || \ @@ -1018,10 +1208,11 @@ static inline int bmap_would_wrap(unsigned bmap_nr) ( (n_tail_size) >= (MAX_DIRECT_ITEM_LEN(n_block_size) * 3)/4) ) ) \ ) -/* Another strategy for tails, this one means only create a tail if all the - file would fit into one DIRECT item. - Primary intention for this one is to increase performance by decreasing - seeking. +/* + * Another strategy for tails, this one means only create a tail if all the + * file would fit into one DIRECT item. + * Primary intention for this one is to increase performance by decreasing + * seeking. */ #define STORE_TAIL_IN_UNFM_S2(n_file_size,n_tail_size,n_block_size) \ (\ @@ -1035,23 +1226,21 @@ static inline int bmap_would_wrap(unsigned bmap_nr) #define REISERFS_VALID_FS 1 #define REISERFS_ERROR_FS 2 -// -// there are 5 item types currently -// +/* + * there are 5 item types currently + */ #define TYPE_STAT_DATA 0 #define TYPE_INDIRECT 1 #define TYPE_DIRECT 2 #define TYPE_DIRENTRY 3 #define TYPE_MAXTYPE 3 -#define TYPE_ANY 15 // FIXME: comment is required +#define TYPE_ANY 15 /* FIXME: comment is required */ -/***************************************************************************/ -/* KEY & ITEM HEAD */ -/***************************************************************************/ +/*************************************************************************** + * KEY & ITEM HEAD * + ***************************************************************************/ -// -// directories use this key as well as old files -// +/* * directories use this key as well as old files */ struct offset_v1 { __le32 k_offset; __le32 k_uniqueness; @@ -1084,11 +1273,14 @@ static inline void set_offset_v2_k_offset(struct offset_v2 *v2, loff_t offset) v2->v = (v2->v & cpu_to_le64(15ULL << 60)) | cpu_to_le64(offset); } -/* Key of an item determines its location in the S+tree, and - is composed of 4 components */ +/* + * Key of an item determines its location in the S+tree, and + * is composed of 4 components + */ struct reiserfs_key { - __le32 k_dir_id; /* packing locality: by default parent - directory object id */ + /* packing locality: by default parent directory object id */ + __le32 k_dir_id; + __le32 k_objectid; /* object identifier */ union { struct offset_v1 k_offset_v1; @@ -1097,8 +1289,8 @@ struct reiserfs_key { } __attribute__ ((__packed__)); struct in_core_key { - __u32 k_dir_id; /* packing locality: by default parent - directory object id */ + /* packing locality: by default parent directory object id */ + __u32 k_dir_id; __u32 k_objectid; /* object identifier */ __u64 k_offset; __u8 k_type; @@ -1107,14 +1299,16 @@ struct in_core_key { struct cpu_key { struct in_core_key on_disk_key; int version; - int key_length; /* 3 in all cases but direct2indirect and - indirect2direct conversion */ + /* 3 in all cases but direct2indirect and indirect2direct conversion */ + int key_length; }; -/* Our function for comparing keys can compare keys of different - lengths. It takes as a parameter the length of the keys it is to - compare. These defines are used in determining what is to be passed - to it as that parameter. */ +/* + * Our function for comparing keys can compare keys of different + * lengths. It takes as a parameter the length of the keys it is to + * compare. These defines are used in determining what is to be passed + * to it as that parameter. + */ #define REISERFS_FULL_KEY_LEN 4 #define REISERFS_SHORT_KEY_LEN 2 @@ -1143,40 +1337,52 @@ struct cpu_key { #define POSITION_FOUND 1 #define POSITION_NOT_FOUND 0 -// return values for reiserfs_find_entry and search_by_entry_key +/* return values for reiserfs_find_entry and search_by_entry_key */ #define NAME_FOUND 1 #define NAME_NOT_FOUND 0 #define GOTO_PREVIOUS_ITEM 2 #define NAME_FOUND_INVISIBLE 3 -/* Everything in the filesystem is stored as a set of items. The - item head contains the key of the item, its free space (for - indirect items) and specifies the location of the item itself - within the block. */ +/* + * Everything in the filesystem is stored as a set of items. The + * item head contains the key of the item, its free space (for + * indirect items) and specifies the location of the item itself + * within the block. + */ struct item_head { - /* Everything in the tree is found by searching for it based on - * its key.*/ + /* + * Everything in the tree is found by searching for it based on + * its key. + */ struct reiserfs_key ih_key; union { - /* The free space in the last unformatted node of an - indirect item if this is an indirect item. This - equals 0xFFFF iff this is a direct item or stat data - item. Note that the key, not this field, is used to - determine the item type, and thus which field this - union contains. */ + /* + * The free space in the last unformatted node of an + * indirect item if this is an indirect item. This + * equals 0xFFFF iff this is a direct item or stat data + * item. Note that the key, not this field, is used to + * determine the item type, and thus which field this + * union contains. + */ __le16 ih_free_space_reserved; - /* Iff this is a directory item, this field equals the - number of directory entries in the directory item. */ + + /* + * Iff this is a directory item, this field equals the + * number of directory entries in the directory item. + */ __le16 ih_entry_count; } __attribute__ ((__packed__)) u; __le16 ih_item_len; /* total size of the item body */ - __le16 ih_item_location; /* an offset to the item body - * within the block */ - __le16 ih_version; /* 0 for all old items, 2 for new - ones. Highest bit is set by fsck - temporary, cleaned after all - done */ + + /* an offset to the item body within the block */ + __le16 ih_item_location; + + /* + * 0 for all old items, 2 for new ones. Highest bit is set by fsck + * temporary, cleaned after all done + */ + __le16 ih_version; } __attribute__ ((__packed__)); /* size of item header */ #define IH_SIZE (sizeof(struct item_head)) @@ -1198,27 +1404,24 @@ struct item_head { #define get_ih_free_space(ih) (ih_version (ih) == KEY_FORMAT_3_6 ? 0 : ih_free_space (ih)) #define set_ih_free_space(ih,val) put_ih_free_space((ih), ((ih_version(ih) == KEY_FORMAT_3_6) ? 0 : (val))) -/* these operate on indirect items, where you've got an array of ints -** at a possibly unaligned location. These are a noop on ia32 -** -** p is the array of __u32, i is the index into the array, v is the value -** to store there. -*/ +/* + * these operate on indirect items, where you've got an array of ints + * at a possibly unaligned location. These are a noop on ia32 + * + * p is the array of __u32, i is the index into the array, v is the value + * to store there. + */ #define get_block_num(p, i) get_unaligned_le32((p) + (i)) #define put_block_num(p, i, v) put_unaligned_le32((v), (p) + (i)) -// -// in old version uniqueness field shows key type -// +/* * in old version uniqueness field shows key type */ #define V1_SD_UNIQUENESS 0 #define V1_INDIRECT_UNIQUENESS 0xfffffffe #define V1_DIRECT_UNIQUENESS 0xffffffff #define V1_DIRENTRY_UNIQUENESS 500 -#define V1_ANY_UNIQUENESS 555 // FIXME: comment is required +#define V1_ANY_UNIQUENESS 555 /* FIXME: comment is required */ -// -// here are conversion routines -// +/* here are conversion routines */ static inline int uniqueness2type(__u32 uniqueness) CONSTF; static inline int uniqueness2type(__u32 uniqueness) { @@ -1255,11 +1458,11 @@ static inline __u32 type2uniqueness(int type) } } -// -// key is pointer to on disk key which is stored in le, result is cpu, -// there is no way to get version of object from key, so, provide -// version to these defines -// +/* + * key is pointer to on disk key which is stored in le, result is cpu, + * there is no way to get version of object from key, so, provide + * version to these defines + */ static inline loff_t le_key_k_offset(int version, const struct reiserfs_key *key) { @@ -1275,9 +1478,11 @@ static inline loff_t le_ih_k_offset(const struct item_head *ih) static inline loff_t le_key_k_type(int version, const struct reiserfs_key *key) { - return (version == KEY_FORMAT_3_5) ? - uniqueness2type(le32_to_cpu(key->u.k_offset_v1.k_uniqueness)) : - offset_v2_k_type(&(key->u.k_offset_v2)); + if (version == KEY_FORMAT_3_5) { + loff_t val = le32_to_cpu(key->u.k_offset_v1.k_uniqueness); + return uniqueness2type(val); + } else + return offset_v2_k_type(&(key->u.k_offset_v2)); } static inline loff_t le_ih_k_type(const struct item_head *ih) @@ -1288,8 +1493,22 @@ static inline loff_t le_ih_k_type(const struct item_head *ih) static inline void set_le_key_k_offset(int version, struct reiserfs_key *key, loff_t offset) { - (version == KEY_FORMAT_3_5) ? (void)(key->u.k_offset_v1.k_offset = cpu_to_le32(offset)) : /* jdm check */ - (void)(set_offset_v2_k_offset(&(key->u.k_offset_v2), offset)); + if (version == KEY_FORMAT_3_5) + key->u.k_offset_v1.k_offset = cpu_to_le32(offset); + else + set_offset_v2_k_offset(&key->u.k_offset_v2, offset); +} + +static inline void add_le_key_k_offset(int version, struct reiserfs_key *key, + loff_t offset) +{ + set_le_key_k_offset(version, key, + le_key_k_offset(version, key) + offset); +} + +static inline void add_le_ih_k_offset(struct item_head *ih, loff_t offset) +{ + add_le_key_k_offset(ih_version(ih), &(ih->ih_key), offset); } static inline void set_le_ih_k_offset(struct item_head *ih, loff_t offset) @@ -1300,10 +1519,11 @@ static inline void set_le_ih_k_offset(struct item_head *ih, loff_t offset) static inline void set_le_key_k_type(int version, struct reiserfs_key *key, int type) { - (version == KEY_FORMAT_3_5) ? - (void)(key->u.k_offset_v1.k_uniqueness = - cpu_to_le32(type2uniqueness(type))) - : (void)(set_offset_v2_k_type(&(key->u.k_offset_v2), type)); + if (version == KEY_FORMAT_3_5) { + type = type2uniqueness(type); + key->u.k_offset_v1.k_uniqueness = cpu_to_le32(type); + } else + set_offset_v2_k_type(&key->u.k_offset_v2, type); } static inline void set_le_ih_k_type(struct item_head *ih, int type) @@ -1331,9 +1551,7 @@ static inline int is_statdata_le_key(int version, struct reiserfs_key *key) return le_key_k_type(version, key) == TYPE_STAT_DATA; } -// -// item header has version. -// +/* item header has version. */ static inline int is_direntry_le_ih(struct item_head *ih) { return is_direntry_le_key(ih_version(ih), &ih->ih_key); @@ -1354,9 +1572,7 @@ static inline int is_statdata_le_ih(struct item_head *ih) return is_statdata_le_key(ih_version(ih), &ih->ih_key); } -// -// key is pointer to cpu key, result is cpu -// +/* key is pointer to cpu key, result is cpu */ static inline loff_t cpu_key_k_offset(const struct cpu_key *key) { return key->on_disk_key.k_offset; @@ -1407,7 +1623,7 @@ static inline void cpu_key_k_offset_dec(struct cpu_key *key) extern struct reiserfs_key root_key; -/* +/* * Picture represents a leaf of the S+tree * ______________________________________________________ * | | Array of | | | @@ -1416,15 +1632,19 @@ extern struct reiserfs_key root_key; * |______|_______________|___________________|___________| */ -/* Header of a disk block. More precisely, header of a formatted leaf - or internal node, and not the header of an unformatted node. */ +/* + * Header of a disk block. More precisely, header of a formatted leaf + * or internal node, and not the header of an unformatted node. + */ struct block_head { __le16 blk_level; /* Level of a block in the tree. */ __le16 blk_nr_item; /* Number of keys/items in a block. */ __le16 blk_free_space; /* Block free space in bytes. */ __le16 blk_reserved; /* dump this in v4/planA */ - struct reiserfs_key blk_right_delim_key; /* kept only for compatibility */ + + /* kept only for compatibility */ + struct reiserfs_key blk_right_delim_key; }; #define BLKH_SIZE (sizeof(struct block_head)) @@ -1439,18 +1659,20 @@ struct block_head { #define blkh_right_delim_key(p_blkh) ((p_blkh)->blk_right_delim_key) #define set_blkh_right_delim_key(p_blkh,val) ((p_blkh)->blk_right_delim_key = val) +/* values for blk_level field of the struct block_head */ + /* - * values for blk_level field of the struct block_head + * When node gets removed from the tree its blk_level is set to FREE_LEVEL. + * It is then used to see whether the node is still in the tree */ - -#define FREE_LEVEL 0 /* when node gets removed from the tree its - blk_level is set to FREE_LEVEL. It is then - used to see whether the node is still in the - tree */ +#define FREE_LEVEL 0 #define DISK_LEAF_NODE_LEVEL 1 /* Leaf node level. */ -/* Given the buffer head of a formatted node, resolve to the block head of that node. */ +/* + * Given the buffer head of a formatted node, resolve to the + * block head of that node. + */ #define B_BLK_HEAD(bh) ((struct block_head *)((bh)->b_data)) /* Number of items that are in buffer. */ #define B_NR_ITEMS(bh) (blkh_nr_item(B_BLK_HEAD(bh))) @@ -1471,14 +1693,14 @@ struct block_head { #define B_IS_KEYS_LEVEL(bh) (B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL \ && B_LEVEL(bh) <= MAX_HEIGHT) -/***************************************************************************/ -/* STAT DATA */ -/***************************************************************************/ +/*************************************************************************** + * STAT DATA * + ***************************************************************************/ -// -// old stat data is 32 bytes long. We are going to distinguish new one by -// different size -// +/* + * old stat data is 32 bytes long. We are going to distinguish new one by + * different size +*/ struct stat_data_v1 { __le16 sd_mode; /* file type, permissions */ __le16 sd_nlink; /* number of hard links */ @@ -1487,20 +1709,25 @@ struct stat_data_v1 { __le32 sd_size; /* file size */ __le32 sd_atime; /* time of last access */ __le32 sd_mtime; /* time file was last modified */ - __le32 sd_ctime; /* time inode (stat data) was last changed (except changes to sd_atime and sd_mtime) */ + + /* + * time inode (stat data) was last changed + * (except changes to sd_atime and sd_mtime) + */ + __le32 sd_ctime; union { __le32 sd_rdev; __le32 sd_blocks; /* number of blocks file uses */ } __attribute__ ((__packed__)) u; - __le32 sd_first_direct_byte; /* first byte of file which is stored - in a direct item: except that if it - equals 1 it is a symlink and if it - equals ~(__u32)0 there is no - direct item. The existence of this - field really grates on me. Let's - replace it with a macro based on - sd_size and our tail suppression - policy. Someday. -Hans */ + + /* + * first byte of file which is stored in a direct item: except that if + * it equals 1 it is a symlink and if it equals ~(__u32)0 there is no + * direct item. The existence of this field really grates on me. + * Let's replace it with a macro based on sd_size and our tail + * suppression policy. Someday. -Hans + */ + __le32 sd_first_direct_byte; } __attribute__ ((__packed__)); #define SD_V1_SIZE (sizeof(struct stat_data_v1)) @@ -1532,8 +1759,10 @@ struct stat_data_v1 { /* inode flags stored in sd_attrs (nee sd_reserved) */ -/* we want common flags to have the same values as in ext2, - so chattr(1) will work without problems */ +/* + * we want common flags to have the same values as in ext2, + * so chattr(1) will work without problems + */ #define REISERFS_IMMUTABLE_FL FS_IMMUTABLE_FL #define REISERFS_APPEND_FL FS_APPEND_FL #define REISERFS_SYNC_FL FS_SYNC_FL @@ -1553,8 +1782,10 @@ struct stat_data_v1 { REISERFS_COMPR_FL | \ REISERFS_NOTAIL_FL ) -/* Stat Data on disk (reiserfs version of UFS disk inode minus the - address blocks) */ +/* + * Stat Data on disk (reiserfs version of UFS disk inode minus the + * address blocks) + */ struct stat_data { __le16 sd_mode; /* file type, permissions */ __le16 sd_attrs; /* persistent inode flags */ @@ -1564,25 +1795,20 @@ struct stat_data { __le32 sd_gid; /* group */ __le32 sd_atime; /* time of last access */ __le32 sd_mtime; /* time file was last modified */ - __le32 sd_ctime; /* time inode (stat data) was last changed (except changes to sd_atime and sd_mtime) */ + + /* + * time inode (stat data) was last changed + * (except changes to sd_atime and sd_mtime) + */ + __le32 sd_ctime; __le32 sd_blocks; union { __le32 sd_rdev; __le32 sd_generation; - //__le32 sd_first_direct_byte; - /* first byte of file which is stored in a - direct item: except that if it equals 1 - it is a symlink and if it equals - ~(__u32)0 there is no direct item. The - existence of this field really grates - on me. Let's replace it with a macro - based on sd_size and our tail - suppression policy? */ } __attribute__ ((__packed__)) u; } __attribute__ ((__packed__)); -// -// this is 44 bytes long -// + +/* this is 44 bytes long */ #define SD_SIZE (sizeof(struct stat_data)) #define SD_V2_SIZE SD_SIZE #define stat_data_v2(ih) (ih_version (ih) == KEY_FORMAT_3_6) @@ -1613,48 +1839,61 @@ struct stat_data { #define sd_v2_attrs(sdp) (le16_to_cpu((sdp)->sd_attrs)) #define set_sd_v2_attrs(sdp,v) ((sdp)->sd_attrs = cpu_to_le16(v)) -/***************************************************************************/ -/* DIRECTORY STRUCTURE */ -/***************************************************************************/ -/* - Picture represents the structure of directory items - ________________________________________________ - | Array of | | | | | | - | directory |N-1| N-2 | .... | 1st |0th| - | entry headers | | | | | | - |_______________|___|_____|________|_______|___| - <---- directory entries ------> - - First directory item has k_offset component 1. We store "." and ".." - in one item, always, we never split "." and ".." into differing - items. This makes, among other things, the code for removing - directories simpler. */ +/*************************************************************************** + * DIRECTORY STRUCTURE * + ***************************************************************************/ +/* + * Picture represents the structure of directory items + * ________________________________________________ + * | Array of | | | | | | + * | directory |N-1| N-2 | .... | 1st |0th| + * | entry headers | | | | | | + * |_______________|___|_____|________|_______|___| + * <---- directory entries ------> + * + * First directory item has k_offset component 1. We store "." and ".." + * in one item, always, we never split "." and ".." into differing + * items. This makes, among other things, the code for removing + * directories simpler. + */ #define SD_OFFSET 0 #define SD_UNIQUENESS 0 #define DOT_OFFSET 1 #define DOT_DOT_OFFSET 2 #define DIRENTRY_UNIQUENESS 500 -/* */ #define FIRST_ITEM_OFFSET 1 /* - Q: How to get key of object pointed to by entry from entry? - - A: Each directory entry has its header. This header has deh_dir_id and deh_objectid fields, those are key - of object, entry points to */ + * Q: How to get key of object pointed to by entry from entry? + * + * A: Each directory entry has its header. This header has deh_dir_id + * and deh_objectid fields, those are key of object, entry points to + */ -/* NOT IMPLEMENTED: - Directory will someday contain stat data of object */ +/* + * NOT IMPLEMENTED: + * Directory will someday contain stat data of object + */ struct reiserfs_de_head { __le32 deh_offset; /* third component of the directory entry key */ - __le32 deh_dir_id; /* objectid of the parent directory of the object, that is referenced - by directory entry */ - __le32 deh_objectid; /* objectid of the object, that is referenced by directory entry */ + + /* + * objectid of the parent directory of the object, that is referenced + * by directory entry + */ + __le32 deh_dir_id; + + /* objectid of the object, that is referenced by directory entry */ + __le32 deh_objectid; __le16 deh_location; /* offset of name in the whole item */ - __le16 deh_state; /* whether 1) entry contains stat data (for future), and 2) whether - entry is hidden (unlinked) */ + + /* + * whether 1) entry contains stat data (for future), and + * 2) whether entry is hidden (unlinked) + */ + __le16 deh_state; } __attribute__ ((__packed__)); #define DEH_SIZE sizeof(struct reiserfs_de_head) #define deh_offset(p_deh) (le32_to_cpu((p_deh)->deh_offset)) @@ -1684,9 +1923,11 @@ struct reiserfs_de_head { # define ADDR_UNALIGNED_BITS (3) #endif -/* These are only used to manipulate deh_state. +/* + * These are only used to manipulate deh_state. * Because of this, we'll use the ext2_ bit routines, - * since they are little endian */ + * since they are little endian + */ #ifdef ADDR_UNALIGNED_BITS # define aligned_address(addr) ((void *)((long)(addr) & ~((1UL << ADDR_UNALIGNED_BITS) - 1))) @@ -1721,46 +1962,16 @@ extern void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid, extern void make_empty_dir_item(char *body, __le32 dirid, __le32 objid, __le32 par_dirid, __le32 par_objid); -/* array of the entry headers */ - /* get item body */ -#define B_I_PITEM(bh,ih) ( (bh)->b_data + ih_location(ih) ) -#define B_I_DEH(bh,ih) ((struct reiserfs_de_head *)(B_I_PITEM(bh,ih))) - -/* length of the directory entry in directory item. This define - calculates length of i-th directory entry using directory entry - locations from dir entry head. When it calculates length of 0-th - directory entry, it uses length of whole item in place of entry - location of the non-existent following entry in the calculation. - See picture above.*/ -/* -#define I_DEH_N_ENTRY_LENGTH(ih,deh,i) \ -((i) ? (deh_location((deh)-1) - deh_location((deh))) : (ih_item_len((ih)) - deh_location((deh)))) -*/ -static inline int entry_length(const struct buffer_head *bh, - const struct item_head *ih, int pos_in_item) -{ - struct reiserfs_de_head *deh; - - deh = B_I_DEH(bh, ih) + pos_in_item; - if (pos_in_item) - return deh_location(deh - 1) - deh_location(deh); - - return ih_item_len(ih) - deh_location(deh); -} - -/* number of entries in the directory item, depends on ENTRY_COUNT being at the start of directory dynamic data. */ -#define I_ENTRY_COUNT(ih) (ih_entry_count((ih))) - -/* name by bh, ih and entry_num */ -#define B_I_E_NAME(bh,ih,entry_num) ((char *)(bh->b_data + ih_location(ih) + deh_location(B_I_DEH(bh,ih)+(entry_num)))) - -// two entries per block (at least) +/* two entries per block (at least) */ #define REISERFS_MAX_NAME(block_size) 255 -/* this structure is used for operations on directory entries. It is - not a disk structure. */ -/* When reiserfs_find_entry or search_by_entry_key find directory - entry, they return filled reiserfs_dir_entry structure */ +/* + * this structure is used for operations on directory entries. It is + * not a disk structure. + * + * When reiserfs_find_entry or search_by_entry_key find directory + * entry, they return filled reiserfs_dir_entry structure + */ struct reiserfs_dir_entry { struct buffer_head *de_bh; int de_item_num; @@ -1778,10 +1989,14 @@ struct reiserfs_dir_entry { struct cpu_key de_entry_key; }; -/* these defines are useful when a particular member of a reiserfs_dir_entry is needed */ +/* + * these defines are useful when a particular member of + * a reiserfs_dir_entry is needed + */ /* pointer to file name, stored in entry */ -#define B_I_DEH_ENTRY_FILE_NAME(bh,ih,deh) (B_I_PITEM (bh, ih) + deh_location(deh)) +#define B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh) \ + (ih_item_body(bh, ih) + deh_location(deh)) /* length of name */ #define I_DEH_N_ENTRY_FILE_NAME_LENGTH(ih,deh,entry_num) \ @@ -1804,11 +2019,13 @@ struct reiserfs_dir_entry { * |______|_______________|___________________|___________| */ -/***************************************************************************/ -/* DISK CHILD */ -/***************************************************************************/ -/* Disk child pointer: The pointer from an internal node of the tree - to a node that is on disk. */ +/*************************************************************************** + * DISK CHILD * + ***************************************************************************/ +/* + * Disk child pointer: + * The pointer from an internal node of the tree to a node that is on disk. + */ struct disk_child { __le32 dc_block_number; /* Disk child's block number. */ __le16 dc_size; /* Disk child's used space. */ @@ -1841,47 +2058,66 @@ struct disk_child { #define MAX_NR_KEY(bh) ( (MAX_CHILD_SIZE(bh)-DC_SIZE)/(KEY_SIZE+DC_SIZE) ) #define MIN_NR_KEY(bh) (MAX_NR_KEY(bh)/2) -/***************************************************************************/ -/* PATH STRUCTURES AND DEFINES */ -/***************************************************************************/ +/*************************************************************************** + * PATH STRUCTURES AND DEFINES * + ***************************************************************************/ -/* Search_by_key fills up the path from the root to the leaf as it descends the tree looking for the - key. It uses reiserfs_bread to try to find buffers in the cache given their block number. If it - does not find them in the cache it reads them from disk. For each node search_by_key finds using - reiserfs_bread it then uses bin_search to look through that node. bin_search will find the - position of the block_number of the next node if it is looking through an internal node. If it - is looking through a leaf node bin_search will find the position of the item which has key either - equal to given key, or which is the maximal key less than the given key. */ +/* + * search_by_key fills up the path from the root to the leaf as it descends + * the tree looking for the key. It uses reiserfs_bread to try to find + * buffers in the cache given their block number. If it does not find + * them in the cache it reads them from disk. For each node search_by_key + * finds using reiserfs_bread it then uses bin_search to look through that + * node. bin_search will find the position of the block_number of the next + * node if it is looking through an internal node. If it is looking through + * a leaf node bin_search will find the position of the item which has key + * either equal to given key, or which is the maximal key less than the + * given key. + */ struct path_element { - struct buffer_head *pe_buffer; /* Pointer to the buffer at the path in the tree. */ - int pe_position; /* Position in the tree node which is placed in the */ - /* buffer above. */ + /* Pointer to the buffer at the path in the tree. */ + struct buffer_head *pe_buffer; + /* Position in the tree node which is placed in the buffer above. */ + int pe_position; }; -#define MAX_HEIGHT 5 /* maximal height of a tree. don't change this without changing JOURNAL_PER_BALANCE_CNT */ -#define EXTENDED_MAX_HEIGHT 7 /* Must be equals MAX_HEIGHT + FIRST_PATH_ELEMENT_OFFSET */ -#define FIRST_PATH_ELEMENT_OFFSET 2 /* Must be equal to at least 2. */ - -#define ILLEGAL_PATH_ELEMENT_OFFSET 1 /* Must be equal to FIRST_PATH_ELEMENT_OFFSET - 1 */ -#define MAX_FEB_SIZE 6 /* this MUST be MAX_HEIGHT + 1. See about FEB below */ - -/* We need to keep track of who the ancestors of nodes are. When we - perform a search we record which nodes were visited while - descending the tree looking for the node we searched for. This list - of nodes is called the path. This information is used while - performing balancing. Note that this path information may become - invalid, and this means we must check it when using it to see if it - is still valid. You'll need to read search_by_key and the comments - in it, especially about decrement_counters_in_path(), to understand - this structure. - -Paths make the code so much harder to work with and debug.... An -enormous number of bugs are due to them, and trying to write or modify -code that uses them just makes my head hurt. They are based on an -excessive effort to avoid disturbing the precious VFS code.:-( The -gods only know how we are going to SMP the code that uses them. -znodes are the way! */ +/* + * maximal height of a tree. don't change this without + * changing JOURNAL_PER_BALANCE_CNT + */ +#define MAX_HEIGHT 5 + +/* Must be equals MAX_HEIGHT + FIRST_PATH_ELEMENT_OFFSET */ +#define EXTENDED_MAX_HEIGHT 7 + +/* Must be equal to at least 2. */ +#define FIRST_PATH_ELEMENT_OFFSET 2 + +/* Must be equal to FIRST_PATH_ELEMENT_OFFSET - 1 */ +#define ILLEGAL_PATH_ELEMENT_OFFSET 1 + +/* this MUST be MAX_HEIGHT + 1. See about FEB below */ +#define MAX_FEB_SIZE 6 + +/* + * We need to keep track of who the ancestors of nodes are. When we + * perform a search we record which nodes were visited while + * descending the tree looking for the node we searched for. This list + * of nodes is called the path. This information is used while + * performing balancing. Note that this path information may become + * invalid, and this means we must check it when using it to see if it + * is still valid. You'll need to read search_by_key and the comments + * in it, especially about decrement_counters_in_path(), to understand + * this structure. + * + * Paths make the code so much harder to work with and debug.... An + * enormous number of bugs are due to them, and trying to write or modify + * code that uses them just makes my head hurt. They are based on an + * excessive effort to avoid disturbing the precious VFS code.:-( The + * gods only know how we are going to SMP the code that uses them. + * znodes are the way! + */ #define PATH_READA 0x1 /* do read ahead */ #define PATH_READA_BACK 0x2 /* read backwards */ @@ -1889,7 +2125,8 @@ znodes are the way! */ struct treepath { int path_length; /* Length of the array above. */ int reada; - struct path_element path_elements[EXTENDED_MAX_HEIGHT]; /* Array of the path elements. */ + /* Array of the path elements. */ + struct path_element path_elements[EXTENDED_MAX_HEIGHT]; int pos_in_item; }; @@ -1908,41 +2145,124 @@ struct treepath var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET, .reada = 0,} #define PATH_OFFSET_POSITION(path, n_offset) (PATH_OFFSET_PELEMENT(path, n_offset)->pe_position) #define PATH_PLAST_BUFFER(path) (PATH_OFFSET_PBUFFER((path), (path)->path_length)) - /* you know, to the person who didn't - write this the macro name does not - at first suggest what it does. - Maybe POSITION_FROM_PATH_END? Or - maybe we should just focus on - dumping paths... -Hans */ + +/* + * you know, to the person who didn't write this the macro name does not + * at first suggest what it does. Maybe POSITION_FROM_PATH_END? Or + * maybe we should just focus on dumping paths... -Hans + */ #define PATH_LAST_POSITION(path) (PATH_OFFSET_POSITION((path), (path)->path_length)) -#define PATH_PITEM_HEAD(path) B_N_PITEM_HEAD(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path)) +/* + * in do_balance leaf has h == 0 in contrast with path structure, + * where root has level == 0. That is why we need these defines + */ + +/* tb->S[h] */ +#define PATH_H_PBUFFER(path, h) \ + PATH_OFFSET_PBUFFER(path, path->path_length - (h)) + +/* tb->F[h] or tb->S[0]->b_parent */ +#define PATH_H_PPARENT(path, h) PATH_H_PBUFFER(path, (h) + 1) + +#define PATH_H_POSITION(path, h) \ + PATH_OFFSET_POSITION(path, path->path_length - (h)) -/* in do_balance leaf has h == 0 in contrast with path structure, - where root has level == 0. That is why we need these defines */ -#define PATH_H_PBUFFER(path, h) PATH_OFFSET_PBUFFER (path, path->path_length - (h)) /* tb->S[h] */ -#define PATH_H_PPARENT(path, h) PATH_H_PBUFFER (path, (h) + 1) /* tb->F[h] or tb->S[0]->b_parent */ -#define PATH_H_POSITION(path, h) PATH_OFFSET_POSITION (path, path->path_length - (h)) -#define PATH_H_B_ITEM_ORDER(path, h) PATH_H_POSITION(path, h + 1) /* tb->S[h]->b_item_order */ +/* tb->S[h]->b_item_order */ +#define PATH_H_B_ITEM_ORDER(path, h) PATH_H_POSITION(path, h + 1) #define PATH_H_PATH_OFFSET(path, n_h) ((path)->path_length - (n_h)) +static inline void *reiserfs_node_data(const struct buffer_head *bh) +{ + return bh->b_data + sizeof(struct block_head); +} + +/* get key from internal node */ +static inline struct reiserfs_key *internal_key(struct buffer_head *bh, + int item_num) +{ + struct reiserfs_key *key = reiserfs_node_data(bh); + + return &key[item_num]; +} + +/* get the item header from leaf node */ +static inline struct item_head *item_head(const struct buffer_head *bh, + int item_num) +{ + struct item_head *ih = reiserfs_node_data(bh); + + return &ih[item_num]; +} + +/* get the key from leaf node */ +static inline struct reiserfs_key *leaf_key(const struct buffer_head *bh, + int item_num) +{ + return &item_head(bh, item_num)->ih_key; +} + +static inline void *ih_item_body(const struct buffer_head *bh, + const struct item_head *ih) +{ + return bh->b_data + ih_location(ih); +} + +/* get item body from leaf node */ +static inline void *item_body(const struct buffer_head *bh, int item_num) +{ + return ih_item_body(bh, item_head(bh, item_num)); +} + +static inline struct item_head *tp_item_head(const struct treepath *path) +{ + return item_head(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path)); +} + +static inline void *tp_item_body(const struct treepath *path) +{ + return item_body(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path)); +} + #define get_last_bh(path) PATH_PLAST_BUFFER(path) -#define get_ih(path) PATH_PITEM_HEAD(path) #define get_item_pos(path) PATH_LAST_POSITION(path) -#define get_item(path) ((void *)B_N_PITEM(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION (path))) #define item_moved(ih,path) comp_items(ih, path) #define path_changed(ih,path) comp_items (ih, path) -/***************************************************************************/ -/* MISC */ -/***************************************************************************/ +/* array of the entry headers */ + /* get item body */ +#define B_I_DEH(bh, ih) ((struct reiserfs_de_head *)(ih_item_body(bh, ih))) + +/* + * length of the directory entry in directory item. This define + * calculates length of i-th directory entry using directory entry + * locations from dir entry head. When it calculates length of 0-th + * directory entry, it uses length of whole item in place of entry + * location of the non-existent following entry in the calculation. + * See picture above. + */ +static inline int entry_length(const struct buffer_head *bh, + const struct item_head *ih, int pos_in_item) +{ + struct reiserfs_de_head *deh; + + deh = B_I_DEH(bh, ih) + pos_in_item; + if (pos_in_item) + return deh_location(deh - 1) - deh_location(deh); + + return ih_item_len(ih) - deh_location(deh); +} + +/*************************************************************************** + * MISC * + ***************************************************************************/ /* Size of pointer to the unformatted node. */ #define UNFM_P_SIZE (sizeof(unp_t)) #define UNFM_P_SHIFT 2 -// in in-core inode key is stored on le form +/* in in-core inode key is stored on le form */ #define INODE_PKEY(inode) ((struct reiserfs_key *)(REISERFS_I(inode)->i_key)) #define MAX_UL_INT 0xffffffff @@ -1958,7 +2278,6 @@ static inline loff_t max_reiserfs_offset(struct inode *inode) return (loff_t) ((~(__u64) 0) >> 4); } -/*#define MAX_KEY_UNIQUENESS MAX_UL_INT*/ #define MAX_KEY_OBJECTID MAX_UL_INT #define MAX_B_NUM MAX_UL_INT @@ -1967,9 +2286,12 @@ static inline loff_t max_reiserfs_offset(struct inode *inode) /* the purpose is to detect overflow of an unsigned short */ #define REISERFS_LINK_MAX (MAX_US_INT - 1000) -/* The following defines are used in reiserfs_insert_item and reiserfs_append_item */ -#define REISERFS_KERNEL_MEM 0 /* reiserfs kernel memory mode */ -#define REISERFS_USER_MEM 1 /* reiserfs user memory mode */ +/* + * The following defines are used in reiserfs_insert_item + * and reiserfs_append_item + */ +#define REISERFS_KERNEL_MEM 0 /* kernel memory mode */ +#define REISERFS_USER_MEM 1 /* user memory mode */ #define fs_generation(s) (REISERFS_SB(s)->s_generation_counter) #define get_generation(s) atomic_read (&fs_generation(s)) @@ -1981,46 +2303,65 @@ static inline loff_t max_reiserfs_offset(struct inode *inode) __fs_changed(gen, s); \ }) -/***************************************************************************/ -/* FIXATE NODES */ -/***************************************************************************/ +/*************************************************************************** + * FIXATE NODES * + ***************************************************************************/ #define VI_TYPE_LEFT_MERGEABLE 1 #define VI_TYPE_RIGHT_MERGEABLE 2 -/* To make any changes in the tree we always first find node, that - contains item to be changed/deleted or place to insert a new - item. We call this node S. To do balancing we need to decide what - we will shift to left/right neighbor, or to a new node, where new - item will be etc. To make this analysis simpler we build virtual - node. Virtual node is an array of items, that will replace items of - node S. (For instance if we are going to delete an item, virtual - node does not contain it). Virtual node keeps information about - item sizes and types, mergeability of first and last items, sizes - of all entries in directory item. We use this array of items when - calculating what we can shift to neighbors and how many nodes we - have to have if we do not any shiftings, if we shift to left/right - neighbor or to both. */ +/* + * To make any changes in the tree we always first find node, that + * contains item to be changed/deleted or place to insert a new + * item. We call this node S. To do balancing we need to decide what + * we will shift to left/right neighbor, or to a new node, where new + * item will be etc. To make this analysis simpler we build virtual + * node. Virtual node is an array of items, that will replace items of + * node S. (For instance if we are going to delete an item, virtual + * node does not contain it). Virtual node keeps information about + * item sizes and types, mergeability of first and last items, sizes + * of all entries in directory item. We use this array of items when + * calculating what we can shift to neighbors and how many nodes we + * have to have if we do not any shiftings, if we shift to left/right + * neighbor or to both. + */ struct virtual_item { - int vi_index; // index in the array of item operations - unsigned short vi_type; // left/right mergeability - unsigned short vi_item_len; /* length of item that it will have after balancing */ + int vi_index; /* index in the array of item operations */ + unsigned short vi_type; /* left/right mergeability */ + + /* length of item that it will have after balancing */ + unsigned short vi_item_len; + struct item_head *vi_ih; - const char *vi_item; // body of item (old or new) - const void *vi_new_data; // 0 always but paste mode - void *vi_uarea; // item specific area + const char *vi_item; /* body of item (old or new) */ + const void *vi_new_data; /* 0 always but paste mode */ + void *vi_uarea; /* item specific area */ }; struct virtual_node { - char *vn_free_ptr; /* this is a pointer to the free space in the buffer */ + /* this is a pointer to the free space in the buffer */ + char *vn_free_ptr; + unsigned short vn_nr_item; /* number of items in virtual node */ - short vn_size; /* size of node , that node would have if it has unlimited size and no balancing is performed */ - short vn_mode; /* mode of balancing (paste, insert, delete, cut) */ + + /* + * size of node , that node would have if it has + * unlimited size and no balancing is performed + */ + short vn_size; + + /* mode of balancing (paste, insert, delete, cut) */ + short vn_mode; + short vn_affected_item_num; short vn_pos_in_item; - struct item_head *vn_ins_ih; /* item header of inserted item, 0 for other modes */ + + /* item header of inserted item, 0 for other modes */ + struct item_head *vn_ins_ih; const void *vn_data; - struct virtual_item *vn_vi; /* array of items (including a new one, excluding item to be deleted) */ + + /* array of items (including a new one, excluding item to be deleted) */ + struct virtual_item *vn_vi; }; /* used by directory items when creating virtual nodes */ @@ -2030,22 +2371,25 @@ struct direntry_uarea { __u16 entry_sizes[1]; } __attribute__ ((__packed__)); -/***************************************************************************/ -/* TREE BALANCE */ -/***************************************************************************/ +/*************************************************************************** + * TREE BALANCE * + ***************************************************************************/ -/* This temporary structure is used in tree balance algorithms, and - constructed as we go to the extent that its various parts are - needed. It contains arrays of nodes that can potentially be - involved in the balancing of node S, and parameters that define how - each of the nodes must be balanced. Note that in these algorithms - for balancing the worst case is to need to balance the current node - S and the left and right neighbors and all of their parents plus - create a new node. We implement S1 balancing for the leaf nodes - and S0 balancing for the internal nodes (S1 and S0 are defined in - our papers.)*/ +/* + * This temporary structure is used in tree balance algorithms, and + * constructed as we go to the extent that its various parts are + * needed. It contains arrays of nodes that can potentially be + * involved in the balancing of node S, and parameters that define how + * each of the nodes must be balanced. Note that in these algorithms + * for balancing the worst case is to need to balance the current node + * S and the left and right neighbors and all of their parents plus + * create a new node. We implement S1 balancing for the leaf nodes + * and S0 balancing for the internal nodes (S1 and S0 are defined in + * our papers.) + */ -#define MAX_FREE_BLOCK 7 /* size of the array of buffers to free at end of do_balance */ +/* size of the array of buffers to free at end of do_balance */ +#define MAX_FREE_BLOCK 7 /* maximum number of FEB blocknrs on a single level */ #define MAX_AMOUNT_NEEDED 2 @@ -2057,64 +2401,144 @@ struct tree_balance { struct super_block *tb_sb; struct reiserfs_transaction_handle *transaction_handle; struct treepath *tb_path; - struct buffer_head *L[MAX_HEIGHT]; /* array of left neighbors of nodes in the path */ - struct buffer_head *R[MAX_HEIGHT]; /* array of right neighbors of nodes in the path */ - struct buffer_head *FL[MAX_HEIGHT]; /* array of fathers of the left neighbors */ - struct buffer_head *FR[MAX_HEIGHT]; /* array of fathers of the right neighbors */ - struct buffer_head *CFL[MAX_HEIGHT]; /* array of common parents of center node and its left neighbor */ - struct buffer_head *CFR[MAX_HEIGHT]; /* array of common parents of center node and its right neighbor */ - - struct buffer_head *FEB[MAX_FEB_SIZE]; /* array of empty buffers. Number of buffers in array equals - cur_blknum. */ + + /* array of left neighbors of nodes in the path */ + struct buffer_head *L[MAX_HEIGHT]; + + /* array of right neighbors of nodes in the path */ + struct buffer_head *R[MAX_HEIGHT]; + + /* array of fathers of the left neighbors */ + struct buffer_head *FL[MAX_HEIGHT]; + + /* array of fathers of the right neighbors */ + struct buffer_head *FR[MAX_HEIGHT]; + /* array of common parents of center node and its left neighbor */ + struct buffer_head *CFL[MAX_HEIGHT]; + + /* array of common parents of center node and its right neighbor */ + struct buffer_head *CFR[MAX_HEIGHT]; + + /* + * array of empty buffers. Number of buffers in array equals + * cur_blknum. + */ + struct buffer_head *FEB[MAX_FEB_SIZE]; struct buffer_head *used[MAX_FEB_SIZE]; struct buffer_head *thrown[MAX_FEB_SIZE]; - int lnum[MAX_HEIGHT]; /* array of number of items which must be - shifted to the left in order to balance the - current node; for leaves includes item that - will be partially shifted; for internal - nodes, it is the number of child pointers - rather than items. It includes the new item - being created. The code sometimes subtracts - one to get the number of wholly shifted - items for other purposes. */ - int rnum[MAX_HEIGHT]; /* substitute right for left in comment above */ - int lkey[MAX_HEIGHT]; /* array indexed by height h mapping the key delimiting L[h] and - S[h] to its item number within the node CFL[h] */ - int rkey[MAX_HEIGHT]; /* substitute r for l in comment above */ - int insert_size[MAX_HEIGHT]; /* the number of bytes by we are trying to add or remove from - S[h]. A negative value means removing. */ - int blknum[MAX_HEIGHT]; /* number of nodes that will replace node S[h] after - balancing on the level h of the tree. If 0 then S is - being deleted, if 1 then S is remaining and no new nodes - are being created, if 2 or 3 then 1 or 2 new nodes is - being created */ + + /* + * array of number of items which must be shifted to the left in + * order to balance the current node; for leaves includes item that + * will be partially shifted; for internal nodes, it is the number + * of child pointers rather than items. It includes the new item + * being created. The code sometimes subtracts one to get the + * number of wholly shifted items for other purposes. + */ + int lnum[MAX_HEIGHT]; + + /* substitute right for left in comment above */ + int rnum[MAX_HEIGHT]; + + /* + * array indexed by height h mapping the key delimiting L[h] and + * S[h] to its item number within the node CFL[h] + */ + int lkey[MAX_HEIGHT]; + + /* substitute r for l in comment above */ + int rkey[MAX_HEIGHT]; + + /* + * the number of bytes by we are trying to add or remove from + * S[h]. A negative value means removing. + */ + int insert_size[MAX_HEIGHT]; + + /* + * number of nodes that will replace node S[h] after balancing + * on the level h of the tree. If 0 then S is being deleted, + * if 1 then S is remaining and no new nodes are being created, + * if 2 or 3 then 1 or 2 new nodes is being created + */ + int blknum[MAX_HEIGHT]; /* fields that are used only for balancing leaves of the tree */ - int cur_blknum; /* number of empty blocks having been already allocated */ - int s0num; /* number of items that fall into left most node when S[0] splits */ - int s1num; /* number of items that fall into first new node when S[0] splits */ - int s2num; /* number of items that fall into second new node when S[0] splits */ - int lbytes; /* number of bytes which can flow to the left neighbor from the left */ - /* most liquid item that cannot be shifted from S[0] entirely */ - /* if -1 then nothing will be partially shifted */ - int rbytes; /* number of bytes which will flow to the right neighbor from the right */ - /* most liquid item that cannot be shifted from S[0] entirely */ - /* if -1 then nothing will be partially shifted */ - int s1bytes; /* number of bytes which flow to the first new node when S[0] splits */ - /* note: if S[0] splits into 3 nodes, then items do not need to be cut */ - int s2bytes; - struct buffer_head *buf_to_free[MAX_FREE_BLOCK]; /* buffers which are to be freed after do_balance finishes by unfix_nodes */ - char *vn_buf; /* kmalloced memory. Used to create - virtual node and keep map of - dirtied bitmap blocks */ + + /* number of empty blocks having been already allocated */ + int cur_blknum; + + /* number of items that fall into left most node when S[0] splits */ + int s0num; + + /* + * number of bytes which can flow to the left neighbor from the left + * most liquid item that cannot be shifted from S[0] entirely + * if -1 then nothing will be partially shifted + */ + int lbytes; + + /* + * number of bytes which will flow to the right neighbor from the right + * most liquid item that cannot be shifted from S[0] entirely + * if -1 then nothing will be partially shifted + */ + int rbytes; + + + /* + * index into the array of item headers in + * S[0] of the affected item + */ + int item_pos; + + /* new nodes allocated to hold what could not fit into S */ + struct buffer_head *S_new[2]; + + /* + * number of items that will be placed into nodes in S_new + * when S[0] splits + */ + int snum[2]; + + /* + * number of bytes which flow to nodes in S_new when S[0] splits + * note: if S[0] splits into 3 nodes, then items do not need to be cut + */ + int sbytes[2]; + + int pos_in_item; + int zeroes_num; + + /* + * buffers which are to be freed after do_balance finishes + * by unfix_nodes + */ + struct buffer_head *buf_to_free[MAX_FREE_BLOCK]; + + /* + * kmalloced memory. Used to create virtual node and keep + * map of dirtied bitmap blocks + */ + char *vn_buf; + int vn_buf_size; /* size of the vn_buf */ - struct virtual_node *tb_vn; /* VN starts after bitmap of bitmap blocks */ - int fs_gen; /* saved value of `reiserfs_generation' counter - see FILESYSTEM_CHANGED() macro in reiserfs_fs.h */ + /* VN starts after bitmap of bitmap blocks */ + struct virtual_node *tb_vn; + + /* + * saved value of `reiserfs_generation' counter see + * FILESYSTEM_CHANGED() macro in reiserfs_fs.h + */ + int fs_gen; + #ifdef DISPLACE_NEW_PACKING_LOCALITIES - struct in_core_key key; /* key pointer, to pass to block allocator or - another low-level subsystem */ + /* + * key pointer, to pass to block allocator or + * another low-level subsystem + */ + struct in_core_key key; #endif }; @@ -2122,20 +2546,24 @@ struct tree_balance { /* When inserting an item. */ #define M_INSERT 'i' -/* When inserting into (directories only) or appending onto an already - existent item. */ +/* + * When inserting into (directories only) or appending onto an already + * existent item. + */ #define M_PASTE 'p' /* When deleting an item. */ #define M_DELETE 'd' /* When truncating an item or removing an entry from a (directory) item. */ -#define M_CUT 'c' +#define M_CUT 'c' /* used when balancing on leaf level skipped (in reiserfsck) */ #define M_INTERNAL 'n' -/* When further balancing is not needed, then do_balance does not need - to be called. */ -#define M_SKIP_BALANCING 's' +/* + * When further balancing is not needed, then do_balance does not need + * to be called. + */ +#define M_SKIP_BALANCING 's' #define M_CONVERT 'v' /* modes of leaf_move_items */ @@ -2148,8 +2576,10 @@ struct tree_balance { #define FIRST_TO_LAST 0 #define LAST_TO_FIRST 1 -/* used in do_balance for passing parent of node information that has - been gotten from tb struct */ +/* + * used in do_balance for passing parent of node information that has + * been gotten from tb struct + */ struct buffer_info { struct tree_balance *tb; struct buffer_head *bi_bh; @@ -2167,20 +2597,24 @@ static inline struct super_block *sb_from_bi(struct buffer_info *bi) return bi ? sb_from_tb(bi->tb) : NULL; } -/* there are 4 types of items: stat data, directory item, indirect, direct. -+-------------------+------------+--------------+------------+ -| | k_offset | k_uniqueness | mergeable? | -+-------------------+------------+--------------+------------+ -| stat data | 0 | 0 | no | -+-------------------+------------+--------------+------------+ -| 1st directory item| DOT_OFFSET |DIRENTRY_UNIQUENESS| no | -| non 1st directory | hash value | | yes | -| item | | | | -+-------------------+------------+--------------+------------+ -| indirect item | offset + 1 |TYPE_INDIRECT | if this is not the first indirect item of the object -+-------------------+------------+--------------+------------+ -| direct item | offset + 1 |TYPE_DIRECT | if not this is not the first direct item of the object -+-------------------+------------+--------------+------------+ +/* + * there are 4 types of items: stat data, directory item, indirect, direct. + * +-------------------+------------+--------------+------------+ + * | | k_offset | k_uniqueness | mergeable? | + * +-------------------+------------+--------------+------------+ + * | stat data | 0 | 0 | no | + * +-------------------+------------+--------------+------------+ + * | 1st directory item| DOT_OFFSET | DIRENTRY_ .. | no | + * | non 1st directory | hash value | UNIQUENESS | yes | + * | item | | | | + * +-------------------+------------+--------------+------------+ + * | indirect item | offset + 1 |TYPE_INDIRECT | [1] | + * +-------------------+------------+--------------+------------+ + * | direct item | offset + 1 |TYPE_DIRECT | [2] | + * +-------------------+------------+--------------+------------+ + * + * [1] if this is not the first indirect item of the object + * [2] if this is not the first direct item of the object */ struct item_operations { @@ -2219,49 +2653,43 @@ extern struct item_operations *item_ops[TYPE_ANY + 1]; /* number of blocks pointed to by the indirect item */ #define I_UNFM_NUM(ih) (ih_item_len(ih) / UNFM_P_SIZE) -/* the used space within the unformatted node corresponding to pos within the item pointed to by ih */ +/* + * the used space within the unformatted node corresponding + * to pos within the item pointed to by ih + */ #define I_POS_UNFM_SIZE(ih,pos,size) (((pos) == I_UNFM_NUM(ih) - 1 ) ? (size) - ih_free_space(ih) : (size)) -/* number of bytes contained by the direct item or the unformatted nodes the indirect item points to */ - -/* get the item header */ -#define B_N_PITEM_HEAD(bh,item_num) ( (struct item_head * )((bh)->b_data + BLKH_SIZE) + (item_num) ) - -/* get key */ -#define B_N_PDELIM_KEY(bh,item_num) ( (struct reiserfs_key * )((bh)->b_data + BLKH_SIZE) + (item_num) ) - -/* get the key */ -#define B_N_PKEY(bh,item_num) ( &(B_N_PITEM_HEAD(bh,item_num)->ih_key) ) - -/* get item body */ -#define B_N_PITEM(bh,item_num) ( (bh)->b_data + ih_location(B_N_PITEM_HEAD((bh),(item_num)))) - -/* get the stat data by the buffer header and the item order */ -#define B_N_STAT_DATA(bh,nr) \ -( (struct stat_data *)((bh)->b_data + ih_location(B_N_PITEM_HEAD((bh),(nr))) ) ) +/* + * number of bytes contained by the direct item or the + * unformatted nodes the indirect item points to + */ - /* following defines use reiserfs buffer header and item header */ +/* following defines use reiserfs buffer header and item header */ /* get stat-data */ #define B_I_STAT_DATA(bh, ih) ( (struct stat_data * )((bh)->b_data + ih_location(ih)) ) -// this is 3976 for size==4096 +/* this is 3976 for size==4096 */ #define MAX_DIRECT_ITEM_LEN(size) ((size) - BLKH_SIZE - 2*IH_SIZE - SD_SIZE - UNFM_P_SIZE) -/* indirect items consist of entries which contain blocknrs, pos - indicates which entry, and B_I_POS_UNFM_POINTER resolves to the - blocknr contained by the entry pos points to */ -#define B_I_POS_UNFM_POINTER(bh,ih,pos) le32_to_cpu(*(((unp_t *)B_I_PITEM(bh,ih)) + (pos))) -#define PUT_B_I_POS_UNFM_POINTER(bh,ih,pos, val) do {*(((unp_t *)B_I_PITEM(bh,ih)) + (pos)) = cpu_to_le32(val); } while (0) +/* + * indirect items consist of entries which contain blocknrs, pos + * indicates which entry, and B_I_POS_UNFM_POINTER resolves to the + * blocknr contained by the entry pos points to + */ +#define B_I_POS_UNFM_POINTER(bh, ih, pos) \ + le32_to_cpu(*(((unp_t *)ih_item_body(bh, ih)) + (pos))) +#define PUT_B_I_POS_UNFM_POINTER(bh, ih, pos, val) \ + (*(((unp_t *)ih_item_body(bh, ih)) + (pos)) = cpu_to_le32(val)) struct reiserfs_iget_args { __u32 objectid; __u32 dirid; }; -/***************************************************************************/ -/* FUNCTION DECLARATIONS */ -/***************************************************************************/ +/*************************************************************************** + * FUNCTION DECLARATIONS * + ***************************************************************************/ #define get_journal_desc_magic(bh) (bh->b_data + bh->b_size - 12) @@ -2273,7 +2701,10 @@ struct reiserfs_iget_args { /* first block written in a commit. */ struct reiserfs_journal_desc { __le32 j_trans_id; /* id of commit */ - __le32 j_len; /* length of commit. len +1 is the commit block */ + + /* length of commit. len +1 is the commit block */ + __le32 j_len; + __le32 j_mount_id; /* mount id of this trans */ __le32 j_realblock[1]; /* real locations for each block */ }; @@ -2300,22 +2731,35 @@ struct reiserfs_journal_commit { #define set_commit_trans_id(c,val) do { (c)->j_trans_id = cpu_to_le32 (val); } while (0) #define set_commit_trans_len(c,val) do { (c)->j_len = cpu_to_le32 (val); } while (0) -/* this header block gets written whenever a transaction is considered fully flushed, and is more recent than the -** last fully flushed transaction. fully flushed means all the log blocks and all the real blocks are on disk, -** and this transaction does not need to be replayed. -*/ +/* + * this header block gets written whenever a transaction is considered + * fully flushed, and is more recent than the last fully flushed transaction. + * fully flushed means all the log blocks and all the real blocks are on + * disk, and this transaction does not need to be replayed. + */ struct reiserfs_journal_header { - __le32 j_last_flush_trans_id; /* id of last fully flushed transaction */ - __le32 j_first_unflushed_offset; /* offset in the log of where to start replay after a crash */ + /* id of last fully flushed transaction */ + __le32 j_last_flush_trans_id; + + /* offset in the log of where to start replay after a crash */ + __le32 j_first_unflushed_offset; + __le32 j_mount_id; /* 12 */ struct journal_params jh_journal; }; /* biggest tunable defines are right here */ #define JOURNAL_BLOCK_COUNT 8192 /* number of blocks in the journal */ -#define JOURNAL_TRANS_MAX_DEFAULT 1024 /* biggest possible single transaction, don't change for now (8/3/99) */ + +/* biggest possible single transaction, don't change for now (8/3/99) */ +#define JOURNAL_TRANS_MAX_DEFAULT 1024 #define JOURNAL_TRANS_MIN_DEFAULT 256 -#define JOURNAL_MAX_BATCH_DEFAULT 900 /* max blocks to batch into one transaction, don't make this any bigger than 900 */ + +/* + * max blocks to batch into one transaction, + * don't make this any bigger than 900 + */ +#define JOURNAL_MAX_BATCH_DEFAULT 900 #define JOURNAL_MIN_RATIO 2 #define JOURNAL_MAX_COMMIT_AGE 30 #define JOURNAL_MAX_TRANS_AGE 30 @@ -2340,16 +2784,18 @@ struct reiserfs_journal_header { #define REISERFS_QUOTA_DEL_BLOCKS(s) 0 #endif -/* both of these can be as low as 1, or as high as you want. The min is the -** number of 4k bitmap nodes preallocated on mount. New nodes are allocated -** as needed, and released when transactions are committed. On release, if -** the current number of nodes is > max, the node is freed, otherwise, -** it is put on a free list for faster use later. +/* + * both of these can be as low as 1, or as high as you want. The min is the + * number of 4k bitmap nodes preallocated on mount. New nodes are allocated + * as needed, and released when transactions are committed. On release, if + * the current number of nodes is > max, the node is freed, otherwise, + * it is put on a free list for faster use later. */ #define REISERFS_MIN_BITMAP_NODES 10 #define REISERFS_MAX_BITMAP_NODES 100 -#define JBH_HASH_SHIFT 13 /* these are based on journal hash size of 8192 */ +/* these are based on journal hash size of 8192 */ +#define JBH_HASH_SHIFT 13 #define JBH_HASH_MASK 8191 #define _jhashfn(sb,block) \ @@ -2357,7 +2803,7 @@ struct reiserfs_journal_header { (((block)<<(JBH_HASH_SHIFT - 6)) ^ ((block) >> 13) ^ ((block) << (JBH_HASH_SHIFT - 12)))) #define journal_hash(t,sb,block) ((t)[_jhashfn((sb),(block)) & JBH_HASH_MASK]) -// We need these to make journal.c code more readable +/* We need these to make journal.c code more readable */ #define journal_find_get_block(s, block) __find_get_block(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) #define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) #define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) @@ -2365,12 +2811,14 @@ struct reiserfs_journal_header { enum reiserfs_bh_state_bits { BH_JDirty = BH_PrivateStart, /* buffer is in current transaction */ BH_JDirty_wait, - BH_JNew, /* disk block was taken off free list before - * being in a finished transaction, or - * written to disk. Can be reused immed. */ + /* + * disk block was taken off free list before being in a + * finished transaction, or written to disk. Can be reused immed. + */ + BH_JNew, BH_JPrepared, BH_JRestore_dirty, - BH_JTest, // debugging only will go away + BH_JTest, /* debugging only will go away */ }; BUFFER_FNS(JDirty, journaled); @@ -2386,27 +2834,36 @@ TAS_BUFFER_FNS(JRestore_dirty, journal_restore_dirty); BUFFER_FNS(JTest, journal_test); TAS_BUFFER_FNS(JTest, journal_test); -/* -** transaction handle which is passed around for all journal calls -*/ +/* transaction handle which is passed around for all journal calls */ struct reiserfs_transaction_handle { - struct super_block *t_super; /* super for this FS when journal_begin was - called. saves calls to reiserfs_get_super - also used by nested transactions to make - sure they are nesting on the right FS - _must_ be first in the handle - */ + /* + * super for this FS when journal_begin was called. saves calls to + * reiserfs_get_super also used by nested transactions to make + * sure they are nesting on the right FS _must_ be first + * in the handle + */ + struct super_block *t_super; + int t_refcount; int t_blocks_logged; /* number of blocks this writer has logged */ int t_blocks_allocated; /* number of blocks this writer allocated */ - unsigned int t_trans_id; /* sanity check, equals the current trans id */ + + /* sanity check, equals the current trans id */ + unsigned int t_trans_id; + void *t_handle_save; /* save existing current->journal_info */ - unsigned displace_new_blocks:1; /* if new block allocation occurres, that block - should be displaced from others */ + + /* + * if new block allocation occurres, that block + * should be displaced from others + */ + unsigned displace_new_blocks:1; + struct list_head t_list; }; -/* used to keep track of ordered and tail writes, attached to the buffer +/* + * used to keep track of ordered and tail writes, attached to the buffer * head through b_journal_head. */ struct reiserfs_jh { @@ -2419,7 +2876,7 @@ void reiserfs_free_jh(struct buffer_head *bh); int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh); int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh); int journal_mark_dirty(struct reiserfs_transaction_handle *, - struct super_block *, struct buffer_head *bh); + struct buffer_head *bh); static inline int reiserfs_file_data_log(struct inode *inode) { @@ -2469,10 +2926,8 @@ int journal_init(struct super_block *, const char *j_dev_name, int old_format, int journal_release(struct reiserfs_transaction_handle *, struct super_block *); int journal_release_error(struct reiserfs_transaction_handle *, struct super_block *); -int journal_end(struct reiserfs_transaction_handle *, struct super_block *, - unsigned long); -int journal_end_sync(struct reiserfs_transaction_handle *, struct super_block *, - unsigned long); +int journal_end(struct reiserfs_transaction_handle *); +int journal_end_sync(struct reiserfs_transaction_handle *); int journal_mark_freed(struct reiserfs_transaction_handle *, struct super_block *, b_blocknr_t blocknr); int journal_transaction_should_end(struct reiserfs_transaction_handle *, int); @@ -2481,7 +2936,7 @@ int reiserfs_in_journal(struct super_block *sb, unsigned int bmap_nr, int journal_begin(struct reiserfs_transaction_handle *, struct super_block *sb, unsigned long); int journal_join_abort(struct reiserfs_transaction_handle *, - struct super_block *sb, unsigned long); + struct super_block *sb); void reiserfs_abort_journal(struct super_block *sb, int errno); void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...); int reiserfs_allocate_list_bitmaps(struct super_block *s, @@ -2503,20 +2958,18 @@ int B_IS_IN_TREE(const struct buffer_head *); extern void copy_item_head(struct item_head *to, const struct item_head *from); -// first key is in cpu form, second - le +/* first key is in cpu form, second - le */ extern int comp_short_keys(const struct reiserfs_key *le_key, const struct cpu_key *cpu_key); extern void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from); -// both are in le form +/* both are in le form */ extern int comp_le_keys(const struct reiserfs_key *, const struct reiserfs_key *); extern int comp_short_le_keys(const struct reiserfs_key *, const struct reiserfs_key *); -// -// get key version from on disk key - kludge -// +/* * get key version from on disk key - kludge */ static inline int le_key_version(const struct reiserfs_key *key) { int type; @@ -2593,12 +3046,12 @@ void padd_item(char *item, int total_length, int length); /* inode.c */ /* args for the create parameter of reiserfs_get_block */ -#define GET_BLOCK_NO_CREATE 0 /* don't create new blocks or convert tails */ -#define GET_BLOCK_CREATE 1 /* add anything you need to find block */ -#define GET_BLOCK_NO_HOLE 2 /* return -ENOENT for file holes */ -#define GET_BLOCK_READ_DIRECT 4 /* read the tail if indirect item not found */ -#define GET_BLOCK_NO_IMUX 8 /* i_mutex is not held, don't preallocate */ -#define GET_BLOCK_NO_DANGLE 16 /* don't leave any transactions running */ +#define GET_BLOCK_NO_CREATE 0 /* don't create new blocks or convert tails */ +#define GET_BLOCK_CREATE 1 /* add anything you need to find block */ +#define GET_BLOCK_NO_HOLE 2 /* return -ENOENT for file holes */ +#define GET_BLOCK_READ_DIRECT 4 /* read the tail if indirect item not found */ +#define GET_BLOCK_NO_IMUX 8 /* i_mutex is not held, don't preallocate */ +#define GET_BLOCK_NO_DANGLE 16 /* don't leave any transactions running */ void reiserfs_read_locked_inode(struct inode *inode, struct reiserfs_iget_args *args); @@ -2797,25 +3250,49 @@ struct buffer_head *get_FEB(struct tree_balance *); /* bitmap.c */ -/* structure contains hints for block allocator, and it is a container for - * arguments, such as node, search path, transaction_handle, etc. */ +/* + * structure contains hints for block allocator, and it is a container for + * arguments, such as node, search path, transaction_handle, etc. + */ struct __reiserfs_blocknr_hint { - struct inode *inode; /* inode passed to allocator, if we allocate unf. nodes */ + /* inode passed to allocator, if we allocate unf. nodes */ + struct inode *inode; + sector_t block; /* file offset, in blocks */ struct in_core_key key; - struct treepath *path; /* search path, used by allocator to deternine search_start by - * various ways */ - struct reiserfs_transaction_handle *th; /* transaction handle is needed to log super blocks and - * bitmap blocks changes */ + + /* + * search path, used by allocator to deternine search_start by + * various ways + */ + struct treepath *path; + + /* + * transaction handle is needed to log super blocks + * and bitmap blocks changes + */ + struct reiserfs_transaction_handle *th; + b_blocknr_t beg, end; - b_blocknr_t search_start; /* a field used to transfer search start value (block number) - * between different block allocator procedures - * (determine_search_start() and others) */ - int prealloc_size; /* is set in determine_prealloc_size() function, used by underlayed - * function that do actual allocation */ - - unsigned formatted_node:1; /* the allocator uses different polices for getting disk space for - * formatted/unformatted blocks with/without preallocation */ + + /* + * a field used to transfer search start value (block number) + * between different block allocator procedures + * (determine_search_start() and others) + */ + b_blocknr_t search_start; + + /* + * is set in determine_prealloc_size() function, + * used by underlayed function that do actual allocation + */ + int prealloc_size; + + /* + * the allocator uses different polices for getting disk + * space for formatted/unformatted blocks with/without preallocation + */ + unsigned formatted_node:1; unsigned preallocate:1; }; @@ -2909,13 +3386,15 @@ __u32 r5_hash(const signed char *msg, int len); #define reiserfs_test_le_bit test_bit_le #define reiserfs_find_next_zero_le_bit find_next_zero_bit_le -/* sometimes reiserfs_truncate may require to allocate few new blocks - to perform indirect2direct conversion. People probably used to - think, that truncate should work without problems on a filesystem - without free disk space. They may complain that they can not - truncate due to lack of free disk space. This spare space allows us - to not worry about it. 500 is probably too much, but it should be - absolutely safe */ +/* + * sometimes reiserfs_truncate may require to allocate few new blocks + * to perform indirect2direct conversion. People probably used to + * think, that truncate should work without problems on a filesystem + * without free disk space. They may complain that they can not + * truncate due to lack of free disk space. This spare space allows us + * to not worry about it. 500 is probably too much, but it should be + * absolutely safe + */ #define SPARE_SPACE 500 /* prototypes from ioctl.c */ diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c index a4ef5cd606eb..6052d323bc9a 100644 --- a/fs/reiserfs/resize.c +++ b/fs/reiserfs/resize.c @@ -53,8 +53,10 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) } bforget(bh); - /* old disk layout detection; those partitions can be mounted, but - * cannot be resized */ + /* + * old disk layout detection; those partitions can be mounted, but + * cannot be resized + */ if (SB_BUFFER_WITH_SB(s)->b_blocknr * SB_BUFFER_WITH_SB(s)->b_size != REISERFS_DISK_OFFSET_IN_BYTES) { printk @@ -86,12 +88,14 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) ("reiserfs_resize: unable to allocate memory for journal bitmaps\n"); return -ENOMEM; } - /* the new journal bitmaps are zero filled, now we copy in the bitmap - ** node pointers from the old journal bitmap structs, and then - ** transfer the new data structures into the journal struct. - ** - ** using the copy_size var below allows this code to work for - ** both shrinking and expanding the FS. + /* + * the new journal bitmaps are zero filled, now we copy i + * the bitmap node pointers from the old journal bitmap + * structs, and then transfer the new data structures + * into the journal struct. + * + * using the copy_size var below allows this code to work for + * both shrinking and expanding the FS. */ copy_size = bmap_nr_new < bmap_nr ? bmap_nr_new : bmap_nr; copy_size = @@ -101,36 +105,45 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) jb = SB_JOURNAL(s)->j_list_bitmap + i; memcpy(jbitmap[i].bitmaps, jb->bitmaps, copy_size); - /* just in case vfree schedules on us, copy the new - ** pointer into the journal struct before freeing the - ** old one + /* + * just in case vfree schedules on us, copy the new + * pointer into the journal struct before freeing the + * old one */ node_tmp = jb->bitmaps; jb->bitmaps = jbitmap[i].bitmaps; vfree(node_tmp); } - /* allocate additional bitmap blocks, reallocate array of bitmap - * block pointers */ + /* + * allocate additional bitmap blocks, reallocate + * array of bitmap block pointers + */ bitmap = vzalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new); if (!bitmap) { - /* Journal bitmaps are still supersized, but the memory isn't - * leaked, so I guess it's ok */ + /* + * Journal bitmaps are still supersized, but the + * memory isn't leaked, so I guess it's ok + */ printk("reiserfs_resize: unable to allocate memory.\n"); return -ENOMEM; } for (i = 0; i < bmap_nr; i++) bitmap[i] = old_bitmap[i]; - /* This doesn't go through the journal, but it doesn't have to. - * The changes are still atomic: We're synced up when the journal - * transaction begins, and the new bitmaps don't matter if the - * transaction fails. */ + /* + * This doesn't go through the journal, but it doesn't have to. + * The changes are still atomic: We're synced up when the + * journal transaction begins, and the new bitmaps don't + * matter if the transaction fails. + */ for (i = bmap_nr; i < bmap_nr_new; i++) { int depth; - /* don't use read_bitmap_block since it will cache - * the uninitialized bitmap */ + /* + * don't use read_bitmap_block since it will cache + * the uninitialized bitmap + */ depth = reiserfs_write_unlock_nested(s); bh = sb_bread(s, i * s->s_blocksize * 8); reiserfs_write_lock_nested(s, depth); @@ -147,7 +160,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) depth = reiserfs_write_unlock_nested(s); sync_dirty_buffer(bh); reiserfs_write_lock_nested(s, depth); - // update bitmap_info stuff + /* update bitmap_info stuff */ bitmap[i].free_count = sb_blocksize(sb) * 8 - 1; brelse(bh); } @@ -156,9 +169,11 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) vfree(old_bitmap); } - /* begin transaction, if there was an error, it's fine. Yes, we have + /* + * begin transaction, if there was an error, it's fine. Yes, we have * incorrect bitmaps now, but none of it is ever going to touch the - * disk anyway. */ + * disk anyway. + */ err = journal_begin(&th, s, 10); if (err) return err; @@ -167,7 +182,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) info = SB_AP_BITMAP(s) + bmap_nr - 1; bh = reiserfs_read_bitmap_block(s, bmap_nr - 1); if (!bh) { - int jerr = journal_end(&th, s, 10); + int jerr = journal_end(&th); if (jerr) return jerr; return -EIO; @@ -178,14 +193,14 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) reiserfs_clear_le_bit(i, bh->b_data); info->free_count += s->s_blocksize * 8 - block_r; - journal_mark_dirty(&th, s, bh); + journal_mark_dirty(&th, bh); brelse(bh); /* Correct new last bitmap block - It may not be full */ info = SB_AP_BITMAP(s) + bmap_nr_new - 1; bh = reiserfs_read_bitmap_block(s, bmap_nr_new - 1); if (!bh) { - int jerr = journal_end(&th, s, 10); + int jerr = journal_end(&th); if (jerr) return jerr; return -EIO; @@ -194,7 +209,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) reiserfs_prepare_for_journal(s, bh, 1); for (i = block_r_new; i < s->s_blocksize * 8; i++) reiserfs_set_le_bit(i, bh->b_data); - journal_mark_dirty(&th, s, bh); + journal_mark_dirty(&th, bh); brelse(bh); info->free_count -= s->s_blocksize * 8 - block_r_new; @@ -207,8 +222,8 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) PUT_SB_BLOCK_COUNT(s, block_count_new); PUT_SB_BMAP_NR(s, bmap_would_wrap(bmap_nr_new) ? : bmap_nr_new); - journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); SB_JOURNAL(s)->j_must_wait = 1; - return journal_end(&th, s, 10); + return journal_end(&th); } diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index b14706a05d52..dd44468edc2b 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -8,46 +8,6 @@ * Pereslavl-Zalessky Russia */ -/* - * This file contains functions dealing with S+tree - * - * B_IS_IN_TREE - * copy_item_head - * comp_short_keys - * comp_keys - * comp_short_le_keys - * le_key2cpu_key - * comp_le_keys - * bin_search - * get_lkey - * get_rkey - * key_in_buffer - * decrement_bcount - * reiserfs_check_path - * pathrelse_and_restore - * pathrelse - * search_by_key_reada - * search_by_key - * search_for_position_by_key - * comp_items - * prepare_for_direct_item - * prepare_for_direntry_item - * prepare_for_delete_or_cut - * calc_deleted_bytes_number - * init_tb_struct - * padd_item - * reiserfs_delete_item - * reiserfs_delete_solid_item - * reiserfs_delete_object - * maybe_indirect_to_direct - * indirect_to_direct_roll_back - * reiserfs_cut_from_item - * truncate_directory - * reiserfs_do_truncate - * reiserfs_paste_into_item - * reiserfs_insert_item - */ - #include <linux/time.h> #include <linux/string.h> #include <linux/pagemap.h> @@ -65,21 +25,21 @@ inline int B_IS_IN_TREE(const struct buffer_head *bh) return (B_LEVEL(bh) != FREE_LEVEL); } -// -// to gets item head in le form -// +/* to get item head in le form */ inline void copy_item_head(struct item_head *to, const struct item_head *from) { memcpy(to, from, IH_SIZE); } -/* k1 is pointer to on-disk structure which is stored in little-endian - form. k2 is pointer to cpu variable. For key of items of the same - object this returns 0. - Returns: -1 if key1 < key2 - 0 if key1 == key2 - 1 if key1 > key2 */ +/* + * k1 is pointer to on-disk structure which is stored in little-endian + * form. k2 is pointer to cpu variable. For key of items of the same + * object this returns 0. + * Returns: -1 if key1 < key2 + * 0 if key1 == key2 + * 1 if key1 > key2 + */ inline int comp_short_keys(const struct reiserfs_key *le_key, const struct cpu_key *cpu_key) { @@ -97,11 +57,13 @@ inline int comp_short_keys(const struct reiserfs_key *le_key, return 0; } -/* k1 is pointer to on-disk structure which is stored in little-endian - form. k2 is pointer to cpu variable. - Compare keys using all 4 key fields. - Returns: -1 if key1 < key2 0 - if key1 = key2 1 if key1 > key2 */ +/* + * k1 is pointer to on-disk structure which is stored in little-endian + * form. k2 is pointer to cpu variable. + * Compare keys using all 4 key fields. + * Returns: -1 if key1 < key2 0 + * if key1 = key2 1 if key1 > key2 + */ static inline int comp_keys(const struct reiserfs_key *le_key, const struct cpu_key *cpu_key) { @@ -155,15 +117,17 @@ inline void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from) to->on_disk_key.k_dir_id = le32_to_cpu(from->k_dir_id); to->on_disk_key.k_objectid = le32_to_cpu(from->k_objectid); - // find out version of the key + /* find out version of the key */ version = le_key_version(from); to->version = version; to->on_disk_key.k_offset = le_key_k_offset(version, from); to->on_disk_key.k_type = le_key_k_type(version, from); } -// this does not say which one is bigger, it only returns 1 if keys -// are not equal, 0 otherwise +/* + * this does not say which one is bigger, it only returns 1 if keys + * are not equal, 0 otherwise + */ inline int comp_le_keys(const struct reiserfs_key *k1, const struct reiserfs_key *k2) { @@ -177,24 +141,27 @@ inline int comp_le_keys(const struct reiserfs_key *k1, * *pos = number of the searched element if found, else the * * number of the first element that is larger than key. * **************************************************************************/ -/* For those not familiar with binary search: lbound is the leftmost item that it - could be, rbound the rightmost item that it could be. We examine the item - halfway between lbound and rbound, and that tells us either that we can increase - lbound, or decrease rbound, or that we have found it, or if lbound <= rbound that - there are no possible items, and we have not found it. With each examination we - cut the number of possible items it could be by one more than half rounded down, - or we find it. */ +/* + * For those not familiar with binary search: lbound is the leftmost item + * that it could be, rbound the rightmost item that it could be. We examine + * the item halfway between lbound and rbound, and that tells us either + * that we can increase lbound, or decrease rbound, or that we have found it, + * or if lbound <= rbound that there are no possible items, and we have not + * found it. With each examination we cut the number of possible items it + * could be by one more than half rounded down, or we find it. + */ static inline int bin_search(const void *key, /* Key to search for. */ const void *base, /* First item in the array. */ int num, /* Number of items in the array. */ - int width, /* Item size in the array. - searched. Lest the reader be - confused, note that this is crafted - as a general function, and when it - is applied specifically to the array - of item headers in a node, width - is actually the item header size not - the item size. */ + /* + * Item size in the array. searched. Lest the + * reader be confused, note that this is crafted + * as a general function, and when it is applied + * specifically to the array of item headers in a + * node, width is actually the item header size + * not the item size. + */ + int width, int *pos /* Number of the searched for element. */ ) { @@ -216,8 +183,10 @@ static inline int bin_search(const void *key, /* Key to search for. */ return ITEM_FOUND; /* Key found in the array. */ } - /* bin_search did not find given key, it returns position of key, - that is minimal and greater than the given one. */ + /* + * bin_search did not find given key, it returns position of key, + * that is minimal and greater than the given one. + */ *pos = lbound; return ITEM_NOT_FOUND; } @@ -228,16 +197,20 @@ const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} }; /* Maximal possible key. It is never in the tree. */ static const struct reiserfs_key MAX_KEY = { - __constant_cpu_to_le32(0xffffffff), - __constant_cpu_to_le32(0xffffffff), - {{__constant_cpu_to_le32(0xffffffff), - __constant_cpu_to_le32(0xffffffff)},} + cpu_to_le32(0xffffffff), + cpu_to_le32(0xffffffff), + {{cpu_to_le32(0xffffffff), + cpu_to_le32(0xffffffff)},} }; -/* Get delimiting key of the buffer by looking for it in the buffers in the path, starting from the bottom - of the path, and going upwards. We must check the path's validity at each step. If the key is not in - the path, there is no delimiting key in the tree (buffer is first or last buffer in tree), and in this - case we return a special key, either MIN_KEY or MAX_KEY. */ +/* + * Get delimiting key of the buffer by looking for it in the buffers in the + * path, starting from the bottom of the path, and going upwards. We must + * check the path's validity at each step. If the key is not in the path, + * there is no delimiting key in the tree (buffer is first or last buffer + * in tree), and in this case we return a special key, either MIN_KEY or + * MAX_KEY. + */ static inline const struct reiserfs_key *get_lkey(const struct treepath *chk_path, const struct super_block *sb) { @@ -270,9 +243,12 @@ static inline const struct reiserfs_key *get_lkey(const struct treepath *chk_pat PATH_OFFSET_PBUFFER(chk_path, path_offset + 1)->b_blocknr) return &MAX_KEY; - /* Return delimiting key if position in the parent is not equal to zero. */ + /* + * Return delimiting key if position in the parent + * is not equal to zero. + */ if (position) - return B_N_PDELIM_KEY(parent, position - 1); + return internal_key(parent, position - 1); } /* Return MIN_KEY if we are in the root of the buffer tree. */ if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)-> @@ -308,15 +284,23 @@ inline const struct reiserfs_key *get_rkey(const struct treepath *chk_path, path_offset)) > B_NR_ITEMS(parent)) return &MIN_KEY; - /* Check whether parent at the path really points to the child. */ + /* + * Check whether parent at the path really points + * to the child. + */ if (B_N_CHILD_NUM(parent, position) != PATH_OFFSET_PBUFFER(chk_path, path_offset + 1)->b_blocknr) return &MIN_KEY; - /* Return delimiting key if position in the parent is not the last one. */ + + /* + * Return delimiting key if position in the parent + * is not the last one. + */ if (position != B_NR_ITEMS(parent)) - return B_N_PDELIM_KEY(parent, position); + return internal_key(parent, position); } + /* Return MAX_KEY if we are in the root of the buffer tree. */ if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)-> b_blocknr == SB_ROOT_BLOCK(sb)) @@ -324,13 +308,20 @@ inline const struct reiserfs_key *get_rkey(const struct treepath *chk_path, return &MIN_KEY; } -/* Check whether a key is contained in the tree rooted from a buffer at a path. */ -/* This works by looking at the left and right delimiting keys for the buffer in the last path_element in - the path. These delimiting keys are stored at least one level above that buffer in the tree. If the - buffer is the first or last node in the tree order then one of the delimiting keys may be absent, and in - this case get_lkey and get_rkey return a special key which is MIN_KEY or MAX_KEY. */ -static inline int key_in_buffer(struct treepath *chk_path, /* Path which should be checked. */ - const struct cpu_key *key, /* Key which should be checked. */ +/* + * Check whether a key is contained in the tree rooted from a buffer at a path. + * This works by looking at the left and right delimiting keys for the buffer + * in the last path_element in the path. These delimiting keys are stored + * at least one level above that buffer in the tree. If the buffer is the + * first or last node in the tree order then one of the delimiting keys may + * be absent, and in this case get_lkey and get_rkey return a special key + * which is MIN_KEY or MAX_KEY. + */ +static inline int key_in_buffer( + /* Path which should be checked. */ + struct treepath *chk_path, + /* Key which should be checked. */ + const struct cpu_key *key, struct super_block *sb ) { @@ -359,9 +350,11 @@ int reiserfs_check_path(struct treepath *p) return 0; } -/* Drop the reference to each buffer in a path and restore +/* + * Drop the reference to each buffer in a path and restore * dirty bits clean when preparing the buffer for the log. - * This version should only be called from fix_nodes() */ + * This version should only be called from fix_nodes() + */ void pathrelse_and_restore(struct super_block *sb, struct treepath *search_path) { @@ -418,14 +411,17 @@ static int is_leaf(char *buf, int blocksize, struct buffer_head *bh) } ih = (struct item_head *)(buf + BLKH_SIZE) + nr - 1; used_space = BLKH_SIZE + IH_SIZE * nr + (blocksize - ih_location(ih)); + + /* free space does not match to calculated amount of use space */ if (used_space != blocksize - blkh_free_space(blkh)) { - /* free space does not match to calculated amount of use space */ reiserfs_warning(NULL, "reiserfs-5082", "free space seems wrong: %z", bh); return 0; } - // FIXME: it is_leaf will hit performance too much - we may have - // return 1 here + /* + * FIXME: it is_leaf will hit performance too much - we may have + * return 1 here + */ /* check tables of item heads */ ih = (struct item_head *)(buf + BLKH_SIZE); @@ -460,7 +456,7 @@ static int is_leaf(char *buf, int blocksize, struct buffer_head *bh) prev_location = ih_location(ih); } - // one may imagine much more checks + /* one may imagine many more checks */ return 1; } @@ -481,8 +477,8 @@ static int is_internal(char *buf, int blocksize, struct buffer_head *bh) } nr = blkh_nr_item(blkh); + /* for internal which is not root we might check min number of keys */ if (nr > (blocksize - BLKH_SIZE - DC_SIZE) / (KEY_SIZE + DC_SIZE)) { - /* for internal which is not root we might check min number of keys */ reiserfs_warning(NULL, "reiserfs-5088", "number of key seems wrong: %z", bh); return 0; @@ -494,12 +490,15 @@ static int is_internal(char *buf, int blocksize, struct buffer_head *bh) "free space seems wrong: %z", bh); return 0; } - // one may imagine much more checks + + /* one may imagine many more checks */ return 1; } -// make sure that bh contains formatted node of reiserfs tree of -// 'level'-th level +/* + * make sure that bh contains formatted node of reiserfs tree of + * 'level'-th level + */ static int is_tree_node(struct buffer_head *bh, int level) { if (B_LEVEL(bh) != level) { @@ -546,7 +545,8 @@ static int search_by_key_reada(struct super_block *s, for (j = 0; j < i; j++) { /* * note, this needs attention if we are getting rid of the BKL - * you have to make sure the prepared bit isn't set on this buffer + * you have to make sure the prepared bit isn't set on this + * buffer */ if (!buffer_uptodate(bh[j])) { if (depth == -1) @@ -558,39 +558,34 @@ static int search_by_key_reada(struct super_block *s, return depth; } -/************************************************************************** - * Algorithm SearchByKey * - * look for item in the Disk S+Tree by its key * - * Input: sb - super block * - * key - pointer to the key to search * - * Output: ITEM_FOUND, ITEM_NOT_FOUND or IO_ERROR * - * search_path - path from the root to the needed leaf * - **************************************************************************/ - -/* This function fills up the path from the root to the leaf as it - descends the tree looking for the key. It uses reiserfs_bread to - try to find buffers in the cache given their block number. If it - does not find them in the cache it reads them from disk. For each - node search_by_key finds using reiserfs_bread it then uses - bin_search to look through that node. bin_search will find the - position of the block_number of the next node if it is looking - through an internal node. If it is looking through a leaf node - bin_search will find the position of the item which has key either - equal to given key, or which is the maximal key less than the given - key. search_by_key returns a path that must be checked for the - correctness of the top of the path but need not be checked for the - correctness of the bottom of the path */ -/* The function is NOT SCHEDULE-SAFE! */ -int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to search. */ - struct treepath *search_path,/* This structure was - allocated and initialized - by the calling - function. It is filled up - by this function. */ - int stop_level /* How far down the tree to search. To - stop at leaf level - set to - DISK_LEAF_NODE_LEVEL */ - ) +/* + * This function fills up the path from the root to the leaf as it + * descends the tree looking for the key. It uses reiserfs_bread to + * try to find buffers in the cache given their block number. If it + * does not find them in the cache it reads them from disk. For each + * node search_by_key finds using reiserfs_bread it then uses + * bin_search to look through that node. bin_search will find the + * position of the block_number of the next node if it is looking + * through an internal node. If it is looking through a leaf node + * bin_search will find the position of the item which has key either + * equal to given key, or which is the maximal key less than the given + * key. search_by_key returns a path that must be checked for the + * correctness of the top of the path but need not be checked for the + * correctness of the bottom of the path + */ +/* + * search_by_key - search for key (and item) in stree + * @sb: superblock + * @key: pointer to key to search for + * @search_path: Allocated and initialized struct treepath; Returned filled + * on success. + * @stop_level: How far down the tree to search, Use DISK_LEAF_NODE_LEVEL to + * stop at leaf level. + * + * The function is NOT SCHEDULE-SAFE! + */ +int search_by_key(struct super_block *sb, const struct cpu_key *key, + struct treepath *search_path, int stop_level) { b_blocknr_t block_number; int expected_level; @@ -609,17 +604,22 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s PROC_INFO_INC(sb, search_by_key); - /* As we add each node to a path we increase its count. This means that - we must be careful to release all nodes in a path before we either - discard the path struct or re-use the path struct, as we do here. */ + /* + * As we add each node to a path we increase its count. This means + * that we must be careful to release all nodes in a path before we + * either discard the path struct or re-use the path struct, as we + * do here. + */ pathrelse(search_path); right_neighbor_of_leaf_node = 0; - /* With each iteration of this loop we search through the items in the - current node, and calculate the next current node(next path element) - for the next iteration of this loop.. */ + /* + * With each iteration of this loop we search through the items in the + * current node, and calculate the next current node(next path element) + * for the next iteration of this loop.. + */ block_number = SB_ROOT_BLOCK(sb); expected_level = -1; while (1) { @@ -639,8 +639,10 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s ++search_path->path_length); fs_gen = get_generation(sb); - /* Read the next tree node, and set the last element in the path to - have a pointer to it. */ + /* + * Read the next tree node, and set the last element + * in the path to have a pointer to it. + */ if ((bh = last_element->pe_buffer = sb_getblk(sb, block_number))) { @@ -666,7 +668,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s if (!buffer_uptodate(bh)) goto io_error; } else { - io_error: +io_error: search_path->path_length--; pathrelse(search_path); return IO_ERROR; @@ -676,9 +678,12 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s expected_level = SB_TREE_HEIGHT(sb); expected_level--; - /* It is possible that schedule occurred. We must check whether the key - to search is still in the tree rooted from the current buffer. If - not then repeat search from the root. */ + /* + * It is possible that schedule occurred. We must check + * whether the key to search is still in the tree rooted + * from the current buffer. If not then repeat search + * from the root. + */ if (fs_changed(fs_gen, sb) && (!B_IS_IN_TREE(bh) || B_LEVEL(bh) != expected_level || @@ -689,8 +694,10 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s sbk_restarted[expected_level - 1]); pathrelse(search_path); - /* Get the root block number so that we can repeat the search - starting from the root. */ + /* + * Get the root block number so that we can + * repeat the search starting from the root. + */ block_number = SB_ROOT_BLOCK(sb); expected_level = -1; right_neighbor_of_leaf_node = 0; @@ -699,9 +706,11 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s continue; } - /* only check that the key is in the buffer if key is not - equal to the MAX_KEY. Latter case is only possible in - "finish_unfinished()" processing during mount. */ + /* + * only check that the key is in the buffer if key is not + * equal to the MAX_KEY. Latter case is only possible in + * "finish_unfinished()" processing during mount. + */ RFALSE(comp_keys(&MAX_KEY, key) && !key_in_buffer(search_path, key, sb), "PAP-5130: key is not in the buffer"); @@ -713,8 +722,10 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s } #endif - // make sure, that the node contents look like a node of - // certain level + /* + * make sure, that the node contents look like a node of + * certain level + */ if (!is_tree_node(bh, expected_level)) { reiserfs_error(sb, "vs-5150", "invalid format found in block %ld. " @@ -732,32 +743,42 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s "vs-5152: tree level (%d) is less than stop level (%d)", node_level, stop_level); - retval = bin_search(key, B_N_PITEM_HEAD(bh, 0), + retval = bin_search(key, item_head(bh, 0), B_NR_ITEMS(bh), (node_level == DISK_LEAF_NODE_LEVEL) ? IH_SIZE : KEY_SIZE, - &(last_element->pe_position)); + &last_element->pe_position); if (node_level == stop_level) { return retval; } /* we are not in the stop level */ + /* + * item has been found, so we choose the pointer which + * is to the right of the found one + */ if (retval == ITEM_FOUND) - /* item has been found, so we choose the pointer which is to the right of the found one */ last_element->pe_position++; - /* if item was not found we choose the position which is to - the left of the found item. This requires no code, - bin_search did it already. */ + /* + * if item was not found we choose the position which is to + * the left of the found item. This requires no code, + * bin_search did it already. + */ - /* So we have chosen a position in the current node which is - an internal node. Now we calculate child block number by - position in the node. */ + /* + * So we have chosen a position in the current node which is + * an internal node. Now we calculate child block number by + * position in the node. + */ block_number = B_N_CHILD_NUM(bh, last_element->pe_position); - /* if we are going to read leaf nodes, try for read ahead as well */ + /* + * if we are going to read leaf nodes, try for read + * ahead as well + */ if ((search_path->reada & PATH_READA) && node_level == DISK_LEAF_NODE_LEVEL + 1) { int pos = last_element->pe_position; @@ -779,7 +800,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s /* * check to make sure we're in the same object */ - le_key = B_N_PDELIM_KEY(bh, pos); + le_key = internal_key(bh, pos); if (le32_to_cpu(le_key->k_objectid) != key->on_disk_key.k_objectid) { break; @@ -789,26 +810,28 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s } } -/* Form the path to an item and position in this item which contains - file byte defined by key. If there is no such item - corresponding to the key, we point the path to the item with - maximal key less than key, and *pos_in_item is set to one - past the last entry/byte in the item. If searching for entry in a - directory item, and it is not found, *pos_in_item is set to one - entry more than the entry with maximal key which is less than the - sought key. - - Note that if there is no entry in this same node which is one more, - then we point to an imaginary entry. for direct items, the - position is in units of bytes, for indirect items the position is - in units of blocknr entries, for directory items the position is in - units of directory entries. */ - +/* + * Form the path to an item and position in this item which contains + * file byte defined by key. If there is no such item + * corresponding to the key, we point the path to the item with + * maximal key less than key, and *pos_in_item is set to one + * past the last entry/byte in the item. If searching for entry in a + * directory item, and it is not found, *pos_in_item is set to one + * entry more than the entry with maximal key which is less than the + * sought key. + * + * Note that if there is no entry in this same node which is one more, + * then we point to an imaginary entry. for direct items, the + * position is in units of bytes, for indirect items the position is + * in units of blocknr entries, for directory items the position is in + * units of directory entries. + */ /* The function is NOT SCHEDULE-SAFE! */ -int search_for_position_by_key(struct super_block *sb, /* Pointer to the super block. */ - const struct cpu_key *p_cpu_key, /* Key to search (cpu variable) */ - struct treepath *search_path /* Filled up by this function. */ - ) +int search_for_position_by_key(struct super_block *sb, + /* Key to search (cpu variable) */ + const struct cpu_key *p_cpu_key, + /* Filled up by this function. */ + struct treepath *search_path) { struct item_head *p_le_ih; /* pointer to on-disk structure */ int blk_size; @@ -830,7 +853,7 @@ int search_for_position_by_key(struct super_block *sb, /* Pointer to the super b if (retval == ITEM_FOUND) { RFALSE(!ih_item_len - (B_N_PITEM_HEAD + (item_head (PATH_PLAST_BUFFER(search_path), PATH_LAST_POSITION(search_path))), "PAP-5165: item length equals zero"); @@ -844,14 +867,14 @@ int search_for_position_by_key(struct super_block *sb, /* Pointer to the super b /* Item is not found. Set path to the previous item. */ p_le_ih = - B_N_PITEM_HEAD(PATH_PLAST_BUFFER(search_path), + item_head(PATH_PLAST_BUFFER(search_path), --PATH_LAST_POSITION(search_path)); blk_size = sb->s_blocksize; - if (comp_short_keys(&(p_le_ih->ih_key), p_cpu_key)) { + if (comp_short_keys(&p_le_ih->ih_key, p_cpu_key)) return FILE_NOT_FOUND; - } - // FIXME: quite ugly this far + + /* FIXME: quite ugly this far */ item_offset = le_ih_k_offset(p_le_ih); offset = cpu_key_k_offset(p_cpu_key); @@ -866,8 +889,10 @@ int search_for_position_by_key(struct super_block *sb, /* Pointer to the super b return POSITION_FOUND; } - /* Needed byte is not contained in the item pointed to by the - path. Set pos_in_item out of the item. */ + /* + * Needed byte is not contained in the item pointed to by the + * path. Set pos_in_item out of the item. + */ if (is_indirect_le_ih(p_le_ih)) pos_in_item(search_path) = ih_item_len(p_le_ih) / UNFM_P_SIZE; @@ -892,19 +917,17 @@ int comp_items(const struct item_head *stored_ih, const struct treepath *path) return 1; /* we need only to know, whether it is the same item */ - ih = get_ih(path); + ih = tp_item_head(path); return memcmp(stored_ih, ih, IH_SIZE); } -/* unformatted nodes are not logged anymore, ever. This is safe -** now -*/ +/* unformatted nodes are not logged anymore, ever. This is safe now */ #define held_by_others(bh) (atomic_read(&(bh)->b_count) > 1) -// block can not be forgotten as it is in I/O or held by someone +/* block can not be forgotten as it is in I/O or held by someone */ #define block_in_use(bh) (buffer_locked(bh) || (held_by_others(bh))) -// prepare for delete or cut of direct item +/* prepare for delete or cut of direct item */ static inline int prepare_for_direct_item(struct treepath *path, struct item_head *le_ih, struct inode *inode, @@ -917,9 +940,8 @@ static inline int prepare_for_direct_item(struct treepath *path, *cut_size = -(IH_SIZE + ih_item_len(le_ih)); return M_DELETE; } - // new file gets truncated + /* new file gets truncated */ if (get_inode_item_key_version(inode) == KEY_FORMAT_3_6) { - // round_len = ROUND_UP(new_file_length); /* this was new_file_length < le_ih ... */ if (round_len < le_ih_k_offset(le_ih)) { @@ -933,12 +955,13 @@ static inline int prepare_for_direct_item(struct treepath *path, return M_CUT; /* Cut from this item. */ } - // old file: items may have any length + /* old file: items may have any length */ if (new_file_length < le_ih_k_offset(le_ih)) { *cut_size = -(IH_SIZE + ih_item_len(le_ih)); return M_DELETE; /* Delete this item. */ } + /* Calculate first position and size for cutting from item. */ *cut_size = -(ih_item_len(le_ih) - (pos_in_item(path) = @@ -957,12 +980,15 @@ static inline int prepare_for_direntry_item(struct treepath *path, RFALSE(ih_entry_count(le_ih) != 2, "PAP-5220: incorrect empty directory item (%h)", le_ih); *cut_size = -(IH_SIZE + ih_item_len(le_ih)); - return M_DELETE; /* Delete the directory item containing "." and ".." entry. */ + /* Delete the directory item containing "." and ".." entry. */ + return M_DELETE; } if (ih_entry_count(le_ih) == 1) { - /* Delete the directory item such as there is one record only - in this item */ + /* + * Delete the directory item such as there is one record only + * in this item + */ *cut_size = -(IH_SIZE + ih_item_len(le_ih)); return M_DELETE; } @@ -976,18 +1002,34 @@ static inline int prepare_for_direntry_item(struct treepath *path, #define JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD (2 * JOURNAL_PER_BALANCE_CNT + 1) -/* If the path points to a directory or direct item, calculate mode and the size cut, for balance. - If the path points to an indirect item, remove some number of its unformatted nodes. - In case of file truncate calculate whether this item must be deleted/truncated or last - unformatted node of this item will be converted to a direct item. - This function returns a determination of what balance mode the calling function should employ. */ -static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, struct inode *inode, struct treepath *path, const struct cpu_key *item_key, int *removed, /* Number of unformatted nodes which were removed - from end of the file. */ - int *cut_size, unsigned long long new_file_length /* MAX_KEY_OFFSET in case of delete. */ +/* + * If the path points to a directory or direct item, calculate mode + * and the size cut, for balance. + * If the path points to an indirect item, remove some number of its + * unformatted nodes. + * In case of file truncate calculate whether this item must be + * deleted/truncated or last unformatted node of this item will be + * converted to a direct item. + * This function returns a determination of what balance mode the + * calling function should employ. + */ +static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, + struct inode *inode, + struct treepath *path, + const struct cpu_key *item_key, + /* + * Number of unformatted nodes + * which were removed from end + * of the file. + */ + int *removed, + int *cut_size, + /* MAX_KEY_OFFSET in case of delete. */ + unsigned long long new_file_length ) { struct super_block *sb = inode->i_sb; - struct item_head *p_le_ih = PATH_PITEM_HEAD(path); + struct item_head *p_le_ih = tp_item_head(path); struct buffer_head *bh = PATH_PLAST_BUFFER(path); BUG_ON(!th->t_trans_id); @@ -1023,8 +1065,10 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st int pos = 0; if ( new_file_length == max_reiserfs_offset (inode) ) { - /* prepare_for_delete_or_cut() is called by - * reiserfs_delete_item() */ + /* + * prepare_for_delete_or_cut() is called by + * reiserfs_delete_item() + */ new_file_length = 0; delete = 1; } @@ -1033,27 +1077,30 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st need_re_search = 0; *cut_size = 0; bh = PATH_PLAST_BUFFER(path); - copy_item_head(&s_ih, PATH_PITEM_HEAD(path)); + copy_item_head(&s_ih, tp_item_head(path)); pos = I_UNFM_NUM(&s_ih); while (le_ih_k_offset (&s_ih) + (pos - 1) * blk_size > new_file_length) { __le32 *unfm; __u32 block; - /* Each unformatted block deletion may involve one additional - * bitmap block into the transaction, thereby the initial - * journal space reservation might not be enough. */ + /* + * Each unformatted block deletion may involve + * one additional bitmap block into the transaction, + * thereby the initial journal space reservation + * might not be enough. + */ if (!delete && (*cut_size) != 0 && reiserfs_transaction_free_space(th) < JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) break; - unfm = (__le32 *)B_I_PITEM(bh, &s_ih) + pos - 1; + unfm = (__le32 *)ih_item_body(bh, &s_ih) + pos - 1; block = get_block_num(unfm, 0); if (block != 0) { reiserfs_prepare_for_journal(sb, bh, 1); put_block_num(unfm, 0, 0); - journal_mark_dirty(th, sb, bh); + journal_mark_dirty(th, bh); reiserfs_free_block(th, inode, block, 1); } @@ -1074,17 +1121,21 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st break; } } - /* a trick. If the buffer has been logged, this will do nothing. If - ** we've broken the loop without logging it, it will restore the - ** buffer */ + /* + * a trick. If the buffer has been logged, this will + * do nothing. If we've broken the loop without logging + * it, it will restore the buffer + */ reiserfs_restore_prepared_buffer(sb, bh); } while (need_re_search && search_for_position_by_key(sb, item_key, path) == POSITION_FOUND); pos_in_item(path) = pos * UNFM_P_SIZE; if (*cut_size == 0) { - /* Nothing were cut. maybe convert last unformatted node to the - * direct item? */ + /* + * Nothing was cut. maybe convert last unformatted node to the + * direct item? + */ result = M_CONVERT; } return result; @@ -1095,7 +1146,7 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st static int calc_deleted_bytes_number(struct tree_balance *tb, char mode) { int del_size; - struct item_head *p_le_ih = PATH_PITEM_HEAD(tb->tb_path); + struct item_head *p_le_ih = tp_item_head(tb->tb_path); if (is_statdata_le_ih(p_le_ih)) return 0; @@ -1104,9 +1155,11 @@ static int calc_deleted_bytes_number(struct tree_balance *tb, char mode) (mode == M_DELETE) ? ih_item_len(p_le_ih) : -tb->insert_size[0]; if (is_direntry_le_ih(p_le_ih)) { - /* return EMPTY_DIR_SIZE; We delete emty directoris only. - * we can't use EMPTY_DIR_SIZE, as old format dirs have a different - * empty size. ick. FIXME, is this right? */ + /* + * return EMPTY_DIR_SIZE; We delete emty directories only. + * we can't use EMPTY_DIR_SIZE, as old format dirs have a + * different empty size. ick. FIXME, is this right? + */ return del_size; } @@ -1169,7 +1222,8 @@ char head2type(struct item_head *ih) } #endif -/* Delete object item. +/* + * Delete object item. * th - active transaction handle * path - path to the deleted item * item_key - key to search for the deleted item @@ -1212,7 +1266,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, RFALSE(mode != M_DELETE, "PAP-5320: mode must be M_DELETE"); - copy_item_head(&s_ih, PATH_PITEM_HEAD(path)); + copy_item_head(&s_ih, tp_item_head(path)); s_del_balance.insert_size[0] = del_size; ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, NULL); @@ -1221,7 +1275,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, PROC_INFO_INC(sb, delete_item_restarted); - // file system changed, repeat search + /* file system changed, repeat search */ ret_value = search_for_position_by_key(sb, item_key, path); if (ret_value == IO_ERROR) @@ -1238,16 +1292,18 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, unfix_nodes(&s_del_balance); return 0; } - // reiserfs_delete_item returns item length when success + + /* reiserfs_delete_item returns item length when success */ ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE); - q_ih = get_ih(path); + q_ih = tp_item_head(path); quota_cut_bytes = ih_item_len(q_ih); - /* hack so the quota code doesn't have to guess if the file - ** has a tail. On tail insert, we allocate quota for 1 unformatted node. - ** We test the offset because the tail might have been - ** split into multiple items, and we only want to decrement for - ** the unfm node once + /* + * hack so the quota code doesn't have to guess if the file has a + * tail. On tail insert, we allocate quota for 1 unformatted node. + * We test the offset because the tail might have been + * split into multiple items, and we only want to decrement for + * the unfm node once */ if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(q_ih)) { if ((le_ih_k_offset(q_ih) & (sb->s_blocksize - 1)) == 1) { @@ -1261,33 +1317,38 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, int off; char *data; - /* We are in direct2indirect conversion, so move tail contents - to the unformatted node */ - /* note, we do the copy before preparing the buffer because we - ** don't care about the contents of the unformatted node yet. - ** the only thing we really care about is the direct item's data - ** is in the unformatted node. - ** - ** Otherwise, we would have to call reiserfs_prepare_for_journal on - ** the unformatted node, which might schedule, meaning we'd have to - ** loop all the way back up to the start of the while loop. - ** - ** The unformatted node must be dirtied later on. We can't be - ** sure here if the entire tail has been deleted yet. - ** - ** un_bh is from the page cache (all unformatted nodes are - ** from the page cache) and might be a highmem page. So, we - ** can't use un_bh->b_data. - ** -clm + /* + * We are in direct2indirect conversion, so move tail contents + * to the unformatted node + */ + /* + * note, we do the copy before preparing the buffer because we + * don't care about the contents of the unformatted node yet. + * the only thing we really care about is the direct item's + * data is in the unformatted node. + * + * Otherwise, we would have to call + * reiserfs_prepare_for_journal on the unformatted node, + * which might schedule, meaning we'd have to loop all the + * way back up to the start of the while loop. + * + * The unformatted node must be dirtied later on. We can't be + * sure here if the entire tail has been deleted yet. + * + * un_bh is from the page cache (all unformatted nodes are + * from the page cache) and might be a highmem page. So, we + * can't use un_bh->b_data. + * -clm */ data = kmap_atomic(un_bh->b_page); off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_CACHE_SIZE - 1)); memcpy(data + off, - B_I_PITEM(PATH_PLAST_BUFFER(path), &s_ih), + ih_item_body(PATH_PLAST_BUFFER(path), &s_ih), ret_value); kunmap_atomic(data); } + /* Perform balancing after all resources have been collected at once. */ do_balance(&s_del_balance, NULL, NULL, M_DELETE); @@ -1304,20 +1365,21 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, return ret_value; } -/* Summary Of Mechanisms For Handling Collisions Between Processes: - - deletion of the body of the object is performed by iput(), with the - result that if multiple processes are operating on a file, the - deletion of the body of the file is deferred until the last process - that has an open inode performs its iput(). - - writes and truncates are protected from collisions by use of - semaphores. - - creates, linking, and mknod are protected from collisions with other - processes by making the reiserfs_add_entry() the last step in the - creation, and then rolling back all changes if there was a collision. - - Hans +/* + * Summary Of Mechanisms For Handling Collisions Between Processes: + * + * deletion of the body of the object is performed by iput(), with the + * result that if multiple processes are operating on a file, the + * deletion of the body of the file is deferred until the last process + * that has an open inode performs its iput(). + * + * writes and truncates are protected from collisions by use of + * semaphores. + * + * creates, linking, and mknod are protected from collisions with other + * processes by making the reiserfs_add_entry() the last step in the + * creation, and then rolling back all changes if there was a collision. + * - Hans */ /* this deletes item which never gets split */ @@ -1347,7 +1409,11 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, } if (retval != ITEM_FOUND) { pathrelse(&path); - // No need for a warning, if there is just no free space to insert '..' item into the newly-created subdir + /* + * No need for a warning, if there is just no free + * space to insert '..' item into the + * newly-created subdir + */ if (! ((unsigned long long) GET_HASH_VALUE(le_key_k_offset @@ -1362,11 +1428,11 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, } if (!tb_init) { tb_init = 1; - item_len = ih_item_len(PATH_PITEM_HEAD(&path)); + item_len = ih_item_len(tp_item_head(&path)); init_tb_struct(th, &tb, th->t_super, &path, -(IH_SIZE + item_len)); } - quota_cut_bytes = ih_item_len(PATH_PITEM_HEAD(&path)); + quota_cut_bytes = ih_item_len(tp_item_head(&path)); retval = fix_nodes(M_DELETE, &tb, NULL, NULL); if (retval == REPEAT_SEARCH) { @@ -1376,7 +1442,11 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, if (retval == CARRY_ON) { do_balance(&tb, NULL, NULL, M_DELETE); - if (inode) { /* Should we count quota for item? (we don't count quotas for save-links) */ + /* + * Should we count quota for item? (we don't + * count quotas for save-links) + */ + if (inode) { int depth; #ifdef REISERQUOTA_DEBUG reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE, @@ -1391,7 +1461,8 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, } break; } - // IO_ERROR, NO_DISK_SPACE, etc + + /* IO_ERROR, NO_DISK_SPACE, etc */ reiserfs_warning(th->t_super, "vs-5360", "could not delete %K due to fix_nodes failure", &cpu_key); @@ -1447,11 +1518,13 @@ static void unmap_buffers(struct page *page, loff_t pos) do { next = bh->b_this_page; - /* we want to unmap the buffers that contain the tail, and - ** all the buffers after it (since the tail must be at the - ** end of the file). We don't want to unmap file data - ** before the tail, since it might be dirty and waiting to - ** reach disk + /* + * we want to unmap the buffers that contain + * the tail, and all the buffers after it + * (since the tail must be at the end of the + * file). We don't want to unmap file data + * before the tail, since it might be dirty + * and waiting to reach disk */ cur_index += bh->b_size; if (cur_index > tail_index) { @@ -1476,9 +1549,10 @@ static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th, BUG_ON(!th->t_trans_id); BUG_ON(new_file_size != inode->i_size); - /* the page being sent in could be NULL if there was an i/o error - ** reading in the last block. The user will hit problems trying to - ** read the file, but for now we just skip the indirect2direct + /* + * the page being sent in could be NULL if there was an i/o error + * reading in the last block. The user will hit problems trying to + * read the file, but for now we just skip the indirect2direct */ if (atomic_read(&inode->i_count) > 1 || !tail_has_to_be_packed(inode) || @@ -1490,17 +1564,18 @@ static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th, pathrelse(path); return cut_bytes; } + /* Perform the conversion to a direct_item. */ - /* return indirect_to_direct(inode, path, item_key, - new_file_size, mode); */ return indirect2direct(th, inode, page, path, item_key, new_file_size, mode); } -/* we did indirect_to_direct conversion. And we have inserted direct - item successesfully, but there were no disk space to cut unfm - pointer being converted. Therefore we have to delete inserted - direct item(s) */ +/* + * we did indirect_to_direct conversion. And we have inserted direct + * item successesfully, but there were no disk space to cut unfm + * pointer being converted. Therefore we have to delete inserted + * direct item(s) + */ static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th, struct inode *inode, struct treepath *path) { @@ -1509,7 +1584,7 @@ static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th, int removed; BUG_ON(!th->t_trans_id); - make_cpu_key(&tail_key, inode, inode->i_size + 1, TYPE_DIRECT, 4); // !!!! + make_cpu_key(&tail_key, inode, inode->i_size + 1, TYPE_DIRECT, 4); tail_key.key_length = 4; tail_len = @@ -1521,7 +1596,7 @@ static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th, reiserfs_panic(inode->i_sb, "vs-5615", "found invalid item"); RFALSE(path->pos_in_item != - ih_item_len(PATH_PITEM_HEAD(path)) - 1, + ih_item_len(tp_item_head(path)) - 1, "vs-5616: appended bytes found"); PATH_LAST_POSITION(path)--; @@ -1539,7 +1614,6 @@ static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th, reiserfs_warning(inode->i_sb, "reiserfs-5091", "indirect_to_direct " "conversion has been rolled back due to " "lack of disk space"); - //mark_file_without_tail (inode); mark_inode_dirty(inode); } @@ -1551,15 +1625,18 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, struct page *page, loff_t new_file_size) { struct super_block *sb = inode->i_sb; - /* Every function which is going to call do_balance must first - create a tree_balance structure. Then it must fill up this - structure by using the init_tb_struct and fix_nodes functions. - After that we can make tree balancing. */ + /* + * Every function which is going to call do_balance must first + * create a tree_balance structure. Then it must fill up this + * structure by using the init_tb_struct and fix_nodes functions. + * After that we can make tree balancing. + */ struct tree_balance s_cut_balance; struct item_head *p_le_ih; - int cut_size = 0, /* Amount to be cut. */ - ret_value = CARRY_ON, removed = 0, /* Number of the removed unformatted nodes. */ - is_inode_locked = 0; + int cut_size = 0; /* Amount to be cut. */ + int ret_value = CARRY_ON; + int removed = 0; /* Number of the removed unformatted nodes. */ + int is_inode_locked = 0; char mode; /* Mode of the balance. */ int retval2 = -1; int quota_cut_bytes; @@ -1571,21 +1648,27 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, init_tb_struct(th, &s_cut_balance, inode->i_sb, path, cut_size); - /* Repeat this loop until we either cut the item without needing - to balance, or we fix_nodes without schedule occurring */ + /* + * Repeat this loop until we either cut the item without needing + * to balance, or we fix_nodes without schedule occurring + */ while (1) { - /* Determine the balance mode, position of the first byte to - be cut, and size to be cut. In case of the indirect item - free unformatted nodes which are pointed to by the cut - pointers. */ + /* + * Determine the balance mode, position of the first byte to + * be cut, and size to be cut. In case of the indirect item + * free unformatted nodes which are pointed to by the cut + * pointers. + */ mode = prepare_for_delete_or_cut(th, inode, path, item_key, &removed, &cut_size, new_file_size); if (mode == M_CONVERT) { - /* convert last unformatted node to direct item or leave - tail in the unformatted node */ + /* + * convert last unformatted node to direct item or + * leave tail in the unformatted node + */ RFALSE(ret_value != CARRY_ON, "PAP-5570: can not convert twice"); @@ -1599,15 +1682,20 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, is_inode_locked = 1; - /* removing of last unformatted node will change value we - have to return to truncate. Save it */ + /* + * removing of last unformatted node will + * change value we have to return to truncate. + * Save it + */ retval2 = ret_value; - /*retval2 = sb->s_blocksize - (new_file_size & (sb->s_blocksize - 1)); */ - /* So, we have performed the first part of the conversion: - inserting the new direct item. Now we are removing the - last unformatted node pointer. Set key to search for - it. */ + /* + * So, we have performed the first part of the + * conversion: + * inserting the new direct item. Now we are + * removing the last unformatted node pointer. + * Set key to search for it. + */ set_cpu_key_k_type(item_key, TYPE_INDIRECT); item_key->key_length = 4; new_file_size -= @@ -1650,11 +1738,13 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, return (ret_value == IO_ERROR) ? -EIO : -ENOENT; } /* while */ - // check fix_nodes results (IO_ERROR or NO_DISK_SPACE) + /* check fix_nodes results (IO_ERROR or NO_DISK_SPACE) */ if (ret_value != CARRY_ON) { if (is_inode_locked) { - // FIXME: this seems to be not needed: we are always able - // to cut item + /* + * FIXME: this seems to be not needed: we are always + * able to cut item + */ indirect_to_direct_roll_back(th, inode, path); } if (ret_value == NO_DISK_SPACE) @@ -1671,22 +1761,23 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, /* Calculate number of bytes that need to be cut from the item. */ quota_cut_bytes = (mode == - M_DELETE) ? ih_item_len(get_ih(path)) : -s_cut_balance. + M_DELETE) ? ih_item_len(tp_item_head(path)) : -s_cut_balance. insert_size[0]; if (retval2 == -1) ret_value = calc_deleted_bytes_number(&s_cut_balance, mode); else ret_value = retval2; - /* For direct items, we only change the quota when deleting the last - ** item. + /* + * For direct items, we only change the quota when deleting the last + * item. */ - p_le_ih = PATH_PITEM_HEAD(s_cut_balance.tb_path); + p_le_ih = tp_item_head(s_cut_balance.tb_path); if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(p_le_ih)) { if (mode == M_DELETE && (le_ih_k_offset(p_le_ih) & (sb->s_blocksize - 1)) == 1) { - // FIXME: this is to keep 3.5 happy + /* FIXME: this is to keep 3.5 happy */ REISERFS_I(inode)->i_first_direct_byte = U32_MAX; quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE; } else { @@ -1696,10 +1787,12 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, #ifdef CONFIG_REISERFS_CHECK if (is_inode_locked) { struct item_head *le_ih = - PATH_PITEM_HEAD(s_cut_balance.tb_path); - /* we are going to complete indirect2direct conversion. Make - sure, that we exactly remove last unformatted node pointer - of the item */ + tp_item_head(s_cut_balance.tb_path); + /* + * we are going to complete indirect2direct conversion. Make + * sure, that we exactly remove last unformatted node pointer + * of the item + */ if (!is_indirect_le_ih(le_ih)) reiserfs_panic(sb, "vs-5652", "item must be indirect %h", le_ih); @@ -1717,17 +1810,20 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, "(CUT, insert_size==%d)", le_ih, s_cut_balance.insert_size[0]); } - /* it would be useful to make sure, that right neighboring - item is direct item of this file */ + /* + * it would be useful to make sure, that right neighboring + * item is direct item of this file + */ } #endif do_balance(&s_cut_balance, NULL, NULL, mode); if (is_inode_locked) { - /* we've done an indirect->direct conversion. when the data block - ** was freed, it was removed from the list of blocks that must - ** be flushed before the transaction commits, make sure to - ** unmap and invalidate it + /* + * we've done an indirect->direct conversion. when the + * data block was freed, it was removed from the list of + * blocks that must be flushed before the transaction + * commits, make sure to unmap and invalidate it */ unmap_buffers(page, tail_pos); REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; @@ -1758,20 +1854,25 @@ static void truncate_directory(struct reiserfs_transaction_handle *th, set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_STAT_DATA); } -/* Truncate file to the new size. Note, this must be called with a transaction - already started */ +/* + * Truncate file to the new size. Note, this must be called with a + * transaction already started + */ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, - struct inode *inode, /* ->i_size contains new size */ + struct inode *inode, /* ->i_size contains new size */ struct page *page, /* up to date for last block */ - int update_timestamps /* when it is called by - file_release to convert - the tail - no timestamps - should be updated */ + /* + * when it is called by file_release to convert + * the tail - no timestamps should be updated + */ + int update_timestamps ) { INITIALIZE_PATH(s_search_path); /* Path to the current object item. */ struct item_head *p_le_ih; /* Pointer to an item header. */ - struct cpu_key s_item_key; /* Key to search for a previous file item. */ + + /* Key to search for a previous file item. */ + struct cpu_key s_item_key; loff_t file_size, /* Old file size. */ new_file_size; /* New file size. */ int deleted; /* Number of deleted or truncated bytes. */ @@ -1784,8 +1885,8 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, || S_ISLNK(inode->i_mode))) return 0; + /* deletion of directory - no need to update timestamps */ if (S_ISDIR(inode->i_mode)) { - // deletion of directory - no need to update timestamps truncate_directory(th, inode); return 0; } @@ -1793,7 +1894,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, /* Get new file size. */ new_file_size = inode->i_size; - // FIXME: note, that key type is unimportant here + /* FIXME: note, that key type is unimportant here */ make_cpu_key(&s_item_key, inode, max_reiserfs_offset(inode), TYPE_DIRECT, 3); @@ -1819,7 +1920,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, s_search_path.pos_in_item--; /* Get real file size (total length of all file items) */ - p_le_ih = PATH_PITEM_HEAD(&s_search_path); + p_le_ih = tp_item_head(&s_search_path); if (is_statdata_le_ih(p_le_ih)) file_size = 0; else { @@ -1827,9 +1928,11 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, int bytes = op_bytes_number(p_le_ih, inode->i_sb->s_blocksize); - /* this may mismatch with real file size: if last direct item - had no padding zeros and last unformatted node had no free - space, this file would have this file size */ + /* + * this may mismatch with real file size: if last direct item + * had no padding zeros and last unformatted node had no free + * space, this file would have this file size + */ file_size = offset + bytes - 1; } /* @@ -1867,18 +1970,20 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, set_cpu_key_k_offset(&s_item_key, file_size); - /* While there are bytes to truncate and previous file item is presented in the tree. */ + /* + * While there are bytes to truncate and previous + * file item is presented in the tree. + */ /* - ** This loop could take a really long time, and could log - ** many more blocks than a transaction can hold. So, we do a polite - ** journal end here, and if the transaction needs ending, we make - ** sure the file is consistent before ending the current trans - ** and starting a new one + * This loop could take a really long time, and could log + * many more blocks than a transaction can hold. So, we do + * a polite journal end here, and if the transaction needs + * ending, we make sure the file is consistent before ending + * the current trans and starting a new one */ if (journal_transaction_should_end(th, 0) || reiserfs_transaction_free_space(th) <= JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) { - int orig_len_alloc = th->t_blocks_allocated; pathrelse(&s_search_path); if (update_timestamps) { @@ -1887,7 +1992,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, } reiserfs_update_sd(th, inode); - err = journal_end(th, inode->i_sb, orig_len_alloc); + err = journal_end(th); if (err) goto out; err = journal_begin(th, inode->i_sb, @@ -1904,25 +2009,25 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, "PAP-5680: truncate did not finish: new_file_size %Ld, current %Ld, oid %d", new_file_size, file_size, s_item_key.on_disk_key.k_objectid); - update_and_out: +update_and_out: if (update_timestamps) { - // this is truncate, not file closing + /* this is truncate, not file closing */ inode->i_mtime = CURRENT_TIME_SEC; inode->i_ctime = CURRENT_TIME_SEC; } reiserfs_update_sd(th, inode); - out: +out: pathrelse(&s_search_path); return err; } #ifdef CONFIG_REISERFS_CHECK -// this makes sure, that we __append__, not overwrite or add holes +/* this makes sure, that we __append__, not overwrite or add holes */ static void check_research_for_paste(struct treepath *path, const struct cpu_key *key) { - struct item_head *found_ih = get_ih(path); + struct item_head *found_ih = tp_item_head(path); if (is_direct_le_ih(found_ih)) { if (le_ih_k_offset(found_ih) + @@ -1952,13 +2057,22 @@ static void check_research_for_paste(struct treepath *path, } #endif /* config reiserfs check */ -/* Paste bytes to the existing item. Returns bytes number pasted into the item. */ -int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct treepath *search_path, /* Path to the pasted item. */ - const struct cpu_key *key, /* Key to search for the needed item. */ - struct inode *inode, /* Inode item belongs to */ - const char *body, /* Pointer to the bytes to paste. */ +/* + * Paste bytes to the existing item. + * Returns bytes number pasted into the item. + */ +int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, + /* Path to the pasted item. */ + struct treepath *search_path, + /* Key to search for the needed item. */ + const struct cpu_key *key, + /* Inode item belongs to */ + struct inode *inode, + /* Pointer to the bytes to paste. */ + const char *body, + /* Size of pasted bytes. */ int pasted_size) -{ /* Size of pasted bytes. */ +{ struct super_block *sb = inode->i_sb; struct tree_balance s_paste_balance; int retval; @@ -1973,7 +2087,7 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE, "reiserquota paste_into_item(): allocating %u id=%u type=%c", pasted_size, inode->i_uid, - key2type(&(key->on_disk_key))); + key2type(&key->on_disk_key)); #endif depth = reiserfs_write_unlock_nested(sb); @@ -1997,7 +2111,7 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree while ((retval = fix_nodes(M_PASTE, &s_paste_balance, NULL, body)) == REPEAT_SEARCH) { - search_again: +search_again: /* file system changed while we were in the fix_nodes */ PROC_INFO_INC(th->t_super, paste_into_item_restarted); retval = @@ -2019,21 +2133,23 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree #endif } - /* Perform balancing after all resources are collected by fix_nodes, and - accessing them will not risk triggering schedule. */ + /* + * Perform balancing after all resources are collected by fix_nodes, + * and accessing them will not risk triggering schedule. + */ if (retval == CARRY_ON) { do_balance(&s_paste_balance, NULL /*ih */ , body, M_PASTE); return 0; } retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO; - error_out: +error_out: /* this also releases the path */ unfix_nodes(&s_paste_balance); #ifdef REISERQUOTA_DEBUG reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE, "reiserquota paste_into_item(): freeing %u id=%u type=%c", pasted_size, inode->i_uid, - key2type(&(key->on_disk_key))); + key2type(&key->on_disk_key)); #endif depth = reiserfs_write_unlock_nested(sb); dquot_free_space_nodirty(inode, pasted_size); @@ -2041,7 +2157,8 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree return retval; } -/* Insert new item into the buffer at the path. +/* + * Insert new item into the buffer at the path. * th - active transaction handle * path - path to the inserted item * ih - pointer to the item header to insert @@ -2064,8 +2181,10 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, fs_gen = get_generation(inode->i_sb); quota_bytes = ih_item_len(ih); - /* hack so the quota code doesn't have to guess if the file has - ** a tail, links are always tails, so there's no guessing needed + /* + * hack so the quota code doesn't have to guess + * if the file has a tail, links are always tails, + * so there's no guessing needed */ if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(ih)) quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE; @@ -2074,8 +2193,10 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, "reiserquota insert_item(): allocating %u id=%u type=%c", quota_bytes, inode->i_uid, head2type(ih)); #endif - /* We can't dirty inode here. It would be immediately written but - * appropriate stat item isn't inserted yet... */ + /* + * We can't dirty inode here. It would be immediately + * written but appropriate stat item isn't inserted yet... + */ depth = reiserfs_write_unlock_nested(inode->i_sb); retval = dquot_alloc_space_nodirty(inode, quota_bytes); reiserfs_write_lock_nested(inode->i_sb, depth); @@ -2089,7 +2210,10 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, #ifdef DISPLACE_NEW_PACKING_LOCALITIES s_ins_balance.key = key->on_disk_key; #endif - /* DQUOT_* can schedule, must check to be sure calling fix_nodes is safe */ + /* + * DQUOT_* can schedule, must check to be sure calling + * fix_nodes is safe + */ if (inode && fs_changed(fs_gen, inode->i_sb)) { goto search_again; } @@ -2097,7 +2221,7 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, while ((retval = fix_nodes(M_INSERT, &s_ins_balance, ih, body)) == REPEAT_SEARCH) { - search_again: +search_again: /* file system changed while we were in the fix_nodes */ PROC_INFO_INC(th->t_super, insert_item_restarted); retval = search_item(th->t_super, key, path); @@ -2121,7 +2245,7 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, } retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO; - error_out: +error_out: /* also releases the path */ unfix_nodes(&s_ins_balance); #ifdef REISERQUOTA_DEBUG diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 9fb20426005e..a392cef6acc6 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -74,7 +74,7 @@ static int reiserfs_sync_fs(struct super_block *s, int wait) dquot_writeback_dquots(s, -1); reiserfs_write_lock(s); if (!journal_begin(&th, s, 1)) - if (!journal_end_sync(&th, s, 1)) + if (!journal_end_sync(&th)) reiserfs_flush_old_commits(s); reiserfs_write_unlock(s); return 0; @@ -136,9 +136,9 @@ static int reiserfs_freeze(struct super_block *s) } else { reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); - journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); reiserfs_block_writes(&th); - journal_end_sync(&th, s, 1); + journal_end_sync(&th); } } reiserfs_write_unlock(s); @@ -153,13 +153,15 @@ static int reiserfs_unfreeze(struct super_block *s) extern const struct in_core_key MAX_IN_CORE_KEY; -/* this is used to delete "save link" when there are no items of a - file it points to. It can either happen if unlink is completed but - "save unlink" removal, or if file has both unlink and truncate - pending and as unlink completes first (because key of "save link" - protecting unlink is bigger that a key lf "save link" which - protects truncate), so there left no items to make truncate - completion on */ +/* + * this is used to delete "save link" when there are no items of a + * file it points to. It can either happen if unlink is completed but + * "save unlink" removal, or if file has both unlink and truncate + * pending and as unlink completes first (because key of "save link" + * protecting unlink is bigger that a key lf "save link" which + * protects truncate), so there left no items to make truncate + * completion on + */ static int remove_save_link_only(struct super_block *s, struct reiserfs_key *key, int oid_free) { @@ -176,7 +178,7 @@ static int remove_save_link_only(struct super_block *s, /* removals are protected by direct items */ reiserfs_release_objectid(&th, le32_to_cpu(key->k_objectid)); - return journal_end(&th, s, JOURNAL_PER_BALANCE_CNT); + return journal_end(&th); } #ifdef CONFIG_QUOTA @@ -258,7 +260,7 @@ static int finish_unfinished(struct super_block *s) break; } item_pos--; - ih = B_N_PITEM_HEAD(bh, item_pos); + ih = item_head(bh, item_pos); if (le32_to_cpu(ih->ih_key.k_dir_id) != MAX_KEY_OBJECTID) /* there are no "save" links anymore */ @@ -271,7 +273,7 @@ static int finish_unfinished(struct super_block *s) truncate = 0; /* reiserfs_iget needs k_dirid and k_objectid only */ - item = B_I_PITEM(bh, ih); + item = ih_item_body(bh, ih); obj_key.on_disk_key.k_dir_id = le32_to_cpu(*(__le32 *) item); obj_key.on_disk_key.k_objectid = le32_to_cpu(ih->ih_key.k_objectid); @@ -282,8 +284,10 @@ static int finish_unfinished(struct super_block *s) inode = reiserfs_iget(s, &obj_key); if (!inode) { - /* the unlink almost completed, it just did not manage to remove - "save" link and release objectid */ + /* + * the unlink almost completed, it just did not + * manage to remove "save" link and release objectid + */ reiserfs_warning(s, "vs-2180", "iget failed for %K", &obj_key); retval = remove_save_link_only(s, &save_link_key, 1); @@ -303,10 +307,13 @@ static int finish_unfinished(struct super_block *s) reiserfs_write_lock_nested(inode->i_sb, depth); if (truncate && S_ISDIR(inode->i_mode)) { - /* We got a truncate request for a dir which is impossible. - The only imaginable way is to execute unfinished truncate request - then boot into old kernel, remove the file and create dir with - the same key. */ + /* + * We got a truncate request for a dir which + * is impossible. The only imaginable way is to + * execute unfinished truncate request then boot + * into old kernel, remove the file and create dir + * with the same key. + */ reiserfs_warning(s, "green-2101", "impossible truncate on a " "directory %k. Please report", @@ -320,14 +327,16 @@ static int finish_unfinished(struct super_block *s) if (truncate) { REISERFS_I(inode)->i_flags |= i_link_saved_truncate_mask; - /* not completed truncate found. New size was committed together - with "save" link */ + /* + * not completed truncate found. New size was + * committed together with "save" link + */ reiserfs_info(s, "Truncating %k to %Ld ..", INODE_PKEY(inode), inode->i_size); - reiserfs_truncate_file(inode, - 0 - /*don't update modification time */ - ); + + /* don't update modification time */ + reiserfs_truncate_file(inode, 0); + retval = remove_save_link(inode, truncate); } else { REISERFS_I(inode)->i_flags |= i_link_saved_unlink_mask; @@ -373,10 +382,12 @@ static int finish_unfinished(struct super_block *s) return retval; } -/* to protect file being unlinked from getting lost we "safe" link files - being unlinked. This link will be deleted in the same transaction with last - item of file. mounting the filesystem we scan all these links and remove - files which almost got lost */ +/* + * to protect file being unlinked from getting lost we "safe" link files + * being unlinked. This link will be deleted in the same transaction with last + * item of file. mounting the filesystem we scan all these links and remove + * files which almost got lost + */ void add_save_link(struct reiserfs_transaction_handle *th, struct inode *inode, int truncate) { @@ -495,7 +506,7 @@ int remove_save_link(struct inode *inode, int truncate) } else REISERFS_I(inode)->i_flags &= ~i_link_saved_truncate_mask; - return journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT); + return journal_end(&th); } static void reiserfs_kill_sb(struct super_block *s) @@ -530,19 +541,23 @@ static void reiserfs_put_super(struct super_block *s) reiserfs_write_lock(s); - /* change file system state to current state if it was mounted with read-write permissions */ + /* + * change file system state to current state if it was mounted + * with read-write permissions + */ if (!(s->s_flags & MS_RDONLY)) { if (!journal_begin(&th, s, 10)) { reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); set_sb_umount_state(SB_DISK_SUPER_BLOCK(s), REISERFS_SB(s)->s_mount_state); - journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); } } - /* note, journal_release checks for readonly mount, and can decide not - ** to do a journal_end + /* + * note, journal_release checks for readonly mount, and can + * decide not to do a journal_end */ journal_release(&th, s); @@ -559,6 +574,7 @@ static void reiserfs_put_super(struct super_block *s) reiserfs_write_unlock(s); mutex_destroy(&REISERFS_SB(s)->lock); + destroy_workqueue(REISERFS_SB(s)->commit_wq); kfree(s->s_fs_info); s->s_fs_info = NULL; } @@ -634,15 +650,16 @@ static void reiserfs_dirty_inode(struct inode *inode, int flags) } reiserfs_write_lock(inode->i_sb); - /* this is really only used for atime updates, so they don't have - ** to be included in O_SYNC or fsync + /* + * this is really only used for atime updates, so they don't have + * to be included in O_SYNC or fsync */ err = journal_begin(&th, inode->i_sb, 1); if (err) goto out; reiserfs_update_sd(&th, inode); - journal_end(&th, inode->i_sb, 1); + journal_end(&th); out: reiserfs_write_unlock(inode->i_sb); @@ -788,31 +805,53 @@ static const struct export_operations reiserfs_export_ops = { .get_parent = reiserfs_get_parent, }; -/* this struct is used in reiserfs_getopt () for containing the value for those - mount options that have values rather than being toggles. */ +/* + * this struct is used in reiserfs_getopt () for containing the value for + * those mount options that have values rather than being toggles. + */ typedef struct { char *value; - int setmask; /* bitmask which is to set on mount_options bitmask when this - value is found, 0 is no bits are to be changed. */ - int clrmask; /* bitmask which is to clear on mount_options bitmask when this - value is found, 0 is no bits are to be changed. This is - applied BEFORE setmask */ + /* + * bitmask which is to set on mount_options bitmask + * when this value is found, 0 is no bits are to be changed. + */ + int setmask; + /* + * bitmask which is to clear on mount_options bitmask + * when this value is found, 0 is no bits are to be changed. + * This is applied BEFORE setmask + */ + int clrmask; } arg_desc_t; /* Set this bit in arg_required to allow empty arguments */ #define REISERFS_OPT_ALLOWEMPTY 31 -/* this struct is used in reiserfs_getopt() for describing the set of reiserfs - mount options */ +/* + * this struct is used in reiserfs_getopt() for describing the + * set of reiserfs mount options + */ typedef struct { char *option_name; - int arg_required; /* 0 if argument is not required, not 0 otherwise */ - const arg_desc_t *values; /* list of values accepted by an option */ - int setmask; /* bitmask which is to set on mount_options bitmask when this - value is found, 0 is no bits are to be changed. */ - int clrmask; /* bitmask which is to clear on mount_options bitmask when this - value is found, 0 is no bits are to be changed. This is - applied BEFORE setmask */ + + /* 0 if argument is not required, not 0 otherwise */ + int arg_required; + + /* list of values accepted by an option */ + const arg_desc_t *values; + + /* + * bitmask which is to set on mount_options bitmask + * when this value is found, 0 is no bits are to be changed. + */ + int setmask; + + /* + * bitmask which is to clear on mount_options bitmask + * when this value is found, 0 is no bits are to be changed. + * This is applied BEFORE setmask + */ + int clrmask; } opt_desc_t; /* possible values for -o data= */ @@ -833,8 +872,10 @@ static const arg_desc_t barrier_mode[] = { {.value = NULL} }; -/* possible values for "-o block-allocator=" and bits which are to be set in - s_mount_opt of reiserfs specific part of in-core super block */ +/* + * possible values for "-o block-allocator=" and bits which are to be set in + * s_mount_opt of reiserfs specific part of in-core super block + */ static const arg_desc_t balloc[] = { {"noborder", 1 << REISERFS_NO_BORDER, 0}, {"border", 0, 1 << REISERFS_NO_BORDER}, @@ -864,21 +905,25 @@ static const arg_desc_t error_actions[] = { {NULL, 0, 0}, }; -/* proceed only one option from a list *cur - string containing of mount options - opts - array of options which are accepted - opt_arg - if option is found and requires an argument and if it is specifed - in the input - pointer to the argument is stored here - bit_flags - if option requires to set a certain bit - it is set here - return -1 if unknown option is found, opt->arg_required otherwise */ +/* + * proceed only one option from a list *cur - string containing of mount + * options + * opts - array of options which are accepted + * opt_arg - if option is found and requires an argument and if it is specifed + * in the input - pointer to the argument is stored here + * bit_flags - if option requires to set a certain bit - it is set here + * return -1 if unknown option is found, opt->arg_required otherwise + */ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts, char **opt_arg, unsigned long *bit_flags) { char *p; - /* foo=bar, - ^ ^ ^ - | | +-- option_end - | +-- arg_start - +-- option_start + /* + * foo=bar, + * ^ ^ ^ + * | | +-- option_end + * | +-- arg_start + * +-- option_start */ const opt_desc_t *opt; const arg_desc_t *arg; @@ -893,9 +938,12 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts, } if (!strncmp(p, "alloc=", 6)) { - /* Ugly special case, probably we should redo options parser so that - it can understand several arguments for some options, also so that - it can fill several bitfields with option values. */ + /* + * Ugly special case, probably we should redo options + * parser so that it can understand several arguments for + * some options, also so that it can fill several bitfields + * with option values. + */ if (reiserfs_parse_alloc_options(s, p + 6)) { return -1; } else { @@ -958,7 +1006,10 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts, return -1; } - /* move to the argument, or to next option if argument is not required */ + /* + * move to the argument, or to next option if argument is not + * required + */ p++; if (opt->arg_required @@ -995,12 +1046,20 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts, } /* returns 0 if something is wrong in option string, 1 - otherwise */ -static int reiserfs_parse_options(struct super_block *s, char *options, /* string given via mount's -o */ +static int reiserfs_parse_options(struct super_block *s, + + /* string given via mount's -o */ + char *options, + + /* + * after the parsing phase, contains the + * collection of bitflags defining what + * mount options were selected. + */ unsigned long *mount_options, - /* after the parsing phase, contains the - collection of bitflags defining what - mount options were selected. */ - unsigned long *blocks, /* strtol-ed from NNN of resize=NNN */ + + /* strtol-ed from NNN of resize=NNN */ + unsigned long *blocks, char **jdev_name, unsigned int *commit_max_age, char **qf_names, @@ -1010,7 +1069,10 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin char *arg = NULL; char *pos; opt_desc_t opts[] = { - /* Compatibility stuff, so that -o notail for old setups still work */ + /* + * Compatibility stuff, so that -o notail for old + * setups still work + */ {"tails",.arg_required = 't',.values = tails}, {"notail",.clrmask = (1 << REISERFS_LARGETAIL) | (1 << REISERFS_SMALLTAIL)}, @@ -1055,8 +1117,10 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin *blocks = 0; if (!options || !*options) - /* use default configuration: create tails, journaling on, no - conversion to newest format */ + /* + * use default configuration: create tails, journaling on, no + * conversion to newest format + */ return 1; for (pos = options; pos;) { @@ -1109,7 +1173,8 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin if (c == 'j') { if (arg && *arg && jdev_name) { - if (*jdev_name) { //Hm, already assigned? + /* Hm, already assigned? */ + if (*jdev_name) { reiserfs_warning(s, "super-6510", "journal device was " "already specified to " @@ -1362,8 +1427,10 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) safe_mask |= 1 << REISERFS_USRQUOTA; safe_mask |= 1 << REISERFS_GRPQUOTA; - /* Update the bitmask, taking care to keep - * the bits we're not allowed to change here */ + /* + * Update the bitmask, taking care to keep + * the bits we're not allowed to change here + */ REISERFS_SB(s)->s_mount_opt = (REISERFS_SB(s)-> s_mount_opt & ~safe_mask) | (mount_options & safe_mask); @@ -1410,7 +1477,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) /* Mounting a rw partition read-only. */ reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); set_sb_umount_state(rs, REISERFS_SB(s)->s_mount_state); - journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); } else { /* remount read-write */ if (!(s->s_flags & MS_RDONLY)) { @@ -1427,7 +1494,9 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) handle_data_mode(s, mount_options); handle_barrier_mode(s, mount_options); REISERFS_SB(s)->s_mount_state = sb_umount_state(rs); - s->s_flags &= ~MS_RDONLY; /* now it is safe to call journal_begin */ + + /* now it is safe to call journal_begin */ + s->s_flags &= ~MS_RDONLY; err = journal_begin(&th, s, 10); if (err) goto out_err_unlock; @@ -1440,12 +1509,12 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) if (!old_format_only(s)) set_sb_mnt_count(rs, sb_mnt_count(rs) + 1); /* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */ - journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS; } /* this will force a full flush of all journal lists */ SB_JOURNAL(s)->j_must_wait = 1; - err = journal_end(&th, s, 10); + err = journal_end(&th); if (err) goto out_err_unlock; @@ -1489,9 +1558,9 @@ static int read_super_block(struct super_block *s, int offset) brelse(bh); return 1; } - // - // ok, reiserfs signature (old or new) found in at the given offset - // + /* + * ok, reiserfs signature (old or new) found in at the given offset + */ fs_blocksize = sb_blocksize(rs); brelse(bh); sb_set_blocksize(s, fs_blocksize); @@ -1529,9 +1598,11 @@ static int read_super_block(struct super_block *s, int offset) SB_BUFFER_WITH_SB(s) = bh; SB_DISK_SUPER_BLOCK(s) = rs; + /* + * magic is of non-standard journal filesystem, look at s_version to + * find which format is in use + */ if (is_reiserfs_jr(rs)) { - /* magic is of non-standard journal filesystem, look at s_version to - find which format is in use */ if (sb_version(rs) == REISERFS_VERSION_2) reiserfs_info(s, "found reiserfs format \"3.6\"" " with non-standard journal\n"); @@ -1545,8 +1616,10 @@ static int read_super_block(struct super_block *s, int offset) return 1; } } else - /* s_version of standard format may contain incorrect information, - so we just look at the magic string */ + /* + * s_version of standard format may contain incorrect + * information, so we just look at the magic string + */ reiserfs_info(s, "found reiserfs format \"%s\" with standard journal\n", is_reiserfs_3_5(rs) ? "3.5" : "3.6"); @@ -1558,8 +1631,9 @@ static int read_super_block(struct super_block *s, int offset) s->dq_op = &reiserfs_quota_operations; #endif - /* new format is limited by the 32 bit wide i_blocks field, want to - ** be one full block below that. + /* + * new format is limited by the 32 bit wide i_blocks field, want to + * be one full block below that. */ s->s_maxbytes = (512LL << 32) - s->s_blocksize; return 0; @@ -1568,7 +1642,7 @@ static int read_super_block(struct super_block *s, int offset) /* after journal replay, reread all bitmap and super blocks */ static int reread_meta_blocks(struct super_block *s) { - ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s))); + ll_rw_block(READ, 1, &SB_BUFFER_WITH_SB(s)); wait_on_buffer(SB_BUFFER_WITH_SB(s)); if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) { reiserfs_warning(s, "reiserfs-2504", "error reading the super"); @@ -1578,14 +1652,15 @@ static int reread_meta_blocks(struct super_block *s) return 0; } -///////////////////////////////////////////////////// -// hash detection stuff +/* hash detection stuff */ -// if root directory is empty - we set default - Yura's - hash and -// warn about it -// FIXME: we look for only one name in a directory. If tea and yura -// bith have the same value - we ask user to send report to the -// mailing list +/* + * if root directory is empty - we set default - Yura's - hash and + * warn about it + * FIXME: we look for only one name in a directory. If tea and yura + * both have the same value - we ask user to send report to the + * mailing list + */ static __u32 find_hash_out(struct super_block *s) { int retval; @@ -1593,92 +1668,83 @@ static __u32 find_hash_out(struct super_block *s) struct cpu_key key; INITIALIZE_PATH(path); struct reiserfs_dir_entry de; + struct reiserfs_de_head *deh; __u32 hash = DEFAULT_HASH; + __u32 deh_hashval, teahash, r5hash, yurahash; inode = s->s_root->d_inode; - do { // Some serious "goto"-hater was there ;) - u32 teahash, r5hash, yurahash; + make_cpu_key(&key, inode, ~0, TYPE_DIRENTRY, 3); + retval = search_by_entry_key(s, &key, &path, &de); + if (retval == IO_ERROR) { + pathrelse(&path); + return UNSET_HASH; + } + if (retval == NAME_NOT_FOUND) + de.de_entry_num--; + + set_de_name_and_namelen(&de); + deh = de.de_deh + de.de_entry_num; - make_cpu_key(&key, inode, ~0, TYPE_DIRENTRY, 3); - retval = search_by_entry_key(s, &key, &path, &de); - if (retval == IO_ERROR) { - pathrelse(&path); - return UNSET_HASH; - } - if (retval == NAME_NOT_FOUND) - de.de_entry_num--; - set_de_name_and_namelen(&de); - if (deh_offset(&(de.de_deh[de.de_entry_num])) == DOT_DOT_OFFSET) { - /* allow override in this case */ - if (reiserfs_rupasov_hash(s)) { - hash = YURA_HASH; - } - reiserfs_info(s, "FS seems to be empty, autodetect " - "is using the default hash\n"); - break; - } - r5hash = GET_HASH_VALUE(r5_hash(de.de_name, de.de_namelen)); - teahash = GET_HASH_VALUE(keyed_hash(de.de_name, de.de_namelen)); - yurahash = GET_HASH_VALUE(yura_hash(de.de_name, de.de_namelen)); - if (((teahash == r5hash) - && - (GET_HASH_VALUE(deh_offset(&(de.de_deh[de.de_entry_num]))) - == r5hash)) || ((teahash == yurahash) - && (yurahash == - GET_HASH_VALUE(deh_offset - (& - (de. - de_deh[de. - de_entry_num]))))) - || ((r5hash == yurahash) - && (yurahash == - GET_HASH_VALUE(deh_offset - (&(de.de_deh[de.de_entry_num])))))) { - reiserfs_warning(s, "reiserfs-2506", "Unable to " - "automatically detect hash function. " - "Please mount with -o " - "hash={tea,rupasov,r5}"); - hash = UNSET_HASH; - break; - } - if (GET_HASH_VALUE(deh_offset(&(de.de_deh[de.de_entry_num]))) == - yurahash) + if (deh_offset(deh) == DOT_DOT_OFFSET) { + /* allow override in this case */ + if (reiserfs_rupasov_hash(s)) hash = YURA_HASH; - else if (GET_HASH_VALUE - (deh_offset(&(de.de_deh[de.de_entry_num]))) == teahash) - hash = TEA_HASH; - else if (GET_HASH_VALUE - (deh_offset(&(de.de_deh[de.de_entry_num]))) == r5hash) - hash = R5_HASH; - else { - reiserfs_warning(s, "reiserfs-2506", - "Unrecognised hash function"); - hash = UNSET_HASH; - } - } while (0); + reiserfs_info(s, "FS seems to be empty, autodetect is using the default hash\n"); + goto out; + } + + deh_hashval = GET_HASH_VALUE(deh_offset(deh)); + r5hash = GET_HASH_VALUE(r5_hash(de.de_name, de.de_namelen)); + teahash = GET_HASH_VALUE(keyed_hash(de.de_name, de.de_namelen)); + yurahash = GET_HASH_VALUE(yura_hash(de.de_name, de.de_namelen)); + + if ((teahash == r5hash && deh_hashval == r5hash) || + (teahash == yurahash && deh_hashval == yurahash) || + (r5hash == yurahash && deh_hashval == yurahash)) { + reiserfs_warning(s, "reiserfs-2506", + "Unable to automatically detect hash " + "function. Please mount with -o " + "hash={tea,rupasov,r5}"); + hash = UNSET_HASH; + goto out; + } + if (deh_hashval == yurahash) + hash = YURA_HASH; + else if (deh_hashval == teahash) + hash = TEA_HASH; + else if (deh_hashval == r5hash) + hash = R5_HASH; + else { + reiserfs_warning(s, "reiserfs-2506", + "Unrecognised hash function"); + hash = UNSET_HASH; + } +out: pathrelse(&path); return hash; } -// finds out which hash names are sorted with +/* finds out which hash names are sorted with */ static int what_hash(struct super_block *s) { __u32 code; code = sb_hash_function_code(SB_DISK_SUPER_BLOCK(s)); - /* reiserfs_hash_detect() == true if any of the hash mount options - ** were used. We must check them to make sure the user isn't - ** using a bad hash value + /* + * reiserfs_hash_detect() == true if any of the hash mount options + * were used. We must check them to make sure the user isn't + * using a bad hash value */ if (code == UNSET_HASH || reiserfs_hash_detect(s)) code = find_hash_out(s); if (code != UNSET_HASH && reiserfs_hash_detect(s)) { - /* detection has found the hash, and we must check against the - ** mount options + /* + * detection has found the hash, and we must check against the + * mount options */ if (reiserfs_rupasov_hash(s) && code != YURA_HASH) { reiserfs_warning(s, "reiserfs-2507", @@ -1700,7 +1766,10 @@ static int what_hash(struct super_block *s) code = UNSET_HASH; } } else { - /* find_hash_out was not called or could not determine the hash */ + /* + * find_hash_out was not called or + * could not determine the hash + */ if (reiserfs_rupasov_hash(s)) { code = YURA_HASH; } else if (reiserfs_tea_hash(s)) { @@ -1710,8 +1779,9 @@ static int what_hash(struct super_block *s) } } - /* if we are mounted RW, and we have a new valid hash code, update - ** the super + /* + * if we are mounted RW, and we have a new valid hash code, update + * the super */ if (code != UNSET_HASH && !(s->s_flags & MS_RDONLY) && @@ -1721,7 +1791,7 @@ static int what_hash(struct super_block *s) return code; } -// return pointer to appropriate function +/* return pointer to appropriate function */ static hashf_t hash_function(struct super_block *s) { switch (what_hash(s)) { @@ -1738,7 +1808,7 @@ static hashf_t hash_function(struct super_block *s) return NULL; } -// this is used to set up correct value for old partitions +/* this is used to set up correct value for old partitions */ static int function2code(hashf_t func) { if (func == keyed_hash) @@ -1748,7 +1818,7 @@ static int function2code(hashf_t func) if (func == r5_hash) return R5_HASH; - BUG(); // should never happen + BUG(); /* should never happen */ return 0; } @@ -1783,8 +1853,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) sbi->s_mount_opt |= (1 << REISERFS_SMALLTAIL); sbi->s_mount_opt |= (1 << REISERFS_ERROR_RO); sbi->s_mount_opt |= (1 << REISERFS_BARRIER_FLUSH); - /* no preallocation minimum, be smart in - reiserfs_file_write instead */ + /* no preallocation minimum, be smart in reiserfs_file_write instead */ sbi->s_alloc_options.preallocmin = 0; /* Preallocate by 16 blocks (17-1) at once */ sbi->s_alloc_options.preallocsize = 17; @@ -1796,9 +1865,17 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) mutex_init(&sbi->lock); sbi->lock_depth = -1; + sbi->commit_wq = alloc_workqueue("reiserfs/%s", WQ_MEM_RECLAIM, 0, + s->s_id); + if (!sbi->commit_wq) { + SWARN(silent, s, "", "Cannot allocate commit workqueue"); + errval = -ENOMEM; + goto error_unlocked; + } + jdev_name = NULL; if (reiserfs_parse_options - (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name, + (s, (char *)data, &sbi->s_mount_opt, &blocks, &jdev_name, &commit_max_age, qf_names, &qfmt) == 0) { goto error_unlocked; } @@ -1819,10 +1896,17 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) goto error_unlocked; } - /* try old format (undistributed bitmap, super block in 8-th 1k block of a device) */ + /* + * try old format (undistributed bitmap, super block in 8-th 1k + * block of a device) + */ if (!read_super_block(s, REISERFS_OLD_DISK_OFFSET_IN_BYTES)) old_format = 1; - /* try new format (64-th 1k block), which can contain reiserfs super block */ + + /* + * try new format (64-th 1k block), which can contain reiserfs + * super block + */ else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) { SWARN(silent, s, "sh-2021", "can not find reiserfs on %s", s->s_id); @@ -1830,9 +1914,11 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) } rs = SB_DISK_SUPER_BLOCK(s); - /* Let's do basic sanity check to verify that underlying device is not - smaller than the filesystem. If the check fails then abort and scream, - because bad stuff will happen otherwise. */ + /* + * Let's do basic sanity check to verify that underlying device is not + * smaller than the filesystem. If the check fails then abort and + * scream, because bad stuff will happen otherwise. + */ if (s->s_bdev && s->s_bdev->bd_inode && i_size_read(s->s_bdev->bd_inode) < sb_block_count(rs) * sb_blocksize(rs)) { @@ -1876,15 +1962,16 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) printk("reiserfs: using flush barriers\n"); } - // set_device_ro(s->s_dev, 1) ; if (journal_init(s, jdev_name, old_format, commit_max_age)) { SWARN(silent, s, "sh-2022", "unable to initialize journal space"); goto error_unlocked; } else { - jinit_done = 1; /* once this is set, journal_release must be called - ** if we error out of the mount - */ + /* + * once this is set, journal_release must be called + * if we error out of the mount + */ + jinit_done = 1; } if (reread_meta_blocks(s)) { @@ -1905,7 +1992,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) args.dirid = REISERFS_ROOT_PARENT_OBJECTID; root_inode = iget5_locked(s, REISERFS_ROOT_OBJECTID, reiserfs_find_actor, - reiserfs_init_locked_inode, (void *)(&args)); + reiserfs_init_locked_inode, (void *)&args); if (!root_inode) { SWARN(silent, s, "jmacd-10", "get root inode failed"); goto error_unlocked; @@ -1929,7 +2016,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) s->s_root = d_make_root(root_inode); if (!s->s_root) goto error; - // define and initialize hash function + /* define and initialize hash function */ sbi->s_hash_function = hash_function(s); if (sbi->s_hash_function == NULL) { dput(s->s_root); @@ -1939,11 +2026,11 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) if (is_reiserfs_3_5(rs) || (is_reiserfs_jr(rs) && SB_VERSION(s) == REISERFS_VERSION_1)) - set_bit(REISERFS_3_5, &(sbi->s_properties)); + set_bit(REISERFS_3_5, &sbi->s_properties); else if (old_format) - set_bit(REISERFS_OLD_FORMAT, &(sbi->s_properties)); + set_bit(REISERFS_OLD_FORMAT, &sbi->s_properties); else - set_bit(REISERFS_3_6, &(sbi->s_properties)); + set_bit(REISERFS_3_6, &sbi->s_properties); if (!(s->s_flags & MS_RDONLY)) { @@ -1958,10 +2045,12 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) set_sb_umount_state(rs, REISERFS_ERROR_FS); set_sb_fs_state(rs, 0); - /* Clear out s_bmap_nr if it would wrap. We can handle this + /* + * Clear out s_bmap_nr if it would wrap. We can handle this * case, but older revisions can't. This will cause the * file system to fail mount on those older implementations, - * avoiding corruption. -jeffm */ + * avoiding corruption. -jeffm + */ if (bmap_would_wrap(reiserfs_bmap_count(s)) && sb_bmap_nr(rs) != 0) { reiserfs_warning(s, "super-2030", "This file system " @@ -1974,8 +2063,10 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) } if (old_format_only(s)) { - /* filesystem of format 3.5 either with standard or non-standard - journal */ + /* + * filesystem of format 3.5 either with standard + * or non-standard journal + */ if (convert_reiserfs(s)) { /* and -o conv is given */ if (!silent) @@ -1983,8 +2074,11 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) "converting 3.5 filesystem to the 3.6 format"); if (is_reiserfs_3_5(rs)) - /* put magic string of 3.6 format. 2.2 will not be able to - mount this filesystem anymore */ + /* + * put magic string of 3.6 format. + * 2.2 will not be able to + * mount this filesystem anymore + */ memcpy(rs->s_v1.s_magic, reiserfs_3_6_magic_string, sizeof @@ -1992,8 +2086,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) set_sb_version(rs, REISERFS_VERSION_2); reiserfs_convert_objectid_map_v1(s); - set_bit(REISERFS_3_6, &(sbi->s_properties)); - clear_bit(REISERFS_3_5, &(sbi->s_properties)); + set_bit(REISERFS_3_6, &sbi->s_properties); + clear_bit(REISERFS_3_5, &sbi->s_properties); } else if (!silent) { reiserfs_info(s, "using 3.5.x disk format\n"); } @@ -2001,8 +2095,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) set_sb_mnt_count(rs, sb_mnt_count(rs) + 1); - journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); - errval = journal_end(&th, s, 1); + journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s)); + errval = journal_end(&th); if (errval) { dput(s->s_root); s->s_root = NULL; @@ -2018,7 +2112,9 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) } reiserfs_write_lock(s); - /* look for files which were to be removed in previous session */ + /* + * look for files which were to be removed in previous session + */ finish_unfinished(s); } else { if (old_format_only(s) && !silent) { @@ -2034,7 +2130,9 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) } reiserfs_write_lock(s); } - // mark hash in super block: it could be unset. overwrite should be ok + /* + * mark hash in super block: it could be unset. overwrite should be ok + */ set_sb_hash_function_code(rs, function2code(sbi->s_hash_function)); handle_attrs(s); @@ -2111,9 +2209,7 @@ static int reiserfs_write_dquot(struct dquot *dquot) depth = reiserfs_write_unlock_nested(dquot->dq_sb); ret = dquot_commit(dquot); reiserfs_write_lock_nested(dquot->dq_sb, depth); - err = - journal_end(&th, dquot->dq_sb, - REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); + err = journal_end(&th); if (!ret && err) ret = err; out: @@ -2136,9 +2232,7 @@ static int reiserfs_acquire_dquot(struct dquot *dquot) depth = reiserfs_write_unlock_nested(dquot->dq_sb); ret = dquot_acquire(dquot); reiserfs_write_lock_nested(dquot->dq_sb, depth); - err = - journal_end(&th, dquot->dq_sb, - REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb)); + err = journal_end(&th); if (!ret && err) ret = err; out: @@ -2163,9 +2257,7 @@ static int reiserfs_release_dquot(struct dquot *dquot) } ret = dquot_release(dquot); reiserfs_write_lock(dquot->dq_sb); - err = - journal_end(&th, dquot->dq_sb, - REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb)); + err = journal_end(&th); if (!ret && err) ret = err; reiserfs_write_unlock(dquot->dq_sb); @@ -2198,7 +2290,7 @@ static int reiserfs_write_info(struct super_block *sb, int type) depth = reiserfs_write_unlock_nested(sb); ret = dquot_commit_info(sb, type); reiserfs_write_lock_nested(sb, depth); - err = journal_end(&th, sb, 2); + err = journal_end(&th); if (!ret && err) ret = err; out: @@ -2238,7 +2330,10 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, goto out; } inode = path->dentry->d_inode; - /* We must not pack tails for quota files on reiserfs for quota IO to work */ + /* + * We must not pack tails for quota files on reiserfs for quota + * IO to work + */ if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) { err = reiserfs_unpack(inode, NULL); if (err) { @@ -2268,7 +2363,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, err = journal_begin(&th, sb, 1); if (err) goto out; - err = journal_end_sync(&th, sb, 1); + err = journal_end_sync(&th); if (err) goto out; } @@ -2279,10 +2374,12 @@ out: return err; } -/* Read data from quotafile - avoid pagecache and such because we cannot afford +/* + * Read data from quotafile - avoid pagecache and such because we cannot afford * acquiring the locks... As quota files are never truncated and quota code * itself serializes the operations (and no one else should touch the files) - * we don't have to be afraid of races */ + * we don't have to be afraid of races + */ static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off) { @@ -2303,7 +2400,10 @@ static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data, sb->s_blocksize - offset < toread ? sb->s_blocksize - offset : toread; tmp_bh.b_state = 0; - /* Quota files are without tails so we can safely use this function */ + /* + * Quota files are without tails so we can safely + * use this function + */ reiserfs_write_lock(sb); err = reiserfs_get_block(inode, blk, &tmp_bh, 0); reiserfs_write_unlock(sb); @@ -2326,8 +2426,10 @@ static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data, return len; } -/* Write to quotafile (we know the transaction is already started and has - * enough credits) */ +/* + * Write to quotafile (we know the transaction is already started and has + * enough credits) + */ static ssize_t reiserfs_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off) { @@ -2368,7 +2470,7 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type, unlock_buffer(bh); reiserfs_write_lock(sb); reiserfs_prepare_for_journal(sb, bh, 1); - journal_mark_dirty(current->journal_info, sb, bh); + journal_mark_dirty(current->journal_info, bh); if (!journal_quota) reiserfs_add_ordered_list(inode, bh); reiserfs_write_unlock(sb); @@ -2402,18 +2504,18 @@ static int __init init_reiserfs_fs(void) { int ret; - if ((ret = init_inodecache())) { + ret = init_inodecache(); + if (ret) return ret; - } reiserfs_proc_info_global_init(); ret = register_filesystem(&reiserfs_fs_type); + if (ret) + goto out; - if (ret == 0) { - return 0; - } - + return 0; +out: reiserfs_proc_info_global_done(); destroy_inodecache(); diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c index 5e2624d12f70..f41e19b4bb42 100644 --- a/fs/reiserfs/tail_conversion.c +++ b/fs/reiserfs/tail_conversion.c @@ -1,5 +1,6 @@ /* - * Copyright 1999 Hans Reiser, see reiserfs/README for licensing and copyright details + * Copyright 1999 Hans Reiser, see reiserfs/README for licensing and copyright + * details */ #include <linux/time.h> @@ -7,29 +8,41 @@ #include <linux/buffer_head.h> #include "reiserfs.h" -/* access to tail : when one is going to read tail it must make sure, that is not running. - direct2indirect and indirect2direct can not run concurrently */ +/* + * access to tail : when one is going to read tail it must make sure, that is + * not running. direct2indirect and indirect2direct can not run concurrently + */ -/* Converts direct items to an unformatted node. Panics if file has no - tail. -ENOSPC if no disk space for conversion */ -/* path points to first direct item of the file regarless of how many of - them are there */ +/* + * Converts direct items to an unformatted node. Panics if file has no + * tail. -ENOSPC if no disk space for conversion + */ +/* + * path points to first direct item of the file regardless of how many of + * them are there + */ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode, struct treepath *path, struct buffer_head *unbh, loff_t tail_offset) { struct super_block *sb = inode->i_sb; struct buffer_head *up_to_date_bh; - struct item_head *p_le_ih = PATH_PITEM_HEAD(path); + struct item_head *p_le_ih = tp_item_head(path); unsigned long total_tail = 0; - struct cpu_key end_key; /* Key to search for the last byte of the - converted item. */ - struct item_head ind_ih; /* new indirect item to be inserted or - key of unfm pointer to be pasted */ - int blk_size, retval; /* returned value for reiserfs_insert_item and clones */ - unp_t unfm_ptr; /* Handle on an unformatted node - that will be inserted in the - tree. */ + + /* Key to search for the last byte of the converted item. */ + struct cpu_key end_key; + + /* + * new indirect item to be inserted or key + * of unfm pointer to be pasted + */ + struct item_head ind_ih; + int blk_size; + /* returned value for reiserfs_insert_item and clones */ + int retval; + /* Handle on an unformatted node that will be inserted in the tree. */ + unp_t unfm_ptr; BUG_ON(!th->t_trans_id); @@ -37,8 +50,10 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode, blk_size = sb->s_blocksize; - /* and key to search for append or insert pointer to the new - unformatted node. */ + /* + * and key to search for append or insert pointer to the new + * unformatted node. + */ copy_item_head(&ind_ih, p_le_ih); set_le_ih_k_offset(&ind_ih, tail_offset); set_le_ih_k_type(&ind_ih, TYPE_INDIRECT); @@ -55,7 +70,7 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode, return -EIO; } - p_le_ih = PATH_PITEM_HEAD(path); + p_le_ih = tp_item_head(path); unfm_ptr = cpu_to_le32(unbh->b_blocknr); @@ -76,36 +91,43 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode, if (retval) { return retval; } - // note: from here there are two keys which have matching first - // three key components. They only differ by the fourth one. + /* + * note: from here there are two keys which have matching first + * three key components. They only differ by the fourth one. + */ /* Set the key to search for the direct items of the file */ make_cpu_key(&end_key, inode, max_reiserfs_offset(inode), TYPE_DIRECT, 4); - /* Move bytes from the direct items to the new unformatted node - and delete them. */ + /* + * Move bytes from the direct items to the new unformatted node + * and delete them. + */ while (1) { int tail_size; - /* end_key.k_offset is set so, that we will always have found - last item of the file */ + /* + * end_key.k_offset is set so, that we will always have found + * last item of the file + */ if (search_for_position_by_key(sb, &end_key, path) == POSITION_FOUND) reiserfs_panic(sb, "PAP-14050", "direct item (%K) not found", &end_key); - p_le_ih = PATH_PITEM_HEAD(path); + p_le_ih = tp_item_head(path); RFALSE(!is_direct_le_ih(p_le_ih), "vs-14055: direct item expected(%K), found %h", &end_key, p_le_ih); tail_size = (le_ih_k_offset(p_le_ih) & (blk_size - 1)) + ih_item_len(p_le_ih) - 1; - /* we only send the unbh pointer if the buffer is not up to date. - ** this avoids overwriting good data from writepage() with old data - ** from the disk or buffer cache - ** Special case: unbh->b_page will be NULL if we are coming through - ** DIRECT_IO handler here. + /* + * we only send the unbh pointer if the buffer is not + * up to date. this avoids overwriting good data from + * writepage() with old data from the disk or buffer cache + * Special case: unbh->b_page will be NULL if we are coming + * through DIRECT_IO handler here. */ if (!unbh->b_page || buffer_uptodate(unbh) || PageUptodate(unbh->b_page)) { @@ -117,13 +139,15 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode, up_to_date_bh); total_tail += retval; + + /* done: file does not have direct items anymore */ if (tail_size == retval) - // done: file does not have direct items anymore break; } - /* if we've copied bytes from disk into the page, we need to zero - ** out the unused part of the block (it was not up to date before) + /* + * if we've copied bytes from disk into the page, we need to zero + * out the unused part of the block (it was not up to date before) */ if (up_to_date_bh) { unsigned pgoff = @@ -146,9 +170,11 @@ void reiserfs_unmap_buffer(struct buffer_head *bh) BUG(); } clear_buffer_dirty(bh); - /* Remove the buffer from whatever list it belongs to. We are mostly - interested in removing it from per-sb j_dirty_buffers list, to avoid - BUG() on attempt to write not mapped buffer */ + /* + * Remove the buffer from whatever list it belongs to. We are mostly + * interested in removing it from per-sb j_dirty_buffers list, to avoid + * BUG() on attempt to write not mapped buffer + */ if ((!list_empty(&bh->b_assoc_buffers) || bh->b_private) && bh->b_page) { struct inode *inode = bh->b_page->mapping->host; struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); @@ -164,12 +190,14 @@ void reiserfs_unmap_buffer(struct buffer_head *bh) unlock_buffer(bh); } -/* this first locks inode (neither reads nor sync are permitted), - reads tail through page cache, insert direct item. When direct item - inserted successfully inode is left locked. Return value is always - what we expect from it (number of cut bytes). But when tail remains - in the unformatted node, we set mode to SKIP_BALANCING and unlock - inode */ +/* + * this first locks inode (neither reads nor sync are permitted), + * reads tail through page cache, insert direct item. When direct item + * inserted successfully inode is left locked. Return value is always + * what we expect from it (number of cut bytes). But when tail remains + * in the unformatted node, we set mode to SKIP_BALANCING and unlock + * inode + */ int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *inode, struct page *page, struct treepath *path, /* path to the indirect item. */ @@ -194,7 +222,7 @@ int indirect2direct(struct reiserfs_transaction_handle *th, *mode = M_SKIP_BALANCING; /* store item head path points to. */ - copy_item_head(&s_ih, PATH_PITEM_HEAD(path)); + copy_item_head(&s_ih, tp_item_head(path)); tail_len = (n_new_file_size & (block_size - 1)); if (get_inode_sd_version(inode) == STAT_DATA_V2) @@ -207,9 +235,11 @@ int indirect2direct(struct reiserfs_transaction_handle *th, 1) * sb->s_blocksize; pos1 = pos; - // we are protected by i_mutex. The tail can not disapper, not - // append can be done either - // we are in truncate or packing tail in file_release + /* + * we are protected by i_mutex. The tail can not disapper, not + * append can be done either + * we are in truncate or packing tail in file_release + */ tail = (char *)kmap(page); /* this can schedule */ @@ -220,7 +250,7 @@ int indirect2direct(struct reiserfs_transaction_handle *th, reiserfs_panic(sb, "PAP-5520", "item to be converted %K does not exist", item_key); - copy_item_head(&s_ih, PATH_PITEM_HEAD(path)); + copy_item_head(&s_ih, tp_item_head(path)); #ifdef CONFIG_REISERFS_CHECK pos = le_ih_k_offset(&s_ih) - 1 + (ih_item_len(&s_ih) / UNFM_P_SIZE - @@ -236,9 +266,10 @@ int indirect2direct(struct reiserfs_transaction_handle *th, pos1 + 1, TYPE_DIRECT, round_tail_len, 0xffff /*ih_free_space */ ); - /* we want a pointer to the first byte of the tail in the page. - ** the page was locked and this part of the page was up to date when - ** indirect2direct was called, so we know the bytes are still valid + /* + * we want a pointer to the first byte of the tail in the page. + * the page was locked and this part of the page was up to date when + * indirect2direct was called, so we know the bytes are still valid */ tail = tail + (pos & (PAGE_CACHE_SIZE - 1)); @@ -250,12 +281,14 @@ int indirect2direct(struct reiserfs_transaction_handle *th, /* Insert tail as new direct item in the tree */ if (reiserfs_insert_item(th, path, &key, &s_ih, inode, tail ? tail : NULL) < 0) { - /* No disk memory. So we can not convert last unformatted node - to the direct item. In this case we used to adjust - indirect items's ih_free_space. Now ih_free_space is not - used, it would be ideal to write zeros to corresponding - unformatted node. For now i_size is considered as guard for - going out of file size */ + /* + * No disk memory. So we can not convert last unformatted node + * to the direct item. In this case we used to adjust + * indirect items's ih_free_space. Now ih_free_space is not + * used, it would be ideal to write zeros to corresponding + * unformatted node. For now i_size is considered as guard for + * going out of file size + */ kunmap(page); return block_size - round_tail_len; } @@ -264,12 +297,16 @@ int indirect2direct(struct reiserfs_transaction_handle *th, /* make sure to get the i_blocks changes from reiserfs_insert_item */ reiserfs_update_sd(th, inode); - // note: we have now the same as in above direct2indirect - // conversion: there are two keys which have matching first three - // key components. They only differ by the fouhth one. + /* + * note: we have now the same as in above direct2indirect + * conversion: there are two keys which have matching first three + * key components. They only differ by the fourth one. + */ - /* We have inserted new direct item and must remove last - unformatted node. */ + /* + * We have inserted new direct item and must remove last + * unformatted node. + */ *mode = M_CUT; /* we store position of first direct item in the in-core inode */ diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 5cdfbd638b5c..ca416d099e7d 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -56,9 +56,11 @@ #define XAROOT_NAME "xattrs" -/* Helpers for inode ops. We do this so that we don't have all the VFS +/* + * Helpers for inode ops. We do this so that we don't have all the VFS * overhead and also for proper i_mutex annotation. - * dir->i_mutex must be held for all of them. */ + * dir->i_mutex must be held for all of them. + */ #ifdef CONFIG_REISERFS_FS_XATTR static int xattr_create(struct inode *dir, struct dentry *dentry, int mode) { @@ -73,10 +75,12 @@ static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return dir->i_op->mkdir(dir, dentry, mode); } -/* We use I_MUTEX_CHILD here to silence lockdep. It's safe because xattr +/* + * We use I_MUTEX_CHILD here to silence lockdep. It's safe because xattr * mutation ops aren't called during rename or splace, which are the * only other users of I_MUTEX_CHILD. It violates the ordering, but that's - * better than allocating another subclass just for this code. */ + * better than allocating another subclass just for this code. + */ static int xattr_unlink(struct inode *dir, struct dentry *dentry) { int error; @@ -166,9 +170,11 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags) return xadir; } -/* The following are side effects of other operations that aren't explicitly +/* + * The following are side effects of other operations that aren't explicitly * modifying extended attributes. This includes operations such as permissions - * or ownership changes, object deletions, etc. */ + * or ownership changes, object deletions, etc. + */ struct reiserfs_dentry_buf { struct dir_context ctx; struct dentry *xadir; @@ -267,11 +273,13 @@ static int reiserfs_for_each_xattr(struct inode *inode, cleanup_dentry_buf(&buf); if (!err) { - /* We start a transaction here to avoid a ABBA situation + /* + * We start a transaction here to avoid a ABBA situation * between the xattr root's i_mutex and the journal lock. * This doesn't incur much additional overhead since the * new transaction will just nest inside the - * outer transaction. */ + * outer transaction. + */ int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb); struct reiserfs_transaction_handle th; @@ -284,7 +292,7 @@ static int reiserfs_for_each_xattr(struct inode *inode, I_MUTEX_XATTR); err = action(dir, data); reiserfs_write_lock(inode->i_sb); - jerror = journal_end(&th, inode->i_sb, blocks); + jerror = journal_end(&th); reiserfs_write_unlock(inode->i_sb); mutex_unlock(&dir->d_parent->d_inode->i_mutex); err = jerror ?: err; @@ -349,9 +357,11 @@ int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs) } #ifdef CONFIG_REISERFS_FS_XATTR -/* Returns a dentry corresponding to a specific extended attribute file +/* + * Returns a dentry corresponding to a specific extended attribute file * for the inode. If flags allow, the file is created. Otherwise, a - * valid or negative dentry, or an error is returned. */ + * valid or negative dentry, or an error is returned. + */ static struct dentry *xattr_lookup(struct inode *inode, const char *name, int flags) { @@ -400,8 +410,10 @@ static struct page *reiserfs_get_page(struct inode *dir, size_t n) { struct address_space *mapping = dir->i_mapping; struct page *page; - /* We can deadlock if we try to free dentries, - and an unlink/rmdir has just occurred - GFP_NOFS avoids this */ + /* + * We can deadlock if we try to free dentries, + * and an unlink/rmdir has just occurred - GFP_NOFS avoids this + */ mapping_set_gfp_mask(mapping, GFP_NOFS); page = read_mapping_page(mapping, n >> PAGE_CACHE_SHIFT, NULL); if (!IS_ERR(page)) { @@ -411,7 +423,7 @@ static struct page *reiserfs_get_page(struct inode *dir, size_t n) } return page; - fail: +fail: reiserfs_put_page(page); return ERR_PTR(-EIO); } @@ -589,7 +601,7 @@ int reiserfs_xattr_set(struct inode *inode, const char *name, buffer, buffer_size, flags); reiserfs_write_lock(inode->i_sb); - error2 = journal_end(&th, inode->i_sb, jbegin_count); + error2 = journal_end(&th); reiserfs_write_unlock(inode->i_sb); if (error == 0) error = error2; @@ -615,8 +627,10 @@ reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer, if (name == NULL) return -EINVAL; - /* We can't have xattrs attached to v1 items since they don't have - * generation numbers */ + /* + * We can't have xattrs attached to v1 items since they don't have + * generation numbers + */ if (get_inode_sd_version(inode) == STAT_DATA_V1) return -EOPNOTSUPP; @@ -913,12 +927,16 @@ static const struct xattr_handler *reiserfs_xattr_handlers[] = { static int xattr_mount_check(struct super_block *s) { - /* We need generation numbers to ensure that the oid mapping is correct - * v3.5 filesystems don't have them. */ + /* + * We need generation numbers to ensure that the oid mapping is correct + * v3.5 filesystems don't have them. + */ if (old_format_only(s)) { if (reiserfs_xattrs_optional(s)) { - /* Old format filesystem, but optional xattrs have - * been enabled. Error out. */ + /* + * Old format filesystem, but optional xattrs have + * been enabled. Error out. + */ reiserfs_warning(s, "jdm-2005", "xattrs/ACLs not supported " "on pre-v3.6 format filesystems. " @@ -972,9 +990,11 @@ int reiserfs_lookup_privroot(struct super_block *s) return err; } -/* We need to take a copy of the mount flags since things like +/* + * We need to take a copy of the mount flags since things like * MS_RDONLY don't get set until *after* we're called. - * mount_flags != mount_options */ + * mount_flags != mount_options + */ int reiserfs_xattr_init(struct super_block *s, int mount_flags) { int err = 0; @@ -1007,8 +1027,8 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) error: if (err) { - clear_bit(REISERFS_XATTRS_USER, &(REISERFS_SB(s)->s_mount_opt)); - clear_bit(REISERFS_POSIXACL, &(REISERFS_SB(s)->s_mount_opt)); + clear_bit(REISERFS_XATTRS_USER, &REISERFS_SB(s)->s_mount_opt); + clear_bit(REISERFS_POSIXACL, &REISERFS_SB(s)->s_mount_opt); } /* The super_block MS_POSIXACL must mirror the (no)acl mount option. */ diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h index f59626c5d33b..857ec7e3016f 100644 --- a/fs/reiserfs/xattr.h +++ b/fs/reiserfs/xattr.h @@ -61,7 +61,8 @@ static inline loff_t reiserfs_xattr_nblocks(struct inode *inode, loff_t size) return ret; } -/* We may have to create up to 3 objects: xattr root, xattr dir, xattr file. +/* + * We may have to create up to 3 objects: xattr root, xattr dir, xattr file. * Let's try to be smart about it. * xattr root: We cache it. If it's not cached, we may need to create it. * xattr dir: If anything has been loaded for this inode, we can set a flag diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index a6ce532402dc..44503e293790 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -25,8 +25,10 @@ reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) int size = acl ? posix_acl_xattr_size(acl->a_count) : 0; - /* Pessimism: We can't assume that anything from the xattr root up - * has been created. */ + /* + * Pessimism: We can't assume that anything from the xattr root up + * has been created. + */ jcreate_blocks = reiserfs_xattr_jcreate_nblocks(inode) + reiserfs_xattr_nblocks(inode, size) * 2; @@ -37,7 +39,7 @@ reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (error == 0) { error = __reiserfs_set_acl(&th, inode, type, acl); reiserfs_write_lock(inode->i_sb); - error2 = journal_end(&th, inode->i_sb, jcreate_blocks); + error2 = journal_end(&th); reiserfs_write_unlock(inode->i_sb); if (error2) error = error2; @@ -111,7 +113,7 @@ static struct posix_acl *reiserfs_posix_acl_from_disk(const void *value, size_t goto fail; return acl; - fail: +fail: posix_acl_release(acl); return ERR_PTR(-EINVAL); } @@ -164,7 +166,7 @@ static void *reiserfs_posix_acl_to_disk(const struct posix_acl *acl, size_t * si } return (char *)ext_acl; - fail: +fail: kfree(ext_acl); return ERR_PTR(-EINVAL); } @@ -208,8 +210,10 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type) retval = reiserfs_xattr_get(inode, name, value, size); if (retval == -ENODATA || retval == -ENOSYS) { - /* This shouldn't actually happen as it should have - been caught above.. but just in case */ + /* + * This shouldn't actually happen as it should have + * been caught above.. but just in case + */ acl = NULL; } else if (retval < 0) { acl = ERR_PTR(retval); @@ -290,8 +294,10 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, return error; } -/* dir->i_mutex: locked, - * inode is new and not released into the wild yet */ +/* + * dir->i_mutex: locked, + * inode is new and not released into the wild yet + */ int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, struct inode *dir, struct dentry *dentry, @@ -304,14 +310,18 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, if (S_ISLNK(inode->i_mode)) return 0; - /* ACLs can only be used on "new" objects, so if it's an old object - * there is nothing to inherit from */ + /* + * ACLs can only be used on "new" objects, so if it's an old object + * there is nothing to inherit from + */ if (get_inode_sd_version(dir) == STAT_DATA_V1) goto apply_umask; - /* Don't apply ACLs to objects in the .reiserfs_priv tree.. This + /* + * Don't apply ACLs to objects in the .reiserfs_priv tree.. This * would be useless since permissions are ignored, and a pain because - * it introduces locking cycles */ + * it introduces locking cycles + */ if (IS_PRIVATE(dir)) { inode->i_flags |= S_PRIVATE; goto apply_umask; @@ -335,7 +345,7 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, return err; - apply_umask: +apply_umask: /* no ACL, apply umask */ inode->i_mode &= ~current_umask(); return err; diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c index f373bde8f545..ea06c7554860 100644 --- a/fs/romfs/mmap-nommu.c +++ b/fs/romfs/mmap-nommu.c @@ -72,8 +72,8 @@ static int romfs_mmap(struct file *file, struct vm_area_struct *vma) const struct file_operations romfs_ro_fops = { .llseek = generic_file_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, + .read = new_sync_read, + .read_iter = generic_file_read_iter, .splice_read = generic_file_splice_read, .mmap = romfs_mmap, .get_unmapped_area = romfs_get_unmapped_area, diff --git a/fs/seq_file.c b/fs/seq_file.c index 1d641bb108d2..3857b720cb1b 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -8,8 +8,10 @@ #include <linux/fs.h> #include <linux/export.h> #include <linux/seq_file.h> +#include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/cred.h> +#include <linux/mm.h> #include <asm/uaccess.h> #include <asm/page.h> @@ -30,6 +32,16 @@ static void seq_set_overflow(struct seq_file *m) m->count = m->size; } +static void *seq_buf_alloc(unsigned long size) +{ + void *buf; + + buf = kmalloc(size, GFP_KERNEL | __GFP_NOWARN); + if (!buf && size > PAGE_SIZE) + buf = vmalloc(size); + return buf; +} + /** * seq_open - initialize sequential file * @file: file we initialize @@ -96,7 +108,7 @@ static int traverse(struct seq_file *m, loff_t offset) return 0; } if (!m->buf) { - m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); + m->buf = seq_buf_alloc(m->size = PAGE_SIZE); if (!m->buf) return -ENOMEM; } @@ -135,9 +147,9 @@ static int traverse(struct seq_file *m, loff_t offset) Eoverflow: m->op->stop(m, p); - kfree(m->buf); + kvfree(m->buf); m->count = 0; - m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); + m->buf = seq_buf_alloc(m->size <<= 1); return !m->buf ? -ENOMEM : -EAGAIN; } @@ -192,7 +204,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) /* grab buffer if we didn't have one */ if (!m->buf) { - m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); + m->buf = seq_buf_alloc(m->size = PAGE_SIZE); if (!m->buf) goto Enomem; } @@ -232,9 +244,9 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) if (m->count < m->size) goto Fill; m->op->stop(m, p); - kfree(m->buf); + kvfree(m->buf); m->count = 0; - m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); + m->buf = seq_buf_alloc(m->size <<= 1); if (!m->buf) goto Enomem; m->version = 0; @@ -350,7 +362,7 @@ EXPORT_SYMBOL(seq_lseek); int seq_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; - kfree(m->buf); + kvfree(m->buf); kfree(m); return 0; } @@ -605,13 +617,13 @@ EXPORT_SYMBOL(single_open); int single_open_size(struct file *file, int (*show)(struct seq_file *, void *), void *data, size_t size) { - char *buf = kmalloc(size, GFP_KERNEL); + char *buf = seq_buf_alloc(size); int ret; if (!buf) return -ENOMEM; ret = single_open(file, show, data); if (ret) { - kfree(buf); + kvfree(buf); return ret; } ((struct seq_file *)file->private_data)->buf = buf; diff --git a/fs/splice.c b/fs/splice.c index 9bc07d2b53cf..f5cb9ba84510 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -32,6 +32,7 @@ #include <linux/gfp.h> #include <linux/socket.h> #include <linux/compat.h> +#include <linux/aio.h> #include "internal.h" /* @@ -717,63 +718,6 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe, sd->len, &pos, more); } -/* - * This is a little more tricky than the file -> pipe splicing. There are - * basically three cases: - * - * - Destination page already exists in the address space and there - * are users of it. For that case we have no other option that - * copying the data. Tough luck. - * - Destination page already exists in the address space, but there - * are no users of it. Make sure it's uptodate, then drop it. Fall - * through to last case. - * - Destination page does not exist, we can add the pipe page to - * the page cache and avoid the copy. - * - * If asked to move pages to the output file (SPLICE_F_MOVE is set in - * sd->flags), we attempt to migrate pages from the pipe to the output - * file address space page cache. This is possible if no one else has - * the pipe page referenced outside of the pipe and page cache. If - * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create - * a new page in the output file page cache and fill/dirty that. - */ -int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, - struct splice_desc *sd) -{ - struct file *file = sd->u.file; - struct address_space *mapping = file->f_mapping; - unsigned int offset, this_len; - struct page *page; - void *fsdata; - int ret; - - offset = sd->pos & ~PAGE_CACHE_MASK; - - this_len = sd->len; - if (this_len + offset > PAGE_CACHE_SIZE) - this_len = PAGE_CACHE_SIZE - offset; - - ret = pagecache_write_begin(file, mapping, sd->pos, this_len, - AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); - if (unlikely(ret)) - goto out; - - if (buf->page != page) { - char *src = kmap_atomic(buf->page); - char *dst = kmap_atomic(page); - - memcpy(dst + offset, src + buf->offset, this_len); - flush_dcache_page(page); - kunmap_atomic(dst); - kunmap_atomic(src); - } - ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len, - page, fsdata); -out: - return ret; -} -EXPORT_SYMBOL(pipe_to_file); - static void wakeup_pipe_writers(struct pipe_inode_info *pipe) { smp_mb(); @@ -802,7 +746,7 @@ static void wakeup_pipe_writers(struct pipe_inode_info *pipe) * locking is required around copying the pipe buffers to the * destination. */ -int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, +static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, splice_actor *actor) { int ret; @@ -849,7 +793,6 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, return 1; } -EXPORT_SYMBOL(splice_from_pipe_feed); /** * splice_from_pipe_next - wait for some data to splice from @@ -861,7 +804,7 @@ EXPORT_SYMBOL(splice_from_pipe_feed); * value (one) if pipe buffers are available. It will return zero * or -errno if no more data needs to be spliced. */ -int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd) +static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd) { while (!pipe->nrbufs) { if (!pipe->writers) @@ -886,7 +829,6 @@ int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd) return 1; } -EXPORT_SYMBOL(splice_from_pipe_next); /** * splice_from_pipe_begin - start splicing from pipe @@ -897,12 +839,11 @@ EXPORT_SYMBOL(splice_from_pipe_next); * splice_from_pipe_next() and splice_from_pipe_feed() to * initialize the necessary fields of @sd. */ -void splice_from_pipe_begin(struct splice_desc *sd) +static void splice_from_pipe_begin(struct splice_desc *sd) { sd->num_spliced = 0; sd->need_wakeup = false; } -EXPORT_SYMBOL(splice_from_pipe_begin); /** * splice_from_pipe_end - finish splicing from pipe @@ -914,12 +855,11 @@ EXPORT_SYMBOL(splice_from_pipe_begin); * be called after a loop containing splice_from_pipe_next() and * splice_from_pipe_feed(). */ -void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd) +static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd) { if (sd->need_wakeup) wakeup_pipe_writers(pipe); } -EXPORT_SYMBOL(splice_from_pipe_end); /** * __splice_from_pipe - splice data from a pipe to given actor @@ -985,7 +925,7 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, } /** - * generic_file_splice_write - splice data from a pipe to a file + * iter_file_splice_write - splice data from a pipe to a file * @pipe: pipe info * @out: file to write to * @ppos: position in @out @@ -995,40 +935,122 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, * Description: * Will either move or copy pages (determined by @flags options) from * the given pipe inode to the given file. + * This one is ->write_iter-based. * */ ssize_t -generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, +iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { - struct address_space *mapping = out->f_mapping; - struct inode *inode = mapping->host; struct splice_desc sd = { .total_len = len, .flags = flags, .pos = *ppos, .u.file = out, }; + int nbufs = pipe->buffers; + struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec), + GFP_KERNEL); ssize_t ret; + if (unlikely(!array)) + return -ENOMEM; + pipe_lock(pipe); splice_from_pipe_begin(&sd); - do { + while (sd.total_len) { + struct iov_iter from; + struct kiocb kiocb; + size_t left; + int n, idx; + ret = splice_from_pipe_next(pipe, &sd); if (ret <= 0) break; - mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); - ret = file_remove_suid(out); - if (!ret) { - ret = file_update_time(out); - if (!ret) - ret = splice_from_pipe_feed(pipe, &sd, - pipe_to_file); + if (unlikely(nbufs < pipe->buffers)) { + kfree(array); + nbufs = pipe->buffers; + array = kcalloc(nbufs, sizeof(struct bio_vec), + GFP_KERNEL); + if (!array) { + ret = -ENOMEM; + break; + } } - mutex_unlock(&inode->i_mutex); - } while (ret > 0); + + /* build the vector */ + left = sd.total_len; + for (n = 0, idx = pipe->curbuf; left && n < pipe->nrbufs; n++, idx++) { + struct pipe_buffer *buf = pipe->bufs + idx; + size_t this_len = buf->len; + + if (this_len > left) + this_len = left; + + if (idx == pipe->buffers - 1) + idx = -1; + + ret = buf->ops->confirm(pipe, buf); + if (unlikely(ret)) { + if (ret == -ENODATA) + ret = 0; + goto done; + } + + array[n].bv_page = buf->page; + array[n].bv_len = this_len; + array[n].bv_offset = buf->offset; + left -= this_len; + } + + /* ... iov_iter */ + from.type = ITER_BVEC | WRITE; + from.bvec = array; + from.nr_segs = n; + from.count = sd.total_len - left; + from.iov_offset = 0; + + /* ... and iocb */ + init_sync_kiocb(&kiocb, out); + kiocb.ki_pos = sd.pos; + kiocb.ki_nbytes = sd.total_len - left; + + /* now, send it */ + ret = out->f_op->write_iter(&kiocb, &from); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&kiocb); + + if (ret <= 0) + break; + + sd.num_spliced += ret; + sd.total_len -= ret; + *ppos = sd.pos = kiocb.ki_pos; + + /* dismiss the fully eaten buffers, adjust the partial one */ + while (ret) { + struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; + if (ret >= buf->len) { + const struct pipe_buf_operations *ops = buf->ops; + ret -= buf->len; + buf->len = 0; + buf->ops = NULL; + ops->release(pipe, buf); + pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); + pipe->nrbufs--; + if (pipe->files) + sd.need_wakeup = true; + } else { + buf->offset += ret; + buf->len -= ret; + ret = 0; + } + } + } +done: + kfree(array); splice_from_pipe_end(pipe, &sd); pipe_unlock(pipe); @@ -1036,21 +1058,10 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, if (sd.num_spliced) ret = sd.num_spliced; - if (ret > 0) { - int err; - - err = generic_write_sync(out, *ppos, ret); - if (err) - ret = err; - else - *ppos += ret; - balance_dirty_pages_ratelimited(mapping); - } - return ret; } -EXPORT_SYMBOL(generic_file_splice_write); +EXPORT_SYMBOL(iter_file_splice_write); static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) @@ -1537,7 +1548,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov, struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; - ssize_t count = 0; + ssize_t count; pipe = get_pipe_info(file); if (!pipe) @@ -1546,9 +1557,10 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov, ret = rw_copy_check_uvector(READ, uiov, nr_segs, ARRAY_SIZE(iovstack), iovstack, &iov); if (ret <= 0) - return ret; + goto out; - iov_iter_init(&iter, iov, nr_segs, count, 0); + count = ret; + iov_iter_init(&iter, READ, iov, nr_segs, count); sd.len = 0; sd.total_len = count; @@ -1560,6 +1572,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov, ret = __splice_from_pipe(pipe, &sd, pipe_to_user); pipe_unlock(pipe); +out: if (iov != iovstack) kfree(iov); diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 9e1bb79f7e6f..887d6d270080 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -25,7 +25,7 @@ #define ERROR(s, args...) pr_err("SQUASHFS error: "s, ## args) -#define WARNING(s, args...) pr_warning("SQUASHFS: "s, ## args) +#define WARNING(s, args...) pr_warn("SQUASHFS: "s, ## args) /* block.c */ extern int squashfs_read_data(struct super_block *, u64, int, u64 *, diff --git a/fs/super.c b/fs/super.c index e9dc3c3fe159..d20d5b11dedf 100644 --- a/fs/super.c +++ b/fs/super.c @@ -112,9 +112,14 @@ static unsigned long super_cache_count(struct shrinker *shrink, sb = container_of(shrink, struct super_block, s_shrink); - if (!grab_super_passive(sb)) - return 0; - + /* + * Don't call grab_super_passive as it is a potential + * scalability bottleneck. The counts could get updated + * between super_cache_count and super_cache_scan anyway. + * Call to super_cache_count with shrinker_rwsem held + * ensures the safety of call to list_lru_count_node() and + * s_op->nr_cached_objects(). + */ if (sb->s_op && sb->s_op->nr_cached_objects) total_objects = sb->s_op->nr_cached_objects(sb, sc->nid); @@ -125,7 +130,6 @@ static unsigned long super_cache_count(struct shrinker *shrink, sc->nid); total_objects = vfs_pressure_ratio(total_objects); - drop_super(sb); return total_objects; } @@ -276,10 +280,8 @@ void deactivate_locked_super(struct super_block *s) struct file_system_type *fs = s->s_type; if (atomic_dec_and_test(&s->s_active)) { cleancache_invalidate_fs(s); - fs->kill_sb(s); - - /* caches are now gone, we can safely kill the shrinker now */ unregister_shrinker(&s->s_shrink); + fs->kill_sb(s); put_filesystem(fs); put_super(s); @@ -800,7 +802,10 @@ void emergency_remount(void) static DEFINE_IDA(unnamed_dev_ida); static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */ -static int unnamed_dev_start = 0; /* don't bother trying below it */ +/* Many userspace utilities consider an FSID of 0 invalid. + * Always return at least 1 from get_anon_bdev. + */ +static int unnamed_dev_start = 1; int get_anon_bdev(dev_t *p) { diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 1b8b91b67fdb..e9ef59b3abb1 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -47,12 +47,13 @@ static int sysfs_kf_seq_show(struct seq_file *sf, void *v) ssize_t count; char *buf; - /* acquire buffer and ensure that it's >= PAGE_SIZE */ + /* acquire buffer and ensure that it's >= PAGE_SIZE and clear */ count = seq_get_buf(sf, &buf); if (count < PAGE_SIZE) { seq_commit(sf, -1); return 0; } + memset(buf, 0, PAGE_SIZE); /* * Invoke show(). Control may reach here via seq file lseek even @@ -453,95 +454,3 @@ void sysfs_remove_bin_file(struct kobject *kobj, kernfs_remove_by_name(kobj->sd, attr->attr.name); } EXPORT_SYMBOL_GPL(sysfs_remove_bin_file); - -struct sysfs_schedule_callback_struct { - struct list_head workq_list; - struct kobject *kobj; - void (*func)(void *); - void *data; - struct module *owner; - struct work_struct work; -}; - -static struct workqueue_struct *sysfs_workqueue; -static DEFINE_MUTEX(sysfs_workq_mutex); -static LIST_HEAD(sysfs_workq); -static void sysfs_schedule_callback_work(struct work_struct *work) -{ - struct sysfs_schedule_callback_struct *ss = container_of(work, - struct sysfs_schedule_callback_struct, work); - - (ss->func)(ss->data); - kobject_put(ss->kobj); - module_put(ss->owner); - mutex_lock(&sysfs_workq_mutex); - list_del(&ss->workq_list); - mutex_unlock(&sysfs_workq_mutex); - kfree(ss); -} - -/** - * sysfs_schedule_callback - helper to schedule a callback for a kobject - * @kobj: object we're acting for. - * @func: callback function to invoke later. - * @data: argument to pass to @func. - * @owner: module owning the callback code - * - * sysfs attribute methods must not unregister themselves or their parent - * kobject (which would amount to the same thing). Attempts to do so will - * deadlock, since unregistration is mutually exclusive with driver - * callbacks. - * - * Instead methods can call this routine, which will attempt to allocate - * and schedule a workqueue request to call back @func with @data as its - * argument in the workqueue's process context. @kobj will be pinned - * until @func returns. - * - * Returns 0 if the request was submitted, -ENOMEM if storage could not - * be allocated, -ENODEV if a reference to @owner isn't available, - * -EAGAIN if a callback has already been scheduled for @kobj. - */ -int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), - void *data, struct module *owner) -{ - struct sysfs_schedule_callback_struct *ss, *tmp; - - if (!try_module_get(owner)) - return -ENODEV; - - mutex_lock(&sysfs_workq_mutex); - list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list) - if (ss->kobj == kobj) { - module_put(owner); - mutex_unlock(&sysfs_workq_mutex); - return -EAGAIN; - } - mutex_unlock(&sysfs_workq_mutex); - - if (sysfs_workqueue == NULL) { - sysfs_workqueue = create_singlethread_workqueue("sysfsd"); - if (sysfs_workqueue == NULL) { - module_put(owner); - return -ENOMEM; - } - } - - ss = kmalloc(sizeof(*ss), GFP_KERNEL); - if (!ss) { - module_put(owner); - return -ENOMEM; - } - kobject_get(kobj); - ss->kobj = kobj; - ss->func = func; - ss->data = data; - ss->owner = owner; - INIT_WORK(&ss->work, sysfs_schedule_callback_work); - INIT_LIST_HEAD(&ss->workq_list); - mutex_lock(&sysfs_workq_mutex); - list_add_tail(&ss->workq_list, &sysfs_workq); - mutex_unlock(&sysfs_workq_mutex); - queue_work(sysfs_workqueue, &ss->work); - return 0; -} -EXPORT_SYMBOL_GPL(sysfs_schedule_callback); diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index aa0406895b53..7d2a860ba788 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -18,7 +18,7 @@ #include "sysfs.h" -static void remove_files(struct kernfs_node *parent, struct kobject *kobj, +static void remove_files(struct kernfs_node *parent, const struct attribute_group *grp) { struct attribute *const *attr; @@ -29,7 +29,7 @@ static void remove_files(struct kernfs_node *parent, struct kobject *kobj, kernfs_remove_by_name(parent, (*attr)->name); if (grp->bin_attrs) for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) - sysfs_remove_bin_file(kobj, *bin_attr); + kernfs_remove_by_name(parent, (*bin_attr)->attr.name); } static int create_files(struct kernfs_node *parent, struct kobject *kobj, @@ -62,7 +62,7 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj, break; } if (error) { - remove_files(parent, kobj, grp); + remove_files(parent, grp); goto exit; } } @@ -79,7 +79,7 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj, break; } if (error) - remove_files(parent, kobj, grp); + remove_files(parent, grp); } exit: return error; @@ -224,7 +224,7 @@ void sysfs_remove_group(struct kobject *kobj, kernfs_get(kn); } - remove_files(kn, kobj, grp); + remove_files(kn, grp); if (grp->name) kernfs_remove(kn); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index a66ad6196f59..8a49486bf30c 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -13,6 +13,7 @@ #define DEBUG #include <linux/fs.h> +#include <linux/magic.h> #include <linux/mount.h> #include <linux/init.h> #include <linux/user_namespace.h> @@ -38,7 +39,8 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, } ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); - root = kernfs_mount_ns(fs_type, flags, sysfs_root, &new_sb, ns); + root = kernfs_mount_ns(fs_type, flags, sysfs_root, + SYSFS_MAGIC, &new_sb, ns); if (IS_ERR(root) || !new_sb) kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); return root; @@ -63,7 +65,8 @@ int __init sysfs_init(void) { int err; - sysfs_root = kernfs_create_root(NULL, 0, NULL); + sysfs_root = kernfs_create_root(NULL, KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, + NULL); if (IS_ERR(sysfs_root)) return PTR_ERR(sysfs_root); diff --git a/fs/sysv/file.c b/fs/sysv/file.c index 9d4dc6831792..b00811c75b24 100644 --- a/fs/sysv/file.c +++ b/fs/sysv/file.c @@ -21,10 +21,10 @@ */ const struct file_operations sysv_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write = new_sync_write, + .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .fsync = generic_file_fsync, .splice_read = generic_file_splice_read, diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index e8e01d74dc05..eb997e9c4ab0 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -437,7 +437,6 @@ static int calc_dd_growth(const struct ubifs_info *c, */ int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req) { - int uninitialized_var(cmt_retries), uninitialized_var(wb_retries); int err, idx_growth, data_growth, dd_growth, retried = 0; ubifs_assert(req->new_page <= 1); diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 5157b866a853..177b0152fef4 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -745,8 +745,10 @@ void ubifs_dump_lprops(struct ubifs_info *c) for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) { err = ubifs_read_one_lp(c, lnum, &lp); - if (err) + if (err) { ubifs_err("cannot read lprops for LEB %d", lnum); + continue; + } ubifs_dump_lprop(c, &lp); } diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 4f34dbae823d..b5b593c45270 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -903,8 +903,9 @@ static int do_writepage(struct page *page, int len) struct ubifs_info *c = inode->i_sb->s_fs_info; #ifdef UBIFS_DEBUG + struct ubifs_inode *ui = ubifs_inode(inode); spin_lock(&ui->ui_lock); - ubifs_assert(page->index <= ui->synced_i_size << PAGE_CACHE_SIZE); + ubifs_assert(page->index <= ui->synced_i_size >> PAGE_CACHE_SHIFT); spin_unlock(&ui->ui_lock); #endif @@ -1363,17 +1364,17 @@ static inline int mctime_update_needed(const struct inode *inode, /** * update_ctime - update mtime and ctime of an inode. - * @c: UBIFS file-system description object * @inode: inode to update * * This function updates mtime and ctime of the inode if it is not equivalent to * current time. Returns zero in case of success and a negative error code in * case of failure. */ -static int update_mctime(struct ubifs_info *c, struct inode *inode) +static int update_mctime(struct inode *inode) { struct timespec now = ubifs_current_time(inode); struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_info *c = inode->i_sb->s_fs_info; if (mctime_update_needed(inode, &now)) { int err, release; @@ -1396,18 +1397,13 @@ static int update_mctime(struct ubifs_info *c, struct inode *inode) return 0; } -static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t ubifs_write_iter(struct kiocb *iocb, struct iov_iter *from) { - int err; - struct inode *inode = iocb->ki_filp->f_mapping->host; - struct ubifs_info *c = inode->i_sb->s_fs_info; - - err = update_mctime(c, inode); + int err = update_mctime(file_inode(iocb->ki_filp)); if (err) return err; - return generic_file_aio_write(iocb, iov, nr_segs, pos); + return generic_file_write_iter(iocb, from); } static int ubifs_set_page_dirty(struct page *page) @@ -1525,8 +1521,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, } wait_for_stable_page(page); - unlock_page(page); - return 0; + return VM_FAULT_LOCKED; out_unlock: unlock_page(page); @@ -1582,15 +1577,15 @@ const struct inode_operations ubifs_symlink_inode_operations = { const struct file_operations ubifs_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = ubifs_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = ubifs_write_iter, .mmap = ubifs_file_mmap, .fsync = ubifs_fsync, .unlocked_ioctl = ubifs_ioctl, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, #ifdef CONFIG_COMPAT .compat_ioctl = ubifs_compat_ioctl, #endif diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index e18b9889a51b..2290d5866725 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -988,30 +988,32 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len, return err; if (type != ch->node_type) { - ubifs_err("bad node type (%d but expected %d)", - ch->node_type, type); + ubifs_errc(c, "bad node type (%d but expected %d)", + ch->node_type, type); goto out; } err = ubifs_check_node(c, buf, lnum, offs, 0, 0); if (err) { - ubifs_err("expected node type %d", type); + ubifs_errc(c, "expected node type %d", type); return err; } l = le32_to_cpu(ch->len); if (l != len) { - ubifs_err("bad node length %d, expected %d", l, len); + ubifs_errc(c, "bad node length %d, expected %d", l, len); goto out; } return 0; out: - ubifs_err("bad node at LEB %d:%d, LEB mapping status %d", lnum, offs, - ubi_is_mapped(c->ubi, lnum)); - ubifs_dump_node(c, buf); - dump_stack(); + ubifs_errc(c, "bad node at LEB %d:%d, LEB mapping status %d", lnum, + offs, ubi_is_mapped(c->ubi, lnum)); + if (!c->probing) { + ubifs_dump_node(c, buf); + dump_stack(); + } return -EINVAL; } diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 4b826abb1528..45d4e96a6bac 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -460,9 +460,9 @@ static int write_cnodes(struct ubifs_info *c) * important. */ clear_bit(DIRTY_CNODE, &cnode->flags); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(COW_CNODE, &cnode->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); offs += len; dbg_chk_lpt_sz(c, 1, len); cnode = cnode->cnext; diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c index f35135e28e96..9a9fb94a41c6 100644 --- a/fs/ubifs/shrinker.c +++ b/fs/ubifs/shrinker.c @@ -128,7 +128,6 @@ static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention) freed = ubifs_destroy_tnc_subtree(znode); atomic_long_sub(freed, &ubifs_clean_zn_cnt); atomic_long_sub(freed, &c->clean_zn_cnt); - ubifs_assert(atomic_long_read(&c->clean_zn_cnt) >= 0); total_freed += freed; znode = zprev; } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index a1266089eca1..3904c8574ef9 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1149,6 +1149,9 @@ static int mount_ubifs(struct ubifs_info *c) size_t sz; c->ro_mount = !!(c->vfs_sb->s_flags & MS_RDONLY); + /* Suppress error messages while probing if MS_SILENT is set */ + c->probing = !!(c->vfs_sb->s_flags & MS_SILENT); + err = init_constants_early(c); if (err) return err; @@ -1214,6 +1217,8 @@ static int mount_ubifs(struct ubifs_info *c) if (err) goto out_free; + c->probing = 0; + /* * Make sure the compressor which is set as default in the superblock * or overridden by mount options is actually compiled in. @@ -1556,7 +1561,7 @@ static int ubifs_remount_rw(struct ubifs_info *c) if (c->space_fixup) { err = ubifs_fixup_free_space(c); if (err) - return err; + goto out; } err = check_free_space(c); diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 9083bc7ed4ae..8a40cf9c02d7 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -2859,10 +2859,11 @@ void ubifs_tnc_close(struct ubifs_info *c) { tnc_destroy_cnext(c); if (c->zroot.znode) { - long n; + long n, freed; - ubifs_destroy_tnc_subtree(c->zroot.znode); n = atomic_long_read(&c->clean_zn_cnt); + freed = ubifs_destroy_tnc_subtree(c->zroot.znode); + ubifs_assert(freed == n); atomic_long_sub(n, &ubifs_clean_zn_cnt); } kfree(c->gap_lebs); diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index 52a6559275c4..3600994f8411 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -895,9 +895,9 @@ static int write_index(struct ubifs_info *c) * the reason for the second barrier. */ clear_bit(DIRTY_ZNODE, &znode->flags); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(COW_ZNODE, &znode->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); /* * We have marked the znode as clean but have not updated the diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index e8c8cfe1435c..c1f71fe17cc0 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -51,6 +51,15 @@ #define ubifs_warn(fmt, ...) \ pr_warn("UBIFS warning (pid %d): %s: " fmt "\n", \ current->pid, __func__, ##__VA_ARGS__) +/* + * A variant of 'ubifs_err()' which takes the UBIFS file-sytem description + * object as an argument. + */ +#define ubifs_errc(c, fmt, ...) \ + do { \ + if (!(c)->probing) \ + ubifs_err(fmt, ##__VA_ARGS__); \ + } while (0) /* UBIFS file system VFS magic number */ #define UBIFS_SUPER_MAGIC 0x24051905 @@ -1209,6 +1218,7 @@ struct ubifs_debug_info; * @need_recovery: %1 if the file-system needs recovery * @replaying: %1 during journal replay * @mounting: %1 while mounting + * @probing: %1 while attempting to mount if MS_SILENT mount flag is set * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode * @replay_list: temporary list used during journal replay * @replay_buds: list of buds to replay @@ -1441,6 +1451,7 @@ struct ubifs_info { unsigned int replaying:1; unsigned int mounting:1; unsigned int remounting_rw:1; + unsigned int probing:1; struct list_head replay_list; struct list_head replay_buds; unsigned long long cs_sqnum; diff --git a/fs/udf/file.c b/fs/udf/file.c index d2c170f8b035..d80738fdf424 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -119,8 +119,8 @@ static int udf_adinicb_write_end(struct file *file, } static ssize_t udf_adinicb_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, - loff_t offset, unsigned long nr_segs) + struct iov_iter *iter, + loff_t offset) { /* Fallback to buffered I/O. */ return 0; @@ -134,8 +134,7 @@ const struct address_space_operations udf_adinicb_aops = { .direct_IO = udf_adinicb_direct_IO, }; -static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t ppos) +static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t retval; struct file *file = iocb->ki_filp; @@ -150,7 +149,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (file->f_flags & O_APPEND) pos = inode->i_size; else - pos = ppos; + pos = iocb->ki_pos; if (inode->i_sb->s_blocksize < (udf_file_entry_alloc_offset(inode) + @@ -171,7 +170,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, } else up_write(&iinfo->i_data_sem); - retval = __generic_file_aio_write(iocb, iov, nr_segs); + retval = __generic_file_write_iter(iocb, from); mutex_unlock(&inode->i_mutex); if (retval > 0) { @@ -252,13 +251,13 @@ static int udf_release_file(struct inode *inode, struct file *filp) } const struct file_operations udf_file_operations = { - .read = do_sync_read, - .aio_read = generic_file_aio_read, + .read = new_sync_read, + .read_iter = generic_file_read_iter, .unlocked_ioctl = udf_ioctl, .open = generic_file_open, .mmap = generic_file_mmap, - .write = do_sync_write, - .aio_write = udf_file_aio_write, + .write = new_sync_write, + .write_iter = udf_file_write_iter, .release = udf_release_file, .fsync = generic_file_fsync, .splice_read = generic_file_splice_read, diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 5d643706212f..236cd48184c2 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -217,18 +217,18 @@ static int udf_write_begin(struct file *file, struct address_space *mapping, } static ssize_t udf_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, - loff_t offset, unsigned long nr_segs) + struct iov_iter *iter, + loff_t offset) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; + size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, - udf_get_block); + ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, udf_get_block); if (unlikely(ret < 0 && (rw & WRITE))) - udf_write_failed(mapping, offset + iov_length(iov, nr_segs)); + udf_write_failed(mapping, offset + count); return ret; } diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 0ab1de4b39a5..7bc20809c99e 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -24,7 +24,7 @@ #define INVBLOCK ((u64)-1L) -static u64 ufs_add_fragments(struct inode *, u64, unsigned, unsigned, int *); +static u64 ufs_add_fragments(struct inode *, u64, unsigned, unsigned); static u64 ufs_alloc_fragments(struct inode *, unsigned, u64, unsigned, int *); static u64 ufs_alloccg_block(struct inode *, struct ufs_cg_private_info *, u64, int *); static u64 ufs_bitmap_search (struct super_block *, struct ufs_cg_private_info *, u64, unsigned); @@ -52,7 +52,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) if (ufs_fragnum(fragment) + count > uspi->s_fpg) ufs_error (sb, "ufs_free_fragments", "internal error"); - mutex_lock(&UFS_SB(sb)->s_lock); + lock_ufs(sb); cgno = ufs_dtog(uspi, fragment); bit = ufs_dtogd(uspi, fragment); @@ -116,12 +116,12 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) ubh_sync_block(UCPI_UBH(ucpi)); ufs_mark_sb_dirty(sb); - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); UFSD("EXIT\n"); return; failed: - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); UFSD("EXIT (FAILED)\n"); return; } @@ -151,7 +151,7 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count) goto failed; } - mutex_lock(&UFS_SB(sb)->s_lock); + lock_ufs(sb); do_more: overflow = 0; @@ -211,12 +211,12 @@ do_more: } ufs_mark_sb_dirty(sb); - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); UFSD("EXIT\n"); return; failed_unlock: - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); failed: UFSD("EXIT (FAILED)\n"); return; @@ -357,7 +357,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, usb1 = ubh_get_usb_first(uspi); *err = -ENOSPC; - mutex_lock(&UFS_SB(sb)->s_lock); + lock_ufs(sb); tmp = ufs_data_ptr_to_cpu(sb, p); if (count + ufs_fragnum(fragment) > uspi->s_fpb) { @@ -378,19 +378,19 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, "fragment %llu, tmp %llu\n", (unsigned long long)fragment, (unsigned long long)tmp); - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); return INVBLOCK; } if (fragment < UFS_I(inode)->i_lastfrag) { UFSD("EXIT (ALREADY ALLOCATED)\n"); - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); return 0; } } else { if (tmp) { UFSD("EXIT (ALREADY ALLOCATED)\n"); - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); return 0; } } @@ -399,7 +399,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, * There is not enough space for user on the device */ if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) { - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); UFSD("EXIT (FAILED)\n"); return 0; } @@ -424,7 +424,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, ufs_clear_frags(inode, result + oldcount, newcount - oldcount, locked_page != NULL); } - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); UFSD("EXIT, result %llu\n", (unsigned long long)result); return result; } @@ -432,14 +432,14 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, /* * resize block */ - result = ufs_add_fragments (inode, tmp, oldcount, newcount, err); + result = ufs_add_fragments(inode, tmp, oldcount, newcount); if (result) { *err = 0; UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, fragment + count); ufs_clear_frags(inode, result + oldcount, newcount - oldcount, locked_page != NULL); - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); UFSD("EXIT, result %llu\n", (unsigned long long)result); return result; } @@ -477,7 +477,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, *err = 0; UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, fragment + count); - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); if (newcount < request) ufs_free_fragments (inode, result + newcount, request - newcount); ufs_free_fragments (inode, tmp, oldcount); @@ -485,13 +485,13 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, return result; } - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); UFSD("EXIT (FAILED)\n"); return 0; } static u64 ufs_add_fragments(struct inode *inode, u64 fragment, - unsigned oldcount, unsigned newcount, int *err) + unsigned oldcount, unsigned newcount) { struct super_block * sb; struct ufs_sb_private_info * uspi; diff --git a/fs/ufs/file.c b/fs/ufs/file.c index 33afa20d4509..c84ec010a676 100644 --- a/fs/ufs/file.c +++ b/fs/ufs/file.c @@ -35,10 +35,10 @@ const struct file_operations ufs_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write = new_sync_write, + .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .open = generic_file_open, .fsync = generic_file_fsync, diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 98f7211599ff..a9cc75ffa925 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -69,11 +69,11 @@ void ufs_free_inode (struct inode * inode) ino = inode->i_ino; - mutex_lock(&UFS_SB(sb)->s_lock); + lock_ufs(sb); if (!((ino > 1) && (ino < (uspi->s_ncg * uspi->s_ipg )))) { ufs_warning(sb, "ufs_free_inode", "reserved inode or nonexistent inode %u\n", ino); - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); return; } @@ -81,7 +81,7 @@ void ufs_free_inode (struct inode * inode) bit = ufs_inotocgoff (ino); ucpi = ufs_load_cylinder (sb, cg); if (!ucpi) { - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); return; } ucg = ubh_get_ucg(UCPI_UBH(ucpi)); @@ -115,7 +115,7 @@ void ufs_free_inode (struct inode * inode) ubh_sync_block(UCPI_UBH(ucpi)); ufs_mark_sb_dirty(sb); - mutex_unlock(&UFS_SB(sb)->s_lock); + unlock_ufs(sb); UFSD("EXIT\n"); } @@ -193,7 +193,7 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode) sbi = UFS_SB(sb); uspi = sbi->s_uspi; - mutex_lock(&sbi->s_lock); + lock_ufs(sb); /* * Try to place the inode in its parent directory @@ -328,21 +328,20 @@ cg_found: sync_dirty_buffer(bh); brelse(bh); } - - mutex_unlock(&sbi->s_lock); + unlock_ufs(sb); UFSD("allocating inode %lu\n", inode->i_ino); UFSD("EXIT\n"); return inode; fail_remove_inode: - mutex_unlock(&sbi->s_lock); + unlock_ufs(sb); clear_nlink(inode); iput(inode); UFSD("EXIT (FAILED): err %d\n", err); return ERR_PTR(err); failed: - mutex_unlock(&sbi->s_lock); + unlock_ufs(sb); make_bad_inode(inode); iput (inode); UFSD("EXIT (FAILED): err %d\n", err); diff --git a/fs/ufs/super.c b/fs/ufs/super.c index c1183f9f69dc..b879f1ba3439 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -697,7 +697,6 @@ static int ufs_sync_fs(struct super_block *sb, int wait) unsigned flags; lock_ufs(sb); - mutex_lock(&UFS_SB(sb)->s_lock); UFSD("ENTER\n"); @@ -715,7 +714,6 @@ static int ufs_sync_fs(struct super_block *sb, int wait) ufs_put_cstotal(sb); UFSD("EXIT\n"); - mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return 0; @@ -760,6 +758,7 @@ static void ufs_put_super(struct super_block *sb) ubh_brelse_uspi (sbi->s_uspi); kfree (sbi->s_uspi); + mutex_destroy(&sbi->mutex); kfree (sbi); sb->s_fs_info = NULL; UFSD("EXIT\n"); @@ -786,6 +785,14 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) flags = 0; UFSD("ENTER\n"); + +#ifndef CONFIG_UFS_FS_WRITE + if (!(sb->s_flags & MS_RDONLY)) { + printk("ufs was compiled with read-only support, " + "can't be mounted as read-write\n"); + return -EROFS; + } +#endif sbi = kzalloc(sizeof(struct ufs_sb_info), GFP_KERNEL); if (!sbi) @@ -795,15 +802,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY)); -#ifndef CONFIG_UFS_FS_WRITE - if (!(sb->s_flags & MS_RDONLY)) { - printk("ufs was compiled with read-only support, " - "can't be mounted as read-write\n"); - goto failed; - } -#endif mutex_init(&sbi->mutex); - mutex_init(&sbi->s_lock); spin_lock_init(&sbi->work_lock); INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs); /* @@ -1257,6 +1256,7 @@ magic_found: return 0; failed: + mutex_destroy(&sbi->mutex); if (ubh) ubh_brelse_uspi (uspi); kfree (uspi); @@ -1280,7 +1280,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) sync_filesystem(sb); lock_ufs(sb); - mutex_lock(&UFS_SB(sb)->s_lock); uspi = UFS_SB(sb)->s_uspi; flags = UFS_SB(sb)->s_flags; usb1 = ubh_get_usb_first(uspi); @@ -1294,7 +1293,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) new_mount_opt = 0; ufs_set_opt (new_mount_opt, ONERROR_LOCK); if (!ufs_parse_options (data, &new_mount_opt)) { - mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return -EINVAL; } @@ -1302,14 +1300,12 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) new_mount_opt |= ufstype; } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { printk("ufstype can't be changed during remount\n"); - mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return -EINVAL; } if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { UFS_SB(sb)->s_mount_opt = new_mount_opt; - mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return 0; } @@ -1334,7 +1330,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) #ifndef CONFIG_UFS_FS_WRITE printk("ufs was compiled with read-only support, " "can't be mounted as read-write\n"); - mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return -EINVAL; #else @@ -1344,13 +1339,11 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) ufstype != UFS_MOUNT_UFSTYPE_SUNx86 && ufstype != UFS_MOUNT_UFSTYPE_UFS2) { printk("this ufstype is read-only supported\n"); - mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return -EINVAL; } if (!ufs_read_cylinder_structures(sb)) { printk("failed during remounting\n"); - mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return -EPERM; } @@ -1358,7 +1351,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) #endif } UFS_SB(sb)->s_mount_opt = new_mount_opt; - mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return 0; } diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index ff2c15ab81aa..343e6fc571e5 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -24,7 +24,6 @@ struct ufs_sb_info { int work_queued; /* non-zero if the delayed work is queued */ struct delayed_work sync_work; /* FS sync delayed work */ spinlock_t work_lock; /* protects sync_work and work_queued */ - struct mutex s_lock; }; struct ufs_inode_info { diff --git a/fs/xattr.c b/fs/xattr.c index 3377dff18404..c69e6d43a0d2 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -843,7 +843,7 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) /* wrap around? */ len = sizeof(*new_xattr) + size; - if (len <= sizeof(*new_xattr)) + if (len < sizeof(*new_xattr)) return NULL; new_xattr = kmalloc(len, GFP_KERNEL); diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 0fdd4109c624..6e247a99f5db 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -160,30 +160,38 @@ typedef struct xfs_agi { * still being referenced. */ __be32 agi_unlinked[XFS_AGI_UNLINKED_BUCKETS]; - + /* + * This marks the end of logging region 1 and start of logging region 2. + */ uuid_t agi_uuid; /* uuid of filesystem */ __be32 agi_crc; /* crc of agi sector */ __be32 agi_pad32; __be64 agi_lsn; /* last write sequence */ + __be32 agi_free_root; /* root of the free inode btree */ + __be32 agi_free_level;/* levels in free inode btree */ + /* structure must be padded to 64 bit alignment */ } xfs_agi_t; #define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc) -#define XFS_AGI_MAGICNUM 0x00000001 -#define XFS_AGI_VERSIONNUM 0x00000002 -#define XFS_AGI_SEQNO 0x00000004 -#define XFS_AGI_LENGTH 0x00000008 -#define XFS_AGI_COUNT 0x00000010 -#define XFS_AGI_ROOT 0x00000020 -#define XFS_AGI_LEVEL 0x00000040 -#define XFS_AGI_FREECOUNT 0x00000080 -#define XFS_AGI_NEWINO 0x00000100 -#define XFS_AGI_DIRINO 0x00000200 -#define XFS_AGI_UNLINKED 0x00000400 -#define XFS_AGI_NUM_BITS 11 -#define XFS_AGI_ALL_BITS ((1 << XFS_AGI_NUM_BITS) - 1) +#define XFS_AGI_MAGICNUM (1 << 0) +#define XFS_AGI_VERSIONNUM (1 << 1) +#define XFS_AGI_SEQNO (1 << 2) +#define XFS_AGI_LENGTH (1 << 3) +#define XFS_AGI_COUNT (1 << 4) +#define XFS_AGI_ROOT (1 << 5) +#define XFS_AGI_LEVEL (1 << 6) +#define XFS_AGI_FREECOUNT (1 << 7) +#define XFS_AGI_NEWINO (1 << 8) +#define XFS_AGI_DIRINO (1 << 9) +#define XFS_AGI_UNLINKED (1 << 10) +#define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */ +#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1) +#define XFS_AGI_FREE_ROOT (1 << 11) +#define XFS_AGI_FREE_LEVEL (1 << 12) +#define XFS_AGI_NUM_BITS_R2 13 /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log)) diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index c1cf6a336a72..d43813267a80 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -257,16 +257,14 @@ xfs_alloc_fix_len( k = rlen % args->prod; if (k == args->mod) return; - if (k > args->mod) { - if ((int)(rlen = rlen - k - args->mod) < (int)args->minlen) - return; - } else { - if ((int)(rlen = rlen - args->prod - (args->mod - k)) < - (int)args->minlen) - return; - } - ASSERT(rlen >= args->minlen); - ASSERT(rlen <= args->maxlen); + if (k > args->mod) + rlen = rlen - (k - args->mod); + else + rlen = rlen - args->prod + (args->mod - k); + if ((int)rlen < (int)args->minlen) + return; + ASSERT(rlen >= args->minlen && rlen <= args->maxlen); + ASSERT(rlen % args->prod == args->mod); args->len = rlen; } @@ -541,7 +539,6 @@ xfs_alloc_read_agfl( XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); if (error) return error; - ASSERT(!xfs_buf_geterror(bp)); xfs_buf_set_ref(bp, XFS_AGFL_REF); *bpp = bp; return 0; diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index cc1eadcbb049..8358f1ded94d 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -70,7 +70,6 @@ xfs_allocbt_alloc_block( struct xfs_btree_cur *cur, union xfs_btree_ptr *start, union xfs_btree_ptr *new, - int length, int *stat) { int error; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 75df77d09f75..faaf716e2080 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -975,14 +975,39 @@ xfs_vm_writepage( * Given that we do not allow direct reclaim to call us, we should * never be called while in a filesystem transaction. */ - if (WARN_ON(current->flags & PF_FSTRANS)) + if (WARN_ON_ONCE(current->flags & PF_FSTRANS)) goto redirty; /* Is this page beyond the end of the file? */ offset = i_size_read(inode); end_index = offset >> PAGE_CACHE_SHIFT; last_index = (offset - 1) >> PAGE_CACHE_SHIFT; - if (page->index >= end_index) { + + /* + * The page index is less than the end_index, adjust the end_offset + * to the highest offset that this page should represent. + * ----------------------------------------------------- + * | file mapping | <EOF> | + * ----------------------------------------------------- + * | Page ... | Page N-2 | Page N-1 | Page N | | + * ^--------------------------------^----------|-------- + * | desired writeback range | see else | + * ---------------------------------^------------------| + */ + if (page->index < end_index) + end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT; + else { + /* + * Check whether the page to write out is beyond or straddles + * i_size or not. + * ------------------------------------------------------- + * | file mapping | <EOF> | + * ------------------------------------------------------- + * | Page ... | Page N-2 | Page N-1 | Page N | Beyond | + * ^--------------------------------^-----------|--------- + * | | Straddles | + * ---------------------------------^-----------|--------| + */ unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1); /* @@ -990,24 +1015,36 @@ xfs_vm_writepage( * truncate operation that is in progress. We must redirty the * page so that reclaim stops reclaiming it. Otherwise * xfs_vm_releasepage() is called on it and gets confused. + * + * Note that the end_index is unsigned long, it would overflow + * if the given offset is greater than 16TB on 32-bit system + * and if we do check the page is fully outside i_size or not + * via "if (page->index >= end_index + 1)" as "end_index + 1" + * will be evaluated to 0. Hence this page will be redirtied + * and be written out repeatedly which would result in an + * infinite loop, the user program that perform this operation + * will hang. Instead, we can verify this situation by checking + * if the page to write is totally beyond the i_size or if it's + * offset is just equal to the EOF. */ - if (page->index >= end_index + 1 || offset_into_page == 0) + if (page->index > end_index || + (page->index == end_index && offset_into_page == 0)) goto redirty; /* * The page straddles i_size. It must be zeroed out on each * and every writepage invocation because it may be mmapped. * "A file is mapped in multiples of the page size. For a file - * that is not a multiple of the page size, the remaining + * that is not a multiple of the page size, the remaining * memory is zeroed when mapped, and writes to that region are * not written out to the file." */ zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE); + + /* Adjust the end_offset to the end of file */ + end_offset = offset; } - end_offset = min_t(unsigned long long, - (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, - offset); len = 1 << inode->i_blkbits; bh = head = page_buffers(page); @@ -1188,9 +1225,9 @@ xfs_vm_releasepage( xfs_count_page_state(page, &delalloc, &unwritten); - if (WARN_ON(delalloc)) + if (WARN_ON_ONCE(delalloc)) return 0; - if (WARN_ON(unwritten)) + if (WARN_ON_ONCE(unwritten)) return 0; return try_to_free_buffers(page); @@ -1344,6 +1381,14 @@ __xfs_get_blocks( /* * If this is O_DIRECT or the mpage code calling tell them how large * the mapping is, so that we can avoid repeated get_blocks calls. + * + * If the mapping spans EOF, then we have to break the mapping up as the + * mapping for blocks beyond EOF must be marked new so that sub block + * regions can be correctly zeroed. We can't do this for mappings within + * EOF unless the mapping was just allocated or is unwritten, otherwise + * the callers would overwrite existing data with zeros. Hence we have + * to split the mapping into a range up to and including EOF, and a + * second mapping for beyond EOF. */ if (direct || size > (1 << inode->i_blkbits)) { xfs_off_t mapping_size; @@ -1354,6 +1399,12 @@ __xfs_get_blocks( ASSERT(mapping_size > 0); if (mapping_size > size) mapping_size = size; + if (offset < i_size_read(inode) && + offset + mapping_size >= i_size_read(inode)) { + /* limit mapping to block that spans EOF */ + mapping_size = roundup_64(i_size_read(inode) - offset, + 1 << inode->i_blkbits); + } if (mapping_size > LONG_MAX) mapping_size = LONG_MAX; @@ -1435,9 +1486,8 @@ STATIC ssize_t xfs_vm_direct_IO( int rw, struct kiocb *iocb, - const struct iovec *iov, - loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, + loff_t offset) { struct inode *inode = iocb->ki_filp->f_mapping->host; struct block_device *bdev = xfs_find_bdev_for_inode(inode); @@ -1445,7 +1495,7 @@ xfs_vm_direct_IO( ssize_t ret; if (rw & WRITE) { - size_t size = iov_length(iov, nr_segs); + size_t size = iov_iter_count(iter); /* * We cannot preallocate a size update transaction here as we @@ -1457,17 +1507,15 @@ xfs_vm_direct_IO( if (offset + size > XFS_I(inode)->i_d.di_size) ioend->io_isdirect = 1; - ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, - offset, nr_segs, - xfs_get_blocks_direct, + ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter, + offset, xfs_get_blocks_direct, xfs_end_io_direct_write, NULL, DIO_ASYNC_EXTEND); if (ret != -EIOCBQUEUED && iocb->private) goto out_destroy_ioend; } else { - ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, - offset, nr_segs, - xfs_get_blocks_direct, + ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter, + offset, xfs_get_blocks_direct, NULL, NULL, 0); } @@ -1566,6 +1614,16 @@ xfs_vm_write_failed( xfs_vm_kill_delalloc_range(inode, block_offset, block_offset + bh->b_size); + + /* + * This buffer does not contain data anymore. make sure anyone + * who finds it knows that for certain. + */ + clear_buffer_delay(bh); + clear_buffer_uptodate(bh); + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_dirty(bh); } } @@ -1599,12 +1657,21 @@ xfs_vm_write_begin( status = __block_write_begin(page, pos, len, xfs_get_blocks); if (unlikely(status)) { struct inode *inode = mapping->host; + size_t isize = i_size_read(inode); xfs_vm_write_failed(inode, page, pos, len); unlock_page(page); - if (pos + len > i_size_read(inode)) - truncate_pagecache(inode, i_size_read(inode)); + /* + * If the write is beyond EOF, we only want to kill blocks + * allocated in this write, not blocks that were previously + * written successfully. + */ + if (pos + len > isize) { + ssize_t start = max_t(ssize_t, pos, isize); + + truncate_pagecache_range(inode, start, pos + len); + } page_cache_release(page); page = NULL; @@ -1615,9 +1682,12 @@ xfs_vm_write_begin( } /* - * On failure, we only need to kill delalloc blocks beyond EOF because they - * will never be written. For blocks within EOF, generic_write_end() zeros them - * so they are safe to leave alone and be written with all the other valid data. + * On failure, we only need to kill delalloc blocks beyond EOF in the range of + * this specific write because they will never be written. Previous writes + * beyond EOF where block allocation succeeded do not need to be trashed, so + * only new blocks from this write should be trashed. For blocks within + * EOF, generic_write_end() zeros them so they are safe to leave alone and be + * written with all the other valid data. */ STATIC int xfs_vm_write_end( @@ -1640,8 +1710,11 @@ xfs_vm_write_end( loff_t to = pos + len; if (to > isize) { - truncate_pagecache(inode, isize); + /* only kill blocks in this write beyond EOF */ + if (pos > isize) + isize = pos; xfs_vm_kill_delalloc_range(inode, isize, to); + truncate_pagecache_range(inode, isize, to); } } return ret; diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 01b6a0102fbd..bfe36fc2cdc2 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -77,17 +77,27 @@ STATIC int xfs_attr_refillstate(xfs_da_state_t *state); STATIC int -xfs_attr_name_to_xname( - struct xfs_name *xname, - const unsigned char *aname) +xfs_attr_args_init( + struct xfs_da_args *args, + struct xfs_inode *dp, + const unsigned char *name, + int flags) { - if (!aname) + + if (!name) return EINVAL; - xname->name = aname; - xname->len = strlen((char *)aname); - if (xname->len >= MAXNAMELEN) + + memset(args, 0, sizeof(*args)); + args->geo = dp->i_mount->m_attr_geo; + args->whichfork = XFS_ATTR_FORK; + args->dp = dp; + args->flags = flags; + args->name = name; + args->namelen = strlen((const char *)name); + if (args->namelen >= MAXNAMELEN) return EFAULT; /* match IRIX behaviour */ + args->hashval = xfs_da_hashname(args->name, args->namelen); return 0; } @@ -106,79 +116,46 @@ xfs_inode_hasattr( * Overall external interface routines. *========================================================================*/ -STATIC int -xfs_attr_get_int( +int +xfs_attr_get( struct xfs_inode *ip, - struct xfs_name *name, + const unsigned char *name, unsigned char *value, int *valuelenp, int flags) { - xfs_da_args_t args; - int error; + struct xfs_da_args args; + uint lock_mode; + int error; + + XFS_STATS_INC(xs_attr_get); + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return EIO; if (!xfs_inode_hasattr(ip)) return ENOATTR; - /* - * Fill in the arg structure for this request. - */ - memset((char *)&args, 0, sizeof(args)); - args.name = name->name; - args.namelen = name->len; + error = xfs_attr_args_init(&args, ip, name, flags); + if (error) + return error; + args.value = value; args.valuelen = *valuelenp; - args.flags = flags; - args.hashval = xfs_da_hashname(args.name, args.namelen); - args.dp = ip; - args.whichfork = XFS_ATTR_FORK; - /* - * Decide on what work routines to call based on the inode size. - */ - if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + lock_mode = xfs_ilock_attr_map_shared(ip); + if (!xfs_inode_hasattr(ip)) + error = ENOATTR; + else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) error = xfs_attr_shortform_getvalue(&args); - } else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) { + else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) error = xfs_attr_leaf_get(&args); - } else { + else error = xfs_attr_node_get(&args); - } + xfs_iunlock(ip, lock_mode); - /* - * Return the number of bytes in the value to the caller. - */ *valuelenp = args.valuelen; - - if (error == EEXIST) - error = 0; - return(error); -} - -int -xfs_attr_get( - xfs_inode_t *ip, - const unsigned char *name, - unsigned char *value, - int *valuelenp, - int flags) -{ - int error; - struct xfs_name xname; - uint lock_mode; - - XFS_STATS_INC(xs_attr_get); - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return(EIO); - - error = xfs_attr_name_to_xname(&xname, name); - if (error) - return error; - - lock_mode = xfs_ilock_attr_map_shared(ip); - error = xfs_attr_get_int(ip, &xname, value, valuelenp, flags); - xfs_iunlock(ip, lock_mode); - return(error); + return error == EEXIST ? 0 : error; } /* @@ -186,12 +163,10 @@ xfs_attr_get( */ STATIC int xfs_attr_calc_size( - struct xfs_inode *ip, - int namelen, - int valuelen, + struct xfs_da_args *args, int *local) { - struct xfs_mount *mp = ip->i_mount; + struct xfs_mount *mp = args->dp->i_mount; int size; int nblks; @@ -199,12 +174,10 @@ xfs_attr_calc_size( * Determine space new attribute will use, and if it would be * "local" or "remote" (note: local != inline). */ - size = xfs_attr_leaf_newentsize(namelen, valuelen, - mp->m_sb.sb_blocksize, local); - + size = xfs_attr_leaf_newentsize(args, local); nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK); if (*local) { - if (size > (mp->m_sb.sb_blocksize >> 1)) { + if (size > (args->geo->blksize / 2)) { /* Double split possible */ nblks *= 2; } @@ -213,7 +186,7 @@ xfs_attr_calc_size( * Out of line attribute, cannot double split, but * make room for the attribute value itself. */ - uint dblocks = XFS_B_TO_FSB(mp, valuelen); + uint dblocks = xfs_attr3_rmt_blocks(mp, args->valuelen); nblks += dblocks; nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK); } @@ -221,26 +194,38 @@ xfs_attr_calc_size( return nblks; } -STATIC int -xfs_attr_set_int( - struct xfs_inode *dp, - struct xfs_name *name, - unsigned char *value, - int valuelen, - int flags) +int +xfs_attr_set( + struct xfs_inode *dp, + const unsigned char *name, + unsigned char *value, + int valuelen, + int flags) { - xfs_da_args_t args; - xfs_fsblock_t firstblock; - xfs_bmap_free_t flist; - int error, err2, committed; struct xfs_mount *mp = dp->i_mount; + struct xfs_da_args args; + struct xfs_bmap_free flist; struct xfs_trans_res tres; + xfs_fsblock_t firstblock; int rsvd = (flags & ATTR_ROOT) != 0; - int local; + int error, err2, committed, local; + + XFS_STATS_INC(xs_attr_set); + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return EIO; + + error = xfs_attr_args_init(&args, dp, name, flags); + if (error) + return error; + + args.value = value; + args.valuelen = valuelen; + args.firstblock = &firstblock; + args.flist = &flist; + args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; + args.total = xfs_attr_calc_size(&args, &local); - /* - * Attach the dquots to the inode. - */ error = xfs_qm_dqattach(dp, 0); if (error) return error; @@ -251,32 +236,14 @@ xfs_attr_set_int( */ if (XFS_IFORK_Q(dp) == 0) { int sf_size = sizeof(xfs_attr_sf_hdr_t) + - XFS_ATTR_SF_ENTSIZE_BYNAME(name->len, valuelen); + XFS_ATTR_SF_ENTSIZE_BYNAME(args.namelen, valuelen); - if ((error = xfs_bmap_add_attrfork(dp, sf_size, rsvd))) - return(error); + error = xfs_bmap_add_attrfork(dp, sf_size, rsvd); + if (error) + return error; } /* - * Fill in the arg structure for this request. - */ - memset((char *)&args, 0, sizeof(args)); - args.name = name->name; - args.namelen = name->len; - args.value = value; - args.valuelen = valuelen; - args.flags = flags; - args.hashval = xfs_da_hashname(args.name, args.namelen); - args.dp = dp; - args.firstblock = &firstblock; - args.flist = &flist; - args.whichfork = XFS_ATTR_FORK; - args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; - - /* Size is now blocks for attribute data */ - args.total = xfs_attr_calc_size(dp, name->len, valuelen, &local); - - /* * Start our first transaction of the day. * * All future transactions during this code must be "chained" off @@ -303,7 +270,7 @@ xfs_attr_set_int( error = xfs_trans_reserve(args.trans, &tres, args.total, 0); if (error) { xfs_trans_cancel(args.trans, 0); - return(error); + return error; } xfs_ilock(dp, XFS_ILOCK_EXCL); @@ -313,7 +280,7 @@ xfs_attr_set_int( if (error) { xfs_iunlock(dp, XFS_ILOCK_EXCL); xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES); - return (error); + return error; } xfs_trans_ijoin(args.trans, dp, 0); @@ -322,9 +289,9 @@ xfs_attr_set_int( * If the attribute list is non-existent or a shortform list, * upgrade it to a single-leaf-block attribute list. */ - if ((dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) || - ((dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) && - (dp->i_d.di_anextents == 0))) { + if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL || + (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && + dp->i_d.di_anextents == 0)) { /* * Build initial attribute list (if required). @@ -349,9 +316,8 @@ xfs_attr_set_int( * the transaction goes to disk before returning * to the user. */ - if (mp->m_flags & XFS_MOUNT_WSYNC) { + if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(args.trans); - } if (!error && (flags & ATTR_KERNOTIME) == 0) { xfs_trans_ichgtime(args.trans, dp, @@ -361,7 +327,7 @@ xfs_attr_set_int( XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(dp, XFS_ILOCK_EXCL); - return(error == 0 ? err2 : error); + return error ? error : err2; } /* @@ -399,22 +365,19 @@ xfs_attr_set_int( } - if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) error = xfs_attr_leaf_addname(&args); - } else { + else error = xfs_attr_node_addname(&args); - } - if (error) { + if (error) goto out; - } /* * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. */ - if (mp->m_flags & XFS_MOUNT_WSYNC) { + if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(args.trans); - } if ((flags & ATTR_KERNOTIME) == 0) xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); @@ -426,65 +389,47 @@ xfs_attr_set_int( error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(dp, XFS_ILOCK_EXCL); - return(error); + return error; out: - if (args.trans) + if (args.trans) { xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + } xfs_iunlock(dp, XFS_ILOCK_EXCL); - return(error); + return error; } +/* + * Generic handler routine to remove a name from an attribute list. + * Transitions attribute list from Btree to shortform as necessary. + */ int -xfs_attr_set( - xfs_inode_t *dp, - const unsigned char *name, - unsigned char *value, - int valuelen, - int flags) +xfs_attr_remove( + struct xfs_inode *dp, + const unsigned char *name, + int flags) { - int error; - struct xfs_name xname; + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_args args; + struct xfs_bmap_free flist; + xfs_fsblock_t firstblock; + int error; - XFS_STATS_INC(xs_attr_set); + XFS_STATS_INC(xs_attr_remove); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return (EIO); + return EIO; - error = xfs_attr_name_to_xname(&xname, name); + if (!xfs_inode_hasattr(dp)) + return ENOATTR; + + error = xfs_attr_args_init(&args, dp, name, flags); if (error) return error; - return xfs_attr_set_int(dp, &xname, value, valuelen, flags); -} - -/* - * Generic handler routine to remove a name from an attribute list. - * Transitions attribute list from Btree to shortform as necessary. - */ -STATIC int -xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags) -{ - xfs_da_args_t args; - xfs_fsblock_t firstblock; - xfs_bmap_free_t flist; - int error; - xfs_mount_t *mp = dp->i_mount; - - /* - * Fill in the arg structure for this request. - */ - memset((char *)&args, 0, sizeof(args)); - args.name = name->name; - args.namelen = name->len; - args.flags = flags; - args.hashval = xfs_da_hashname(args.name, args.namelen); - args.dp = dp; args.firstblock = &firstblock; args.flist = &flist; - args.total = 0; - args.whichfork = XFS_ATTR_FORK; /* * we have no control over the attribute names that userspace passes us @@ -493,9 +438,6 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags) */ args.op_flags = XFS_DA_OP_OKNOENT; - /* - * Attach the dquots to the inode. - */ error = xfs_qm_dqattach(dp, 0); if (error) return error; @@ -524,7 +466,7 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags) XFS_ATTRRM_SPACE_RES(mp), 0); if (error) { xfs_trans_cancel(args.trans, 0); - return(error); + return error; } xfs_ilock(dp, XFS_ILOCK_EXCL); @@ -534,35 +476,26 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags) */ xfs_trans_ijoin(args.trans, dp, 0); - /* - * Decide on what work routines to call based on the inode size. - */ if (!xfs_inode_hasattr(dp)) { error = XFS_ERROR(ENOATTR); - goto out; - } - if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { ASSERT(dp->i_afp->if_flags & XFS_IFINLINE); error = xfs_attr_shortform_remove(&args); - if (error) { - goto out; - } } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { error = xfs_attr_leaf_removename(&args); } else { error = xfs_attr_node_removename(&args); } - if (error) { + + if (error) goto out; - } /* * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. */ - if (mp->m_flags & XFS_MOUNT_WSYNC) { + if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(args.trans); - } if ((flags & ATTR_KERNOTIME) == 0) xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); @@ -574,45 +507,17 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags) error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(dp, XFS_ILOCK_EXCL); - return(error); + return error; out: - if (args.trans) + if (args.trans) { xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - xfs_iunlock(dp, XFS_ILOCK_EXCL); - return(error); -} - -int -xfs_attr_remove( - xfs_inode_t *dp, - const unsigned char *name, - int flags) -{ - int error; - struct xfs_name xname; - - XFS_STATS_INC(xs_attr_remove); - - if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return (EIO); - - error = xfs_attr_name_to_xname(&xname, name); - if (error) - return error; - - xfs_ilock(dp, XFS_ILOCK_SHARED); - if (!xfs_inode_hasattr(dp)) { - xfs_iunlock(dp, XFS_ILOCK_SHARED); - return XFS_ERROR(ENOATTR); } - xfs_iunlock(dp, XFS_ILOCK_SHARED); - - return xfs_attr_remove_int(dp, &xname, flags); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + return error; } - /*======================================================================== * External routines when attribute list is inside the inode *========================================================================*/ @@ -698,11 +603,22 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) trace_xfs_attr_leaf_replace(args); + /* save the attribute state for later removal*/ args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */ args->blkno2 = args->blkno; /* set 2nd entry info*/ args->index2 = args->index; args->rmtblkno2 = args->rmtblkno; args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtvaluelen2 = args->rmtvaluelen; + + /* + * clear the remote attr state now that it is saved so that the + * values reflect the state of the attribute we are about to + * add, not the attribute we just found and will remove later. + */ + args->rmtblkno = 0; + args->rmtblkcnt = 0; + args->rmtvaluelen = 0; } /* @@ -794,6 +710,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) args->blkno = args->blkno2; args->rmtblkno = args->rmtblkno2; args->rmtblkcnt = args->rmtblkcnt2; + args->rmtvaluelen = args->rmtvaluelen2; if (args->rmtblkno) { error = xfs_attr_rmtval_remove(args); if (error) @@ -946,7 +863,7 @@ xfs_attr_leaf_get(xfs_da_args_t *args) } /*======================================================================== - * External routines when attribute list size > XFS_LBSIZE(mp). + * External routines when attribute list size > geo->blksize *========================================================================*/ /* @@ -979,8 +896,6 @@ restart: state = xfs_da_state_alloc(); state->args = args; state->mp = mp; - state->blocksize = state->mp->m_sb.sb_blocksize; - state->node_ents = state->mp->m_attr_node_ents; /* * Search to see if name already exists, and get back a pointer @@ -999,13 +914,22 @@ restart: trace_xfs_attr_node_replace(args); + /* save the attribute state for later removal*/ args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */ args->blkno2 = args->blkno; /* set 2nd entry info*/ args->index2 = args->index; args->rmtblkno2 = args->rmtblkno; args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtvaluelen2 = args->rmtvaluelen; + + /* + * clear the remote attr state now that it is saved so that the + * values reflect the state of the attribute we are about to + * add, not the attribute we just found and will remove later. + */ args->rmtblkno = 0; args->rmtblkcnt = 0; + args->rmtvaluelen = 0; } retval = xfs_attr3_leaf_add(blk->bp, state->args); @@ -1133,6 +1057,7 @@ restart: args->blkno = args->blkno2; args->rmtblkno = args->rmtblkno2; args->rmtblkcnt = args->rmtblkcnt2; + args->rmtvaluelen = args->rmtvaluelen2; if (args->rmtblkno) { error = xfs_attr_rmtval_remove(args); if (error) @@ -1148,8 +1073,6 @@ restart: state = xfs_da_state_alloc(); state->args = args; state->mp = mp; - state->blocksize = state->mp->m_sb.sb_blocksize; - state->node_ents = state->mp->m_attr_node_ents; state->inleaf = 0; error = xfs_da3_node_lookup_int(state, &retval); if (error) @@ -1240,8 +1163,6 @@ xfs_attr_node_removename(xfs_da_args_t *args) state = xfs_da_state_alloc(); state->args = args; state->mp = dp->i_mount; - state->blocksize = state->mp->m_sb.sb_blocksize; - state->node_ents = state->mp->m_attr_node_ents; /* * Search to see if name exists, and get back a pointer to it. @@ -1503,8 +1424,6 @@ xfs_attr_node_get(xfs_da_args_t *args) state = xfs_da_state_alloc(); state->args = args; state->mp = args->dp->i_mount; - state->blocksize = state->mp->m_sb.sb_blocksize; - state->node_ents = state->mp->m_attr_node_ents; /* * Search to see if name exists, and get back a pointer to it. diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index fe9587fab17a..28712d29e43c 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -80,11 +80,12 @@ STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state, /* * Utility routines. */ -STATIC void xfs_attr3_leaf_moveents(struct xfs_attr_leafblock *src_leaf, +STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args, + struct xfs_attr_leafblock *src_leaf, struct xfs_attr3_icleaf_hdr *src_ichdr, int src_start, struct xfs_attr_leafblock *dst_leaf, struct xfs_attr3_icleaf_hdr *dst_ichdr, int dst_start, - int move_count, struct xfs_mount *mp); + int move_count); STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); void @@ -711,6 +712,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) memset((char *)&nargs, 0, sizeof(nargs)); nargs.dp = dp; + nargs.geo = args->geo; nargs.firstblock = args->firstblock; nargs.flist = args->flist; nargs.total = args->total; @@ -805,18 +807,18 @@ xfs_attr3_leaf_to_shortform( trace_xfs_attr_leaf_to_sf(args); - tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP); + tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP); if (!tmpbuffer) return ENOMEM; - memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(dp->i_mount)); + memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); leaf = (xfs_attr_leafblock_t *)tmpbuffer; xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); entry = xfs_attr3_leaf_entryp(leaf); /* XXX (dgc): buffer is about to be marked stale - why zero it? */ - memset(bp->b_addr, 0, XFS_LBSIZE(dp->i_mount)); + memset(bp->b_addr, 0, args->geo->blksize); /* * Clean out the prior contents of the attribute list. @@ -838,6 +840,7 @@ xfs_attr3_leaf_to_shortform( * Copy the attributes */ memset((char *)&nargs, 0, sizeof(nargs)); + nargs.geo = args->geo; nargs.dp = dp; nargs.firstblock = args->firstblock; nargs.flist = args->flist; @@ -904,12 +907,12 @@ xfs_attr3_leaf_to_node( /* copy leaf to new buffer, update identifiers */ xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF); bp2->b_ops = bp1->b_ops; - memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(mp)); + memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize); if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_da3_blkinfo *hdr3 = bp2->b_addr; hdr3->blkno = cpu_to_be64(bp2->b_bn); } - xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(mp) - 1); + xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1); /* * Set up the new root node. @@ -930,7 +933,7 @@ xfs_attr3_leaf_to_node( btree[0].before = cpu_to_be32(blkno); icnodehdr.count = 1; dp->d_ops->node_hdr_to_disk(node, &icnodehdr); - xfs_trans_log_buf(args->trans, bp1, 0, XFS_LBSIZE(mp) - 1); + xfs_trans_log_buf(args->trans, bp1, 0, args->geo->blksize - 1); error = 0; out: return error; @@ -966,10 +969,10 @@ xfs_attr3_leaf_create( bp->b_ops = &xfs_attr3_leaf_buf_ops; xfs_trans_buf_set_type(args->trans, bp, XFS_BLFT_ATTR_LEAF_BUF); leaf = bp->b_addr; - memset(leaf, 0, XFS_LBSIZE(mp)); + memset(leaf, 0, args->geo->blksize); memset(&ichdr, 0, sizeof(ichdr)); - ichdr.firstused = XFS_LBSIZE(mp); + ichdr.firstused = args->geo->blksize; if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_da3_blkinfo *hdr3 = bp->b_addr; @@ -988,7 +991,7 @@ xfs_attr3_leaf_create( ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base; xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); - xfs_trans_log_buf(args->trans, bp, 0, XFS_LBSIZE(mp) - 1); + xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1); *bpp = bp; return 0; @@ -1074,8 +1077,7 @@ xfs_attr3_leaf_add( leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); ASSERT(args->index >= 0 && args->index <= ichdr.count); - entsize = xfs_attr_leaf_newentsize(args->namelen, args->valuelen, - args->trans->t_mountp->m_sb.sb_blocksize, NULL); + entsize = xfs_attr_leaf_newentsize(args, NULL); /* * Search through freemap for first-fit on new name length. @@ -1174,17 +1176,14 @@ xfs_attr3_leaf_add_work( * Allocate space for the new string (at the end of the run). */ mp = args->trans->t_mountp; - ASSERT(ichdr->freemap[mapindex].base < XFS_LBSIZE(mp)); + ASSERT(ichdr->freemap[mapindex].base < args->geo->blksize); ASSERT((ichdr->freemap[mapindex].base & 0x3) == 0); ASSERT(ichdr->freemap[mapindex].size >= - xfs_attr_leaf_newentsize(args->namelen, args->valuelen, - mp->m_sb.sb_blocksize, NULL)); - ASSERT(ichdr->freemap[mapindex].size < XFS_LBSIZE(mp)); + xfs_attr_leaf_newentsize(args, NULL)); + ASSERT(ichdr->freemap[mapindex].size < args->geo->blksize); ASSERT((ichdr->freemap[mapindex].size & 0x3) == 0); - ichdr->freemap[mapindex].size -= - xfs_attr_leaf_newentsize(args->namelen, args->valuelen, - mp->m_sb.sb_blocksize, &tmp); + ichdr->freemap[mapindex].size -= xfs_attr_leaf_newentsize(args, &tmp); entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base + ichdr->freemap[mapindex].size); @@ -1229,6 +1228,7 @@ xfs_attr3_leaf_add_work( name_rmt->valueblk = 0; args->rmtblkno = 1; args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); + args->rmtvaluelen = args->valuelen; } xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), @@ -1268,14 +1268,13 @@ xfs_attr3_leaf_compact( struct xfs_attr_leafblock *leaf_dst; struct xfs_attr3_icleaf_hdr ichdr_src; struct xfs_trans *trans = args->trans; - struct xfs_mount *mp = trans->t_mountp; char *tmpbuffer; trace_xfs_attr_leaf_compact(args); - tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP); - memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp)); - memset(bp->b_addr, 0, XFS_LBSIZE(mp)); + tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP); + memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); + memset(bp->b_addr, 0, args->geo->blksize); leaf_src = (xfs_attr_leafblock_t *)tmpbuffer; leaf_dst = bp->b_addr; @@ -1288,7 +1287,7 @@ xfs_attr3_leaf_compact( /* Initialise the incore headers */ ichdr_src = *ichdr_dst; /* struct copy */ - ichdr_dst->firstused = XFS_LBSIZE(mp); + ichdr_dst->firstused = args->geo->blksize; ichdr_dst->usedbytes = 0; ichdr_dst->count = 0; ichdr_dst->holes = 0; @@ -1303,13 +1302,13 @@ xfs_attr3_leaf_compact( * Copy all entry's in the same (sorted) order, * but allocate name/value pairs packed and in sequence. */ - xfs_attr3_leaf_moveents(leaf_src, &ichdr_src, 0, leaf_dst, ichdr_dst, 0, - ichdr_src.count, mp); + xfs_attr3_leaf_moveents(args, leaf_src, &ichdr_src, 0, + leaf_dst, ichdr_dst, 0, ichdr_src.count); /* * this logs the entire buffer, but the caller must write the header * back to the buffer when it is finished modifying it. */ - xfs_trans_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1); + xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1); kmem_free(tmpbuffer); } @@ -1460,8 +1459,8 @@ xfs_attr3_leaf_rebalance( /* * Move high entries from leaf1 to low end of leaf2. */ - xfs_attr3_leaf_moveents(leaf1, &ichdr1, ichdr1.count - count, - leaf2, &ichdr2, 0, count, state->mp); + xfs_attr3_leaf_moveents(args, leaf1, &ichdr1, + ichdr1.count - count, leaf2, &ichdr2, 0, count); } else if (count > ichdr1.count) { /* @@ -1489,14 +1488,14 @@ xfs_attr3_leaf_rebalance( /* * Move low entries from leaf2 to high end of leaf1. */ - xfs_attr3_leaf_moveents(leaf2, &ichdr2, 0, leaf1, &ichdr1, - ichdr1.count, count, state->mp); + xfs_attr3_leaf_moveents(args, leaf2, &ichdr2, 0, leaf1, &ichdr1, + ichdr1.count, count); } xfs_attr3_leaf_hdr_to_disk(leaf1, &ichdr1); xfs_attr3_leaf_hdr_to_disk(leaf2, &ichdr2); - xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); - xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); + xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1); + xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1); /* * Copy out last hashval in each block for B-tree code. @@ -1591,11 +1590,9 @@ xfs_attr3_leaf_figure_balance( max = ichdr1->count + ichdr2->count; half = (max + 1) * sizeof(*entry); half += ichdr1->usedbytes + ichdr2->usedbytes + - xfs_attr_leaf_newentsize(state->args->namelen, - state->args->valuelen, - state->blocksize, NULL); + xfs_attr_leaf_newentsize(state->args, NULL); half /= 2; - lastdelta = state->blocksize; + lastdelta = state->args->geo->blksize; entry = xfs_attr3_leaf_entryp(leaf1); for (count = index = 0; count < max; entry++, index++, count++) { @@ -1605,10 +1602,7 @@ xfs_attr3_leaf_figure_balance( */ if (count == blk1->index) { tmp = totallen + sizeof(*entry) + - xfs_attr_leaf_newentsize( - state->args->namelen, - state->args->valuelen, - state->blocksize, NULL); + xfs_attr_leaf_newentsize(state->args, NULL); if (XFS_ATTR_ABS(half - tmp) > lastdelta) break; lastdelta = XFS_ATTR_ABS(half - tmp); @@ -1644,10 +1638,7 @@ xfs_attr3_leaf_figure_balance( totallen -= count * sizeof(*entry); if (foundit) { totallen -= sizeof(*entry) + - xfs_attr_leaf_newentsize( - state->args->namelen, - state->args->valuelen, - state->blocksize, NULL); + xfs_attr_leaf_newentsize(state->args, NULL); } *countarg = count; @@ -1699,7 +1690,7 @@ xfs_attr3_leaf_toosmall( bytes = xfs_attr3_leaf_hdr_size(leaf) + ichdr.count * sizeof(xfs_attr_leaf_entry_t) + ichdr.usedbytes; - if (bytes > (state->blocksize >> 1)) { + if (bytes > (state->args->geo->blksize >> 1)) { *action = 0; /* blk over 50%, don't try to join */ return(0); } @@ -1753,7 +1744,8 @@ xfs_attr3_leaf_toosmall( xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr); - bytes = state->blocksize - (state->blocksize >> 2) - + bytes = state->args->geo->blksize - + (state->args->geo->blksize >> 2) - ichdr.usedbytes - ichdr2.usedbytes - ((ichdr.count + ichdr2.count) * sizeof(xfs_attr_leaf_entry_t)) - @@ -1804,7 +1796,6 @@ xfs_attr3_leaf_remove( struct xfs_attr_leafblock *leaf; struct xfs_attr3_icleaf_hdr ichdr; struct xfs_attr_leaf_entry *entry; - struct xfs_mount *mp = args->trans->t_mountp; int before; int after; int smallest; @@ -1818,7 +1809,7 @@ xfs_attr3_leaf_remove( leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); - ASSERT(ichdr.count > 0 && ichdr.count < XFS_LBSIZE(mp) / 8); + ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8); ASSERT(args->index >= 0 && args->index < ichdr.count); ASSERT(ichdr.firstused >= ichdr.count * sizeof(*entry) + xfs_attr3_leaf_hdr_size(leaf)); @@ -1826,7 +1817,7 @@ xfs_attr3_leaf_remove( entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused); - ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp)); + ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize); /* * Scan through free region table: @@ -1841,8 +1832,8 @@ xfs_attr3_leaf_remove( smallest = XFS_ATTR_LEAF_MAPSIZE - 1; entsize = xfs_attr_leaf_entsize(leaf, args->index); for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { - ASSERT(ichdr.freemap[i].base < XFS_LBSIZE(mp)); - ASSERT(ichdr.freemap[i].size < XFS_LBSIZE(mp)); + ASSERT(ichdr.freemap[i].base < args->geo->blksize); + ASSERT(ichdr.freemap[i].size < args->geo->blksize); if (ichdr.freemap[i].base == tablesize) { ichdr.freemap[i].base -= sizeof(xfs_attr_leaf_entry_t); ichdr.freemap[i].size += sizeof(xfs_attr_leaf_entry_t); @@ -1919,11 +1910,11 @@ xfs_attr3_leaf_remove( * removing the name. */ if (smallest) { - tmp = XFS_LBSIZE(mp); + tmp = args->geo->blksize; entry = xfs_attr3_leaf_entryp(leaf); for (i = ichdr.count - 1; i >= 0; entry++, i--) { ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused); - ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp)); + ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize); if (be16_to_cpu(entry->nameidx) < tmp) tmp = be16_to_cpu(entry->nameidx); @@ -1946,7 +1937,7 @@ xfs_attr3_leaf_remove( tmp = ichdr.usedbytes + xfs_attr3_leaf_hdr_size(leaf) + ichdr.count * sizeof(xfs_attr_leaf_entry_t); - return tmp < mp->m_attr_magicpct; /* leaf is < 37% full */ + return tmp < args->geo->magicpct; /* leaf is < 37% full */ } /* @@ -1963,7 +1954,6 @@ xfs_attr3_leaf_unbalance( struct xfs_attr3_icleaf_hdr drophdr; struct xfs_attr3_icleaf_hdr savehdr; struct xfs_attr_leaf_entry *entry; - struct xfs_mount *mp = state->mp; trace_xfs_attr_leaf_unbalance(state->args); @@ -1990,13 +1980,15 @@ xfs_attr3_leaf_unbalance( */ if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, drop_blk->bp, &drophdr)) { - xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0, + xfs_attr3_leaf_moveents(state->args, + drop_leaf, &drophdr, 0, save_leaf, &savehdr, 0, - drophdr.count, mp); + drophdr.count); } else { - xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0, + xfs_attr3_leaf_moveents(state->args, + drop_leaf, &drophdr, 0, save_leaf, &savehdr, - savehdr.count, drophdr.count, mp); + savehdr.count, drophdr.count); } } else { /* @@ -2006,7 +1998,7 @@ xfs_attr3_leaf_unbalance( struct xfs_attr_leafblock *tmp_leaf; struct xfs_attr3_icleaf_hdr tmphdr; - tmp_leaf = kmem_zalloc(state->blocksize, KM_SLEEP); + tmp_leaf = kmem_zalloc(state->args->geo->blksize, KM_SLEEP); /* * Copy the header into the temp leaf so that all the stuff @@ -2019,35 +2011,39 @@ xfs_attr3_leaf_unbalance( tmphdr.magic = savehdr.magic; tmphdr.forw = savehdr.forw; tmphdr.back = savehdr.back; - tmphdr.firstused = state->blocksize; + tmphdr.firstused = state->args->geo->blksize; /* write the header to the temp buffer to initialise it */ xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr); if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, drop_blk->bp, &drophdr)) { - xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0, + xfs_attr3_leaf_moveents(state->args, + drop_leaf, &drophdr, 0, tmp_leaf, &tmphdr, 0, - drophdr.count, mp); - xfs_attr3_leaf_moveents(save_leaf, &savehdr, 0, + drophdr.count); + xfs_attr3_leaf_moveents(state->args, + save_leaf, &savehdr, 0, tmp_leaf, &tmphdr, tmphdr.count, - savehdr.count, mp); + savehdr.count); } else { - xfs_attr3_leaf_moveents(save_leaf, &savehdr, 0, + xfs_attr3_leaf_moveents(state->args, + save_leaf, &savehdr, 0, tmp_leaf, &tmphdr, 0, - savehdr.count, mp); - xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0, + savehdr.count); + xfs_attr3_leaf_moveents(state->args, + drop_leaf, &drophdr, 0, tmp_leaf, &tmphdr, tmphdr.count, - drophdr.count, mp); + drophdr.count); } - memcpy(save_leaf, tmp_leaf, state->blocksize); + memcpy(save_leaf, tmp_leaf, state->args->geo->blksize); savehdr = tmphdr; /* struct copy */ kmem_free(tmp_leaf); } xfs_attr3_leaf_hdr_to_disk(save_leaf, &savehdr); xfs_trans_log_buf(state->args->trans, save_blk->bp, 0, - state->blocksize - 1); + state->args->geo->blksize - 1); /* * Copy out last hashval in each block for B-tree code. @@ -2093,7 +2089,7 @@ xfs_attr3_leaf_lookup_int( leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); - ASSERT(ichdr.count < XFS_LBSIZE(args->dp->i_mount) / 8); + ASSERT(ichdr.count < args->geo->blksize / 8); /* * Binary search. (note: small blocks will skip this loop) @@ -2167,11 +2163,11 @@ xfs_attr3_leaf_lookup_int( if (!xfs_attr_namesp_match(args->flags, entry->flags)) continue; args->index = probe; - args->valuelen = be32_to_cpu(name_rmt->valuelen); + args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); args->rmtblkcnt = xfs_attr3_rmt_blocks( args->dp->i_mount, - args->valuelen); + args->rmtvaluelen); return XFS_ERROR(EEXIST); } } @@ -2197,7 +2193,7 @@ xfs_attr3_leaf_getvalue( leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); - ASSERT(ichdr.count < XFS_LBSIZE(args->dp->i_mount) / 8); + ASSERT(ichdr.count < args->geo->blksize / 8); ASSERT(args->index < ichdr.count); entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; @@ -2220,19 +2216,19 @@ xfs_attr3_leaf_getvalue( name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); ASSERT(name_rmt->namelen == args->namelen); ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); - valuelen = be32_to_cpu(name_rmt->valuelen); + args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount, - valuelen); + args->rmtvaluelen); if (args->flags & ATTR_KERNOVAL) { - args->valuelen = valuelen; + args->valuelen = args->rmtvaluelen; return 0; } - if (args->valuelen < valuelen) { - args->valuelen = valuelen; + if (args->valuelen < args->rmtvaluelen) { + args->valuelen = args->rmtvaluelen; return XFS_ERROR(ERANGE); } - args->valuelen = valuelen; + args->valuelen = args->rmtvaluelen; } return 0; } @@ -2248,14 +2244,14 @@ xfs_attr3_leaf_getvalue( /*ARGSUSED*/ STATIC void xfs_attr3_leaf_moveents( + struct xfs_da_args *args, struct xfs_attr_leafblock *leaf_s, struct xfs_attr3_icleaf_hdr *ichdr_s, int start_s, struct xfs_attr_leafblock *leaf_d, struct xfs_attr3_icleaf_hdr *ichdr_d, int start_d, - int count, - struct xfs_mount *mp) + int count) { struct xfs_attr_leaf_entry *entry_s; struct xfs_attr_leaf_entry *entry_d; @@ -2275,10 +2271,10 @@ xfs_attr3_leaf_moveents( ASSERT(ichdr_s->magic == XFS_ATTR_LEAF_MAGIC || ichdr_s->magic == XFS_ATTR3_LEAF_MAGIC); ASSERT(ichdr_s->magic == ichdr_d->magic); - ASSERT(ichdr_s->count > 0 && ichdr_s->count < XFS_LBSIZE(mp) / 8); + ASSERT(ichdr_s->count > 0 && ichdr_s->count < args->geo->blksize / 8); ASSERT(ichdr_s->firstused >= (ichdr_s->count * sizeof(*entry_s)) + xfs_attr3_leaf_hdr_size(leaf_s)); - ASSERT(ichdr_d->count < XFS_LBSIZE(mp) / 8); + ASSERT(ichdr_d->count < args->geo->blksize / 8); ASSERT(ichdr_d->firstused >= (ichdr_d->count * sizeof(*entry_d)) + xfs_attr3_leaf_hdr_size(leaf_d)); @@ -2330,11 +2326,11 @@ xfs_attr3_leaf_moveents( entry_d->nameidx = cpu_to_be16(ichdr_d->firstused); entry_d->flags = entry_s->flags; ASSERT(be16_to_cpu(entry_d->nameidx) + tmp - <= XFS_LBSIZE(mp)); + <= args->geo->blksize); memmove(xfs_attr3_leaf_name(leaf_d, desti), xfs_attr3_leaf_name(leaf_s, start_s + i), tmp); ASSERT(be16_to_cpu(entry_s->nameidx) + tmp - <= XFS_LBSIZE(mp)); + <= args->geo->blksize); memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp); ichdr_s->usedbytes -= tmp; ichdr_d->usedbytes += tmp; @@ -2355,7 +2351,7 @@ xfs_attr3_leaf_moveents( tmp = count * sizeof(xfs_attr_leaf_entry_t); entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s]; ASSERT(((char *)entry_s + tmp) <= - ((char *)leaf_s + XFS_LBSIZE(mp))); + ((char *)leaf_s + args->geo->blksize)); memset(entry_s, 0, tmp); } else { /* @@ -2370,7 +2366,7 @@ xfs_attr3_leaf_moveents( tmp = count * sizeof(xfs_attr_leaf_entry_t); entry_s = &xfs_attr3_leaf_entryp(leaf_s)[ichdr_s->count]; ASSERT(((char *)entry_s + tmp) <= - ((char *)leaf_s + XFS_LBSIZE(mp))); + ((char *)leaf_s + args->geo->blksize)); memset(entry_s, 0, tmp); } @@ -2438,22 +2434,21 @@ xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index) * a "local" or a "remote" attribute. */ int -xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local) +xfs_attr_leaf_newentsize( + struct xfs_da_args *args, + int *local) { - int size; + int size; - size = xfs_attr_leaf_entsize_local(namelen, valuelen); - if (size < xfs_attr_leaf_entsize_local_max(blocksize)) { - if (local) { + size = xfs_attr_leaf_entsize_local(args->namelen, args->valuelen); + if (size < xfs_attr_leaf_entsize_local_max(args->geo->blksize)) { + if (local) *local = 1; - } - } else { - size = xfs_attr_leaf_entsize_remote(namelen); - if (local) { - *local = 0; - } + return size; } - return size; + if (local) + *local = 0; + return xfs_attr_leaf_entsize_remote(args->namelen); } @@ -2519,7 +2514,7 @@ xfs_attr3_leaf_clearflag( ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0); name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); - name_rmt->valuelen = cpu_to_be32(args->valuelen); + name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); } @@ -2677,7 +2672,7 @@ xfs_attr3_leaf_flipflags( ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0); name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); - name_rmt->valuelen = cpu_to_be32(args->valuelen); + name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen); xfs_trans_log_buf(args->trans, bp1, XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt))); } diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index 3ec5ec0b8678..e2929da7c3ba 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -96,8 +96,7 @@ int xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp); xfs_dahash_t xfs_attr_leaf_lasthash(struct xfs_buf *bp, int *count); int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp, struct xfs_buf *leaf2_bp); -int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, - int *local); +int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local); int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 01db96f60cf0..90e2eeb21207 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -444,9 +444,11 @@ xfs_attr3_leaf_list_int( xfs_da_args_t args; memset((char *)&args, 0, sizeof(args)); + args.geo = context->dp->i_mount->m_attr_geo; args.dp = context->dp; args.whichfork = XFS_ATTR_FORK; args.valuelen = valuelen; + args.rmtvaluelen = valuelen; args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS); args.rmtblkno = be32_to_cpu(name_rmt->valueblk); args.rmtblkcnt = xfs_attr3_rmt_blocks( diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c index 6e37823e2932..b5adfecbb8ee 100644 --- a/fs/xfs/xfs_attr_remote.c +++ b/fs/xfs/xfs_attr_remote.c @@ -68,7 +68,6 @@ xfs_attr3_rmt_blocks( */ static bool xfs_attr3_rmt_hdr_ok( - struct xfs_mount *mp, void *ptr, xfs_ino_t ino, uint32_t offset, @@ -126,6 +125,7 @@ xfs_attr3_rmt_read_verify( char *ptr; int len; xfs_daddr_t bno; + int blksize = mp->m_attr_geo->blksize; /* no verification of non-crc buffers */ if (!xfs_sb_version_hascrc(&mp->m_sb)) @@ -134,21 +134,20 @@ xfs_attr3_rmt_read_verify( ptr = bp->b_addr; bno = bp->b_bn; len = BBTOB(bp->b_length); - ASSERT(len >= XFS_LBSIZE(mp)); + ASSERT(len >= blksize); while (len > 0) { - if (!xfs_verify_cksum(ptr, XFS_LBSIZE(mp), - XFS_ATTR3_RMT_CRC_OFF)) { + if (!xfs_verify_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF)) { xfs_buf_ioerror(bp, EFSBADCRC); break; } - if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) { + if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) { xfs_buf_ioerror(bp, EFSCORRUPTED); break; } - len -= XFS_LBSIZE(mp); - ptr += XFS_LBSIZE(mp); - bno += mp->m_bsize; + len -= blksize; + ptr += blksize; + bno += BTOBB(blksize); } if (bp->b_error) @@ -166,6 +165,7 @@ xfs_attr3_rmt_write_verify( char *ptr; int len; xfs_daddr_t bno; + int blksize = mp->m_attr_geo->blksize; /* no verification of non-crc buffers */ if (!xfs_sb_version_hascrc(&mp->m_sb)) @@ -174,10 +174,10 @@ xfs_attr3_rmt_write_verify( ptr = bp->b_addr; bno = bp->b_bn; len = BBTOB(bp->b_length); - ASSERT(len >= XFS_LBSIZE(mp)); + ASSERT(len >= blksize); while (len > 0) { - if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) { + if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) { xfs_buf_ioerror(bp, EFSCORRUPTED); xfs_verifier_error(bp); return; @@ -188,11 +188,11 @@ xfs_attr3_rmt_write_verify( rmt = (struct xfs_attr3_rmt_hdr *)ptr; rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn); } - xfs_update_cksum(ptr, XFS_LBSIZE(mp), XFS_ATTR3_RMT_CRC_OFF); + xfs_update_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF); - len -= XFS_LBSIZE(mp); - ptr += XFS_LBSIZE(mp); - bno += mp->m_bsize; + len -= blksize; + ptr += blksize; + bno += BTOBB(blksize); } ASSERT(len == 0); } @@ -241,17 +241,18 @@ xfs_attr_rmtval_copyout( char *src = bp->b_addr; xfs_daddr_t bno = bp->b_bn; int len = BBTOB(bp->b_length); + int blksize = mp->m_attr_geo->blksize; - ASSERT(len >= XFS_LBSIZE(mp)); + ASSERT(len >= blksize); while (len > 0 && *valuelen > 0) { int hdr_size = 0; - int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp)); + int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize); byte_cnt = min(*valuelen, byte_cnt); if (xfs_sb_version_hascrc(&mp->m_sb)) { - if (!xfs_attr3_rmt_hdr_ok(mp, src, ino, *offset, + if (!xfs_attr3_rmt_hdr_ok(src, ino, *offset, byte_cnt, bno)) { xfs_alert(mp, "remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)", @@ -264,9 +265,9 @@ xfs_attr_rmtval_copyout( memcpy(*dst, src + hdr_size, byte_cnt); /* roll buffer forwards */ - len -= XFS_LBSIZE(mp); - src += XFS_LBSIZE(mp); - bno += mp->m_bsize; + len -= blksize; + src += blksize; + bno += BTOBB(blksize); /* roll attribute data forwards */ *valuelen -= byte_cnt; @@ -288,12 +289,13 @@ xfs_attr_rmtval_copyin( char *dst = bp->b_addr; xfs_daddr_t bno = bp->b_bn; int len = BBTOB(bp->b_length); + int blksize = mp->m_attr_geo->blksize; - ASSERT(len >= XFS_LBSIZE(mp)); + ASSERT(len >= blksize); while (len > 0 && *valuelen > 0) { int hdr_size; - int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp)); + int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize); byte_cnt = min(*valuelen, byte_cnt); hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset, @@ -305,17 +307,17 @@ xfs_attr_rmtval_copyin( * If this is the last block, zero the remainder of it. * Check that we are actually the last block, too. */ - if (byte_cnt + hdr_size < XFS_LBSIZE(mp)) { + if (byte_cnt + hdr_size < blksize) { ASSERT(*valuelen - byte_cnt == 0); - ASSERT(len == XFS_LBSIZE(mp)); + ASSERT(len == blksize); memset(dst + hdr_size + byte_cnt, 0, - XFS_LBSIZE(mp) - hdr_size - byte_cnt); + blksize - hdr_size - byte_cnt); } /* roll buffer forwards */ - len -= XFS_LBSIZE(mp); - dst += XFS_LBSIZE(mp); - bno += mp->m_bsize; + len -= blksize; + dst += blksize; + bno += BTOBB(blksize); /* roll attribute data forwards */ *valuelen -= byte_cnt; @@ -337,7 +339,7 @@ xfs_attr_rmtval_get( struct xfs_buf *bp; xfs_dablk_t lblkno = args->rmtblkno; __uint8_t *dst = args->value; - int valuelen = args->valuelen; + int valuelen; int nmap; int error; int blkcnt = args->rmtblkcnt; @@ -347,7 +349,9 @@ xfs_attr_rmtval_get( trace_xfs_attr_rmtval_get(args); ASSERT(!(args->flags & ATTR_KERNOVAL)); + ASSERT(args->rmtvaluelen == args->valuelen); + valuelen = args->rmtvaluelen; while (valuelen > 0) { nmap = ATTR_RMTVALUE_MAPSIZE; error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, @@ -415,7 +419,7 @@ xfs_attr_rmtval_set( * attributes have headers, we can't just do a straight byte to FSB * conversion and have to take the header space into account. */ - blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); + blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen); error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, XFS_ATTR_FORK); if (error) @@ -480,7 +484,7 @@ xfs_attr_rmtval_set( */ lblkno = args->rmtblkno; blkcnt = args->rmtblkcnt; - valuelen = args->valuelen; + valuelen = args->rmtvaluelen; while (valuelen > 0) { struct xfs_buf *bp; xfs_daddr_t dblkno; diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h index f1e3c907044d..e1649c0d3e02 100644 --- a/fs/xfs/xfs_bit.h +++ b/fs/xfs/xfs_bit.h @@ -66,8 +66,11 @@ static inline int xfs_lowbit64(__uint64_t v) n = ffs(w); } else { /* upper bits */ w = (__uint32_t)(v >> 32); - if (w && (n = ffs(w))) - n += 32; + if (w) { + n = ffs(w); + if (n) + n += 32; + } } return n - 1; } diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 5b6092ef51ef..75c3fe5f3d9d 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -94,7 +94,7 @@ xfs_bmap_compute_maxlevels( maxleafents = MAXAEXTNUM; sz = XFS_BMDR_SPACE_CALC(MINABTPTRS); } - maxrootrecs = xfs_bmdr_maxrecs(mp, sz, 0); + maxrootrecs = xfs_bmdr_maxrecs(sz, 0); minleafrecs = mp->m_bmap_dmnr[0]; minnoderecs = mp->m_bmap_dmnr[1]; maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; @@ -233,7 +233,6 @@ xfs_default_attroffset( */ STATIC void xfs_bmap_forkoff_reset( - xfs_mount_t *mp, xfs_inode_t *ip, int whichfork) { @@ -905,7 +904,7 @@ xfs_bmap_local_to_extents_empty( ASSERT(ifp->if_bytes == 0); ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); - xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork); + xfs_bmap_forkoff_reset(ip, whichfork); ifp->if_flags &= ~XFS_IFINLINE; ifp->if_flags |= XFS_IFEXTENTS; XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); @@ -1099,10 +1098,11 @@ xfs_bmap_add_attrfork_local( if (S_ISDIR(ip->i_d.di_mode)) { memset(&dargs, 0, sizeof(dargs)); + dargs.geo = ip->i_mount->m_dir_geo; dargs.dp = ip; dargs.firstblock = firstblock; dargs.flist = flist; - dargs.total = ip->i_mount->m_dirblkfsbs; + dargs.total = dargs.geo->fsbcount; dargs.whichfork = XFS_DATA_FORK; dargs.trans = tp; return xfs_dir2_sf_to_block(&dargs); @@ -1675,7 +1675,6 @@ xfs_bmap_isaeof( */ int xfs_bmap_last_offset( - struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *last_block, int whichfork) @@ -3517,6 +3516,67 @@ xfs_bmap_adjacent( #undef ISVALID } +static int +xfs_bmap_longest_free_extent( + struct xfs_trans *tp, + xfs_agnumber_t ag, + xfs_extlen_t *blen, + int *notinit) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_perag *pag; + xfs_extlen_t longest; + int error = 0; + + pag = xfs_perag_get(mp, ag); + if (!pag->pagf_init) { + error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK); + if (error) + goto out; + + if (!pag->pagf_init) { + *notinit = 1; + goto out; + } + } + + longest = xfs_alloc_longest_free_extent(mp, pag); + if (*blen < longest) + *blen = longest; + +out: + xfs_perag_put(pag); + return error; +} + +static void +xfs_bmap_select_minlen( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + xfs_extlen_t *blen, + int notinit) +{ + if (notinit || *blen < ap->minlen) { + /* + * Since we did a BUF_TRYLOCK above, it is possible that + * there is space for this request. + */ + args->minlen = ap->minlen; + } else if (*blen < args->maxlen) { + /* + * If the best seen length is less than the request length, + * use the best as the minimum. + */ + args->minlen = *blen; + } else { + /* + * Otherwise we've seen an extent as big as maxlen, use that + * as the minimum. + */ + args->minlen = args->maxlen; + } +} + STATIC int xfs_bmap_btalloc_nullfb( struct xfs_bmalloca *ap, @@ -3524,111 +3584,74 @@ xfs_bmap_btalloc_nullfb( xfs_extlen_t *blen) { struct xfs_mount *mp = ap->ip->i_mount; - struct xfs_perag *pag; xfs_agnumber_t ag, startag; int notinit = 0; int error; - if (ap->userdata && xfs_inode_is_filestream(ap->ip)) - args->type = XFS_ALLOCTYPE_NEAR_BNO; - else - args->type = XFS_ALLOCTYPE_START_BNO; + args->type = XFS_ALLOCTYPE_START_BNO; args->total = ap->total; - /* - * Search for an allocation group with a single extent large enough - * for the request. If one isn't found, then adjust the minimum - * allocation size to the largest space found. - */ startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno); if (startag == NULLAGNUMBER) startag = ag = 0; - pag = xfs_perag_get(mp, ag); while (*blen < args->maxlen) { - if (!pag->pagf_init) { - error = xfs_alloc_pagf_init(mp, args->tp, ag, - XFS_ALLOC_FLAG_TRYLOCK); - if (error) { - xfs_perag_put(pag); - return error; - } - } - - /* - * See xfs_alloc_fix_freelist... - */ - if (pag->pagf_init) { - xfs_extlen_t longest; - longest = xfs_alloc_longest_free_extent(mp, pag); - if (*blen < longest) - *blen = longest; - } else - notinit = 1; - - if (xfs_inode_is_filestream(ap->ip)) { - if (*blen >= args->maxlen) - break; - - if (ap->userdata) { - /* - * If startag is an invalid AG, we've - * come here once before and - * xfs_filestream_new_ag picked the - * best currently available. - * - * Don't continue looping, since we - * could loop forever. - */ - if (startag == NULLAGNUMBER) - break; - - error = xfs_filestream_new_ag(ap, &ag); - xfs_perag_put(pag); - if (error) - return error; + error = xfs_bmap_longest_free_extent(args->tp, ag, blen, + ¬init); + if (error) + return error; - /* loop again to set 'blen'*/ - startag = NULLAGNUMBER; - pag = xfs_perag_get(mp, ag); - continue; - } - } if (++ag == mp->m_sb.sb_agcount) ag = 0; if (ag == startag) break; - xfs_perag_put(pag); - pag = xfs_perag_get(mp, ag); } - xfs_perag_put(pag); - /* - * Since the above loop did a BUF_TRYLOCK, it is - * possible that there is space for this request. - */ - if (notinit || *blen < ap->minlen) - args->minlen = ap->minlen; - /* - * If the best seen length is less than the request - * length, use the best as the minimum. - */ - else if (*blen < args->maxlen) - args->minlen = *blen; - /* - * Otherwise we've seen an extent as big as maxlen, - * use that as the minimum. - */ - else - args->minlen = args->maxlen; + xfs_bmap_select_minlen(ap, args, blen, notinit); + return 0; +} + +STATIC int +xfs_bmap_btalloc_filestreams( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + xfs_extlen_t *blen) +{ + struct xfs_mount *mp = ap->ip->i_mount; + xfs_agnumber_t ag; + int notinit = 0; + int error; + + args->type = XFS_ALLOCTYPE_NEAR_BNO; + args->total = ap->total; + + ag = XFS_FSB_TO_AGNO(mp, args->fsbno); + if (ag == NULLAGNUMBER) + ag = 0; + + error = xfs_bmap_longest_free_extent(args->tp, ag, blen, ¬init); + if (error) + return error; + + if (*blen < args->maxlen) { + error = xfs_filestream_new_ag(ap, &ag); + if (error) + return error; + + error = xfs_bmap_longest_free_extent(args->tp, ag, blen, + ¬init); + if (error) + return error; + + } + + xfs_bmap_select_minlen(ap, args, blen, notinit); /* - * set the failure fallback case to look in the selected - * AG as the stream may have moved. + * Set the failure fallback case to look in the selected AG as stream + * may have moved. */ - if (xfs_inode_is_filestream(ap->ip)) - ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0); - + ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0); return 0; } @@ -3708,7 +3731,15 @@ xfs_bmap_btalloc( args.firstblock = *ap->firstblock; blen = 0; if (nullfb) { - error = xfs_bmap_btalloc_nullfb(ap, &args, &blen); + /* + * Search for an allocation group with a single extent large + * enough for the request. If one isn't found, then adjust + * the minimum allocation size to the largest space found. + */ + if (ap->userdata && xfs_inode_is_filestream(ap->ip)) + error = xfs_bmap_btalloc_filestreams(ap, &args, &blen); + else + error = xfs_bmap_btalloc_nullfb(ap, &args, &blen); if (error) return error; } else if (ap->flist->xbf_low) { @@ -4267,8 +4298,8 @@ xfs_bmapi_delay( } -int -__xfs_bmapi_allocate( +static int +xfs_bmapi_allocate( struct xfs_bmalloca *bma) { struct xfs_mount *mp = bma->ip->i_mount; @@ -4547,9 +4578,6 @@ xfs_bmapi_write( bma.flist = flist; bma.firstblock = firstblock; - if (flags & XFS_BMAPI_STACK_SWITCH) - bma.stack_switch = 1; - while (bno < end && n < *nmap) { inhole = eof || bma.got.br_startoff > bno; wasdelay = !inhole && isnullstartblock(bma.got.br_startblock); @@ -5413,6 +5441,7 @@ xfs_bmap_shift_extents( int whichfork = XFS_DATA_FORK; int logflags; xfs_filblks_t blockcount = 0; + int total_extents; if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && @@ -5429,7 +5458,6 @@ xfs_bmap_shift_extents( ASSERT(current_ext != NULL); ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { /* Read in all the extents */ error = xfs_iread_extents(tp, ip, whichfork); @@ -5456,7 +5484,6 @@ xfs_bmap_shift_extents( /* We are going to change core inode */ logflags = XFS_ILOG_CORE; - if (ifp->if_flags & XFS_IFBROOT) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_private.b.firstblock = *firstblock; @@ -5467,8 +5494,14 @@ xfs_bmap_shift_extents( logflags |= XFS_ILOG_DEXT; } - while (nexts++ < num_exts && - *current_ext < XFS_IFORK_NEXTENTS(ip, whichfork)) { + /* + * There may be delalloc extents in the data fork before the range we + * are collapsing out, so we cannot + * use the count of real extents here. Instead we have to calculate it + * from the incore fork. + */ + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + while (nexts++ < num_exts && *current_ext < total_extents) { gotp = xfs_iext_get_ext(ifp, *current_ext); xfs_bmbt_get_all(gotp, &got); @@ -5556,10 +5589,11 @@ xfs_bmap_shift_extents( } (*current_ext)++; + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); } /* Check if we are done */ - if (*current_ext == XFS_IFORK_NEXTENTS(ip, whichfork)) + if (*current_ext == total_extents) *done = 1; del_cursor: @@ -5568,6 +5602,5 @@ del_cursor: error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); xfs_trans_log_inode(tp, ip, logflags); - return error; } diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index f84bd7af43be..b879ca56a64c 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -77,7 +77,6 @@ typedef struct xfs_bmap_free * from written to unwritten, otherwise convert from unwritten to written. */ #define XFS_BMAPI_CONVERT 0x040 -#define XFS_BMAPI_STACK_SWITCH 0x080 #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ @@ -86,8 +85,7 @@ typedef struct xfs_bmap_free { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ { XFS_BMAPI_CONTIG, "CONTIG" }, \ - { XFS_BMAPI_CONVERT, "CONVERT" }, \ - { XFS_BMAPI_STACK_SWITCH, "STACK_SWITCH" } + { XFS_BMAPI_CONVERT, "CONVERT" } static inline int xfs_bmapi_aflag(int w) @@ -156,8 +154,8 @@ int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); int xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *last_block, int whichfork); -int xfs_bmap_last_offset(struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t *unused, int whichfork); +int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused, + int whichfork); int xfs_bmap_one_block(struct xfs_inode *ip, int whichfork); int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork); diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 818d546664e7..948836c4fd90 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -84,7 +84,7 @@ xfs_bmdr_to_bmbt( rblock->bb_level = dblock->bb_level; ASSERT(be16_to_cpu(rblock->bb_level) > 0); rblock->bb_numrecs = dblock->bb_numrecs; - dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0); + dmxr = xfs_bmdr_maxrecs(dblocklen, 0); fkp = XFS_BMDR_KEY_ADDR(dblock, 1); tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1); fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr); @@ -443,7 +443,7 @@ xfs_bmbt_to_bmdr( ASSERT(rblock->bb_level != 0); dblock->bb_level = rblock->bb_level; dblock->bb_numrecs = rblock->bb_numrecs; - dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0); + dmxr = xfs_bmdr_maxrecs(dblocklen, 0); fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1); tkp = XFS_BMDR_KEY_ADDR(dblock, 1); fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen); @@ -519,7 +519,6 @@ xfs_bmbt_alloc_block( struct xfs_btree_cur *cur, union xfs_btree_ptr *start, union xfs_btree_ptr *new, - int length, int *stat) { xfs_alloc_arg_t args; /* block allocation args */ @@ -672,8 +671,7 @@ xfs_bmbt_get_dmaxrecs( { if (level != cur->bc_nlevels - 1) return cur->bc_mp->m_bmap_dmxr[level != 0]; - return xfs_bmdr_maxrecs(cur->bc_mp, cur->bc_private.b.forksize, - level == 0); + return xfs_bmdr_maxrecs(cur->bc_private.b.forksize, level == 0); } STATIC void @@ -914,7 +912,6 @@ xfs_bmbt_maxrecs( */ int xfs_bmdr_maxrecs( - struct xfs_mount *mp, int blocklen, int leaf) { diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index 6e42e1e50b89..819a8a4dee95 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -130,7 +130,7 @@ extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int, xfs_bmdr_block_t *, int); extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level); -extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf); +extern int xfs_bmdr_maxrecs(int blocklen, int leaf); extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf); extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip, diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 01f6a646caa1..64731ef3324d 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -249,49 +249,6 @@ xfs_bmap_rtalloc( } /* - * Stack switching interfaces for allocation - */ -static void -xfs_bmapi_allocate_worker( - struct work_struct *work) -{ - struct xfs_bmalloca *args = container_of(work, - struct xfs_bmalloca, work); - unsigned long pflags; - - /* we are in a transaction context here */ - current_set_flags_nested(&pflags, PF_FSTRANS); - - args->result = __xfs_bmapi_allocate(args); - complete(args->done); - - current_restore_flags_nested(&pflags, PF_FSTRANS); -} - -/* - * Some allocation requests often come in with little stack to work on. Push - * them off to a worker thread so there is lots of stack to use. Otherwise just - * call directly to avoid the context switch overhead here. - */ -int -xfs_bmapi_allocate( - struct xfs_bmalloca *args) -{ - DECLARE_COMPLETION_ONSTACK(done); - - if (!args->stack_switch) - return __xfs_bmapi_allocate(args); - - - args->done = &done; - INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker); - queue_work(xfs_alloc_wq, &args->work); - wait_for_completion(&done); - destroy_work_on_stack(&args->work); - return args->result; -} - -/* * Check if the endoff is outside the last extent. If so the caller will grow * the allocation to a stripe unit boundary. All offsets are considered outside * the end of file for an empty fork, so 1 is returned in *eof in that case. @@ -1418,6 +1375,8 @@ xfs_zero_file_space( xfs_off_t end_boundary; int error; + trace_xfs_zero_file_space(ip); + granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); /* @@ -1432,9 +1391,18 @@ xfs_zero_file_space( ASSERT(end_boundary <= offset + len); if (start_boundary < end_boundary - 1) { - /* punch out the page cache over the conversion range */ + /* + * punch out delayed allocation blocks and the page cache over + * the conversion range + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, + XFS_B_TO_FSBT(mp, start_boundary), + XFS_B_TO_FSB(mp, end_boundary - start_boundary)); + xfs_iunlock(ip, XFS_ILOCK_EXCL); truncate_pagecache_range(VFS_I(ip), start_boundary, end_boundary - 1); + /* convert the blocks */ error = xfs_alloc_file_space(ip, start_boundary, end_boundary - start_boundary - 1, @@ -1508,7 +1476,6 @@ xfs_collapse_file_space( while (!error && !done) { tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - tp->t_flags |= XFS_TRANS_RESERVE; /* * We would need to reserve permanent block for transaction. * This will come into picture when after shifting extent into @@ -1518,7 +1485,6 @@ xfs_collapse_file_space( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); if (error) { - ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp)); xfs_trans_cancel(tp, 0); break; } diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 935ed2b24edf..2fdb72d2c908 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -50,12 +50,11 @@ struct xfs_bmalloca { xfs_extlen_t total; /* total blocks needed for xaction */ xfs_extlen_t minlen; /* minimum allocation size (blocks) */ xfs_extlen_t minleft; /* amount must be left after alloc */ - char eof; /* set if allocating past last extent */ - char wasdel; /* replacing a delayed allocation */ - char userdata;/* set if is user data */ - char aeof; /* allocated space at eof */ - char conv; /* overwriting unwritten extents */ - char stack_switch; + bool eof; /* set if allocating past last extent */ + bool wasdel; /* replacing a delayed allocation */ + bool userdata;/* set if is user data */ + bool aeof; /* allocated space at eof */ + bool conv; /* overwriting unwritten extents */ int flags; struct completion *done; struct work_struct work; @@ -65,8 +64,6 @@ struct xfs_bmalloca { int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist, int *committed); int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); -int xfs_bmapi_allocate(struct xfs_bmalloca *args); -int __xfs_bmapi_allocate(struct xfs_bmalloca *args); int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, int whichfork, int *eof); int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index e80d59fdf89a..cf893bc1e373 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -33,6 +33,7 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_cksum.h" +#include "xfs_alloc.h" /* * Cursor allocation zone. @@ -43,9 +44,10 @@ kmem_zone_t *xfs_btree_cur_zone; * Btree magic numbers. */ static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { - { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC }, + { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC, + XFS_FIBT_MAGIC }, { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, - XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC } + XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC } }; #define xfs_btree_magic(cur) \ xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum] @@ -552,14 +554,11 @@ xfs_btree_get_bufl( xfs_fsblock_t fsbno, /* file system block number */ uint lock) /* lock flags for get_buf */ { - xfs_buf_t *bp; /* buffer pointer (return value) */ xfs_daddr_t d; /* real disk block address */ ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); - ASSERT(!xfs_buf_geterror(bp)); - return bp; + return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); } /* @@ -574,15 +573,12 @@ xfs_btree_get_bufs( xfs_agblock_t agbno, /* allocation group block number */ uint lock) /* lock flags for get_buf */ { - xfs_buf_t *bp; /* buffer pointer (return value) */ xfs_daddr_t d; /* real disk block address */ ASSERT(agno != NULLAGNUMBER); ASSERT(agbno != NULLAGBLOCK); d = XFS_AGB_TO_DADDR(mp, agno, agbno); - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); - ASSERT(!xfs_buf_geterror(bp)); - return bp; + return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); } /* @@ -722,7 +718,6 @@ xfs_btree_read_bufl( mp->m_bsize, lock, &bp, ops); if (error) return error; - ASSERT(!xfs_buf_geterror(bp)); if (bp) xfs_buf_set_ref(bp, refval); *bpp = bp; @@ -1115,6 +1110,7 @@ xfs_btree_set_refs( xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF); break; case XFS_BTNUM_INO: + case XFS_BTNUM_FINO: xfs_buf_set_ref(bp, XFS_INO_BTREE_REF); break; case XFS_BTNUM_BMAP: @@ -1159,7 +1155,6 @@ STATIC int xfs_btree_read_buf_block( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, - int level, int flags, struct xfs_btree_block **block, struct xfs_buf **bpp) @@ -1178,7 +1173,6 @@ xfs_btree_read_buf_block( if (error) return error; - ASSERT(!xfs_buf_geterror(*bpp)); xfs_btree_set_refs(cur, *bpp); *block = XFS_BUF_TO_BLOCK(*bpp); return 0; @@ -1517,8 +1511,8 @@ xfs_btree_increment( union xfs_btree_ptr *ptrp; ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block); - error = xfs_btree_read_buf_block(cur, ptrp, --lev, - 0, &block, &bp); + --lev; + error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); if (error) goto error0; @@ -1616,8 +1610,8 @@ xfs_btree_decrement( union xfs_btree_ptr *ptrp; ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block); - error = xfs_btree_read_buf_block(cur, ptrp, --lev, - 0, &block, &bp); + --lev; + error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); if (error) goto error0; xfs_btree_setbuf(cur, lev, bp); @@ -1667,7 +1661,7 @@ xfs_btree_lookup_get_block( return 0; } - error = xfs_btree_read_buf_block(cur, pp, level, 0, blkp, &bp); + error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp); if (error) return error; @@ -2018,7 +2012,7 @@ xfs_btree_lshift( goto out0; /* Set up the left neighbor as "left". */ - error = xfs_btree_read_buf_block(cur, &lptr, level, 0, &left, &lbp); + error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); if (error) goto error0; @@ -2202,7 +2196,7 @@ xfs_btree_rshift( goto out0; /* Set up the right neighbor as "right". */ - error = xfs_btree_read_buf_block(cur, &rptr, level, 0, &right, &rbp); + error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); if (error) goto error0; @@ -2330,7 +2324,7 @@ error1: * record (to be inserted into parent). */ STATIC int /* error */ -xfs_btree_split( +__xfs_btree_split( struct xfs_btree_cur *cur, int level, union xfs_btree_ptr *ptrp, @@ -2372,7 +2366,7 @@ xfs_btree_split( xfs_btree_buf_to_ptr(cur, lbp, &lptr); /* Allocate the new block. If we can't do it, we're toast. Give up. */ - error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, 1, stat); + error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat); if (error) goto error0; if (*stat == 0) @@ -2470,7 +2464,7 @@ xfs_btree_split( * point back to right instead of to left. */ if (!xfs_btree_ptr_is_null(cur, &rrptr)) { - error = xfs_btree_read_buf_block(cur, &rrptr, level, + error = xfs_btree_read_buf_block(cur, &rrptr, 0, &rrblock, &rrbp); if (error) goto error0; @@ -2510,6 +2504,85 @@ error0: return error; } +struct xfs_btree_split_args { + struct xfs_btree_cur *cur; + int level; + union xfs_btree_ptr *ptrp; + union xfs_btree_key *key; + struct xfs_btree_cur **curp; + int *stat; /* success/failure */ + int result; + bool kswapd; /* allocation in kswapd context */ + struct completion *done; + struct work_struct work; +}; + +/* + * Stack switching interfaces for allocation + */ +static void +xfs_btree_split_worker( + struct work_struct *work) +{ + struct xfs_btree_split_args *args = container_of(work, + struct xfs_btree_split_args, work); + unsigned long pflags; + unsigned long new_pflags = PF_FSTRANS; + + /* + * we are in a transaction context here, but may also be doing work + * in kswapd context, and hence we may need to inherit that state + * temporarily to ensure that we don't block waiting for memory reclaim + * in any way. + */ + if (args->kswapd) + new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; + + current_set_flags_nested(&pflags, new_pflags); + + args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, + args->key, args->curp, args->stat); + complete(args->done); + + current_restore_flags_nested(&pflags, new_pflags); +} + +/* + * BMBT split requests often come in with little stack to work on. Push + * them off to a worker thread so there is lots of stack to use. For the other + * btree types, just call directly to avoid the context switch overhead here. + */ +STATIC int /* error */ +xfs_btree_split( + struct xfs_btree_cur *cur, + int level, + union xfs_btree_ptr *ptrp, + union xfs_btree_key *key, + struct xfs_btree_cur **curp, + int *stat) /* success/failure */ +{ + struct xfs_btree_split_args args; + DECLARE_COMPLETION_ONSTACK(done); + + if (cur->bc_btnum != XFS_BTNUM_BMAP) + return __xfs_btree_split(cur, level, ptrp, key, curp, stat); + + args.cur = cur; + args.level = level; + args.ptrp = ptrp; + args.key = key; + args.curp = curp; + args.stat = stat; + args.done = &done; + args.kswapd = current_is_kswapd(); + INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker); + queue_work(xfs_alloc_wq, &args.work); + wait_for_completion(&done); + destroy_work_on_stack(&args.work); + return args.result; +} + + /* * Copy the old inode root contents into a real block and make the * broot point to it. @@ -2545,7 +2618,7 @@ xfs_btree_new_iroot( pp = xfs_btree_ptr_addr(cur, 1, block); /* Allocate the new block. If we can't do it, we're toast. Give up. */ - error = cur->bc_ops->alloc_block(cur, pp, &nptr, 1, stat); + error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat); if (error) goto error0; if (*stat == 0) { @@ -2649,7 +2722,7 @@ xfs_btree_new_root( cur->bc_ops->init_ptr_from_cur(cur, &rptr); /* Allocate the new block. If we can't do it, we're toast. Give up. */ - error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, 1, stat); + error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat); if (error) goto error0; if (*stat == 0) @@ -2684,8 +2757,7 @@ xfs_btree_new_root( lbp = bp; xfs_btree_buf_to_ptr(cur, lbp, &lptr); left = block; - error = xfs_btree_read_buf_block(cur, &rptr, - cur->bc_nlevels - 1, 0, &right, &rbp); + error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); if (error) goto error0; bp = rbp; @@ -2696,8 +2768,7 @@ xfs_btree_new_root( xfs_btree_buf_to_ptr(cur, rbp, &rptr); right = block; xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); - error = xfs_btree_read_buf_block(cur, &lptr, - cur->bc_nlevels - 1, 0, &left, &lbp); + error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); if (error) goto error0; bp = lbp; @@ -3649,8 +3720,7 @@ xfs_btree_delrec( rptr = cptr; right = block; rbp = bp; - error = xfs_btree_read_buf_block(cur, &lptr, level, - 0, &left, &lbp); + error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); if (error) goto error0; @@ -3667,8 +3737,7 @@ xfs_btree_delrec( lptr = cptr; left = block; lbp = bp; - error = xfs_btree_read_buf_block(cur, &rptr, level, - 0, &right, &rbp); + error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); if (error) goto error0; @@ -3740,8 +3809,7 @@ xfs_btree_delrec( /* If there is a right sibling, point it to the remaining block. */ xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); if (!xfs_btree_ptr_is_null(cur, &cptr)) { - error = xfs_btree_read_buf_block(cur, &cptr, level, - 0, &rrblock, &rrbp); + error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp); if (error) goto error0; xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB); diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 91e34f21bace..a04b69422f67 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -62,6 +62,7 @@ union xfs_btree_rec { #define XFS_BTNUM_CNT ((xfs_btnum_t)XFS_BTNUM_CNTi) #define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi) #define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi) +#define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi) /* * For logging record fields. @@ -92,6 +93,7 @@ do { \ case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break; \ case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break; \ case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break; \ + case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(fibt, stat); break; \ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ } \ } while (0) @@ -105,6 +107,7 @@ do { \ case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \ case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \ case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \ + case XFS_BTNUM_FINO: __XFS_BTREE_STATS_ADD(fibt, stat, val); break; \ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ } \ } while (0) @@ -129,7 +132,7 @@ struct xfs_btree_ops { int (*alloc_block)(struct xfs_btree_cur *cur, union xfs_btree_ptr *start_bno, union xfs_btree_ptr *new_bno, - int length, int *stat); + int *stat); int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp); /* update last record information */ diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 107f2fdfe41f..7a34a1ae6552 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -216,8 +216,7 @@ _xfs_buf_alloc( STATIC int _xfs_buf_get_pages( xfs_buf_t *bp, - int page_count, - xfs_buf_flags_t flags) + int page_count) { /* Make sure that we have a page list */ if (bp->b_pages == NULL) { @@ -330,7 +329,7 @@ use_alloc_page: end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1) >> PAGE_SHIFT; page_count = end - start; - error = _xfs_buf_get_pages(bp, page_count, flags); + error = _xfs_buf_get_pages(bp, page_count); if (unlikely(error)) return error; @@ -778,7 +777,7 @@ xfs_buf_associate_memory( bp->b_pages = NULL; bp->b_addr = mem; - rval = _xfs_buf_get_pages(bp, page_count, 0); + rval = _xfs_buf_get_pages(bp, page_count); if (rval) return rval; @@ -811,7 +810,7 @@ xfs_buf_get_uncached( goto fail; page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT; - error = _xfs_buf_get_pages(bp, page_count, 0); + error = _xfs_buf_get_pages(bp, page_count); if (error) goto fail_free_buf; @@ -1372,21 +1371,29 @@ xfs_buf_iorequest( xfs_buf_wait_unpin(bp); xfs_buf_hold(bp); - /* Set the count to 1 initially, this will stop an I/O + /* + * Set the count to 1 initially, this will stop an I/O * completion callout which happens before we have started * all the I/O from calling xfs_buf_ioend too early. */ atomic_set(&bp->b_io_remaining, 1); _xfs_buf_ioapply(bp); - _xfs_buf_ioend(bp, 1); + /* + * If _xfs_buf_ioapply failed, we'll get back here with + * only the reference we took above. _xfs_buf_ioend will + * drop it to zero, so we'd better not queue it for later, + * or we'll free it before it's done. + */ + _xfs_buf_ioend(bp, bp->b_error ? 0 : 1); xfs_buf_rele(bp); } /* * Waits for I/O to complete on the buffer supplied. It returns immediately if - * no I/O is pending or there is already a pending error on the buffer. It - * returns the I/O error code, if any, or 0 if there was no error. + * no I/O is pending or there is already a pending error on the buffer, in which + * case nothing will ever complete. It returns the I/O error code, if any, or + * 0 if there was no error. */ int xfs_buf_iowait( @@ -1607,7 +1614,6 @@ xfs_free_buftarg( int xfs_setsize_buftarg( xfs_buftarg_t *btp, - unsigned int blocksize, unsigned int sectorsize) { /* Set up metadata sector size info */ @@ -1642,16 +1648,13 @@ xfs_setsize_buftarg_early( xfs_buftarg_t *btp, struct block_device *bdev) { - return xfs_setsize_buftarg(btp, PAGE_SIZE, - bdev_logical_block_size(bdev)); + return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev)); } xfs_buftarg_t * xfs_alloc_buftarg( struct xfs_mount *mp, - struct block_device *bdev, - int external, - const char *fsname) + struct block_device *bdev) { xfs_buftarg_t *btp; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index b8a3abf6cf47..3a7a5523d3dc 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -298,11 +298,6 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, extern int xfs_bioerror_relse(struct xfs_buf *); -static inline int xfs_buf_geterror(xfs_buf_t *bp) -{ - return bp ? bp->b_error : ENOMEM; -} - /* Buffer Utility Routines */ extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); @@ -387,10 +382,10 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset) * Handling of buftargs. */ extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *, - struct block_device *, int, const char *); + struct block_device *); extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); extern void xfs_wait_buftarg(xfs_buftarg_t *); -extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); +extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 8752821443be..4654338b03fc 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -812,7 +812,6 @@ xfs_buf_item_init( */ static void xfs_buf_item_log_segment( - struct xfs_buf_log_item *bip, uint first, uint last, uint *map) @@ -920,7 +919,7 @@ xfs_buf_item_log( if (end > last) end = last; - xfs_buf_item_log_segment(bip, first, end, + xfs_buf_item_log_segment(first, end, &bip->bli_formats[i].blf_data_map[0]); start += bp->b_maps[i].bm_len; @@ -1053,7 +1052,7 @@ xfs_buf_iodone_callbacks( static ulong lasttime; static xfs_buftarg_t *lasttarg; - if (likely(!xfs_buf_geterror(bp))) + if (likely(!bp->b_error)) goto do_callbacks; /* diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 6cc5f6785a77..a514ab616650 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -167,8 +167,8 @@ xfs_da3_node_verify( * we don't know if the node is for and attribute or directory tree, * so only fail if the count is outside both bounds */ - if (ichdr.count > mp->m_dir_node_ents && - ichdr.count > mp->m_attr_node_ents) + if (ichdr.count > mp->m_dir_geo->node_ents && + ichdr.count > mp->m_attr_geo->node_ents) return false; /* XXX: hash order check? */ @@ -598,7 +598,7 @@ xfs_da3_root_split( * Set up the new root node. */ error = xfs_da3_node_create(args, - (args->whichfork == XFS_DATA_FORK) ? mp->m_dirleafblk : 0, + (args->whichfork == XFS_DATA_FORK) ? args->geo->leafblk : 0, level + 1, &bp, args->whichfork); if (error) return error; @@ -616,10 +616,10 @@ xfs_da3_root_split( #ifdef DEBUG if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { - ASSERT(blk1->blkno >= mp->m_dirleafblk && - blk1->blkno < mp->m_dirfreeblk); - ASSERT(blk2->blkno >= mp->m_dirleafblk && - blk2->blkno < mp->m_dirfreeblk); + ASSERT(blk1->blkno >= args->geo->leafblk && + blk1->blkno < args->geo->freeblk); + ASSERT(blk2->blkno >= args->geo->leafblk && + blk2->blkno < args->geo->freeblk); } #endif @@ -663,7 +663,7 @@ xfs_da3_node_split( /* * Do we have to split the node? */ - if (nodehdr.count + newcount > state->node_ents) { + if (nodehdr.count + newcount > state->args->geo->node_ents) { /* * Allocate a new node, add to the doubly linked chain of * nodes, then move some of our excess entries into it. @@ -894,8 +894,8 @@ xfs_da3_node_add( ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count); ASSERT(newblk->blkno != 0); if (state->args->whichfork == XFS_DATA_FORK) - ASSERT(newblk->blkno >= state->mp->m_dirleafblk && - newblk->blkno < state->mp->m_dirfreeblk); + ASSERT(newblk->blkno >= state->args->geo->leafblk && + newblk->blkno < state->args->geo->freeblk); /* * We may need to make some room before we insert the new node. @@ -1089,14 +1089,15 @@ xfs_da3_root_join( * that could occur. For dir3 blocks we also need to update the block * number in the buffer header. */ - memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize); + memcpy(root_blk->bp->b_addr, bp->b_addr, args->geo->blksize); root_blk->bp->b_ops = bp->b_ops; xfs_trans_buf_copy_type(root_blk->bp, bp); if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) { struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr; da3->blkno = cpu_to_be64(root_blk->bp->b_bn); } - xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); + xfs_trans_log_buf(args->trans, root_blk->bp, 0, + args->geo->blksize - 1); error = xfs_da_shrink_inode(args, child, bp); return(error); } @@ -1139,7 +1140,7 @@ xfs_da3_node_toosmall( info = blk->bp->b_addr; node = (xfs_da_intnode_t *)info; dp->d_ops->node_hdr_from_disk(&nodehdr, node); - if (nodehdr.count > (state->node_ents >> 1)) { + if (nodehdr.count > (state->args->geo->node_ents >> 1)) { *action = 0; /* blk over 50%, don't try to join */ return(0); /* blk over 50%, don't try to join */ } @@ -1176,8 +1177,8 @@ xfs_da3_node_toosmall( * We prefer coalescing with the lower numbered sibling so as * to shrink a directory over time. */ - count = state->node_ents; - count -= state->node_ents >> 2; + count = state->args->geo->node_ents; + count -= state->args->geo->node_ents >> 2; count -= nodehdr.count; /* start with smaller blk num */ @@ -1472,7 +1473,7 @@ xfs_da3_node_lookup_int( * Descend thru the B-tree searching each level for the right * node to use, until the right hashval is found. */ - blkno = (args->whichfork == XFS_DATA_FORK)? state->mp->m_dirleafblk : 0; + blkno = (args->whichfork == XFS_DATA_FORK)? args->geo->leafblk : 0; for (blk = &state->path.blk[0], state->path.active = 1; state->path.active <= XFS_DA_NODE_MAXDEPTH; blk++, state->path.active++) { @@ -2090,20 +2091,12 @@ xfs_da_grow_inode( xfs_dablk_t *new_blkno) { xfs_fileoff_t bno; - int count; int error; trace_xfs_da_grow_inode(args); - if (args->whichfork == XFS_DATA_FORK) { - bno = args->dp->i_mount->m_dirleafblk; - count = args->dp->i_mount->m_dirblkfsbs; - } else { - bno = 0; - count = 1; - } - - error = xfs_da_grow_inode_int(args, &bno, count); + bno = args->geo->leafblk; + error = xfs_da_grow_inode_int(args, &bno, args->geo->fsbcount); if (!error) *new_blkno = (xfs_dablk_t)bno; return error; @@ -2158,7 +2151,7 @@ xfs_da3_swap_lastblock( w = args->whichfork; ASSERT(w == XFS_DATA_FORK); mp = dp->i_mount; - lastoff = mp->m_dirfreeblk; + lastoff = args->geo->freeblk; error = xfs_bmap_last_before(tp, dp, &lastoff, w); if (error) return error; @@ -2170,15 +2163,15 @@ xfs_da3_swap_lastblock( /* * Read the last block in the btree space. */ - last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs; + last_blkno = (xfs_dablk_t)lastoff - args->geo->fsbcount; error = xfs_da3_node_read(tp, dp, last_blkno, -1, &last_buf, w); if (error) return error; /* * Copy the last block into the dead buffer and log it. */ - memcpy(dead_buf->b_addr, last_buf->b_addr, mp->m_dirblksize); - xfs_trans_log_buf(tp, dead_buf, 0, mp->m_dirblksize - 1); + memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize); + xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1); dead_info = dead_buf->b_addr; /* * Get values from the moved block. @@ -2247,7 +2240,7 @@ xfs_da3_swap_lastblock( sizeof(sib_info->back))); sib_buf = NULL; } - par_blkno = mp->m_dirleafblk; + par_blkno = args->geo->leafblk; level = -1; /* * Walk down the tree looking for the parent of the moved block. @@ -2357,10 +2350,7 @@ xfs_da_shrink_inode( w = args->whichfork; tp = args->trans; mp = dp->i_mount; - if (w == XFS_DATA_FORK) - count = mp->m_dirblkfsbs; - else - count = 1; + count = args->geo->fsbcount; for (;;) { /* * Remove extents. If we get ENOSPC for a dir we have to move @@ -2462,7 +2452,6 @@ xfs_buf_map_from_irec( */ static int xfs_dabuf_map( - struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, @@ -2480,7 +2469,10 @@ xfs_dabuf_map( ASSERT(map && *map); ASSERT(*nmaps == 1); - nfsb = (whichfork == XFS_DATA_FORK) ? mp->m_dirblkfsbs : 1; + if (whichfork == XFS_DATA_FORK) + nfsb = mp->m_dir_geo->fsbcount; + else + nfsb = mp->m_attr_geo->fsbcount; /* * Caller doesn't have a mapping. -2 means don't complain @@ -2558,7 +2550,7 @@ xfs_da_get_buf( *bpp = NULL; mapp = ↦ nmap = 1; - error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork, + error = xfs_dabuf_map(dp, bno, mappedbno, whichfork, &mapp, &nmap); if (error) { /* mapping a hole is not an error, but we don't continue */ @@ -2606,7 +2598,7 @@ xfs_da_read_buf( *bpp = NULL; mapp = ↦ nmap = 1; - error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork, + error = xfs_dabuf_map(dp, bno, mappedbno, whichfork, &mapp, &nmap); if (error) { /* mapping a hole is not an error, but we don't continue */ @@ -2625,47 +2617,6 @@ xfs_da_read_buf( xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF); else xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF); - - /* - * This verification code will be moved to a CRC verification callback - * function so just leave it here unchanged until then. - */ - { - xfs_dir2_data_hdr_t *hdr = bp->b_addr; - xfs_dir2_free_t *free = bp->b_addr; - xfs_da_blkinfo_t *info = bp->b_addr; - uint magic, magic1; - struct xfs_mount *mp = dp->i_mount; - - magic = be16_to_cpu(info->magic); - magic1 = be32_to_cpu(hdr->magic); - if (unlikely( - XFS_TEST_ERROR((magic != XFS_DA_NODE_MAGIC) && - (magic != XFS_DA3_NODE_MAGIC) && - (magic != XFS_ATTR_LEAF_MAGIC) && - (magic != XFS_ATTR3_LEAF_MAGIC) && - (magic != XFS_DIR2_LEAF1_MAGIC) && - (magic != XFS_DIR3_LEAF1_MAGIC) && - (magic != XFS_DIR2_LEAFN_MAGIC) && - (magic != XFS_DIR3_LEAFN_MAGIC) && - (magic1 != XFS_DIR2_BLOCK_MAGIC) && - (magic1 != XFS_DIR3_BLOCK_MAGIC) && - (magic1 != XFS_DIR2_DATA_MAGIC) && - (magic1 != XFS_DIR3_DATA_MAGIC) && - (free->hdr.magic != - cpu_to_be32(XFS_DIR2_FREE_MAGIC)) && - (free->hdr.magic != - cpu_to_be32(XFS_DIR3_FREE_MAGIC)), - mp, XFS_ERRTAG_DA_READ_BUF, - XFS_RANDOM_DA_READ_BUF))) { - trace_xfs_da_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR("xfs_da_do_buf(2)", - XFS_ERRLEVEL_LOW, mp, info); - error = XFS_ERROR(EFSCORRUPTED); - xfs_trans_brelse(trans, bp); - goto out_free; - } - } *bpp = bp; out_free: if (mapp != &map) @@ -2679,7 +2630,6 @@ out_free: */ xfs_daddr_t xfs_da_reada_buf( - struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, @@ -2693,7 +2643,7 @@ xfs_da_reada_buf( mapp = ↦ nmap = 1; - error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork, + error = xfs_dabuf_map(dp, bno, mappedbno, whichfork, &mapp, &nmap); if (error) { /* mapping a hole is not an error, but we don't continue */ diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 6e95ea79f5d7..6e153e399a77 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -25,6 +25,23 @@ struct xfs_trans; struct zone; struct xfs_dir_ops; +/* + * Directory/attribute geometry information. There will be one of these for each + * data fork type, and it will be passed around via the xfs_da_args. Global + * structures will be attached to the xfs_mount. + */ +struct xfs_da_geometry { + int blksize; /* da block size in bytes */ + int fsbcount; /* da block size in filesystem blocks */ + uint8_t fsblog; /* log2 of _filesystem_ block size */ + uint8_t blklog; /* log2 of da block size */ + uint node_ents; /* # of entries in a danode */ + int magicpct; /* 37% of block size in bytes */ + xfs_dablk_t datablk; /* blockno of dir data v2 */ + xfs_dablk_t leafblk; /* blockno of leaf data v2 */ + xfs_dablk_t freeblk; /* blockno of free data v2 */ +}; + /*======================================================================== * Btree searching and modification structure definitions. *========================================================================*/ @@ -42,6 +59,7 @@ enum xfs_dacmp { * Structure to ease passing around component names. */ typedef struct xfs_da_args { + struct xfs_da_geometry *geo; /* da block geometry */ const __uint8_t *name; /* string (maybe not NULL terminated) */ int namelen; /* length of string (maybe no NULL) */ __uint8_t filetype; /* filetype of inode for directories */ @@ -60,10 +78,12 @@ typedef struct xfs_da_args { int index; /* index of attr of interest in blk */ xfs_dablk_t rmtblkno; /* remote attr value starting blkno */ int rmtblkcnt; /* remote attr value block count */ + int rmtvaluelen; /* remote attr value length in bytes */ xfs_dablk_t blkno2; /* blkno of 2nd attr leaf of interest */ int index2; /* index of 2nd attr in blk */ xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */ int rmtblkcnt2; /* remote attr value block count */ + int rmtvaluelen2; /* remote attr value length in bytes */ int op_flags; /* operation flags */ enum xfs_dacmp cmpresult; /* name compare result for lookups */ } xfs_da_args_t; @@ -108,8 +128,6 @@ typedef struct xfs_da_state_path { typedef struct xfs_da_state { xfs_da_args_t *args; /* filename arguments */ struct xfs_mount *mp; /* filesystem mount point */ - unsigned int blocksize; /* logical block size */ - unsigned int node_ents; /* how many entries in danode */ xfs_da_state_path_t path; /* search/split paths */ xfs_da_state_path_t altpath; /* alternate path for join */ unsigned char inleaf; /* insert into 1->lf, 0->splf */ @@ -183,9 +201,9 @@ int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp, int whichfork, const struct xfs_buf_ops *ops); -xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp, - xfs_dablk_t bno, xfs_daddr_t mapped_bno, - int whichfork, const struct xfs_buf_ops *ops); +xfs_daddr_t xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno, + xfs_daddr_t mapped_bno, int whichfork, + const struct xfs_buf_ops *ops); int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, struct xfs_buf *dead_buf); diff --git a/fs/xfs/xfs_da_format.c b/fs/xfs/xfs_da_format.c index e6c83e1fbc8a..c9aee52a37e2 100644 --- a/fs/xfs/xfs_da_format.c +++ b/fs/xfs/xfs_da_format.c @@ -26,8 +26,10 @@ #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_inode.h" #include "xfs_dir2.h" +#include "xfs_dir2_priv.h" /* * Shortform directory ops @@ -425,9 +427,9 @@ xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr) * Directory Leaf block operations */ static int -xfs_dir2_max_leaf_ents(struct xfs_mount *mp) +xfs_dir2_max_leaf_ents(struct xfs_da_geometry *geo) { - return (mp->m_dirblksize - sizeof(struct xfs_dir2_leaf_hdr)) / + return (geo->blksize - sizeof(struct xfs_dir2_leaf_hdr)) / (uint)sizeof(struct xfs_dir2_leaf_entry); } @@ -438,9 +440,9 @@ xfs_dir2_leaf_ents_p(struct xfs_dir2_leaf *lp) } static int -xfs_dir3_max_leaf_ents(struct xfs_mount *mp) +xfs_dir3_max_leaf_ents(struct xfs_da_geometry *geo) { - return (mp->m_dirblksize - sizeof(struct xfs_dir3_leaf_hdr)) / + return (geo->blksize - sizeof(struct xfs_dir3_leaf_hdr)) / (uint)sizeof(struct xfs_dir2_leaf_entry); } @@ -591,9 +593,9 @@ xfs_da3_node_hdr_to_disk( * Directory free space block operations */ static int -xfs_dir2_free_max_bests(struct xfs_mount *mp) +xfs_dir2_free_max_bests(struct xfs_da_geometry *geo) { - return (mp->m_dirblksize - sizeof(struct xfs_dir2_free_hdr)) / + return (geo->blksize - sizeof(struct xfs_dir2_free_hdr)) / sizeof(xfs_dir2_data_off_t); } @@ -607,24 +609,25 @@ xfs_dir2_free_bests_p(struct xfs_dir2_free *free) * Convert data space db to the corresponding free db. */ static xfs_dir2_db_t -xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db) +xfs_dir2_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db) { - return XFS_DIR2_FREE_FIRSTDB(mp) + db / xfs_dir2_free_max_bests(mp); + return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) + + (db / xfs_dir2_free_max_bests(geo)); } /* * Convert data space db to the corresponding index in a free db. */ static int -xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db) +xfs_dir2_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db) { - return db % xfs_dir2_free_max_bests(mp); + return db % xfs_dir2_free_max_bests(geo); } static int -xfs_dir3_free_max_bests(struct xfs_mount *mp) +xfs_dir3_free_max_bests(struct xfs_da_geometry *geo) { - return (mp->m_dirblksize - sizeof(struct xfs_dir3_free_hdr)) / + return (geo->blksize - sizeof(struct xfs_dir3_free_hdr)) / sizeof(xfs_dir2_data_off_t); } @@ -638,18 +641,19 @@ xfs_dir3_free_bests_p(struct xfs_dir2_free *free) * Convert data space db to the corresponding free db. */ static xfs_dir2_db_t -xfs_dir3_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db) +xfs_dir3_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db) { - return XFS_DIR2_FREE_FIRSTDB(mp) + db / xfs_dir3_free_max_bests(mp); + return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) + + (db / xfs_dir3_free_max_bests(geo)); } /* * Convert data space db to the corresponding index in a free db. */ static int -xfs_dir3_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db) +xfs_dir3_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db) { - return db % xfs_dir3_free_max_bests(mp); + return db % xfs_dir3_free_max_bests(geo); } static void diff --git a/fs/xfs/xfs_da_format.h b/fs/xfs/xfs_da_format.h index a19d3f8f639c..0a49b0286372 100644 --- a/fs/xfs/xfs_da_format.h +++ b/fs/xfs/xfs_da_format.h @@ -19,10 +19,6 @@ #ifndef __XFS_DA_FORMAT_H__ #define __XFS_DA_FORMAT_H__ -/*======================================================================== - * Directory Structure when greater than XFS_LBSIZE(mp) bytes. - *========================================================================*/ - /* * This structure is common to both leaf nodes and non-leaf nodes in the Btree. * @@ -122,8 +118,6 @@ struct xfs_da3_icnode_hdr { __uint16_t level; }; -#define XFS_LBSIZE(mp) (mp)->m_sb.sb_blocksize - /* * Directory version 2. * @@ -330,8 +324,6 @@ xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr) #define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG)) #define XFS_DIR2_DATA_SPACE 0 #define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE) -#define XFS_DIR2_DATA_FIRSTDB(mp) \ - xfs_dir2_byte_to_db(mp, XFS_DIR2_DATA_OFFSET) /* * Describe a free area in the data block. @@ -456,8 +448,6 @@ xfs_dir2_data_unused_tag_p(struct xfs_dir2_data_unused *dup) */ #define XFS_DIR2_LEAF_SPACE 1 #define XFS_DIR2_LEAF_OFFSET (XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE) -#define XFS_DIR2_LEAF_FIRSTDB(mp) \ - xfs_dir2_byte_to_db(mp, XFS_DIR2_LEAF_OFFSET) /* * Leaf block header. @@ -514,17 +504,6 @@ struct xfs_dir3_leaf { #define XFS_DIR3_LEAF_CRC_OFF offsetof(struct xfs_dir3_leaf_hdr, info.crc) /* - * Get address of the bestcount field in the single-leaf block. - */ -static inline struct xfs_dir2_leaf_tail * -xfs_dir2_leaf_tail_p(struct xfs_mount *mp, struct xfs_dir2_leaf *lp) -{ - return (struct xfs_dir2_leaf_tail *) - ((char *)lp + mp->m_dirblksize - - sizeof(struct xfs_dir2_leaf_tail)); -} - -/* * Get address of the bests array in the single-leaf block. */ static inline __be16 * @@ -534,123 +513,6 @@ xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp) } /* - * DB blocks here are logical directory block numbers, not filesystem blocks. - */ - -/* - * Convert dataptr to byte in file space - */ -static inline xfs_dir2_off_t -xfs_dir2_dataptr_to_byte(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) -{ - return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG; -} - -/* - * Convert byte in file space to dataptr. It had better be aligned. - */ -static inline xfs_dir2_dataptr_t -xfs_dir2_byte_to_dataptr(struct xfs_mount *mp, xfs_dir2_off_t by) -{ - return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG); -} - -/* - * Convert byte in space to (DB) block - */ -static inline xfs_dir2_db_t -xfs_dir2_byte_to_db(struct xfs_mount *mp, xfs_dir2_off_t by) -{ - return (xfs_dir2_db_t) - (by >> (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)); -} - -/* - * Convert dataptr to a block number - */ -static inline xfs_dir2_db_t -xfs_dir2_dataptr_to_db(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) -{ - return xfs_dir2_byte_to_db(mp, xfs_dir2_dataptr_to_byte(mp, dp)); -} - -/* - * Convert byte in space to offset in a block - */ -static inline xfs_dir2_data_aoff_t -xfs_dir2_byte_to_off(struct xfs_mount *mp, xfs_dir2_off_t by) -{ - return (xfs_dir2_data_aoff_t)(by & - ((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) - 1)); -} - -/* - * Convert dataptr to a byte offset in a block - */ -static inline xfs_dir2_data_aoff_t -xfs_dir2_dataptr_to_off(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) -{ - return xfs_dir2_byte_to_off(mp, xfs_dir2_dataptr_to_byte(mp, dp)); -} - -/* - * Convert block and offset to byte in space - */ -static inline xfs_dir2_off_t -xfs_dir2_db_off_to_byte(struct xfs_mount *mp, xfs_dir2_db_t db, - xfs_dir2_data_aoff_t o) -{ - return ((xfs_dir2_off_t)db << - (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) + o; -} - -/* - * Convert block (DB) to block (dablk) - */ -static inline xfs_dablk_t -xfs_dir2_db_to_da(struct xfs_mount *mp, xfs_dir2_db_t db) -{ - return (xfs_dablk_t)(db << mp->m_sb.sb_dirblklog); -} - -/* - * Convert byte in space to (DA) block - */ -static inline xfs_dablk_t -xfs_dir2_byte_to_da(struct xfs_mount *mp, xfs_dir2_off_t by) -{ - return xfs_dir2_db_to_da(mp, xfs_dir2_byte_to_db(mp, by)); -} - -/* - * Convert block and offset to dataptr - */ -static inline xfs_dir2_dataptr_t -xfs_dir2_db_off_to_dataptr(struct xfs_mount *mp, xfs_dir2_db_t db, - xfs_dir2_data_aoff_t o) -{ - return xfs_dir2_byte_to_dataptr(mp, xfs_dir2_db_off_to_byte(mp, db, o)); -} - -/* - * Convert block (dablk) to block (DB) - */ -static inline xfs_dir2_db_t -xfs_dir2_da_to_db(struct xfs_mount *mp, xfs_dablk_t da) -{ - return (xfs_dir2_db_t)(da >> mp->m_sb.sb_dirblklog); -} - -/* - * Convert block (dablk) to byte offset in space - */ -static inline xfs_dir2_off_t -xfs_dir2_da_to_byte(struct xfs_mount *mp, xfs_dablk_t da) -{ - return xfs_dir2_db_off_to_byte(mp, xfs_dir2_da_to_db(mp, da), 0); -} - -/* * Free space block defintions for the node format. */ @@ -659,8 +521,6 @@ xfs_dir2_da_to_byte(struct xfs_mount *mp, xfs_dablk_t da) */ #define XFS_DIR2_FREE_SPACE 2 #define XFS_DIR2_FREE_OFFSET (XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE) -#define XFS_DIR2_FREE_FIRSTDB(mp) \ - xfs_dir2_byte_to_db(mp, XFS_DIR2_FREE_OFFSET) typedef struct xfs_dir2_free_hdr { __be32 magic; /* XFS_DIR2_FREE_MAGIC */ @@ -736,16 +596,6 @@ typedef struct xfs_dir2_block_tail { } xfs_dir2_block_tail_t; /* - * Pointer to the leaf header embedded in a data block (1-block format) - */ -static inline struct xfs_dir2_block_tail * -xfs_dir2_block_tail_p(struct xfs_mount *mp, struct xfs_dir2_data_hdr *hdr) -{ - return ((struct xfs_dir2_block_tail *) - ((char *)hdr + mp->m_dirblksize)) - 1; -} - -/* * Pointer to the leaf entries embedded in a data block (1-block format) */ static inline struct xfs_dir2_leaf_entry * @@ -764,10 +614,6 @@ xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp) * of an attribute name may not be unique, we may have duplicate keys. The * internal links in the Btree are logical block offsets into the file. * - *======================================================================== - * Attribute structure when equal to XFS_LBSIZE(mp) bytes. - *======================================================================== - * * Struct leaf_entry's are packed from the top. Name/values grow from the * bottom but are not packed. The freemap contains run-length-encoded entries * for the free bytes after the leaf_entry's, but only the N largest such, diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index fda46253966a..79670cda48ae 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -85,38 +85,74 @@ static struct xfs_nameops xfs_ascii_ci_nameops = { .compname = xfs_ascii_ci_compname, }; -void -xfs_dir_mount( - xfs_mount_t *mp) +int +xfs_da_mount( + struct xfs_mount *mp) { - int nodehdr_size; + struct xfs_da_geometry *dageo; + int nodehdr_size; - ASSERT(xfs_sb_version_hasdirv2(&mp->m_sb)); + ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT); ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <= XFS_MAX_BLOCKSIZE); mp->m_dir_inode_ops = xfs_dir_get_ops(mp, NULL); mp->m_nondir_inode_ops = xfs_nondir_get_ops(mp, NULL); - mp->m_dirblksize = 1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog); - mp->m_dirblkfsbs = 1 << mp->m_sb.sb_dirblklog; - mp->m_dirdatablk = xfs_dir2_db_to_da(mp, XFS_DIR2_DATA_FIRSTDB(mp)); - mp->m_dirleafblk = xfs_dir2_db_to_da(mp, XFS_DIR2_LEAF_FIRSTDB(mp)); - mp->m_dirfreeblk = xfs_dir2_db_to_da(mp, XFS_DIR2_FREE_FIRSTDB(mp)); - nodehdr_size = mp->m_dir_inode_ops->node_hdr_size; - mp->m_attr_node_ents = (mp->m_sb.sb_blocksize - nodehdr_size) / + mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), + KM_SLEEP | KM_MAYFAIL); + mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), + KM_SLEEP | KM_MAYFAIL); + if (!mp->m_dir_geo || !mp->m_attr_geo) { + kmem_free(mp->m_dir_geo); + kmem_free(mp->m_attr_geo); + return ENOMEM; + } + + /* set up directory geometry */ + dageo = mp->m_dir_geo; + dageo->blklog = mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog; + dageo->fsblog = mp->m_sb.sb_blocklog; + dageo->blksize = 1 << dageo->blklog; + dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog; + + /* + * Now we've set up the block conversion variables, we can calculate the + * segment block constants using the geometry structure. + */ + dageo->datablk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_DATA_OFFSET); + dageo->leafblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_LEAF_OFFSET); + dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET); + dageo->node_ents = (dageo->blksize - nodehdr_size) / (uint)sizeof(xfs_da_node_entry_t); - mp->m_dir_node_ents = (mp->m_dirblksize - nodehdr_size) / + dageo->magicpct = (dageo->blksize * 37) / 100; + + /* set up attribute geometry - single fsb only */ + dageo = mp->m_attr_geo; + dageo->blklog = mp->m_sb.sb_blocklog; + dageo->fsblog = mp->m_sb.sb_blocklog; + dageo->blksize = 1 << dageo->blklog; + dageo->fsbcount = 1; + dageo->node_ents = (dageo->blksize - nodehdr_size) / (uint)sizeof(xfs_da_node_entry_t); + dageo->magicpct = (dageo->blksize * 37) / 100; - mp->m_dir_magicpct = (mp->m_dirblksize * 37) / 100; if (xfs_sb_version_hasasciici(&mp->m_sb)) mp->m_dirnameops = &xfs_ascii_ci_nameops; else mp->m_dirnameops = &xfs_default_nameops; + return 0; +} + +void +xfs_da_unmount( + struct xfs_mount *mp) +{ + kmem_free(mp->m_dir_geo); + kmem_free(mp->m_attr_geo); } /* @@ -192,6 +228,7 @@ xfs_dir_init( if (!args) return ENOMEM; + args->geo = dp->i_mount->m_dir_geo; args->dp = dp; args->trans = tp; error = xfs_dir2_sf_create(args, pdp->i_ino); @@ -226,6 +263,7 @@ xfs_dir_createname( if (!args) return ENOMEM; + args->geo = dp->i_mount->m_dir_geo; args->name = name->name; args->namelen = name->len; args->filetype = name->type; @@ -244,7 +282,7 @@ xfs_dir_createname( goto out_free; } - rval = xfs_dir2_isblock(tp, dp, &v); + rval = xfs_dir2_isblock(args, &v); if (rval) goto out_free; if (v) { @@ -252,7 +290,7 @@ xfs_dir_createname( goto out_free; } - rval = xfs_dir2_isleaf(tp, dp, &v); + rval = xfs_dir2_isleaf(args, &v); if (rval) goto out_free; if (v) @@ -320,6 +358,7 @@ xfs_dir_lookup( * annotations into the reclaim path for the ilock. */ args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + args->geo = dp->i_mount->m_dir_geo; args->name = name->name; args->namelen = name->len; args->filetype = name->type; @@ -336,7 +375,7 @@ xfs_dir_lookup( goto out_check_rval; } - rval = xfs_dir2_isblock(tp, dp, &v); + rval = xfs_dir2_isblock(args, &v); if (rval) goto out_free; if (v) { @@ -344,7 +383,7 @@ xfs_dir_lookup( goto out_check_rval; } - rval = xfs_dir2_isleaf(tp, dp, &v); + rval = xfs_dir2_isleaf(args, &v); if (rval) goto out_free; if (v) @@ -391,6 +430,7 @@ xfs_dir_removename( if (!args) return ENOMEM; + args->geo = dp->i_mount->m_dir_geo; args->name = name->name; args->namelen = name->len; args->filetype = name->type; @@ -408,7 +448,7 @@ xfs_dir_removename( goto out_free; } - rval = xfs_dir2_isblock(tp, dp, &v); + rval = xfs_dir2_isblock(args, &v); if (rval) goto out_free; if (v) { @@ -416,7 +456,7 @@ xfs_dir_removename( goto out_free; } - rval = xfs_dir2_isleaf(tp, dp, &v); + rval = xfs_dir2_isleaf(args, &v); if (rval) goto out_free; if (v) @@ -455,6 +495,7 @@ xfs_dir_replace( if (!args) return ENOMEM; + args->geo = dp->i_mount->m_dir_geo; args->name = name->name; args->namelen = name->len; args->filetype = name->type; @@ -472,7 +513,7 @@ xfs_dir_replace( goto out_free; } - rval = xfs_dir2_isblock(tp, dp, &v); + rval = xfs_dir2_isblock(args, &v); if (rval) goto out_free; if (v) { @@ -480,7 +521,7 @@ xfs_dir_replace( goto out_free; } - rval = xfs_dir2_isleaf(tp, dp, &v); + rval = xfs_dir2_isleaf(args, &v); if (rval) goto out_free; if (v) @@ -516,6 +557,7 @@ xfs_dir_canenter( if (!args) return ENOMEM; + args->geo = dp->i_mount->m_dir_geo; args->name = name->name; args->namelen = name->len; args->filetype = name->type; @@ -531,7 +573,7 @@ xfs_dir_canenter( goto out_free; } - rval = xfs_dir2_isblock(tp, dp, &v); + rval = xfs_dir2_isblock(args, &v); if (rval) goto out_free; if (v) { @@ -539,7 +581,7 @@ xfs_dir_canenter( goto out_free; } - rval = xfs_dir2_isleaf(tp, dp, &v); + rval = xfs_dir2_isleaf(args, &v); if (rval) goto out_free; if (v) @@ -579,13 +621,13 @@ xfs_dir2_grow_inode( * Set lowest possible block in the space requested. */ bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE); - count = mp->m_dirblkfsbs; + count = args->geo->fsbcount; error = xfs_da_grow_inode_int(args, &bno, count); if (error) return error; - *dbp = xfs_dir2_da_to_db(mp, (xfs_dablk_t)bno); + *dbp = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)bno); /* * Update file's size if this is the data space and it grew. @@ -607,19 +649,16 @@ xfs_dir2_grow_inode( */ int xfs_dir2_isblock( - xfs_trans_t *tp, - xfs_inode_t *dp, - int *vp) /* out: 1 is block, 0 is not block */ + struct xfs_da_args *args, + int *vp) /* out: 1 is block, 0 is not block */ { - xfs_fileoff_t last; /* last file offset */ - xfs_mount_t *mp; - int rval; + xfs_fileoff_t last; /* last file offset */ + int rval; - mp = dp->i_mount; - if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK))) + if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK))) return rval; - rval = XFS_FSB_TO_B(mp, last) == mp->m_dirblksize; - ASSERT(rval == 0 || dp->i_d.di_size == mp->m_dirblksize); + rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize; + ASSERT(rval == 0 || args->dp->i_d.di_size == args->geo->blksize); *vp = rval; return 0; } @@ -629,18 +668,15 @@ xfs_dir2_isblock( */ int xfs_dir2_isleaf( - xfs_trans_t *tp, - xfs_inode_t *dp, - int *vp) /* out: 1 is leaf, 0 is not leaf */ + struct xfs_da_args *args, + int *vp) /* out: 1 is block, 0 is not block */ { - xfs_fileoff_t last; /* last file offset */ - xfs_mount_t *mp; - int rval; + xfs_fileoff_t last; /* last file offset */ + int rval; - mp = dp->i_mount; - if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK))) + if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK))) return rval; - *vp = last == mp->m_dirleafblk + (1 << mp->m_sb.sb_dirblklog); + *vp = last == args->geo->leafblk + args->geo->fsbcount; return 0; } @@ -668,11 +704,11 @@ xfs_dir2_shrink_inode( dp = args->dp; mp = dp->i_mount; tp = args->trans; - da = xfs_dir2_db_to_da(mp, db); + da = xfs_dir2_db_to_da(args->geo, db); /* * Unmap the fsblock(s). */ - if ((error = xfs_bunmapi(tp, dp, da, mp->m_dirblkfsbs, + if ((error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount, XFS_BMAPI_METADATA, 0, args->firstblock, args->flist, &done))) { /* @@ -699,12 +735,12 @@ xfs_dir2_shrink_inode( /* * If it's not a data block, we're done. */ - if (db >= XFS_DIR2_LEAF_FIRSTDB(mp)) + if (db >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET)) return 0; /* * If the block isn't the last one in the directory, we're done. */ - if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(mp, db + 1, 0)) + if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(args->geo, db + 1, 0)) return 0; bno = da; if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) { @@ -713,7 +749,7 @@ xfs_dir2_shrink_inode( */ return error; } - if (db == mp->m_dirdatablk) + if (db == args->geo->datablk) ASSERT(bno == 0); else ASSERT(bno > 0); diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h index cec70e0781ab..c8e86b0b5e99 100644 --- a/fs/xfs/xfs_dir2.h +++ b/fs/xfs/xfs_dir2.h @@ -80,7 +80,7 @@ struct xfs_dir_ops { struct xfs_dir3_icleaf_hdr *from); void (*leaf_hdr_from_disk)(struct xfs_dir3_icleaf_hdr *to, struct xfs_dir2_leaf *from); - int (*leaf_max_ents)(struct xfs_mount *mp); + int (*leaf_max_ents)(struct xfs_da_geometry *geo); struct xfs_dir2_leaf_entry * (*leaf_ents_p)(struct xfs_dir2_leaf *lp); @@ -97,10 +97,12 @@ struct xfs_dir_ops { struct xfs_dir3_icfree_hdr *from); void (*free_hdr_from_disk)(struct xfs_dir3_icfree_hdr *to, struct xfs_dir2_free *from); - int (*free_max_bests)(struct xfs_mount *mp); + int (*free_max_bests)(struct xfs_da_geometry *geo); __be16 * (*free_bests_p)(struct xfs_dir2_free *free); - xfs_dir2_db_t (*db_to_fdb)(struct xfs_mount *mp, xfs_dir2_db_t db); - int (*db_to_fdindex)(struct xfs_mount *mp, xfs_dir2_db_t db); + xfs_dir2_db_t (*db_to_fdb)(struct xfs_da_geometry *geo, + xfs_dir2_db_t db); + int (*db_to_fdindex)(struct xfs_da_geometry *geo, + xfs_dir2_db_t db); }; extern const struct xfs_dir_ops * @@ -112,7 +114,9 @@ extern const struct xfs_dir_ops * * Generic directory interface routines */ extern void xfs_dir_startup(void); -extern void xfs_dir_mount(struct xfs_mount *mp); +extern int xfs_da_mount(struct xfs_mount *mp); +extern void xfs_da_unmount(struct xfs_mount *mp); + extern int xfs_dir_isempty(struct xfs_inode *dp); extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_inode *pdp); @@ -142,23 +146,23 @@ extern int xfs_dir2_sf_to_block(struct xfs_da_args *args); /* * Interface routines used by userspace utilities */ -extern int xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp, int *r); -extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *r); +extern int xfs_dir2_isblock(struct xfs_da_args *args, int *r); +extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r); extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, struct xfs_buf *bp); extern void xfs_dir2_data_freescan(struct xfs_inode *dp, struct xfs_dir2_data_hdr *hdr, int *loghead); -extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_inode *dp, +extern void xfs_dir2_data_log_entry(struct xfs_da_args *args, struct xfs_buf *bp, struct xfs_dir2_data_entry *dep); -extern void xfs_dir2_data_log_header(struct xfs_trans *tp, struct xfs_inode *dp, +extern void xfs_dir2_data_log_header(struct xfs_da_args *args, struct xfs_buf *bp); -extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_buf *bp, - struct xfs_dir2_data_unused *dup); -extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_inode *dp, +extern void xfs_dir2_data_log_unused(struct xfs_da_args *args, + struct xfs_buf *bp, struct xfs_dir2_data_unused *dup); +extern void xfs_dir2_data_make_free(struct xfs_da_args *args, struct xfs_buf *bp, xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); -extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_inode *dp, +extern void xfs_dir2_data_use_free(struct xfs_da_args *args, struct xfs_buf *bp, struct xfs_dir2_data_unused *dup, xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 4f6a38cb83a4..c7cd3154026a 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -136,7 +136,7 @@ xfs_dir3_block_read( struct xfs_mount *mp = dp->i_mount; int err; - err = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp, + err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp, XFS_DATA_FORK, &xfs_dir3_block_buf_ops); if (!err && tp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF); @@ -281,8 +281,7 @@ out: */ static void xfs_dir2_block_compact( - struct xfs_trans *tp, - struct xfs_inode *dp, + struct xfs_da_args *args, struct xfs_buf *bp, struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_block_tail *btp, @@ -315,18 +314,17 @@ xfs_dir2_block_compact( *lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1); *lfloghigh -= be32_to_cpu(btp->stale) - 1; be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1)); - xfs_dir2_data_make_free(tp, dp, bp, + xfs_dir2_data_make_free(args, bp, (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr), (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)), needlog, &needscan); - blp += be32_to_cpu(btp->stale) - 1; btp->stale = cpu_to_be32(1); /* * If we now need to rebuild the bestfree map, do so. * This needs to happen before the next call to use_free. */ if (needscan) - xfs_dir2_data_freescan(dp, hdr, needlog); + xfs_dir2_data_freescan(args->dp, hdr, needlog); } /* @@ -378,7 +376,7 @@ xfs_dir2_block_addname( * Set up pointers to parts of the block. */ hdr = bp->b_addr; - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); /* @@ -421,7 +419,7 @@ xfs_dir2_block_addname( * If need to compact the leaf entries, do it now. */ if (compact) { - xfs_dir2_block_compact(tp, dp, bp, hdr, btp, blp, &needlog, + xfs_dir2_block_compact(args, bp, hdr, btp, blp, &needlog, &lfloghigh, &lfloglow); /* recalculate blp post-compaction */ blp = xfs_dir2_block_leaf_p(btp); @@ -456,7 +454,7 @@ xfs_dir2_block_addname( /* * Mark the space needed for the new leaf entry, now in use. */ - xfs_dir2_data_use_free(tp, dp, bp, enddup, + xfs_dir2_data_use_free(args, bp, enddup, (xfs_dir2_data_aoff_t) ((char *)enddup - (char *)hdr + be16_to_cpu(enddup->length) - sizeof(*blp)), @@ -537,13 +535,13 @@ xfs_dir2_block_addname( * Fill in the leaf entry. */ blp[mid].hashval = cpu_to_be32(args->hashval); - blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, + blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( (char *)dep - (char *)hdr)); xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh); /* * Mark space for the data entry used. */ - xfs_dir2_data_use_free(tp, dp, bp, dup, + xfs_dir2_data_use_free(args, bp, dup, (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), (xfs_dir2_data_aoff_t)len, &needlog, &needscan); /* @@ -561,9 +559,9 @@ xfs_dir2_block_addname( if (needscan) xfs_dir2_data_freescan(dp, hdr, &needlog); if (needlog) - xfs_dir2_data_log_header(tp, dp, bp); + xfs_dir2_data_log_header(args, bp); xfs_dir2_block_log_tail(tp, bp); - xfs_dir2_data_log_entry(tp, dp, bp, dep); + xfs_dir2_data_log_entry(args, bp, dep); xfs_dir3_data_check(dp, bp); return 0; } @@ -582,7 +580,7 @@ xfs_dir2_block_log_leaf( xfs_dir2_leaf_entry_t *blp; xfs_dir2_block_tail_t *btp; - btp = xfs_dir2_block_tail_p(tp->t_mountp, hdr); + btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr); blp = xfs_dir2_block_leaf_p(btp); xfs_trans_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)hdr), (uint)((char *)&blp[last + 1] - (char *)hdr - 1)); @@ -599,7 +597,7 @@ xfs_dir2_block_log_tail( xfs_dir2_data_hdr_t *hdr = bp->b_addr; xfs_dir2_block_tail_t *btp; - btp = xfs_dir2_block_tail_p(tp->t_mountp, hdr); + btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr); xfs_trans_log_buf(tp, bp, (uint)((char *)btp - (char *)hdr), (uint)((char *)(btp + 1) - (char *)hdr - 1)); } @@ -634,13 +632,14 @@ xfs_dir2_block_lookup( mp = dp->i_mount; hdr = bp->b_addr; xfs_dir3_data_check(dp, bp); - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); /* * Get the offset from the leaf entry, to point to the data. */ dep = (xfs_dir2_data_entry_t *)((char *)hdr + - xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address))); + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(blp[ent].address))); /* * Fill in inode number, CI name if appropriate, release the block. */ @@ -686,7 +685,7 @@ xfs_dir2_block_lookup_int( hdr = bp->b_addr; xfs_dir3_data_check(dp, bp); - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); /* * Loop doing a binary search for our hash value. @@ -724,7 +723,7 @@ xfs_dir2_block_lookup_int( * Get pointer to the entry from the leaf. */ dep = (xfs_dir2_data_entry_t *) - ((char *)hdr + xfs_dir2_dataptr_to_off(mp, addr)); + ((char *)hdr + xfs_dir2_dataptr_to_off(args->geo, addr)); /* * Compare name and if it's an exact match, return the index * and buffer. If it's the first case-insensitive match, store @@ -791,18 +790,19 @@ xfs_dir2_block_removename( tp = args->trans; mp = dp->i_mount; hdr = bp->b_addr; - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); /* * Point to the data entry using the leaf entry. */ - dep = (xfs_dir2_data_entry_t *) - ((char *)hdr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address))); + dep = (xfs_dir2_data_entry_t *)((char *)hdr + + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(blp[ent].address))); /* * Mark the data entry's space free. */ needlog = needscan = 0; - xfs_dir2_data_make_free(tp, dp, bp, + xfs_dir2_data_make_free(args, bp, (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr), dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan); /* @@ -821,7 +821,7 @@ xfs_dir2_block_removename( if (needscan) xfs_dir2_data_freescan(dp, hdr, &needlog); if (needlog) - xfs_dir2_data_log_header(tp, dp, bp); + xfs_dir2_data_log_header(args, bp); xfs_dir3_data_check(dp, bp); /* * See if the size as a shortform is good enough. @@ -866,20 +866,21 @@ xfs_dir2_block_replace( dp = args->dp; mp = dp->i_mount; hdr = bp->b_addr; - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); /* * Point to the data entry we need to change. */ - dep = (xfs_dir2_data_entry_t *) - ((char *)hdr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address))); + dep = (xfs_dir2_data_entry_t *)((char *)hdr + + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(blp[ent].address))); ASSERT(be64_to_cpu(dep->inumber) != args->inumber); /* * Change the inode number to the new value. */ dep->inumber = cpu_to_be64(args->inumber); dp->d_ops->data_put_ftype(dep, args->filetype); - xfs_dir2_data_log_entry(args->trans, dp, bp, dep); + xfs_dir2_data_log_entry(args, bp, dep); xfs_dir3_data_check(dp, bp); return 0; } @@ -939,7 +940,7 @@ xfs_dir2_leaf_to_block( leaf = lbp->b_addr; dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); ents = dp->d_ops->leaf_ents_p(leaf); - ltp = xfs_dir2_leaf_tail_p(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC || leafhdr.magic == XFS_DIR3_LEAF1_MAGIC); @@ -949,13 +950,13 @@ xfs_dir2_leaf_to_block( * been left behind during no-space-reservation operations. * These will show up in the leaf bests table. */ - while (dp->i_d.di_size > mp->m_dirblksize) { + while (dp->i_d.di_size > args->geo->blksize) { int hdrsz; hdrsz = dp->d_ops->data_entry_offset; bestsp = xfs_dir2_leaf_bests_p(ltp); if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) == - mp->m_dirblksize - hdrsz) { + args->geo->blksize - hdrsz) { if ((error = xfs_dir2_leaf_trim_data(args, lbp, (xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1)))) @@ -967,7 +968,7 @@ xfs_dir2_leaf_to_block( * Read the data block if we don't already have it, give up if it fails. */ if (!dbp) { - error = xfs_dir3_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp); + error = xfs_dir3_data_read(tp, dp, args->geo->datablk, -1, &dbp); if (error) return error; } @@ -983,7 +984,7 @@ xfs_dir2_leaf_to_block( /* * Look at the last data entry. */ - tagp = (__be16 *)((char *)hdr + mp->m_dirblksize) - 1; + tagp = (__be16 *)((char *)hdr + args->geo->blksize) - 1; dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); /* * If it's not free or is too short we can't do it. @@ -1002,12 +1003,12 @@ xfs_dir2_leaf_to_block( /* * Use up the space at the end of the block (blp/btp). */ - xfs_dir2_data_use_free(tp, dp, dbp, dup, mp->m_dirblksize - size, size, + xfs_dir2_data_use_free(args, dbp, dup, args->geo->blksize - size, size, &needlog, &needscan); /* * Initialize the block tail. */ - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(args->geo, hdr); btp->count = cpu_to_be32(leafhdr.count - leafhdr.stale); btp->stale = 0; xfs_dir2_block_log_tail(tp, dbp); @@ -1028,11 +1029,11 @@ xfs_dir2_leaf_to_block( if (needscan) xfs_dir2_data_freescan(dp, hdr, &needlog); if (needlog) - xfs_dir2_data_log_header(tp, dp, dbp); + xfs_dir2_data_log_header(args, dbp); /* * Pitch the old leaf block. */ - error = xfs_da_shrink_inode(args, mp->m_dirleafblk, lbp); + error = xfs_da_shrink_inode(args, args->geo->leafblk, lbp); if (error) return error; @@ -1141,13 +1142,13 @@ xfs_dir2_sf_to_block( */ dup = dp->d_ops->data_unused_p(hdr); needlog = needscan = 0; - xfs_dir2_data_use_free(tp, dp, bp, dup, mp->m_dirblksize - i, i, &needlog, - &needscan); + xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i, + i, &needlog, &needscan); ASSERT(needscan == 0); /* * Fill in the tail. */ - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(args->geo, hdr); btp->count = cpu_to_be32(sfp->count + 2); /* ., .. */ btp->stale = 0; blp = xfs_dir2_block_leaf_p(btp); @@ -1155,7 +1156,7 @@ xfs_dir2_sf_to_block( /* * Remove the freespace, we'll manage it. */ - xfs_dir2_data_use_free(tp, dp, bp, dup, + xfs_dir2_data_use_free(args, bp, dup, (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), be16_to_cpu(dup->length), &needlog, &needscan); /* @@ -1168,9 +1169,9 @@ xfs_dir2_sf_to_block( dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR); tagp = dp->d_ops->data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); - xfs_dir2_data_log_entry(tp, dp, bp, dep); + xfs_dir2_data_log_entry(args, bp, dep); blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot); - blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, + blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( (char *)dep - (char *)hdr)); /* * Create entry for .. @@ -1182,9 +1183,9 @@ xfs_dir2_sf_to_block( dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR); tagp = dp->d_ops->data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); - xfs_dir2_data_log_entry(tp, dp, bp, dep); + xfs_dir2_data_log_entry(args, bp, dep); blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot); - blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, + blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( (char *)dep - (char *)hdr)); offset = dp->d_ops->data_first_offset; /* @@ -1216,7 +1217,7 @@ xfs_dir2_sf_to_block( dup->length = cpu_to_be16(newoffset - offset); *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16( ((char *)dup - (char *)hdr)); - xfs_dir2_data_log_unused(tp, bp, dup); + xfs_dir2_data_log_unused(args, bp, dup); xfs_dir2_data_freeinsert(hdr, dp->d_ops->data_bestfree_p(hdr), dup, &dummy); @@ -1233,12 +1234,12 @@ xfs_dir2_sf_to_block( memcpy(dep->name, sfep->name, dep->namelen); tagp = dp->d_ops->data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); - xfs_dir2_data_log_entry(tp, dp, bp, dep); + xfs_dir2_data_log_entry(args, bp, dep); name.name = sfep->name; name.len = sfep->namelen; blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops-> hashname(&name)); - blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, + blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( (char *)dep - (char *)hdr)); offset = (int)((char *)(tagp + 1) - (char *)hdr); if (++i == sfp->count) diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index afa4ad523f3f..8c2f6422648e 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -63,8 +63,10 @@ __xfs_dir3_data_check( int stale; /* count of stale leaves */ struct xfs_name name; const struct xfs_dir_ops *ops; + struct xfs_da_geometry *geo; mp = bp->b_target->bt_mount; + geo = mp->m_dir_geo; /* * We can be passed a null dp here from a verifier, so we need to go the @@ -78,7 +80,7 @@ __xfs_dir3_data_check( switch (hdr->magic) { case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC): case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(geo, hdr); lep = xfs_dir2_block_leaf_p(btp); endp = (char *)lep; @@ -94,7 +96,7 @@ __xfs_dir3_data_check( break; case cpu_to_be32(XFS_DIR3_DATA_MAGIC): case cpu_to_be32(XFS_DIR2_DATA_MAGIC): - endp = (char *)hdr + mp->m_dirblksize; + endp = (char *)hdr + geo->blksize; break; default: XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp); @@ -172,9 +174,9 @@ __xfs_dir3_data_check( lastfree = 0; if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { - addr = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, - (xfs_dir2_data_aoff_t) - ((char *)dep - (char *)hdr)); + addr = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + (xfs_dir2_data_aoff_t) + ((char *)dep - (char *)hdr)); name.name = dep->name; name.len = dep->namelen; hash = mp->m_dirnameops->hashname(&name); @@ -329,12 +331,11 @@ xfs_dir3_data_read( int xfs_dir3_data_readahead( - struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno) { - return xfs_da_reada_buf(tp, dp, bno, mapped_bno, + return xfs_da_reada_buf(dp, bno, mapped_bno, XFS_DATA_FORK, &xfs_dir3_data_reada_buf_ops); } @@ -510,6 +511,7 @@ xfs_dir2_data_freescan( struct xfs_dir2_data_free *bf; char *endp; /* end of block's data */ char *p; /* current entry pointer */ + struct xfs_da_geometry *geo = dp->i_mount->m_dir_geo; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || @@ -528,10 +530,10 @@ xfs_dir2_data_freescan( p = (char *)dp->d_ops->data_entry_p(hdr); if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { - btp = xfs_dir2_block_tail_p(dp->i_mount, hdr); + btp = xfs_dir2_block_tail_p(geo, hdr); endp = (char *)xfs_dir2_block_leaf_p(btp); } else - endp = (char *)hdr + dp->i_mount->m_dirblksize; + endp = (char *)hdr + geo->blksize; /* * Loop over the block's entries. */ @@ -585,8 +587,8 @@ xfs_dir3_data_init( /* * Get the buffer set up for the block. */ - error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp, - XFS_DATA_FORK); + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, blkno), + -1, &bp, XFS_DATA_FORK); if (error) return error; bp->b_ops = &xfs_dir3_data_buf_ops; @@ -621,15 +623,15 @@ xfs_dir3_data_init( dup = dp->d_ops->data_unused_p(hdr); dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); - t = mp->m_dirblksize - (uint)dp->d_ops->data_entry_offset; + t = args->geo->blksize - (uint)dp->d_ops->data_entry_offset; bf[0].length = cpu_to_be16(t); dup->length = cpu_to_be16(t); *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr); /* * Log it and return it. */ - xfs_dir2_data_log_header(tp, dp, bp); - xfs_dir2_data_log_unused(tp, bp, dup); + xfs_dir2_data_log_header(args, bp); + xfs_dir2_data_log_unused(args, bp, dup); *bpp = bp; return 0; } @@ -639,8 +641,7 @@ xfs_dir3_data_init( */ void xfs_dir2_data_log_entry( - struct xfs_trans *tp, - struct xfs_inode *dp, + struct xfs_da_args *args, struct xfs_buf *bp, xfs_dir2_data_entry_t *dep) /* data entry pointer */ { @@ -651,8 +652,8 @@ xfs_dir2_data_log_entry( hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); - xfs_trans_log_buf(tp, bp, (uint)((char *)dep - (char *)hdr), - (uint)((char *)(dp->d_ops->data_entry_tag_p(dep) + 1) - + xfs_trans_log_buf(args->trans, bp, (uint)((char *)dep - (char *)hdr), + (uint)((char *)(args->dp->d_ops->data_entry_tag_p(dep) + 1) - (char *)hdr - 1)); } @@ -661,8 +662,7 @@ xfs_dir2_data_log_entry( */ void xfs_dir2_data_log_header( - struct xfs_trans *tp, - struct xfs_inode *dp, + struct xfs_da_args *args, struct xfs_buf *bp) { #ifdef DEBUG @@ -674,7 +674,8 @@ xfs_dir2_data_log_header( hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); #endif - xfs_trans_log_buf(tp, bp, 0, dp->d_ops->data_entry_offset - 1); + xfs_trans_log_buf(args->trans, bp, 0, + args->dp->d_ops->data_entry_offset - 1); } /* @@ -682,7 +683,7 @@ xfs_dir2_data_log_header( */ void xfs_dir2_data_log_unused( - struct xfs_trans *tp, + struct xfs_da_args *args, struct xfs_buf *bp, xfs_dir2_data_unused_t *dup) /* data unused pointer */ { @@ -696,13 +697,13 @@ xfs_dir2_data_log_unused( /* * Log the first part of the unused entry. */ - xfs_trans_log_buf(tp, bp, (uint)((char *)dup - (char *)hdr), + xfs_trans_log_buf(args->trans, bp, (uint)((char *)dup - (char *)hdr), (uint)((char *)&dup->length + sizeof(dup->length) - 1 - (char *)hdr)); /* * Log the end (tag) of the unused entry. */ - xfs_trans_log_buf(tp, bp, + xfs_trans_log_buf(args->trans, bp, (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr), (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr + sizeof(xfs_dir2_data_off_t) - 1)); @@ -714,8 +715,7 @@ xfs_dir2_data_log_unused( */ void xfs_dir2_data_make_free( - struct xfs_trans *tp, - struct xfs_inode *dp, + struct xfs_da_args *args, struct xfs_buf *bp, xfs_dir2_data_aoff_t offset, /* starting byte offset */ xfs_dir2_data_aoff_t len, /* length in bytes */ @@ -725,14 +725,12 @@ xfs_dir2_data_make_free( xfs_dir2_data_hdr_t *hdr; /* data block pointer */ xfs_dir2_data_free_t *dfp; /* bestfree pointer */ char *endptr; /* end of data area */ - xfs_mount_t *mp; /* filesystem mount point */ int needscan; /* need to regen bestfree */ xfs_dir2_data_unused_t *newdup; /* new unused entry */ xfs_dir2_data_unused_t *postdup; /* unused entry after us */ xfs_dir2_data_unused_t *prevdup; /* unused entry before us */ struct xfs_dir2_data_free *bf; - mp = tp->t_mountp; hdr = bp->b_addr; /* @@ -740,20 +738,20 @@ xfs_dir2_data_make_free( */ if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)) - endptr = (char *)hdr + mp->m_dirblksize; + endptr = (char *)hdr + args->geo->blksize; else { xfs_dir2_block_tail_t *btp; /* block tail */ ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(args->geo, hdr); endptr = (char *)xfs_dir2_block_leaf_p(btp); } /* * If this isn't the start of the block, then back up to * the previous entry and see if it's free. */ - if (offset > dp->d_ops->data_entry_offset) { + if (offset > args->dp->d_ops->data_entry_offset) { __be16 *tagp; /* tag just before us */ tagp = (__be16 *)((char *)hdr + offset) - 1; @@ -779,7 +777,7 @@ xfs_dir2_data_make_free( * Previous and following entries are both free, * merge everything into a single free entry. */ - bf = dp->d_ops->data_bestfree_p(hdr); + bf = args->dp->d_ops->data_bestfree_p(hdr); if (prevdup && postdup) { xfs_dir2_data_free_t *dfp2; /* another bestfree pointer */ @@ -801,7 +799,7 @@ xfs_dir2_data_make_free( be16_add_cpu(&prevdup->length, len + be16_to_cpu(postdup->length)); *xfs_dir2_data_unused_tag_p(prevdup) = cpu_to_be16((char *)prevdup - (char *)hdr); - xfs_dir2_data_log_unused(tp, bp, prevdup); + xfs_dir2_data_log_unused(args, bp, prevdup); if (!needscan) { /* * Has to be the case that entries 0 and 1 are @@ -836,7 +834,7 @@ xfs_dir2_data_make_free( be16_add_cpu(&prevdup->length, len); *xfs_dir2_data_unused_tag_p(prevdup) = cpu_to_be16((char *)prevdup - (char *)hdr); - xfs_dir2_data_log_unused(tp, bp, prevdup); + xfs_dir2_data_log_unused(args, bp, prevdup); /* * If the previous entry was in the table, the new entry * is longer, so it will be in the table too. Remove @@ -864,7 +862,7 @@ xfs_dir2_data_make_free( newdup->length = cpu_to_be16(len + be16_to_cpu(postdup->length)); *xfs_dir2_data_unused_tag_p(newdup) = cpu_to_be16((char *)newdup - (char *)hdr); - xfs_dir2_data_log_unused(tp, bp, newdup); + xfs_dir2_data_log_unused(args, bp, newdup); /* * If the following entry was in the table, the new entry * is longer, so it will be in the table too. Remove @@ -891,7 +889,7 @@ xfs_dir2_data_make_free( newdup->length = cpu_to_be16(len); *xfs_dir2_data_unused_tag_p(newdup) = cpu_to_be16((char *)newdup - (char *)hdr); - xfs_dir2_data_log_unused(tp, bp, newdup); + xfs_dir2_data_log_unused(args, bp, newdup); xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp); } *needscanp = needscan; @@ -902,8 +900,7 @@ xfs_dir2_data_make_free( */ void xfs_dir2_data_use_free( - struct xfs_trans *tp, - struct xfs_inode *dp, + struct xfs_da_args *args, struct xfs_buf *bp, xfs_dir2_data_unused_t *dup, /* unused entry */ xfs_dir2_data_aoff_t offset, /* starting offset to use */ @@ -934,7 +931,7 @@ xfs_dir2_data_use_free( * Look up the entry in the bestfree table. */ oldlen = be16_to_cpu(dup->length); - bf = dp->d_ops->data_bestfree_p(hdr); + bf = args->dp->d_ops->data_bestfree_p(hdr); dfp = xfs_dir2_data_freefind(hdr, bf, dup); ASSERT(dfp || oldlen <= be16_to_cpu(bf[2].length)); /* @@ -966,7 +963,7 @@ xfs_dir2_data_use_free( newdup->length = cpu_to_be16(oldlen - len); *xfs_dir2_data_unused_tag_p(newdup) = cpu_to_be16((char *)newdup - (char *)hdr); - xfs_dir2_data_log_unused(tp, bp, newdup); + xfs_dir2_data_log_unused(args, bp, newdup); /* * If it was in the table, remove it and add the new one. */ @@ -994,7 +991,7 @@ xfs_dir2_data_use_free( newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup); *xfs_dir2_data_unused_tag_p(newdup) = cpu_to_be16((char *)newdup - (char *)hdr); - xfs_dir2_data_log_unused(tp, bp, newdup); + xfs_dir2_data_log_unused(args, bp, newdup); /* * If it was in the table, remove it and add the new one. */ @@ -1022,13 +1019,13 @@ xfs_dir2_data_use_free( newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup); *xfs_dir2_data_unused_tag_p(newdup) = cpu_to_be16((char *)newdup - (char *)hdr); - xfs_dir2_data_log_unused(tp, bp, newdup); + xfs_dir2_data_log_unused(args, bp, newdup); newdup2 = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len); newdup2->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); newdup2->length = cpu_to_be16(oldlen - len - be16_to_cpu(newdup->length)); *xfs_dir2_data_unused_tag_p(newdup2) = cpu_to_be16((char *)newdup2 - (char *)hdr); - xfs_dir2_data_log_unused(tp, bp, newdup2); + xfs_dir2_data_log_unused(args, bp, newdup2); /* * If the old entry was in the table, we need to scan * if the 3rd entry was valid, since these entries diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index d36e97df1187..fb0aad4440c1 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -41,9 +41,10 @@ */ static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp, int *indexp, struct xfs_buf **dbpp); -static void xfs_dir3_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp, - int first, int last); -static void xfs_dir3_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp); +static void xfs_dir3_leaf_log_bests(struct xfs_da_args *args, + struct xfs_buf *bp, int first, int last); +static void xfs_dir3_leaf_log_tail(struct xfs_da_args *args, + struct xfs_buf *bp); /* * Check the internal consistency of a leaf1 block. @@ -92,6 +93,7 @@ xfs_dir3_leaf_check_int( int i; const struct xfs_dir_ops *ops; struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_da_geometry *geo = mp->m_dir_geo; /* * we can be passed a null dp here from a verifier, so we need to go the @@ -105,14 +107,14 @@ xfs_dir3_leaf_check_int( } ents = ops->leaf_ents_p(leaf); - ltp = xfs_dir2_leaf_tail_p(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(geo, leaf); /* * XXX (dgc): This value is not restrictive enough. * Should factor in the size of the bests table as well. * We can deduce a value for that from di_size. */ - if (hdr->count > ops->leaf_max_ents(mp)) + if (hdr->count > ops->leaf_max_ents(geo)) return false; /* Leaves and bests don't overlap in leaf format. */ @@ -323,7 +325,7 @@ xfs_dir3_leaf_init( if (type == XFS_DIR2_LEAF1_MAGIC) { struct xfs_dir2_leaf_tail *ltp; - ltp = xfs_dir2_leaf_tail_p(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(mp->m_dir_geo, leaf); ltp->bestcount = 0; bp->b_ops = &xfs_dir3_leaf1_buf_ops; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAF1_BUF); @@ -347,18 +349,18 @@ xfs_dir3_leaf_get_buf( int error; ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC); - ASSERT(bno >= XFS_DIR2_LEAF_FIRSTDB(mp) && - bno < XFS_DIR2_FREE_FIRSTDB(mp)); + ASSERT(bno >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET) && + bno < xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET)); - error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp, - XFS_DATA_FORK); + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, bno), + -1, &bp, XFS_DATA_FORK); if (error) return error; xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic); - xfs_dir3_leaf_log_header(tp, dp, bp); + xfs_dir3_leaf_log_header(args, bp); if (magic == XFS_DIR2_LEAF1_MAGIC) - xfs_dir3_leaf_log_tail(tp, bp); + xfs_dir3_leaf_log_tail(args, bp); *bpp = bp; return 0; } @@ -403,8 +405,8 @@ xfs_dir2_block_to_leaf( if ((error = xfs_da_grow_inode(args, &blkno))) { return error; } - ldb = xfs_dir2_da_to_db(mp, blkno); - ASSERT(ldb == XFS_DIR2_LEAF_FIRSTDB(mp)); + ldb = xfs_dir2_da_to_db(args->geo, blkno); + ASSERT(ldb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET)); /* * Initialize the leaf block, get a buffer for it. */ @@ -415,7 +417,7 @@ xfs_dir2_block_to_leaf( leaf = lbp->b_addr; hdr = dbp->b_addr; xfs_dir3_data_check(dp, dbp); - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); bf = dp->d_ops->data_bestfree_p(hdr); ents = dp->d_ops->leaf_ents_p(leaf); @@ -427,23 +429,23 @@ xfs_dir2_block_to_leaf( leafhdr.count = be32_to_cpu(btp->count); leafhdr.stale = be32_to_cpu(btp->stale); dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); - xfs_dir3_leaf_log_header(tp, dp, lbp); + xfs_dir3_leaf_log_header(args, lbp); /* * Could compact these but I think we always do the conversion * after squeezing out stale entries. */ memcpy(ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir3_leaf_log_ents(tp, dp, lbp, 0, leafhdr.count - 1); + xfs_dir3_leaf_log_ents(args, lbp, 0, leafhdr.count - 1); needscan = 0; needlog = 1; /* * Make the space formerly occupied by the leaf entries and block * tail be free. */ - xfs_dir2_data_make_free(tp, dp, dbp, + xfs_dir2_data_make_free(args, dbp, (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr), - (xfs_dir2_data_aoff_t)((char *)hdr + mp->m_dirblksize - + (xfs_dir2_data_aoff_t)((char *)hdr + args->geo->blksize - (char *)blp), &needlog, &needscan); /* @@ -461,7 +463,7 @@ xfs_dir2_block_to_leaf( /* * Set up leaf tail and bests table. */ - ltp = xfs_dir2_leaf_tail_p(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); ltp->bestcount = cpu_to_be32(1); bestsp = xfs_dir2_leaf_bests_p(ltp); bestsp[0] = bf[0].length; @@ -469,10 +471,10 @@ xfs_dir2_block_to_leaf( * Log the data header and leaf bests table. */ if (needlog) - xfs_dir2_data_log_header(tp, dp, dbp); + xfs_dir2_data_log_header(args, dbp); xfs_dir3_leaf_check(dp, lbp); xfs_dir3_data_check(dp, dbp); - xfs_dir3_leaf_log_bests(tp, lbp, 0, 0); + xfs_dir3_leaf_log_bests(args, lbp, 0, 0); return 0; } @@ -641,7 +643,7 @@ xfs_dir2_leaf_addname( tp = args->trans; mp = dp->i_mount; - error = xfs_dir3_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp); + error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp); if (error) return error; @@ -653,7 +655,7 @@ xfs_dir2_leaf_addname( */ index = xfs_dir2_leaf_search_hash(args, lbp); leaf = lbp->b_addr; - ltp = xfs_dir2_leaf_tail_p(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); ents = dp->d_ops->leaf_ents_p(leaf); dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); bestsp = xfs_dir2_leaf_bests_p(ltp); @@ -670,7 +672,7 @@ xfs_dir2_leaf_addname( index++, lep++) { if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR) continue; - i = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); + i = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address)); ASSERT(i < be32_to_cpu(ltp->bestcount)); ASSERT(bestsp[i] != cpu_to_be16(NULLDATAOFF)); if (be16_to_cpu(bestsp[i]) >= length) { @@ -810,14 +812,15 @@ xfs_dir2_leaf_addname( memmove(&bestsp[0], &bestsp[1], be32_to_cpu(ltp->bestcount) * sizeof(bestsp[0])); be32_add_cpu(<p->bestcount, 1); - xfs_dir3_leaf_log_tail(tp, lbp); - xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + xfs_dir3_leaf_log_tail(args, lbp); + xfs_dir3_leaf_log_bests(args, lbp, 0, + be32_to_cpu(ltp->bestcount) - 1); } /* * If we're filling in a previously empty block just log it. */ else - xfs_dir3_leaf_log_bests(tp, lbp, use_block, use_block); + xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block); hdr = dbp->b_addr; bf = dp->d_ops->data_bestfree_p(hdr); bestsp[use_block] = bf[0].length; @@ -828,8 +831,8 @@ xfs_dir2_leaf_addname( * Just read that one in. */ error = xfs_dir3_data_read(tp, dp, - xfs_dir2_db_to_da(mp, use_block), - -1, &dbp); + xfs_dir2_db_to_da(args->geo, use_block), + -1, &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -848,7 +851,7 @@ xfs_dir2_leaf_addname( /* * Mark the initial part of our freespace in use for the new entry. */ - xfs_dir2_data_use_free(tp, dp, dbp, dup, + xfs_dir2_data_use_free(args, dbp, dup, (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length, &needlog, &needscan); /* @@ -870,8 +873,8 @@ xfs_dir2_leaf_addname( * Need to log the data block's header. */ if (needlog) - xfs_dir2_data_log_header(tp, dp, dbp); - xfs_dir2_data_log_entry(tp, dp, dbp, dep); + xfs_dir2_data_log_header(args, dbp); + xfs_dir2_data_log_entry(args, dbp, dep); /* * If the bests table needs to be changed, do it. * Log the change unless we've already done that. @@ -879,7 +882,7 @@ xfs_dir2_leaf_addname( if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(bf[0].length)) { bestsp[use_block] = bf[0].length; if (!grown) - xfs_dir3_leaf_log_bests(tp, lbp, use_block, use_block); + xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block); } lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale, @@ -889,14 +892,15 @@ xfs_dir2_leaf_addname( * Fill in the new leaf entry. */ lep->hashval = cpu_to_be32(args->hashval); - lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp, use_block, + lep->address = cpu_to_be32( + xfs_dir2_db_off_to_dataptr(args->geo, use_block, be16_to_cpu(*tagp))); /* * Log the leaf fields and give up the buffers. */ dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); - xfs_dir3_leaf_log_header(tp, dp, lbp); - xfs_dir3_leaf_log_ents(tp, dp, lbp, lfloglow, lfloghigh); + xfs_dir3_leaf_log_header(args, lbp); + xfs_dir3_leaf_log_ents(args, lbp, lfloglow, lfloghigh); xfs_dir3_leaf_check(dp, lbp); xfs_dir3_data_check(dp, dbp); return 0; @@ -948,9 +952,9 @@ xfs_dir3_leaf_compact( leafhdr->stale = 0; dp->d_ops->leaf_hdr_to_disk(leaf, leafhdr); - xfs_dir3_leaf_log_header(args->trans, dp, bp); + xfs_dir3_leaf_log_header(args, bp); if (loglow != -1) - xfs_dir3_leaf_log_ents(args->trans, dp, bp, loglow, to - 1); + xfs_dir3_leaf_log_ents(args, bp, loglow, to - 1); } /* @@ -1052,7 +1056,7 @@ xfs_dir3_leaf_compact_x1( */ static void xfs_dir3_leaf_log_bests( - xfs_trans_t *tp, /* transaction pointer */ + struct xfs_da_args *args, struct xfs_buf *bp, /* leaf buffer */ int first, /* first entry to log */ int last) /* last entry to log */ @@ -1065,10 +1069,11 @@ xfs_dir3_leaf_log_bests( ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC)); - ltp = xfs_dir2_leaf_tail_p(tp->t_mountp, leaf); + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); firstb = xfs_dir2_leaf_bests_p(ltp) + first; lastb = xfs_dir2_leaf_bests_p(ltp) + last; - xfs_trans_log_buf(tp, bp, (uint)((char *)firstb - (char *)leaf), + xfs_trans_log_buf(args->trans, bp, + (uint)((char *)firstb - (char *)leaf), (uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1)); } @@ -1077,8 +1082,7 @@ xfs_dir3_leaf_log_bests( */ void xfs_dir3_leaf_log_ents( - struct xfs_trans *tp, - struct xfs_inode *dp, + struct xfs_da_args *args, struct xfs_buf *bp, int first, int last) @@ -1093,10 +1097,11 @@ xfs_dir3_leaf_log_ents( leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)); - ents = dp->d_ops->leaf_ents_p(leaf); + ents = args->dp->d_ops->leaf_ents_p(leaf); firstlep = &ents[first]; lastlep = &ents[last]; - xfs_trans_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf), + xfs_trans_log_buf(args->trans, bp, + (uint)((char *)firstlep - (char *)leaf), (uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1)); } @@ -1105,8 +1110,7 @@ xfs_dir3_leaf_log_ents( */ void xfs_dir3_leaf_log_header( - struct xfs_trans *tp, - struct xfs_inode *dp, + struct xfs_da_args *args, struct xfs_buf *bp) { struct xfs_dir2_leaf *leaf = bp->b_addr; @@ -1116,8 +1120,9 @@ xfs_dir3_leaf_log_header( leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)); - xfs_trans_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf), - dp->d_ops->leaf_hdr_size - 1); + xfs_trans_log_buf(args->trans, bp, + (uint)((char *)&leaf->hdr - (char *)leaf), + args->dp->d_ops->leaf_hdr_size - 1); } /* @@ -1125,21 +1130,20 @@ xfs_dir3_leaf_log_header( */ STATIC void xfs_dir3_leaf_log_tail( - struct xfs_trans *tp, + struct xfs_da_args *args, struct xfs_buf *bp) { struct xfs_dir2_leaf *leaf = bp->b_addr; xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ - struct xfs_mount *mp = tp->t_mountp; ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)); - ltp = xfs_dir2_leaf_tail_p(mp, leaf); - xfs_trans_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf), - (uint)(mp->m_dirblksize - 1)); + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + xfs_trans_log_buf(args->trans, bp, (uint)((char *)ltp - (char *)leaf), + (uint)(args->geo->blksize - 1)); } /* @@ -1185,7 +1189,7 @@ xfs_dir2_leaf_lookup( */ dep = (xfs_dir2_data_entry_t *) ((char *)dbp->b_addr + - xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address))); + xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address))); /* * Return the found inode number & CI name if appropriate */ @@ -1231,7 +1235,7 @@ xfs_dir2_leaf_lookup_int( tp = args->trans; mp = dp->i_mount; - error = xfs_dir3_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp); + error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp); if (error) return error; @@ -1260,7 +1264,8 @@ xfs_dir2_leaf_lookup_int( /* * Get the new data block number. */ - newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); + newdb = xfs_dir2_dataptr_to_db(args->geo, + be32_to_cpu(lep->address)); /* * If it's not the same as the old data block number, * need to pitch the old one and read the new one. @@ -1269,8 +1274,8 @@ xfs_dir2_leaf_lookup_int( if (dbp) xfs_trans_brelse(tp, dbp); error = xfs_dir3_data_read(tp, dp, - xfs_dir2_db_to_da(mp, newdb), - -1, &dbp); + xfs_dir2_db_to_da(args->geo, newdb), + -1, &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1281,7 +1286,8 @@ xfs_dir2_leaf_lookup_int( * Point to the data entry. */ dep = (xfs_dir2_data_entry_t *)((char *)dbp->b_addr + - xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(lep->address))); /* * Compare name and if it's an exact match, return the index * and buffer. If it's the first case-insensitive match, store @@ -1310,8 +1316,8 @@ xfs_dir2_leaf_lookup_int( if (cidb != curdb) { xfs_trans_brelse(tp, dbp); error = xfs_dir3_data_read(tp, dp, - xfs_dir2_db_to_da(mp, cidb), - -1, &dbp); + xfs_dir2_db_to_da(args->geo, cidb), + -1, &dbp); if (error) { xfs_trans_brelse(tp, lbp); return error; @@ -1380,18 +1386,18 @@ xfs_dir2_leaf_removename( * Point to the leaf entry, use that to point to the data entry. */ lep = &ents[index]; - db = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); - dep = (xfs_dir2_data_entry_t *) - ((char *)hdr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); + db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address)); + dep = (xfs_dir2_data_entry_t *)((char *)hdr + + xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address))); needscan = needlog = 0; oldbest = be16_to_cpu(bf[0].length); - ltp = xfs_dir2_leaf_tail_p(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); bestsp = xfs_dir2_leaf_bests_p(ltp); ASSERT(be16_to_cpu(bestsp[db]) == oldbest); /* * Mark the former data entry unused. */ - xfs_dir2_data_make_free(tp, dp, dbp, + xfs_dir2_data_make_free(args, dbp, (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr), dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan); /* @@ -1399,10 +1405,10 @@ xfs_dir2_leaf_removename( */ leafhdr.stale++; dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); - xfs_dir3_leaf_log_header(tp, dp, lbp); + xfs_dir3_leaf_log_header(args, lbp); lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); - xfs_dir3_leaf_log_ents(tp, dp, lbp, index, index); + xfs_dir3_leaf_log_ents(args, lbp, index, index); /* * Scan the freespace in the data block again if necessary, @@ -1411,22 +1417,22 @@ xfs_dir2_leaf_removename( if (needscan) xfs_dir2_data_freescan(dp, hdr, &needlog); if (needlog) - xfs_dir2_data_log_header(tp, dp, dbp); + xfs_dir2_data_log_header(args, dbp); /* * If the longest freespace in the data block has changed, * put the new value in the bests table and log that. */ if (be16_to_cpu(bf[0].length) != oldbest) { bestsp[db] = bf[0].length; - xfs_dir3_leaf_log_bests(tp, lbp, db, db); + xfs_dir3_leaf_log_bests(args, lbp, db, db); } xfs_dir3_data_check(dp, dbp); /* * If the data block is now empty then get rid of the data block. */ if (be16_to_cpu(bf[0].length) == - mp->m_dirblksize - dp->d_ops->data_entry_offset) { - ASSERT(db != mp->m_dirdatablk); + args->geo->blksize - dp->d_ops->data_entry_offset) { + ASSERT(db != args->geo->datablk); if ((error = xfs_dir2_shrink_inode(args, db, dbp))) { /* * Nope, can't get rid of it because it caused @@ -1459,15 +1465,16 @@ xfs_dir2_leaf_removename( memmove(&bestsp[db - i], bestsp, (be32_to_cpu(ltp->bestcount) - (db - i)) * sizeof(*bestsp)); be32_add_cpu(<p->bestcount, -(db - i)); - xfs_dir3_leaf_log_tail(tp, lbp); - xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + xfs_dir3_leaf_log_tail(args, lbp); + xfs_dir3_leaf_log_bests(args, lbp, 0, + be32_to_cpu(ltp->bestcount) - 1); } else bestsp[db] = cpu_to_be16(NULLDATAOFF); } /* * If the data block was not the first one, drop it. */ - else if (db != mp->m_dirdatablk) + else if (db != args->geo->datablk) dbp = NULL; xfs_dir3_leaf_check(dp, lbp); @@ -1515,7 +1522,7 @@ xfs_dir2_leaf_replace( */ dep = (xfs_dir2_data_entry_t *) ((char *)dbp->b_addr + - xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address))); + xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address))); ASSERT(args->inumber != be64_to_cpu(dep->inumber)); /* * Put the new inode number in, log it. @@ -1523,7 +1530,7 @@ xfs_dir2_leaf_replace( dep->inumber = cpu_to_be64(args->inumber); dp->d_ops->data_put_ftype(dep, args->filetype); tp = args->trans; - xfs_dir2_data_log_entry(tp, dp, dbp, dep); + xfs_dir2_data_log_entry(args, dbp, dep); xfs_dir3_leaf_check(dp, lbp); xfs_trans_brelse(tp, lbp); return 0; @@ -1609,12 +1616,13 @@ xfs_dir2_leaf_trim_data( /* * Read the offending data block. We need its buffer. */ - error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp); + error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, db), + -1, &dbp); if (error) return error; leaf = lbp->b_addr; - ltp = xfs_dir2_leaf_tail_p(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); #ifdef DEBUG { @@ -1624,7 +1632,7 @@ xfs_dir2_leaf_trim_data( ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)); ASSERT(be16_to_cpu(bf[0].length) == - mp->m_dirblksize - dp->d_ops->data_entry_offset); + args->geo->blksize - dp->d_ops->data_entry_offset); ASSERT(db == be32_to_cpu(ltp->bestcount) - 1); } #endif @@ -1643,8 +1651,8 @@ xfs_dir2_leaf_trim_data( bestsp = xfs_dir2_leaf_bests_p(ltp); be32_add_cpu(<p->bestcount, -1); memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp)); - xfs_dir3_leaf_log_tail(tp, lbp); - xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + xfs_dir3_leaf_log_tail(args, lbp); + xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); return 0; } @@ -1708,22 +1716,22 @@ xfs_dir2_node_to_leaf( /* * Get the last offset in the file. */ - if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK))) { + if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) { return error; } - fo -= mp->m_dirblkfsbs; + fo -= args->geo->fsbcount; /* * If there are freespace blocks other than the first one, * take this opportunity to remove trailing empty freespace blocks * that may have been left behind during no-space-reservation * operations. */ - while (fo > mp->m_dirfreeblk) { + while (fo > args->geo->freeblk) { if ((error = xfs_dir2_node_trim_free(args, fo, &rval))) { return error; } if (rval) - fo -= mp->m_dirblkfsbs; + fo -= args->geo->fsbcount; else return 0; } @@ -1736,7 +1744,7 @@ xfs_dir2_node_to_leaf( /* * If it's not the single leaf block, give up. */ - if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + mp->m_dirblksize) + if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + args->geo->blksize) return 0; lbp = state->path.blk[0].bp; leaf = lbp->b_addr; @@ -1748,7 +1756,7 @@ xfs_dir2_node_to_leaf( /* * Read the freespace block. */ - error = xfs_dir2_free_read(tp, dp, mp->m_dirfreeblk, &fbp); + error = xfs_dir2_free_read(tp, dp, args->geo->freeblk, &fbp); if (error) return error; free = fbp->b_addr; @@ -1760,7 +1768,7 @@ xfs_dir2_node_to_leaf( * Now see if the leafn and free data will fit in a leaf1. * If not, release the buffer and give up. */ - if (xfs_dir3_leaf_size(&leafhdr, freehdr.nvalid) > mp->m_dirblksize) { + if (xfs_dir3_leaf_size(&leafhdr, freehdr.nvalid) > args->geo->blksize) { xfs_trans_brelse(tp, fbp); return 0; } @@ -1780,7 +1788,7 @@ xfs_dir2_node_to_leaf( /* * Set up the leaf tail from the freespace block. */ - ltp = xfs_dir2_leaf_tail_p(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); ltp->bestcount = cpu_to_be32(freehdr.nvalid); /* @@ -1790,15 +1798,17 @@ xfs_dir2_node_to_leaf( freehdr.nvalid * sizeof(xfs_dir2_data_off_t)); dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); - xfs_dir3_leaf_log_header(tp, dp, lbp); - xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); - xfs_dir3_leaf_log_tail(tp, lbp); + xfs_dir3_leaf_log_header(args, lbp); + xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + xfs_dir3_leaf_log_tail(args, lbp); xfs_dir3_leaf_check(dp, lbp); /* * Get rid of the freespace block. */ - error = xfs_dir2_shrink_inode(args, XFS_DIR2_FREE_FIRSTDB(mp), fbp); + error = xfs_dir2_shrink_inode(args, + xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET), + fbp); if (error) { /* * This can't fail here because it can only happen when diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index cb434d732681..da43d304fca2 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -195,17 +195,18 @@ xfs_dir2_free_try_read( static int xfs_dir3_free_get_buf( - struct xfs_trans *tp, - struct xfs_inode *dp, + xfs_da_args_t *args, xfs_dir2_db_t fbno, struct xfs_buf **bpp) { + struct xfs_trans *tp = args->trans; + struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; struct xfs_buf *bp; int error; struct xfs_dir3_icfree_hdr hdr; - error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fbno), + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, fbno), -1, &bp, XFS_DATA_FORK); if (error) return error; @@ -240,8 +241,7 @@ xfs_dir3_free_get_buf( */ STATIC void xfs_dir2_free_log_bests( - struct xfs_trans *tp, - struct xfs_inode *dp, + struct xfs_da_args *args, struct xfs_buf *bp, int first, /* first entry to log */ int last) /* last entry to log */ @@ -250,10 +250,10 @@ xfs_dir2_free_log_bests( __be16 *bests; free = bp->b_addr; - bests = dp->d_ops->free_bests_p(free); + bests = args->dp->d_ops->free_bests_p(free); ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); - xfs_trans_log_buf(tp, bp, + xfs_trans_log_buf(args->trans, bp, (uint)((char *)&bests[first] - (char *)free), (uint)((char *)&bests[last] - (char *)free + sizeof(bests[0]) - 1)); @@ -264,8 +264,7 @@ xfs_dir2_free_log_bests( */ static void xfs_dir2_free_log_header( - struct xfs_trans *tp, - struct xfs_inode *dp, + struct xfs_da_args *args, struct xfs_buf *bp) { #ifdef DEBUG @@ -275,7 +274,8 @@ xfs_dir2_free_log_header( ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); #endif - xfs_trans_log_buf(tp, bp, 0, dp->d_ops->free_hdr_size - 1); + xfs_trans_log_buf(args->trans, bp, 0, + args->dp->d_ops->free_hdr_size - 1); } /* @@ -315,20 +315,20 @@ xfs_dir2_leaf_to_node( if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fdb))) { return error; } - ASSERT(fdb == XFS_DIR2_FREE_FIRSTDB(mp)); + ASSERT(fdb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET)); /* * Get the buffer for the new freespace block. */ - error = xfs_dir3_free_get_buf(tp, dp, fdb, &fbp); + error = xfs_dir3_free_get_buf(args, fdb, &fbp); if (error) return error; free = fbp->b_addr; dp->d_ops->free_hdr_from_disk(&freehdr, free); leaf = lbp->b_addr; - ltp = xfs_dir2_leaf_tail_p(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); ASSERT(be32_to_cpu(ltp->bestcount) <= - (uint)dp->i_d.di_size / mp->m_dirblksize); + (uint)dp->i_d.di_size / args->geo->blksize); /* * Copy freespace entries from the leaf block to the new block. @@ -349,8 +349,8 @@ xfs_dir2_leaf_to_node( freehdr.nvalid = be32_to_cpu(ltp->bestcount); dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr); - xfs_dir2_free_log_bests(tp, dp, fbp, 0, freehdr.nvalid - 1); - xfs_dir2_free_log_header(tp, dp, fbp); + xfs_dir2_free_log_bests(args, fbp, 0, freehdr.nvalid - 1); + xfs_dir2_free_log_header(args, fbp); /* * Converting the leaf to a leafnode is just a matter of changing the @@ -364,7 +364,7 @@ xfs_dir2_leaf_to_node( leaf->hdr.info.magic = cpu_to_be16(XFS_DIR3_LEAFN_MAGIC); lbp->b_ops = &xfs_dir3_leafn_buf_ops; xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAFN_BUF); - xfs_dir3_leaf_log_header(tp, dp, lbp); + xfs_dir3_leaf_log_header(args, lbp); xfs_dir3_leaf_check(dp, lbp); return 0; } @@ -415,7 +415,7 @@ xfs_dir2_leafn_add( * a compact. */ - if (leafhdr.count == dp->d_ops->leaf_max_ents(mp)) { + if (leafhdr.count == dp->d_ops->leaf_max_ents(args->geo)) { if (!leafhdr.stale) return XFS_ERROR(ENOSPC); compact = leafhdr.stale > 1; @@ -450,12 +450,12 @@ xfs_dir2_leafn_add( highstale, &lfloglow, &lfloghigh); lep->hashval = cpu_to_be32(args->hashval); - lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp, + lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(args->geo, args->blkno, args->index)); dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); - xfs_dir3_leaf_log_header(tp, dp, bp); - xfs_dir3_leaf_log_ents(tp, dp, bp, lfloglow, lfloghigh); + xfs_dir3_leaf_log_header(args, bp); + xfs_dir3_leaf_log_ents(args, bp, lfloglow, lfloghigh); xfs_dir3_leaf_check(dp, bp); return 0; } @@ -471,7 +471,8 @@ xfs_dir2_free_hdr_check( dp->d_ops->free_hdr_from_disk(&hdr, bp->b_addr); - ASSERT((hdr.firstdb % dp->d_ops->free_max_bests(dp->i_mount)) == 0); + ASSERT((hdr.firstdb % + dp->d_ops->free_max_bests(dp->i_mount->m_dir_geo)) == 0); ASSERT(hdr.firstdb <= db); ASSERT(db < hdr.firstdb + hdr.nvalid); } @@ -576,7 +577,8 @@ xfs_dir2_leafn_lookup_for_addname( /* * Pull the data block number from the entry. */ - newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); + newdb = xfs_dir2_dataptr_to_db(args->geo, + be32_to_cpu(lep->address)); /* * For addname, we're looking for a place to put the new entry. * We want to use a data block with an entry of equal @@ -593,7 +595,7 @@ xfs_dir2_leafn_lookup_for_addname( * Convert the data block to the free block * holding its freespace information. */ - newfdb = dp->d_ops->db_to_fdb(mp, newdb); + newfdb = dp->d_ops->db_to_fdb(args->geo, newdb); /* * If it's not the one we have in hand, read it in. */ @@ -605,7 +607,8 @@ xfs_dir2_leafn_lookup_for_addname( xfs_trans_brelse(tp, curbp); error = xfs_dir2_free_read(tp, dp, - xfs_dir2_db_to_da(mp, newfdb), + xfs_dir2_db_to_da(args->geo, + newfdb), &curbp); if (error) return error; @@ -616,7 +619,7 @@ xfs_dir2_leafn_lookup_for_addname( /* * Get the index for our entry. */ - fi = dp->d_ops->db_to_fdindex(mp, curdb); + fi = dp->d_ops->db_to_fdindex(args->geo, curdb); /* * If it has room, return it. */ @@ -721,7 +724,8 @@ xfs_dir2_leafn_lookup_for_entry( /* * Pull the data block number from the entry. */ - newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); + newdb = xfs_dir2_dataptr_to_db(args->geo, + be32_to_cpu(lep->address)); /* * Not adding a new entry, so we really want to find * the name given to us. @@ -746,7 +750,8 @@ xfs_dir2_leafn_lookup_for_entry( curbp = state->extrablk.bp; } else { error = xfs_dir3_data_read(tp, dp, - xfs_dir2_db_to_da(mp, newdb), + xfs_dir2_db_to_da(args->geo, + newdb), -1, &curbp); if (error) return error; @@ -758,7 +763,8 @@ xfs_dir2_leafn_lookup_for_entry( * Point to the data entry. */ dep = (xfs_dir2_data_entry_t *)((char *)curbp->b_addr + - xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address))); + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(lep->address))); /* * Compare the entry and if it's an exact match, return * EEXIST immediately. If it's the first case-insensitive @@ -844,7 +850,6 @@ xfs_dir3_leafn_moveents( int start_d,/* destination leaf index */ int count) /* count of leaves to copy */ { - struct xfs_trans *tp = args->trans; int stale; /* count stale leaves copied */ trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count); @@ -863,7 +868,7 @@ xfs_dir3_leafn_moveents( if (start_d < dhdr->count) { memmove(&dents[start_d + count], &dents[start_d], (dhdr->count - start_d) * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir3_leaf_log_ents(tp, args->dp, bp_d, start_d + count, + xfs_dir3_leaf_log_ents(args, bp_d, start_d + count, count + dhdr->count - 1); } /* @@ -885,8 +890,7 @@ xfs_dir3_leafn_moveents( */ memcpy(&dents[start_d], &sents[start_s], count * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir3_leaf_log_ents(tp, args->dp, bp_d, - start_d, start_d + count - 1); + xfs_dir3_leaf_log_ents(args, bp_d, start_d, start_d + count - 1); /* * If there are source entries after the ones we copied, @@ -895,8 +899,7 @@ xfs_dir3_leafn_moveents( if (start_s + count < shdr->count) { memmove(&sents[start_s], &sents[start_s + count], count * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir3_leaf_log_ents(tp, args->dp, bp_s, - start_s, start_s + count - 1); + xfs_dir3_leaf_log_ents(args, bp_s, start_s, start_s + count - 1); } /* @@ -1032,8 +1035,8 @@ xfs_dir2_leafn_rebalance( /* log the changes made when moving the entries */ dp->d_ops->leaf_hdr_to_disk(leaf1, &hdr1); dp->d_ops->leaf_hdr_to_disk(leaf2, &hdr2); - xfs_dir3_leaf_log_header(args->trans, dp, blk1->bp); - xfs_dir3_leaf_log_header(args->trans, dp, blk2->bp); + xfs_dir3_leaf_log_header(args, blk1->bp); + xfs_dir3_leaf_log_header(args, blk2->bp); xfs_dir3_leaf_check(dp, blk1->bp); xfs_dir3_leaf_check(dp, blk2->bp); @@ -1076,7 +1079,6 @@ xfs_dir3_data_block_free( struct xfs_buf *fbp, int longest) { - struct xfs_trans *tp = args->trans; int logfree = 0; __be16 *bests; struct xfs_dir3_icfree_hdr freehdr; @@ -1090,7 +1092,7 @@ xfs_dir3_data_block_free( * value. */ bests[findex] = cpu_to_be16(longest); - xfs_dir2_free_log_bests(tp, dp, fbp, findex, findex); + xfs_dir2_free_log_bests(args, fbp, findex, findex); return 0; } @@ -1118,7 +1120,7 @@ xfs_dir3_data_block_free( } dp->d_ops->free_hdr_to_disk(free, &freehdr); - xfs_dir2_free_log_header(tp, dp, fbp); + xfs_dir2_free_log_header(args, fbp); /* * If there are no useful entries left in the block, get rid of the @@ -1142,7 +1144,7 @@ xfs_dir3_data_block_free( /* Log the free entry that changed, unless we got rid of it. */ if (logfree) - xfs_dir2_free_log_bests(tp, dp, fbp, findex, findex); + xfs_dir2_free_log_bests(args, fbp, findex, findex); return 0; } @@ -1193,9 +1195,9 @@ xfs_dir2_leafn_remove( /* * Extract the data block and offset from the entry. */ - db = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address)); + db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address)); ASSERT(dblk->blkno == db); - off = xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)); + off = xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)); ASSERT(dblk->index == off); /* @@ -1204,10 +1206,10 @@ xfs_dir2_leafn_remove( */ leafhdr.stale++; dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); - xfs_dir3_leaf_log_header(tp, dp, bp); + xfs_dir3_leaf_log_header(args, bp); lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); - xfs_dir3_leaf_log_ents(tp, dp, bp, index, index); + xfs_dir3_leaf_log_ents(args, bp, index, index); /* * Make the data entry free. Keep track of the longest freespace @@ -1219,7 +1221,7 @@ xfs_dir2_leafn_remove( bf = dp->d_ops->data_bestfree_p(hdr); longest = be16_to_cpu(bf[0].length); needlog = needscan = 0; - xfs_dir2_data_make_free(tp, dp, dbp, off, + xfs_dir2_data_make_free(args, dbp, off, dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan); /* * Rescan the data block freespaces for bestfree. @@ -1228,7 +1230,7 @@ xfs_dir2_leafn_remove( if (needscan) xfs_dir2_data_freescan(dp, hdr, &needlog); if (needlog) - xfs_dir2_data_log_header(tp, dp, dbp); + xfs_dir2_data_log_header(args, dbp); xfs_dir3_data_check(dp, dbp); /* * If the longest data block freespace changes, need to update @@ -1245,8 +1247,9 @@ xfs_dir2_leafn_remove( * Convert the data block number to a free block, * read in the free block. */ - fdb = dp->d_ops->db_to_fdb(mp, db); - error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(mp, fdb), + fdb = dp->d_ops->db_to_fdb(args->geo, db); + error = xfs_dir2_free_read(tp, dp, + xfs_dir2_db_to_da(args->geo, fdb), &fbp); if (error) return error; @@ -1255,20 +1258,21 @@ xfs_dir2_leafn_remove( { struct xfs_dir3_icfree_hdr freehdr; dp->d_ops->free_hdr_from_disk(&freehdr, free); - ASSERT(freehdr.firstdb == dp->d_ops->free_max_bests(mp) * - (fdb - XFS_DIR2_FREE_FIRSTDB(mp))); + ASSERT(freehdr.firstdb == dp->d_ops->free_max_bests(args->geo) * + (fdb - xfs_dir2_byte_to_db(args->geo, + XFS_DIR2_FREE_OFFSET))); } #endif /* * Calculate which entry we need to fix. */ - findex = dp->d_ops->db_to_fdindex(mp, db); + findex = dp->d_ops->db_to_fdindex(args->geo, db); longest = be16_to_cpu(bf[0].length); /* * If the data block is now empty we can get rid of it * (usually). */ - if (longest == mp->m_dirblksize - + if (longest == args->geo->blksize - dp->d_ops->data_entry_offset) { /* * Try to punch out the data block. @@ -1303,7 +1307,7 @@ xfs_dir2_leafn_remove( */ *rval = (dp->d_ops->leaf_hdr_size + (uint)sizeof(ents[0]) * (leafhdr.count - leafhdr.stale)) < - mp->m_dir_magicpct; + args->geo->magicpct; return 0; } @@ -1336,7 +1340,7 @@ xfs_dir2_leafn_split( /* * Initialize the new leaf block. */ - error = xfs_dir3_leaf_get_buf(args, xfs_dir2_da_to_db(mp, blkno), + error = xfs_dir3_leaf_get_buf(args, xfs_dir2_da_to_db(args->geo, blkno), &newblk->bp, XFS_DIR2_LEAFN_MAGIC); if (error) return error; @@ -1410,7 +1414,7 @@ xfs_dir2_leafn_toosmall( count = leafhdr.count - leafhdr.stale; bytes = dp->d_ops->leaf_hdr_size + count * sizeof(ents[0]); - if (bytes > (state->blocksize >> 1)) { + if (bytes > (state->args->geo->blksize >> 1)) { /* * Blk over 50%, don't try to join. */ @@ -1463,7 +1467,8 @@ xfs_dir2_leafn_toosmall( * Count bytes in the two blocks combined. */ count = leafhdr.count - leafhdr.stale; - bytes = state->blocksize - (state->blocksize >> 2); + bytes = state->args->geo->blksize - + (state->args->geo->blksize >> 2); leaf = bp->b_addr; dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf); @@ -1560,8 +1565,8 @@ xfs_dir2_leafn_unbalance( /* log the changes made when moving the entries */ dp->d_ops->leaf_hdr_to_disk(save_leaf, &savehdr); dp->d_ops->leaf_hdr_to_disk(drop_leaf, &drophdr); - xfs_dir3_leaf_log_header(args->trans, dp, save_blk->bp); - xfs_dir3_leaf_log_header(args->trans, dp, drop_blk->bp); + xfs_dir3_leaf_log_header(args, save_blk->bp); + xfs_dir3_leaf_log_header(args, drop_blk->bp); xfs_dir3_leaf_check(dp, save_blk->bp); xfs_dir3_leaf_check(dp, drop_blk->bp); @@ -1587,8 +1592,6 @@ xfs_dir2_node_addname( state = xfs_da_state_alloc(); state->args = args; state->mp = args->dp->i_mount; - state->blocksize = state->mp->m_dirblksize; - state->node_ents = state->mp->m_dir_node_ents; /* * Look up the name. We're not supposed to find it, but * this gives us the insertion point. @@ -1727,9 +1730,9 @@ xfs_dir2_node_addname_int( if (dbno == -1) { xfs_fileoff_t fo; /* freespace block number */ - if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK))) + if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) return error; - lastfbno = xfs_dir2_da_to_db(mp, (xfs_dablk_t)fo); + lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo); fbno = ifbno; } /* @@ -1747,7 +1750,8 @@ xfs_dir2_node_addname_int( * us a freespace block to start with. */ if (++fbno == 0) - fbno = XFS_DIR2_FREE_FIRSTDB(mp); + fbno = xfs_dir2_byte_to_db(args->geo, + XFS_DIR2_FREE_OFFSET); /* * If it's ifbno we already looked at it. */ @@ -1765,8 +1769,8 @@ xfs_dir2_node_addname_int( * to avoid it. */ error = xfs_dir2_free_try_read(tp, dp, - xfs_dir2_db_to_da(mp, fbno), - &fbp); + xfs_dir2_db_to_da(args->geo, fbno), + &fbp); if (error) return error; if (!fbp) @@ -1834,10 +1838,10 @@ xfs_dir2_node_addname_int( * Get the freespace block corresponding to the data block * that was just allocated. */ - fbno = dp->d_ops->db_to_fdb(mp, dbno); + fbno = dp->d_ops->db_to_fdb(args->geo, dbno); error = xfs_dir2_free_try_read(tp, dp, - xfs_dir2_db_to_da(mp, fbno), - &fbp); + xfs_dir2_db_to_da(args->geo, fbno), + &fbp); if (error) return error; @@ -1851,12 +1855,13 @@ xfs_dir2_node_addname_int( if (error) return error; - if (unlikely(dp->d_ops->db_to_fdb(mp, dbno) != fbno)) { + if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) { xfs_alert(mp, "%s: dir ino %llu needed freesp block %lld for\n" " data block %lld, got %lld ifbno %llu lastfbno %d", __func__, (unsigned long long)dp->i_ino, - (long long)dp->d_ops->db_to_fdb(mp, dbno), + (long long)dp->d_ops->db_to_fdb( + args->geo, dbno), (long long)dbno, (long long)fbno, (unsigned long long)ifbno, lastfbno); if (fblk) { @@ -1877,7 +1882,7 @@ xfs_dir2_node_addname_int( /* * Get a buffer for the new block. */ - error = xfs_dir3_free_get_buf(tp, dp, fbno, &fbp); + error = xfs_dir3_free_get_buf(args, fbno, &fbp); if (error) return error; free = fbp->b_addr; @@ -1887,8 +1892,10 @@ xfs_dir2_node_addname_int( /* * Remember the first slot as our empty slot. */ - freehdr.firstdb = (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) * - dp->d_ops->free_max_bests(mp); + freehdr.firstdb = + (fbno - xfs_dir2_byte_to_db(args->geo, + XFS_DIR2_FREE_OFFSET)) * + dp->d_ops->free_max_bests(args->geo); } else { free = fbp->b_addr; bests = dp->d_ops->free_bests_p(free); @@ -1898,13 +1905,13 @@ xfs_dir2_node_addname_int( /* * Set the freespace block index from the data block number. */ - findex = dp->d_ops->db_to_fdindex(mp, dbno); + findex = dp->d_ops->db_to_fdindex(args->geo, dbno); /* * If it's after the end of the current entries in the * freespace block, extend that table. */ if (findex >= freehdr.nvalid) { - ASSERT(findex < dp->d_ops->free_max_bests(mp)); + ASSERT(findex < dp->d_ops->free_max_bests(args->geo)); freehdr.nvalid = findex + 1; /* * Tag new entry so nused will go up. @@ -1918,7 +1925,7 @@ xfs_dir2_node_addname_int( if (bests[findex] == cpu_to_be16(NULLDATAOFF)) { freehdr.nused++; dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr); - xfs_dir2_free_log_header(tp, dp, fbp); + xfs_dir2_free_log_header(args, fbp); } /* * Update the real value in the table. @@ -1943,7 +1950,8 @@ xfs_dir2_node_addname_int( /* * Read the data block in. */ - error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno), + error = xfs_dir3_data_read(tp, dp, + xfs_dir2_db_to_da(args->geo, dbno), -1, &dbp); if (error) return error; @@ -1961,7 +1969,7 @@ xfs_dir2_node_addname_int( /* * Mark the first part of the unused space, inuse for us. */ - xfs_dir2_data_use_free(tp, dp, dbp, dup, + xfs_dir2_data_use_free(args, dbp, dup, (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length, &needlog, &needscan); /* @@ -1974,7 +1982,7 @@ xfs_dir2_node_addname_int( dp->d_ops->data_put_ftype(dep, args->filetype); tagp = dp->d_ops->data_entry_tag_p(dep); *tagp = cpu_to_be16((char *)dep - (char *)hdr); - xfs_dir2_data_log_entry(tp, dp, dbp, dep); + xfs_dir2_data_log_entry(args, dbp, dep); /* * Rescan the block for bestfree if needed. */ @@ -1984,7 +1992,7 @@ xfs_dir2_node_addname_int( * Log the data block header if needed. */ if (needlog) - xfs_dir2_data_log_header(tp, dp, dbp); + xfs_dir2_data_log_header(args, dbp); /* * If the freespace entry is now wrong, update it. */ @@ -1997,7 +2005,7 @@ xfs_dir2_node_addname_int( * Log the freespace entry if needed. */ if (logfree) - xfs_dir2_free_log_bests(tp, dp, fbp, findex, findex); + xfs_dir2_free_log_bests(args, fbp, findex, findex); /* * Return the data block and offset in args, then drop the data block. */ @@ -2028,8 +2036,6 @@ xfs_dir2_node_lookup( state = xfs_da_state_alloc(); state->args = args; state->mp = args->dp->i_mount; - state->blocksize = state->mp->m_dirblksize; - state->node_ents = state->mp->m_dir_node_ents; /* * Fill in the path to the entry in the cursor. */ @@ -2083,8 +2089,6 @@ xfs_dir2_node_removename( state = xfs_da_state_alloc(); state->args = args; state->mp = args->dp->i_mount; - state->blocksize = state->mp->m_dirblksize; - state->node_ents = state->mp->m_dir_node_ents; /* Look up the entry we're deleting, set up the cursor. */ error = xfs_da3_node_lookup_int(state, &rval); @@ -2153,8 +2157,6 @@ xfs_dir2_node_replace( state = xfs_da_state_alloc(); state->args = args; state->mp = args->dp->i_mount; - state->blocksize = state->mp->m_dirblksize; - state->node_ents = state->mp->m_dir_node_ents; inum = args->inumber; /* * Lookup the entry to change in the btree. @@ -2186,15 +2188,15 @@ xfs_dir2_node_replace( hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)); dep = (xfs_dir2_data_entry_t *) ((char *)hdr + - xfs_dir2_dataptr_to_off(state->mp, be32_to_cpu(lep->address))); + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(lep->address))); ASSERT(inum != be64_to_cpu(dep->inumber)); /* * Fill in the new inode number and log the entry. */ dep->inumber = cpu_to_be64(inum); args->dp->d_ops->data_put_ftype(dep, args->filetype); - xfs_dir2_data_log_entry(args->trans, args->dp, - state->extrablk.bp, dep); + xfs_dir2_data_log_entry(args, state->extrablk.bp, dep); rval = 0; } /* @@ -2262,9 +2264,9 @@ xfs_dir2_node_trim_free( /* * Blow the block away. */ - if ((error = - xfs_dir2_shrink_inode(args, xfs_dir2_da_to_db(mp, (xfs_dablk_t)fo), - bp))) { + error = xfs_dir2_shrink_inode(args, + xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo), bp); + if (error) { /* * Can't fail with ENOSPC since that only happens with no * space reservation, when breaking up an extent into two diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h index 8b9d2281f85b..27ce0794d196 100644 --- a/fs/xfs/xfs_dir2_priv.h +++ b/fs/xfs/xfs_dir2_priv.h @@ -20,6 +20,140 @@ struct dir_context; +/* + * Directory offset/block conversion functions. + * + * DB blocks here are logical directory block numbers, not filesystem blocks. + */ + +/* + * Convert dataptr to byte in file space + */ +static inline xfs_dir2_off_t +xfs_dir2_dataptr_to_byte(xfs_dir2_dataptr_t dp) +{ + return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG; +} + +/* + * Convert byte in file space to dataptr. It had better be aligned. + */ +static inline xfs_dir2_dataptr_t +xfs_dir2_byte_to_dataptr(xfs_dir2_off_t by) +{ + return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG); +} + +/* + * Convert byte in space to (DB) block + */ +static inline xfs_dir2_db_t +xfs_dir2_byte_to_db(struct xfs_da_geometry *geo, xfs_dir2_off_t by) +{ + return (xfs_dir2_db_t)(by >> geo->blklog); +} + +/* + * Convert dataptr to a block number + */ +static inline xfs_dir2_db_t +xfs_dir2_dataptr_to_db(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp) +{ + return xfs_dir2_byte_to_db(geo, xfs_dir2_dataptr_to_byte(dp)); +} + +/* + * Convert byte in space to offset in a block + */ +static inline xfs_dir2_data_aoff_t +xfs_dir2_byte_to_off(struct xfs_da_geometry *geo, xfs_dir2_off_t by) +{ + return (xfs_dir2_data_aoff_t)(by & (geo->blksize - 1)); +} + +/* + * Convert dataptr to a byte offset in a block + */ +static inline xfs_dir2_data_aoff_t +xfs_dir2_dataptr_to_off(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp) +{ + return xfs_dir2_byte_to_off(geo, xfs_dir2_dataptr_to_byte(dp)); +} + +/* + * Convert block and offset to byte in space + */ +static inline xfs_dir2_off_t +xfs_dir2_db_off_to_byte(struct xfs_da_geometry *geo, xfs_dir2_db_t db, + xfs_dir2_data_aoff_t o) +{ + return ((xfs_dir2_off_t)db << geo->blklog) + o; +} + +/* + * Convert block (DB) to block (dablk) + */ +static inline xfs_dablk_t +xfs_dir2_db_to_da(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return (xfs_dablk_t)(db << (geo->blklog - geo->fsblog)); +} + +/* + * Convert byte in space to (DA) block + */ +static inline xfs_dablk_t +xfs_dir2_byte_to_da(struct xfs_da_geometry *geo, xfs_dir2_off_t by) +{ + return xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, by)); +} + +/* + * Convert block and offset to dataptr + */ +static inline xfs_dir2_dataptr_t +xfs_dir2_db_off_to_dataptr(struct xfs_da_geometry *geo, xfs_dir2_db_t db, + xfs_dir2_data_aoff_t o) +{ + return xfs_dir2_byte_to_dataptr(xfs_dir2_db_off_to_byte(geo, db, o)); +} + +/* + * Convert block (dablk) to block (DB) + */ +static inline xfs_dir2_db_t +xfs_dir2_da_to_db(struct xfs_da_geometry *geo, xfs_dablk_t da) +{ + return (xfs_dir2_db_t)(da >> (geo->blklog - geo->fsblog)); +} + +/* + * Convert block (dablk) to byte offset in space + */ +static inline xfs_dir2_off_t +xfs_dir2_da_to_byte(struct xfs_da_geometry *geo, xfs_dablk_t da) +{ + return xfs_dir2_db_off_to_byte(geo, xfs_dir2_da_to_db(geo, da), 0); +} + +/* + * Directory tail pointer accessor functions. Based on block geometry. + */ +static inline struct xfs_dir2_block_tail * +xfs_dir2_block_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr) +{ + return ((struct xfs_dir2_block_tail *) + ((char *)hdr + geo->blksize)) - 1; +} + +static inline struct xfs_dir2_leaf_tail * +xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp) +{ + return (struct xfs_dir2_leaf_tail *) + ((char *)lp + geo->blksize - + sizeof(struct xfs_dir2_leaf_tail)); +} + /* xfs_dir2.c */ extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, @@ -54,8 +188,8 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp); extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp); -extern int xfs_dir3_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp, - xfs_dablk_t bno, xfs_daddr_t mapped_bno); +extern int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno, + xfs_daddr_t mapped_bno); extern struct xfs_dir2_data_free * xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, @@ -77,9 +211,9 @@ extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr, int *lowstalep, int *highstalep, int *lowlogp, int *highlogp); extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno, struct xfs_buf **bpp, __uint16_t magic); -extern void xfs_dir3_leaf_log_ents(struct xfs_trans *tp, struct xfs_inode *dp, +extern void xfs_dir3_leaf_log_ents(struct xfs_da_args *args, struct xfs_buf *bp, int first, int last); -extern void xfs_dir3_leaf_log_header(struct xfs_trans *tp, struct xfs_inode *dp, +extern void xfs_dir3_leaf_log_header(struct xfs_da_args *args, struct xfs_buf *bp); extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args); extern int xfs_dir2_leaf_removename(struct xfs_da_args *args); diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index aead369e1c30..48e99afb9cb0 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -76,26 +76,25 @@ const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = { STATIC int xfs_dir2_sf_getdents( - xfs_inode_t *dp, /* incore directory inode */ + struct xfs_da_args *args, struct dir_context *ctx) { int i; /* shortform entry number */ - xfs_mount_t *mp; /* filesystem mount point */ + struct xfs_inode *dp = args->dp; /* incore directory inode */ xfs_dir2_dataptr_t off; /* current entry's offset */ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ xfs_dir2_dataptr_t dot_offset; xfs_dir2_dataptr_t dotdot_offset; xfs_ino_t ino; - - mp = dp->i_mount; + struct xfs_da_geometry *geo = args->geo; ASSERT(dp->i_df.if_flags & XFS_IFINLINE); /* * Give up if the directory is way too short. */ if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); + ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); return XFS_ERROR(EIO); } @@ -109,18 +108,18 @@ xfs_dir2_sf_getdents( /* * If the block number in the offset is out of range, we're done. */ - if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk) + if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk) return 0; /* * Precalculate offsets for . and .. as we will always need them. * * XXX(hch): the second argument is sometimes 0 and sometimes - * mp->m_dirdatablk. + * geo->datablk */ - dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, + dot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, dp->d_ops->data_dot_offset); - dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, + dotdot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, dp->d_ops->data_dotdot_offset); /* @@ -149,7 +148,7 @@ xfs_dir2_sf_getdents( for (i = 0; i < sfp->count; i++) { __uint8_t filetype; - off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, + off = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, xfs_dir2_sf_get_offset(sfep)); if (ctx->pos > off) { @@ -161,13 +160,13 @@ xfs_dir2_sf_getdents( filetype = dp->d_ops->sf_get_ftype(sfep); ctx->pos = off & 0x7fffffff; if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino, - xfs_dir3_get_dtype(mp, filetype))) + xfs_dir3_get_dtype(dp->i_mount, filetype))) return 0; sfep = dp->d_ops->sf_nextentry(sfp, sfep); } - ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) & - 0x7fffffff; + ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) & + 0x7fffffff; return 0; } @@ -176,9 +175,10 @@ xfs_dir2_sf_getdents( */ STATIC int xfs_dir2_block_getdents( - xfs_inode_t *dp, /* incore inode */ + struct xfs_da_args *args, struct dir_context *ctx) { + struct xfs_inode *dp = args->dp; /* incore directory inode */ xfs_dir2_data_hdr_t *hdr; /* block header */ struct xfs_buf *bp; /* buffer for block */ xfs_dir2_block_tail_t *btp; /* block tail */ @@ -186,16 +186,15 @@ xfs_dir2_block_getdents( xfs_dir2_data_unused_t *dup; /* block unused entry */ char *endptr; /* end of the data entries */ int error; /* error return value */ - xfs_mount_t *mp; /* filesystem mount point */ char *ptr; /* current data entry */ int wantoff; /* starting block offset */ xfs_off_t cook; + struct xfs_da_geometry *geo = args->geo; - mp = dp->i_mount; /* * If the block number in the offset is out of range, we're done. */ - if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk) + if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk) return 0; error = xfs_dir3_block_read(NULL, dp, &bp); @@ -206,13 +205,13 @@ xfs_dir2_block_getdents( * Extract the byte offset we start at from the seek pointer. * We'll skip entries before this. */ - wantoff = xfs_dir2_dataptr_to_off(mp, ctx->pos); + wantoff = xfs_dir2_dataptr_to_off(geo, ctx->pos); hdr = bp->b_addr; xfs_dir3_data_check(dp, bp); /* * Set up values for the loop. */ - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(geo, hdr); ptr = (char *)dp->d_ops->data_entry_p(hdr); endptr = (char *)xfs_dir2_block_leaf_p(btp); @@ -244,7 +243,7 @@ xfs_dir2_block_getdents( if ((char *)dep - (char *)hdr < wantoff) continue; - cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, + cook = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, (char *)dep - (char *)hdr); ctx->pos = cook & 0x7fffffff; @@ -254,7 +253,7 @@ xfs_dir2_block_getdents( */ if (!dir_emit(ctx, (char *)dep->name, dep->namelen, be64_to_cpu(dep->inumber), - xfs_dir3_get_dtype(mp, filetype))) { + xfs_dir3_get_dtype(dp->i_mount, filetype))) { xfs_trans_brelse(NULL, bp); return 0; } @@ -264,8 +263,8 @@ xfs_dir2_block_getdents( * Reached the end of the block. * Set the offset to a non-existent block 1 and return. */ - ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) & - 0x7fffffff; + ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) & + 0x7fffffff; xfs_trans_brelse(NULL, bp); return 0; } @@ -286,13 +285,13 @@ struct xfs_dir2_leaf_map_info { STATIC int xfs_dir2_leaf_readbuf( - struct xfs_inode *dp, + struct xfs_da_args *args, size_t bufsize, struct xfs_dir2_leaf_map_info *mip, xfs_dir2_off_t *curoff, struct xfs_buf **bpp) { - struct xfs_mount *mp = dp->i_mount; + struct xfs_inode *dp = args->dp; struct xfs_buf *bp = *bpp; struct xfs_bmbt_irec *map = mip->map; struct blk_plug plug; @@ -300,6 +299,7 @@ xfs_dir2_leaf_readbuf( int length; int i; int j; + struct xfs_da_geometry *geo = args->geo; /* * If we have a buffer, we need to release it and @@ -309,12 +309,12 @@ xfs_dir2_leaf_readbuf( if (bp) { xfs_trans_brelse(NULL, bp); bp = NULL; - mip->map_blocks -= mp->m_dirblkfsbs; + mip->map_blocks -= geo->fsbcount; /* * Loop to get rid of the extents for the * directory block. */ - for (i = mp->m_dirblkfsbs; i > 0; ) { + for (i = geo->fsbcount; i > 0; ) { j = min_t(int, map->br_blockcount, i); map->br_blockcount -= j; map->br_startblock += j; @@ -333,8 +333,7 @@ xfs_dir2_leaf_readbuf( /* * Recalculate the readahead blocks wanted. */ - mip->ra_want = howmany(bufsize + mp->m_dirblksize, - mp->m_sb.sb_blocksize) - 1; + mip->ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog)) - 1; ASSERT(mip->ra_want >= 0); /* @@ -342,14 +341,14 @@ xfs_dir2_leaf_readbuf( * run out of data blocks, get some more mappings. */ if (1 + mip->ra_want > mip->map_blocks && - mip->map_off < xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET)) { + mip->map_off < xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET)) { /* * Get more bmaps, fill in after the ones * we already have in the table. */ mip->nmap = mip->map_size - mip->map_valid; error = xfs_bmapi_read(dp, mip->map_off, - xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET) - + xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET) - mip->map_off, &map[mip->map_valid], &mip->nmap, 0); @@ -370,7 +369,7 @@ xfs_dir2_leaf_readbuf( i = mip->map_valid + mip->nmap - 1; mip->map_off = map[i].br_startoff + map[i].br_blockcount; } else - mip->map_off = xfs_dir2_byte_to_da(mp, + mip->map_off = xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET); /* @@ -396,18 +395,18 @@ xfs_dir2_leaf_readbuf( * No valid mappings, so no more data blocks. */ if (!mip->map_valid) { - *curoff = xfs_dir2_da_to_byte(mp, mip->map_off); + *curoff = xfs_dir2_da_to_byte(geo, mip->map_off); goto out; } /* * Read the directory block starting at the first mapping. */ - mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff); + mip->curdb = xfs_dir2_da_to_db(geo, map->br_startoff); error = xfs_dir3_data_read(NULL, dp, map->br_startoff, - map->br_blockcount >= mp->m_dirblkfsbs ? - XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp); - + map->br_blockcount >= geo->fsbcount ? + XFS_FSB_TO_DADDR(dp->i_mount, map->br_startblock) : + -1, &bp); /* * Should just skip over the data block instead of giving up. */ @@ -419,7 +418,7 @@ xfs_dir2_leaf_readbuf( * was previously ra. */ if (mip->ra_current) - mip->ra_current -= mp->m_dirblkfsbs; + mip->ra_current -= geo->fsbcount; /* * Do we need more readahead? @@ -427,16 +426,16 @@ xfs_dir2_leaf_readbuf( blk_start_plug(&plug); for (mip->ra_index = mip->ra_offset = i = 0; mip->ra_want > mip->ra_current && i < mip->map_blocks; - i += mp->m_dirblkfsbs) { + i += geo->fsbcount) { ASSERT(mip->ra_index < mip->map_valid); /* * Read-ahead a contiguous directory block. */ if (i > mip->ra_current && - map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) { - xfs_dir3_data_readahead(NULL, dp, + map[mip->ra_index].br_blockcount >= geo->fsbcount) { + xfs_dir3_data_readahead(dp, map[mip->ra_index].br_startoff + mip->ra_offset, - XFS_FSB_TO_DADDR(mp, + XFS_FSB_TO_DADDR(dp->i_mount, map[mip->ra_index].br_startblock + mip->ra_offset)); mip->ra_current = i; @@ -447,7 +446,7 @@ xfs_dir2_leaf_readbuf( * use our mapping, but this is a very rare case. */ else if (i > mip->ra_current) { - xfs_dir3_data_readahead(NULL, dp, + xfs_dir3_data_readahead(dp, map[mip->ra_index].br_startoff + mip->ra_offset, -1); mip->ra_current = i; @@ -456,15 +455,14 @@ xfs_dir2_leaf_readbuf( /* * Advance offset through the mapping table. */ - for (j = 0; j < mp->m_dirblkfsbs; j++) { + for (j = 0; j < geo->fsbcount; j += length ) { /* * The rest of this extent but not more than a dir * block. */ - length = min_t(int, mp->m_dirblkfsbs, + length = min_t(int, geo->fsbcount, map[mip->ra_index].br_blockcount - mip->ra_offset); - j += length; mip->ra_offset += length; /* @@ -489,22 +487,23 @@ out: */ STATIC int xfs_dir2_leaf_getdents( - xfs_inode_t *dp, /* incore directory inode */ + struct xfs_da_args *args, struct dir_context *ctx, size_t bufsize) { + struct xfs_inode *dp = args->dp; struct xfs_buf *bp = NULL; /* data block buffer */ xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_data_entry_t *dep; /* data entry */ xfs_dir2_data_unused_t *dup; /* unused entry */ int error = 0; /* error return value */ int length; /* temporary length value */ - xfs_mount_t *mp; /* filesystem mount point */ int byteoff; /* offset in current block */ xfs_dir2_off_t curoff; /* current overall offset */ xfs_dir2_off_t newoff; /* new curoff after new blk */ char *ptr = NULL; /* pointer to current data */ struct xfs_dir2_leaf_map_info *map_info; + struct xfs_da_geometry *geo = args->geo; /* * If the offset is at or past the largest allowed value, @@ -513,15 +512,12 @@ xfs_dir2_leaf_getdents( if (ctx->pos >= XFS_DIR2_MAX_DATAPTR) return 0; - mp = dp->i_mount; - /* * Set up to bmap a number of blocks based on the caller's * buffer size, the directory block size, and the filesystem * block size. */ - length = howmany(bufsize + mp->m_dirblksize, - mp->m_sb.sb_blocksize); + length = howmany(bufsize + geo->blksize, (1 << geo->fsblog)); map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) + (length * sizeof(struct xfs_bmbt_irec)), KM_SLEEP | KM_NOFS); @@ -531,14 +527,14 @@ xfs_dir2_leaf_getdents( * Inside the loop we keep the main offset value as a byte offset * in the directory file. */ - curoff = xfs_dir2_dataptr_to_byte(mp, ctx->pos); + curoff = xfs_dir2_dataptr_to_byte(ctx->pos); /* * Force this conversion through db so we truncate the offset * down to get the start of the data block. */ - map_info->map_off = xfs_dir2_db_to_da(mp, - xfs_dir2_byte_to_db(mp, curoff)); + map_info->map_off = xfs_dir2_db_to_da(geo, + xfs_dir2_byte_to_db(geo, curoff)); /* * Loop over directory entries until we reach the end offset. @@ -551,9 +547,9 @@ xfs_dir2_leaf_getdents( * If we have no buffer, or we're off the end of the * current buffer, need to get another one. */ - if (!bp || ptr >= (char *)bp->b_addr + mp->m_dirblksize) { + if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) { - error = xfs_dir2_leaf_readbuf(dp, bufsize, map_info, + error = xfs_dir2_leaf_readbuf(args, bufsize, map_info, &curoff, &bp); if (error || !map_info->map_valid) break; @@ -561,7 +557,8 @@ xfs_dir2_leaf_getdents( /* * Having done a read, we need to set a new offset. */ - newoff = xfs_dir2_db_off_to_byte(mp, map_info->curdb, 0); + newoff = xfs_dir2_db_off_to_byte(geo, + map_info->curdb, 0); /* * Start of the current block. */ @@ -571,7 +568,7 @@ xfs_dir2_leaf_getdents( * Make sure we're in the right block. */ else if (curoff > newoff) - ASSERT(xfs_dir2_byte_to_db(mp, curoff) == + ASSERT(xfs_dir2_byte_to_db(geo, curoff) == map_info->curdb); hdr = bp->b_addr; xfs_dir3_data_check(dp, bp); @@ -579,7 +576,7 @@ xfs_dir2_leaf_getdents( * Find our position in the block. */ ptr = (char *)dp->d_ops->data_entry_p(hdr); - byteoff = xfs_dir2_byte_to_off(mp, curoff); + byteoff = xfs_dir2_byte_to_off(geo, curoff); /* * Skip past the header. */ @@ -608,10 +605,10 @@ xfs_dir2_leaf_getdents( * Now set our real offset. */ curoff = - xfs_dir2_db_off_to_byte(mp, - xfs_dir2_byte_to_db(mp, curoff), + xfs_dir2_db_off_to_byte(geo, + xfs_dir2_byte_to_db(geo, curoff), (char *)ptr - (char *)hdr); - if (ptr >= (char *)hdr + mp->m_dirblksize) { + if (ptr >= (char *)hdr + geo->blksize) { continue; } } @@ -635,10 +632,10 @@ xfs_dir2_leaf_getdents( length = dp->d_ops->data_entsize(dep->namelen); filetype = dp->d_ops->data_get_ftype(dep); - ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff; + ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff; if (!dir_emit(ctx, (char *)dep->name, dep->namelen, be64_to_cpu(dep->inumber), - xfs_dir3_get_dtype(mp, filetype))) + xfs_dir3_get_dtype(dp->i_mount, filetype))) break; /* @@ -653,10 +650,10 @@ xfs_dir2_leaf_getdents( /* * All done. Set output offset value to current offset. */ - if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR)) + if (curoff > xfs_dir2_dataptr_to_byte(XFS_DIR2_MAX_DATAPTR)) ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff; else - ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff; + ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff; kmem_free(map_info); if (bp) xfs_trans_brelse(NULL, bp); @@ -668,13 +665,14 @@ xfs_dir2_leaf_getdents( */ int xfs_readdir( - xfs_inode_t *dp, - struct dir_context *ctx, - size_t bufsize) + struct xfs_inode *dp, + struct dir_context *ctx, + size_t bufsize) { - int rval; /* return value */ - int v; /* type-checking value */ - uint lock_mode; + struct xfs_da_args args = { NULL }; + int rval; + int v; + uint lock_mode; trace_xfs_readdir(dp); @@ -684,15 +682,18 @@ xfs_readdir( ASSERT(S_ISDIR(dp->i_d.di_mode)); XFS_STATS_INC(xs_dir_getdents); + args.dp = dp; + args.geo = dp->i_mount->m_dir_geo; + lock_mode = xfs_ilock_data_map_shared(dp); if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_getdents(dp, ctx); - else if ((rval = xfs_dir2_isblock(NULL, dp, &v))) + rval = xfs_dir2_sf_getdents(&args, ctx); + else if ((rval = xfs_dir2_isblock(&args, &v))) ; else if (v) - rval = xfs_dir2_block_getdents(dp, ctx); + rval = xfs_dir2_block_getdents(&args, ctx); else - rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize); + rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize); xfs_iunlock(dp, lock_mode); return rval; diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c index 3725fb1b902b..53c3be619db5 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/xfs_dir2_sf.c @@ -82,8 +82,10 @@ xfs_dir2_block_sfsize( xfs_ino_t parent = 0; /* parent inode number */ int size=0; /* total computed size */ int has_ftype; + struct xfs_da_geometry *geo; mp = dp->i_mount; + geo = mp->m_dir_geo; /* * if there is a filetype field, add the extra byte to the namelen @@ -92,7 +94,7 @@ xfs_dir2_block_sfsize( has_ftype = xfs_sb_version_hasftype(&mp->m_sb) ? 1 : 0; count = i8count = namelen = 0; - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(geo, hdr); blp = xfs_dir2_block_leaf_p(btp); /* @@ -104,8 +106,8 @@ xfs_dir2_block_sfsize( /* * Calculate the pointer to the entry at hand. */ - dep = (xfs_dir2_data_entry_t *) - ((char *)hdr + xfs_dir2_dataptr_to_off(mp, addr)); + dep = (xfs_dir2_data_entry_t *)((char *)hdr + + xfs_dir2_dataptr_to_off(geo, addr)); /* * Detect . and .., so we can special-case them. * . is not included in sf directories. @@ -195,7 +197,7 @@ xfs_dir2_block_to_sf( /* * Set up to loop over the block's entries. */ - btp = xfs_dir2_block_tail_p(mp, hdr); + btp = xfs_dir2_block_tail_p(args->geo, hdr); ptr = (char *)dp->d_ops->data_entry_p(hdr); endptr = (char *)xfs_dir2_block_leaf_p(btp); sfep = xfs_dir2_sf_firstentry(sfp); @@ -247,7 +249,7 @@ xfs_dir2_block_to_sf( /* now we are done with the block, we can shrink the inode */ logflags = XFS_ILOG_CORE; - error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp); + error = xfs_dir2_shrink_inode(args, args->geo->datablk, bp); if (error) { ASSERT(error != ENOSPC); goto out; @@ -285,14 +287,12 @@ int /* error */ xfs_dir2_sf_addname( xfs_da_args_t *args) /* operation arguments */ { - int add_entsize; /* size of the new entry */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return value */ int incr_isize; /* total change in size */ int new_isize; /* di_size after adding name */ int objchange; /* changing to 8-byte inodes */ xfs_dir2_data_aoff_t offset = 0; /* offset for new entry */ - int old_isize; /* di_size before adding name */ int pick; /* which algorithm to use */ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ xfs_dir2_sf_entry_t *sfep = NULL; /* shortform entry */ @@ -316,8 +316,7 @@ xfs_dir2_sf_addname( /* * Compute entry (and change in) size. */ - add_entsize = dp->d_ops->sf_entsize(sfp, args->namelen); - incr_isize = add_entsize; + incr_isize = dp->d_ops->sf_entsize(sfp, args->namelen); objchange = 0; #if XFS_BIG_INUMS /* @@ -325,11 +324,8 @@ xfs_dir2_sf_addname( */ if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) { /* - * Yes, adjust the entry size and the total size. + * Yes, adjust the inode size. old count + (parent + new) */ - add_entsize += - (uint)sizeof(xfs_dir2_ino8_t) - - (uint)sizeof(xfs_dir2_ino4_t); incr_isize += (sfp->count + 2) * ((uint)sizeof(xfs_dir2_ino8_t) - @@ -337,8 +333,7 @@ xfs_dir2_sf_addname( objchange = 1; } #endif - old_isize = (int)dp->i_d.di_size; - new_isize = old_isize + incr_isize; + new_isize = (int)dp->i_d.di_size + incr_isize; /* * Won't fit as shortform any more (due to size), * or the pick routine says it won't (due to offset values). @@ -593,7 +588,7 @@ xfs_dir2_sf_addname_pick( * we'll go back, convert to block, then try the insert and convert * to leaf. */ - if (used + (holefit ? 0 : size) > mp->m_dirblksize) + if (used + (holefit ? 0 : size) > args->geo->blksize) return 0; /* * If changing the inode number size, do it the hard way. @@ -608,7 +603,7 @@ xfs_dir2_sf_addname_pick( /* * If it won't fit at the end then do it the hard way (use the hole). */ - if (used + size > mp->m_dirblksize) + if (used + size > args->geo->blksize) return 2; /* * Do it the easy way. @@ -659,7 +654,7 @@ xfs_dir2_sf_check( ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size); ASSERT(offset + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + - (uint)sizeof(xfs_dir2_block_tail_t) <= mp->m_dirblksize); + (uint)sizeof(xfs_dir2_block_tail_t) <= args->geo->blksize); } #endif /* DEBUG */ @@ -1110,9 +1105,9 @@ xfs_dir2_sf_toino4( } /* - * Convert from 4-byte inode numbers to 8-byte inode numbers. - * The new 8-byte inode number is not there yet, we leave with the - * count 1 but no corresponding entry. + * Convert existing entries from 4-byte inode numbers to 8-byte inode numbers. + * The new entry w/ an 8-byte inode number is not there yet; we leave with + * i8count set to 1, but no corresponding 8-byte entry. */ static void xfs_dir2_sf_toino8( @@ -1145,7 +1140,7 @@ xfs_dir2_sf_toino8( ASSERT(oldsfp->i8count == 0); memcpy(buf, oldsfp, oldsize); /* - * Compute the new inode size. + * Compute the new inode size (nb: entry count + 1 for parent) */ newsize = oldsize + diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 868b19f096bf..3ee0cd43edc0 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -353,10 +353,10 @@ xfs_qm_dqalloc( dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0); - - error = xfs_buf_geterror(bp); - if (error) + if (!bp) { + error = ENOMEM; goto error1; + } bp->b_ops = &xfs_dquot_buf_ops; /* @@ -832,47 +832,6 @@ restart: return (0); } - -STATIC void -xfs_qm_dqput_final( - struct xfs_dquot *dqp) -{ - struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; - struct xfs_dquot *gdqp; - struct xfs_dquot *pdqp; - - trace_xfs_dqput_free(dqp); - - if (list_lru_add(&qi->qi_lru, &dqp->q_lru)) - XFS_STATS_INC(xs_qm_dquot_unused); - - /* - * If we just added a udquot to the freelist, then we want to release - * the gdquot/pdquot reference that it (probably) has. Otherwise it'll - * keep the gdquot/pdquot from getting reclaimed. - */ - gdqp = dqp->q_gdquot; - if (gdqp) { - xfs_dqlock(gdqp); - dqp->q_gdquot = NULL; - } - - pdqp = dqp->q_pdquot; - if (pdqp) { - xfs_dqlock(pdqp); - dqp->q_pdquot = NULL; - } - xfs_dqunlock(dqp); - - /* - * If we had a group/project quota hint, release it now. - */ - if (gdqp) - xfs_qm_dqput(gdqp); - if (pdqp) - xfs_qm_dqput(pdqp); -} - /* * Release a reference to the dquot (decrement ref-count) and unlock it. * @@ -888,10 +847,14 @@ xfs_qm_dqput( trace_xfs_dqput(dqp); - if (--dqp->q_nrefs > 0) - xfs_dqunlock(dqp); - else - xfs_qm_dqput_final(dqp); + if (--dqp->q_nrefs == 0) { + struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; + trace_xfs_dqput_free(dqp); + + if (list_lru_add(&qi->qi_lru, &dqp->q_lru)) + XFS_STATS_INC(xs_qm_dquot_unused); + } + xfs_dqunlock(dqp); } /* diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index d22ed0053c32..68a68f704837 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -52,8 +52,6 @@ typedef struct xfs_dquot { int q_bufoffset; /* off of dq in buffer (# dquots) */ xfs_fileoff_t q_fileoffset; /* offset in quotas file */ - struct xfs_dquot*q_gdquot; /* group dquot, hint only */ - struct xfs_dquot*q_pdquot; /* project dquot, hint only */ xfs_disk_dquot_t q_core; /* actual usage & quotas */ xfs_dq_logitem_t q_logitem; /* dquot log item */ xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */ diff --git a/fs/xfs/xfs_dquot_buf.c b/fs/xfs/xfs_dquot_buf.c index 610da8177737..c2ac0c611ad8 100644 --- a/fs/xfs/xfs_dquot_buf.c +++ b/fs/xfs/xfs_dquot_buf.c @@ -35,7 +35,6 @@ int xfs_calc_dquots_per_chunk( - struct xfs_mount *mp, unsigned int nbblks) /* basic block units */ { unsigned int ndquots; @@ -194,7 +193,7 @@ xfs_dquot_buf_verify_crc( if (mp->m_quotainfo) ndquots = mp->m_quotainfo->qi_dqperchunk; else - ndquots = xfs_calc_dquots_per_chunk(mp, + ndquots = xfs_calc_dquots_per_chunk( XFS_BB_TO_FSB(mp, bp->b_length)); for (i = 0; i < ndquots; i++, d++) { @@ -225,7 +224,7 @@ xfs_dquot_buf_verify( if (mp->m_quotainfo) ndquots = mp->m_quotainfo->qi_dqperchunk; else - ndquots = xfs_calc_dquots_per_chunk(mp, bp->b_length); + ndquots = xfs_calc_dquots_per_chunk(bp->b_length); /* * On the first read of the buffer, verify that each dquot is valid. diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 1399e187d425..753e467aa1a5 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -237,7 +237,7 @@ xfs_fs_nfs_commit_metadata( if (!lsn) return 0; - return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); } const struct export_operations xfs_export_operations = { diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 79e96ce98733..1f66779d7a46 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -155,7 +155,7 @@ xfs_dir_fsync( if (!lsn) return 0; - return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); } STATIC int @@ -229,34 +229,27 @@ xfs_file_fsync( } STATIC ssize_t -xfs_file_aio_read( +xfs_file_read_iter( struct kiocb *iocb, - const struct iovec *iovp, - unsigned long nr_segs, - loff_t pos) + struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - size_t size = 0; + size_t size = iov_iter_count(to); ssize_t ret = 0; int ioflags = 0; xfs_fsize_t n; + loff_t pos = iocb->ki_pos; XFS_STATS_INC(xs_read_calls); - BUG_ON(iocb->ki_pos != pos); - if (unlikely(file->f_flags & O_DIRECT)) ioflags |= IO_ISDIRECT; if (file->f_mode & FMODE_NOCMTIME) ioflags |= IO_INVIS; - ret = generic_segment_checks(iovp, &nr_segs, &size, VERIFY_WRITE); - if (ret < 0) - return ret; - if (unlikely(ioflags & IO_ISDIRECT)) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? @@ -295,7 +288,7 @@ xfs_file_aio_read( xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); if (inode->i_mapping->nrpages) { - ret = -filemap_write_and_wait_range( + ret = filemap_write_and_wait_range( VFS_I(ip)->i_mapping, pos, -1); if (ret) { @@ -309,7 +302,7 @@ xfs_file_aio_read( trace_xfs_file_read(ip, size, pos, ioflags); - ret = generic_file_aio_read(iocb, iovp, nr_segs, pos); + ret = generic_file_read_iter(iocb, to); if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); @@ -350,47 +343,6 @@ xfs_file_splice_read( } /* - * xfs_file_splice_write() does not use xfs_rw_ilock() because - * generic_file_splice_write() takes the i_mutex itself. This, in theory, - * couuld cause lock inversions between the aio_write path and the splice path - * if someone is doing concurrent splice(2) based writes and write(2) based - * writes to the same inode. The only real way to fix this is to re-implement - * the generic code here with correct locking orders. - */ -STATIC ssize_t -xfs_file_splice_write( - struct pipe_inode_info *pipe, - struct file *outfilp, - loff_t *ppos, - size_t count, - unsigned int flags) -{ - struct inode *inode = outfilp->f_mapping->host; - struct xfs_inode *ip = XFS_I(inode); - int ioflags = 0; - ssize_t ret; - - XFS_STATS_INC(xs_write_calls); - - if (outfilp->f_mode & FMODE_NOCMTIME) - ioflags |= IO_INVIS; - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return -EIO; - - xfs_ilock(ip, XFS_IOLOCK_EXCL); - - trace_xfs_file_splice_write(ip, count, *ppos, ioflags); - - ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); - if (ret > 0) - XFS_STATS_ADD(xs_write_bytes, ret); - - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return ret; -} - -/* * This routine is called to handle zeroing any space in the last block of the * file that is beyond the EOF. We do this since the size is being increased * without writing anything to that block and we don't want to read the @@ -625,10 +577,7 @@ restart: STATIC ssize_t xfs_file_dio_aio_write( struct kiocb *iocb, - const struct iovec *iovp, - unsigned long nr_segs, - loff_t pos, - size_t ocount) + struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -636,9 +585,10 @@ xfs_file_dio_aio_write( struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; ssize_t ret = 0; - size_t count = ocount; int unaligned_io = 0; int iolock; + size_t count = iov_iter_count(from); + loff_t pos = iocb->ki_pos; struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; @@ -677,9 +627,10 @@ xfs_file_dio_aio_write( ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock); if (ret) goto out; + iov_iter_truncate(from, count); if (mapping->nrpages) { - ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, pos, -1); if (ret) goto out; @@ -698,8 +649,7 @@ xfs_file_dio_aio_write( } trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); - ret = generic_file_direct_write(iocb, iovp, - &nr_segs, pos, count, ocount); + ret = generic_file_direct_write(iocb, from, pos); out: xfs_rw_iunlock(ip, iolock); @@ -712,10 +662,7 @@ out: STATIC ssize_t xfs_file_buffered_aio_write( struct kiocb *iocb, - const struct iovec *iovp, - unsigned long nr_segs, - loff_t pos, - size_t count) + struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -724,7 +671,8 @@ xfs_file_buffered_aio_write( ssize_t ret; int enospc = 0; int iolock = XFS_IOLOCK_EXCL; - struct iov_iter from; + loff_t pos = iocb->ki_pos; + size_t count = iov_iter_count(from); xfs_rw_ilock(ip, iolock); @@ -732,13 +680,13 @@ xfs_file_buffered_aio_write( if (ret) goto out; - iov_iter_init(&from, iovp, nr_segs, count, 0); + iov_iter_truncate(from, count); /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; write_retry: trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); - ret = generic_perform_write(file, &from, pos); + ret = generic_perform_write(file, from, pos); if (likely(ret >= 0)) iocb->ki_pos = pos + ret; /* @@ -759,40 +707,29 @@ out: } STATIC ssize_t -xfs_file_aio_write( +xfs_file_write_iter( struct kiocb *iocb, - const struct iovec *iovp, - unsigned long nr_segs, - loff_t pos) + struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t ret; - size_t ocount = 0; + size_t ocount = iov_iter_count(from); XFS_STATS_INC(xs_write_calls); - BUG_ON(iocb->ki_pos != pos); - - ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); - if (ret) - return ret; - if (ocount == 0) return 0; - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { - ret = -EIO; - goto out; - } + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; if (unlikely(file->f_flags & O_DIRECT)) - ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount); + ret = xfs_file_dio_aio_write(iocb, from); else - ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, - ocount); + ret = xfs_file_buffered_aio_write(iocb, from); if (ret > 0) { ssize_t err; @@ -804,8 +741,6 @@ xfs_file_aio_write( if (err < 0) ret = err; } - -out: return ret; } @@ -837,11 +772,19 @@ xfs_file_fallocate( unsigned blksize_mask = (1 << inode->i_blkbits) - 1; if (offset & blksize_mask || len & blksize_mask) { - error = -EINVAL; + error = EINVAL; + goto out_unlock; + } + + /* + * There is no need to overlap collapse range with EOF, + * in which case it is effectively a truncate operation + */ + if (offset + len >= i_size_read(inode)) { + error = EINVAL; goto out_unlock; } - ASSERT(offset + len < i_size_read(inode)); new_size = i_size_read(inode) - len; error = xfs_collapse_file_space(ip, offset, len); @@ -936,7 +879,7 @@ xfs_dir_open( */ mode = xfs_ilock_data_map_shared(ip); if (ip->i_d.di_nextents > 0) - xfs_dir3_data_readahead(NULL, ip, 0, -1); + xfs_dir3_data_readahead(ip, 0, -1); xfs_iunlock(ip, mode); return 0; } @@ -1453,12 +1396,12 @@ xfs_file_llseek( const struct file_operations xfs_file_operations = { .llseek = xfs_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = xfs_file_aio_read, - .aio_write = xfs_file_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = xfs_file_read_iter, + .write_iter = xfs_file_write_iter, .splice_read = xfs_file_splice_read, - .splice_write = xfs_file_splice_write, + .splice_write = iter_file_splice_write, .unlocked_ioctl = xfs_file_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_ioctl, diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 12b6e7701985..8ec81bed7992 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2006-2007 Silicon Graphics, Inc. + * Copyright (c) 2014 Christoph Hellwig. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -32,100 +33,20 @@ #include "xfs_filestream.h" #include "xfs_trace.h" -#ifdef XFS_FILESTREAMS_TRACE - -ktrace_t *xfs_filestreams_trace_buf; - -STATIC void -xfs_filestreams_trace( - xfs_mount_t *mp, /* mount point */ - int type, /* type of trace */ - const char *func, /* source function */ - int line, /* source line number */ - __psunsigned_t arg0, - __psunsigned_t arg1, - __psunsigned_t arg2, - __psunsigned_t arg3, - __psunsigned_t arg4, - __psunsigned_t arg5) -{ - ktrace_enter(xfs_filestreams_trace_buf, - (void *)(__psint_t)(type | (line << 16)), - (void *)func, - (void *)(__psunsigned_t)current_pid(), - (void *)mp, - (void *)(__psunsigned_t)arg0, - (void *)(__psunsigned_t)arg1, - (void *)(__psunsigned_t)arg2, - (void *)(__psunsigned_t)arg3, - (void *)(__psunsigned_t)arg4, - (void *)(__psunsigned_t)arg5, - NULL, NULL, NULL, NULL, NULL, NULL); -} - -#define TRACE0(mp,t) TRACE6(mp,t,0,0,0,0,0,0) -#define TRACE1(mp,t,a0) TRACE6(mp,t,a0,0,0,0,0,0) -#define TRACE2(mp,t,a0,a1) TRACE6(mp,t,a0,a1,0,0,0,0) -#define TRACE3(mp,t,a0,a1,a2) TRACE6(mp,t,a0,a1,a2,0,0,0) -#define TRACE4(mp,t,a0,a1,a2,a3) TRACE6(mp,t,a0,a1,a2,a3,0,0) -#define TRACE5(mp,t,a0,a1,a2,a3,a4) TRACE6(mp,t,a0,a1,a2,a3,a4,0) -#define TRACE6(mp,t,a0,a1,a2,a3,a4,a5) \ - xfs_filestreams_trace(mp, t, __func__, __LINE__, \ - (__psunsigned_t)a0, (__psunsigned_t)a1, \ - (__psunsigned_t)a2, (__psunsigned_t)a3, \ - (__psunsigned_t)a4, (__psunsigned_t)a5) - -#define TRACE_AG_SCAN(mp, ag, ag2) \ - TRACE2(mp, XFS_FSTRM_KTRACE_AGSCAN, ag, ag2); -#define TRACE_AG_PICK1(mp, max_ag, maxfree) \ - TRACE2(mp, XFS_FSTRM_KTRACE_AGPICK1, max_ag, maxfree); -#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag) \ - TRACE6(mp, XFS_FSTRM_KTRACE_AGPICK2, ag, ag2, \ - cnt, free, scan, flag) -#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2) \ - TRACE5(mp, XFS_FSTRM_KTRACE_UPDATE, ip, ag, cnt, ag2, cnt2) -#define TRACE_FREE(mp, ip, pip, ag, cnt) \ - TRACE4(mp, XFS_FSTRM_KTRACE_FREE, ip, pip, ag, cnt) -#define TRACE_LOOKUP(mp, ip, pip, ag, cnt) \ - TRACE4(mp, XFS_FSTRM_KTRACE_ITEM_LOOKUP, ip, pip, ag, cnt) -#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt) \ - TRACE4(mp, XFS_FSTRM_KTRACE_ASSOCIATE, ip, pip, ag, cnt) -#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt) \ - TRACE6(mp, XFS_FSTRM_KTRACE_MOVEAG, ip, pip, oag, ocnt, nag, ncnt) -#define TRACE_ORPHAN(mp, ip, ag) \ - TRACE2(mp, XFS_FSTRM_KTRACE_ORPHAN, ip, ag); - - -#else -#define TRACE_AG_SCAN(mp, ag, ag2) -#define TRACE_AG_PICK1(mp, max_ag, maxfree) -#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag) -#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2) -#define TRACE_FREE(mp, ip, pip, ag, cnt) -#define TRACE_LOOKUP(mp, ip, pip, ag, cnt) -#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt) -#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt) -#define TRACE_ORPHAN(mp, ip, ag) -#endif - -static kmem_zone_t *item_zone; +struct xfs_fstrm_item { + struct xfs_mru_cache_elem mru; + struct xfs_inode *ip; + xfs_agnumber_t ag; /* AG in use for this directory */ +}; -/* - * Structure for associating a file or a directory with an allocation group. - * The parent directory pointer is only needed for files, but since there will - * generally be vastly more files than directories in the cache, using the same - * data structure simplifies the code with very little memory overhead. - */ -typedef struct fstrm_item -{ - xfs_agnumber_t ag; /* AG currently in use for the file/directory. */ - xfs_inode_t *ip; /* inode self-pointer. */ - xfs_inode_t *pip; /* Parent directory inode pointer. */ -} fstrm_item_t; +enum xfs_fstrm_alloc { + XFS_PICK_USERDATA = 1, + XFS_PICK_LOWSPACE = 2, +}; /* * Allocation group filestream associations are tracked with per-ag atomic - * counters. These counters allow _xfs_filestream_pick_ag() to tell whether a + * counters. These counters allow xfs_filestream_pick_ag() to tell whether a * particular AG already has active filestreams associated with it. The mount * point's m_peraglock is used to protect these counters from per-ag array * re-allocation during a growfs operation. When xfs_growfs_data_private() is @@ -160,7 +81,7 @@ typedef struct fstrm_item * the cache that reference per-ag array elements that have since been * reallocated. */ -static int +int xfs_filestream_peek_ag( xfs_mount_t *mp, xfs_agnumber_t agno) @@ -200,23 +121,40 @@ xfs_filestream_put_ag( xfs_perag_put(pag); } +static void +xfs_fstrm_free_func( + struct xfs_mru_cache_elem *mru) +{ + struct xfs_fstrm_item *item = + container_of(mru, struct xfs_fstrm_item, mru); + + xfs_filestream_put_ag(item->ip->i_mount, item->ag); + + trace_xfs_filestream_free(item->ip, item->ag); + + kmem_free(item); +} + /* * Scan the AGs starting at startag looking for an AG that isn't in use and has * at least minlen blocks free. */ static int -_xfs_filestream_pick_ag( - xfs_mount_t *mp, - xfs_agnumber_t startag, - xfs_agnumber_t *agp, - int flags, - xfs_extlen_t minlen) +xfs_filestream_pick_ag( + struct xfs_inode *ip, + xfs_agnumber_t startag, + xfs_agnumber_t *agp, + int flags, + xfs_extlen_t minlen) { - int streams, max_streams; - int err, trylock, nscan; - xfs_extlen_t longest, free, minfree, maxfree = 0; - xfs_agnumber_t ag, max_ag = NULLAGNUMBER; - struct xfs_perag *pag; + struct xfs_mount *mp = ip->i_mount; + struct xfs_fstrm_item *item; + struct xfs_perag *pag; + xfs_extlen_t longest, free = 0, minfree, maxfree = 0; + xfs_agnumber_t ag, max_ag = NULLAGNUMBER; + int err, trylock, nscan; + + ASSERT(S_ISDIR(ip->i_d.di_mode)); /* 2% of an AG's blocks must be free for it to be chosen. */ minfree = mp->m_sb.sb_agblocks / 50; @@ -228,8 +166,9 @@ _xfs_filestream_pick_ag( trylock = XFS_ALLOC_FLAG_TRYLOCK; for (nscan = 0; 1; nscan++) { + trace_xfs_filestream_scan(ip, ag); + pag = xfs_perag_get(mp, ag); - TRACE_AG_SCAN(mp, ag, atomic_read(&pag->pagf_fstrms)); if (!pag->pagf_init) { err = xfs_alloc_pagf_init(mp, NULL, ag, trylock); @@ -246,7 +185,6 @@ _xfs_filestream_pick_ag( /* Keep track of the AG with the most free blocks. */ if (pag->pagf_freeblks > maxfree) { maxfree = pag->pagf_freeblks; - max_streams = atomic_read(&pag->pagf_fstrms); max_ag = ag; } @@ -269,7 +207,6 @@ _xfs_filestream_pick_ag( /* Break out, retaining the reference on the AG. */ free = pag->pagf_freeblks; - streams = atomic_read(&pag->pagf_fstrms); xfs_perag_put(pag); *agp = ag; break; @@ -305,317 +242,98 @@ next_ag: */ if (max_ag != NULLAGNUMBER) { xfs_filestream_get_ag(mp, max_ag); - TRACE_AG_PICK1(mp, max_ag, maxfree); - streams = max_streams; free = maxfree; *agp = max_ag; break; } /* take AG 0 if none matched */ - TRACE_AG_PICK1(mp, max_ag, maxfree); + trace_xfs_filestream_pick(ip, *agp, free, nscan); *agp = 0; return 0; } - TRACE_AG_PICK2(mp, startag, *agp, streams, free, nscan, flags); - - return 0; -} + trace_xfs_filestream_pick(ip, *agp, free, nscan); -/* - * Set the allocation group number for a file or a directory, updating inode - * references and per-AG references as appropriate. - */ -static int -_xfs_filestream_update_ag( - xfs_inode_t *ip, - xfs_inode_t *pip, - xfs_agnumber_t ag) -{ - int err = 0; - xfs_mount_t *mp; - xfs_mru_cache_t *cache; - fstrm_item_t *item; - xfs_agnumber_t old_ag; - xfs_inode_t *old_pip; - - /* - * Either ip is a regular file and pip is a directory, or ip is a - * directory and pip is NULL. - */ - ASSERT(ip && ((S_ISREG(ip->i_d.di_mode) && pip && - S_ISDIR(pip->i_d.di_mode)) || - (S_ISDIR(ip->i_d.di_mode) && !pip))); - - mp = ip->i_mount; - cache = mp->m_filestream; - - item = xfs_mru_cache_lookup(cache, ip->i_ino); - if (item) { - ASSERT(item->ip == ip); - old_ag = item->ag; - item->ag = ag; - old_pip = item->pip; - item->pip = pip; - xfs_mru_cache_done(cache); - - /* - * If the AG has changed, drop the old ref and take a new one, - * effectively transferring the reference from old to new AG. - */ - if (ag != old_ag) { - xfs_filestream_put_ag(mp, old_ag); - xfs_filestream_get_ag(mp, ag); - } - - /* - * If ip is a file and its pip has changed, drop the old ref and - * take a new one. - */ - if (pip && pip != old_pip) { - IRELE(old_pip); - IHOLD(pip); - } - - TRACE_UPDATE(mp, ip, old_ag, xfs_filestream_peek_ag(mp, old_ag), - ag, xfs_filestream_peek_ag(mp, ag)); + if (*agp == NULLAGNUMBER) return 0; - } - item = kmem_zone_zalloc(item_zone, KM_MAYFAIL); + err = ENOMEM; + item = kmem_alloc(sizeof(*item), KM_MAYFAIL); if (!item) - return ENOMEM; + goto out_put_ag; - item->ag = ag; + item->ag = *agp; item->ip = ip; - item->pip = pip; - err = xfs_mru_cache_insert(cache, ip->i_ino, item); + err = xfs_mru_cache_insert(mp->m_filestream, ip->i_ino, &item->mru); if (err) { - kmem_zone_free(item_zone, item); - return err; + if (err == EEXIST) + err = 0; + goto out_free_item; } - /* Take a reference on the AG. */ - xfs_filestream_get_ag(mp, ag); - - /* - * Take a reference on the inode itself regardless of whether it's a - * regular file or a directory. - */ - IHOLD(ip); - - /* - * In the case of a regular file, take a reference on the parent inode - * as well to ensure it remains in-core. - */ - if (pip) - IHOLD(pip); - - TRACE_UPDATE(mp, ip, ag, xfs_filestream_peek_ag(mp, ag), - ag, xfs_filestream_peek_ag(mp, ag)); - return 0; -} - -/* xfs_fstrm_free_func(): callback for freeing cached stream items. */ -STATIC void -xfs_fstrm_free_func( - unsigned long ino, - void *data) -{ - fstrm_item_t *item = (fstrm_item_t *)data; - xfs_inode_t *ip = item->ip; - - ASSERT(ip->i_ino == ino); - - xfs_iflags_clear(ip, XFS_IFILESTREAM); - - /* Drop the reference taken on the AG when the item was added. */ - xfs_filestream_put_ag(ip->i_mount, item->ag); - - TRACE_FREE(ip->i_mount, ip, item->pip, item->ag, - xfs_filestream_peek_ag(ip->i_mount, item->ag)); - - /* - * _xfs_filestream_update_ag() always takes a reference on the inode - * itself, whether it's a file or a directory. Release it here. - * This can result in the inode being freed and so we must - * not hold any inode locks when freeing filesstreams objects - * otherwise we can deadlock here. - */ - IRELE(ip); - - /* - * In the case of a regular file, _xfs_filestream_update_ag() also - * takes a ref on the parent inode to keep it in-core. Release that - * too. - */ - if (item->pip) - IRELE(item->pip); - - /* Finally, free the memory allocated for the item. */ - kmem_zone_free(item_zone, item); -} - -/* - * xfs_filestream_init() is called at xfs initialisation time to set up the - * memory zone that will be used for filestream data structure allocation. - */ -int -xfs_filestream_init(void) -{ - item_zone = kmem_zone_init(sizeof(fstrm_item_t), "fstrm_item"); - if (!item_zone) - return -ENOMEM; - - return 0; -} - -/* - * xfs_filestream_uninit() is called at xfs termination time to destroy the - * memory zone that was used for filestream data structure allocation. - */ -void -xfs_filestream_uninit(void) -{ - kmem_zone_destroy(item_zone); -} - -/* - * xfs_filestream_mount() is called when a file system is mounted with the - * filestream option. It is responsible for allocating the data structures - * needed to track the new file system's file streams. - */ -int -xfs_filestream_mount( - xfs_mount_t *mp) -{ - int err; - unsigned int lifetime, grp_count; - - /* - * The filestream timer tunable is currently fixed within the range of - * one second to four minutes, with five seconds being the default. The - * group count is somewhat arbitrary, but it'd be nice to adhere to the - * timer tunable to within about 10 percent. This requires at least 10 - * groups. - */ - lifetime = xfs_fstrm_centisecs * 10; - grp_count = 10; - - err = xfs_mru_cache_create(&mp->m_filestream, lifetime, grp_count, - xfs_fstrm_free_func); +out_free_item: + kmem_free(item); +out_put_ag: + xfs_filestream_put_ag(mp, *agp); return err; } -/* - * xfs_filestream_unmount() is called when a file system that was mounted with - * the filestream option is unmounted. It drains the data structures created - * to track the file system's file streams and frees all the memory that was - * allocated. - */ -void -xfs_filestream_unmount( - xfs_mount_t *mp) +static struct xfs_inode * +xfs_filestream_get_parent( + struct xfs_inode *ip) { - xfs_mru_cache_destroy(mp->m_filestream); -} + struct inode *inode = VFS_I(ip), *dir = NULL; + struct dentry *dentry, *parent; -/* - * Return the AG of the filestream the file or directory belongs to, or - * NULLAGNUMBER otherwise. - */ -xfs_agnumber_t -xfs_filestream_lookup_ag( - xfs_inode_t *ip) -{ - xfs_mru_cache_t *cache; - fstrm_item_t *item; - xfs_agnumber_t ag; - int ref; - - if (!S_ISREG(ip->i_d.di_mode) && !S_ISDIR(ip->i_d.di_mode)) { - ASSERT(0); - return NULLAGNUMBER; - } + dentry = d_find_alias(inode); + if (!dentry) + goto out; - cache = ip->i_mount->m_filestream; - item = xfs_mru_cache_lookup(cache, ip->i_ino); - if (!item) { - TRACE_LOOKUP(ip->i_mount, ip, NULL, NULLAGNUMBER, 0); - return NULLAGNUMBER; - } + parent = dget_parent(dentry); + if (!parent) + goto out_dput; - ASSERT(ip == item->ip); - ag = item->ag; - ref = xfs_filestream_peek_ag(ip->i_mount, ag); - xfs_mru_cache_done(cache); + dir = igrab(parent->d_inode); + dput(parent); - TRACE_LOOKUP(ip->i_mount, ip, item->pip, ag, ref); - return ag; +out_dput: + dput(dentry); +out: + return dir ? XFS_I(dir) : NULL; } /* - * xfs_filestream_associate() should only be called to associate a regular file - * with its parent directory. Calling it with a child directory isn't - * appropriate because filestreams don't apply to entire directory hierarchies. - * Creating a file in a child directory of an existing filestream directory - * starts a new filestream with its own allocation group association. + * Find the right allocation group for a file, either by finding an + * existing file stream or creating a new one. * - * Returns < 0 on error, 0 if successful association occurred, > 0 if - * we failed to get an association because of locking issues. + * Returns NULLAGNUMBER in case of an error. */ -int -xfs_filestream_associate( - xfs_inode_t *pip, - xfs_inode_t *ip) +xfs_agnumber_t +xfs_filestream_lookup_ag( + struct xfs_inode *ip) { - xfs_mount_t *mp; - xfs_mru_cache_t *cache; - fstrm_item_t *item; - xfs_agnumber_t ag, rotorstep, startag; - int err = 0; + struct xfs_mount *mp = ip->i_mount; + struct xfs_inode *pip = NULL; + xfs_agnumber_t startag, ag = NULLAGNUMBER; + struct xfs_mru_cache_elem *mru; - ASSERT(S_ISDIR(pip->i_d.di_mode)); ASSERT(S_ISREG(ip->i_d.di_mode)); - if (!S_ISDIR(pip->i_d.di_mode) || !S_ISREG(ip->i_d.di_mode)) - return -EINVAL; - mp = pip->i_mount; - cache = mp->m_filestream; + pip = xfs_filestream_get_parent(ip); + if (!pip) + goto out; - /* - * We have a problem, Houston. - * - * Taking the iolock here violates inode locking order - we already - * hold the ilock. Hence if we block getting this lock we may never - * wake. Unfortunately, that means if we can't get the lock, we're - * screwed in terms of getting a stream association - we can't spin - * waiting for the lock because someone else is waiting on the lock we - * hold and we cannot drop that as we are in a transaction here. - * - * Lucky for us, this inversion is not a problem because it's a - * directory inode that we are trying to lock here. - * - * So, if we can't get the iolock without sleeping then just give up - */ - if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL)) - return 1; - - /* If the parent directory is already in the cache, use its AG. */ - item = xfs_mru_cache_lookup(cache, pip->i_ino); - if (item) { - ASSERT(item->ip == pip); - ag = item->ag; - xfs_mru_cache_done(cache); - - TRACE_LOOKUP(mp, pip, pip, ag, xfs_filestream_peek_ag(mp, ag)); - err = _xfs_filestream_update_ag(ip, pip, ag); + mru = xfs_mru_cache_lookup(mp->m_filestream, pip->i_ino); + if (mru) { + ag = container_of(mru, struct xfs_fstrm_item, mru)->ag; + xfs_mru_cache_done(mp->m_filestream); - goto exit; + trace_xfs_filestream_lookup(ip, ag); + goto out; } /* @@ -623,202 +341,94 @@ xfs_filestream_associate( * use the directory inode's AG. */ if (mp->m_flags & XFS_MOUNT_32BITINODES) { - rotorstep = xfs_rotorstep; + xfs_agnumber_t rotorstep = xfs_rotorstep; startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount; mp->m_agfrotor = (mp->m_agfrotor + 1) % (mp->m_sb.sb_agcount * rotorstep); } else startag = XFS_INO_TO_AGNO(mp, pip->i_ino); - /* Pick a new AG for the parent inode starting at startag. */ - err = _xfs_filestream_pick_ag(mp, startag, &ag, 0, 0); - if (err || ag == NULLAGNUMBER) - goto exit_did_pick; - - /* Associate the parent inode with the AG. */ - err = _xfs_filestream_update_ag(pip, NULL, ag); - if (err) - goto exit_did_pick; - - /* Associate the file inode with the AG. */ - err = _xfs_filestream_update_ag(ip, pip, ag); - if (err) - goto exit_did_pick; - - TRACE_ASSOCIATE(mp, ip, pip, ag, xfs_filestream_peek_ag(mp, ag)); - -exit_did_pick: - /* - * If _xfs_filestream_pick_ag() returned a valid AG, remove the - * reference it took on it, since the file and directory will have taken - * their own now if they were successfully cached. - */ - if (ag != NULLAGNUMBER) - xfs_filestream_put_ag(mp, ag); - -exit: - xfs_iunlock(pip, XFS_IOLOCK_EXCL); - return -err; + if (xfs_filestream_pick_ag(pip, startag, &ag, 0, 0)) + ag = NULLAGNUMBER; +out: + IRELE(pip); + return ag; } /* - * Pick a new allocation group for the current file and its file stream. This - * function is called by xfs_bmap_filestreams() with the mount point's per-ag - * lock held. + * Pick a new allocation group for the current file and its file stream. + * + * This is called when the allocator can't find a suitable extent in the + * current AG, and we have to move the stream into a new AG with more space. */ int xfs_filestream_new_ag( struct xfs_bmalloca *ap, xfs_agnumber_t *agp) { - int flags, err; - xfs_inode_t *ip, *pip = NULL; - xfs_mount_t *mp; - xfs_mru_cache_t *cache; - xfs_extlen_t minlen; - fstrm_item_t *dir, *file; - xfs_agnumber_t ag = NULLAGNUMBER; - - ip = ap->ip; - mp = ip->i_mount; - cache = mp->m_filestream; - minlen = ap->length; - *agp = NULLAGNUMBER; + struct xfs_inode *ip = ap->ip, *pip; + struct xfs_mount *mp = ip->i_mount; + xfs_extlen_t minlen = ap->length; + xfs_agnumber_t startag = 0; + int flags, err = 0; + struct xfs_mru_cache_elem *mru; - /* - * Look for the file in the cache, removing it if it's found. Doing - * this allows it to be held across the dir lookup that follows. - */ - file = xfs_mru_cache_remove(cache, ip->i_ino); - if (file) { - ASSERT(ip == file->ip); - - /* Save the file's parent inode and old AG number for later. */ - pip = file->pip; - ag = file->ag; - - /* Look for the file's directory in the cache. */ - dir = xfs_mru_cache_lookup(cache, pip->i_ino); - if (dir) { - ASSERT(pip == dir->ip); - - /* - * If the directory has already moved on to a new AG, - * use that AG as the new AG for the file. Don't - * forget to twiddle the AG refcounts to match the - * movement. - */ - if (dir->ag != file->ag) { - xfs_filestream_put_ag(mp, file->ag); - xfs_filestream_get_ag(mp, dir->ag); - *agp = file->ag = dir->ag; - } - - xfs_mru_cache_done(cache); - } + *agp = NULLAGNUMBER; - /* - * Put the file back in the cache. If this fails, the free - * function needs to be called to tidy up in the same way as if - * the item had simply expired from the cache. - */ - err = xfs_mru_cache_insert(cache, ip->i_ino, file); - if (err) { - xfs_fstrm_free_func(ip->i_ino, file); - return err; - } + pip = xfs_filestream_get_parent(ip); + if (!pip) + goto exit; - /* - * If the file's AG was moved to the directory's new AG, there's - * nothing more to be done. - */ - if (*agp != NULLAGNUMBER) { - TRACE_MOVEAG(mp, ip, pip, - ag, xfs_filestream_peek_ag(mp, ag), - *agp, xfs_filestream_peek_ag(mp, *agp)); - return 0; - } + mru = xfs_mru_cache_remove(mp->m_filestream, pip->i_ino); + if (mru) { + struct xfs_fstrm_item *item = + container_of(mru, struct xfs_fstrm_item, mru); + startag = (item->ag + 1) % mp->m_sb.sb_agcount; } - /* - * If the file's parent directory is known, take its iolock in exclusive - * mode to prevent two sibling files from racing each other to migrate - * themselves and their parent to different AGs. - * - * Note that we lock the parent directory iolock inside the child - * iolock here. That's fine as we never hold both parent and child - * iolock in any other place. This is different from the ilock, - * which requires locking of the child after the parent for namespace - * operations. - */ - if (pip) - xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); - - /* - * A new AG needs to be found for the file. If the file's parent - * directory is also known, it will be moved to the new AG as well to - * ensure that files created inside it in future use the new AG. - */ - ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount; flags = (ap->userdata ? XFS_PICK_USERDATA : 0) | (ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0); - err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen); - if (err || *agp == NULLAGNUMBER) - goto exit; + err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen); /* - * If the file wasn't found in the file cache, then its parent directory - * inode isn't known. For this to have happened, the file must either - * be pre-existing, or it was created long enough ago that its cache - * entry has expired. This isn't the sort of usage that the filestreams - * allocator is trying to optimise, so there's no point trying to track - * its new AG somehow in the filestream data structures. + * Only free the item here so we skip over the old AG earlier. */ - if (!pip) { - TRACE_ORPHAN(mp, ip, *agp); - goto exit; - } - - /* Associate the parent inode with the AG. */ - err = _xfs_filestream_update_ag(pip, NULL, *agp); - if (err) - goto exit; - - /* Associate the file inode with the AG. */ - err = _xfs_filestream_update_ag(ip, pip, *agp); - if (err) - goto exit; - - TRACE_MOVEAG(mp, ip, pip, NULLAGNUMBER, 0, - *agp, xfs_filestream_peek_ag(mp, *agp)); + if (mru) + xfs_fstrm_free_func(mru); + IRELE(pip); exit: - /* - * If _xfs_filestream_pick_ag() returned a valid AG, remove the - * reference it took on it, since the file and directory will have taken - * their own now if they were successfully cached. - */ - if (*agp != NULLAGNUMBER) - xfs_filestream_put_ag(mp, *agp); - else + if (*agp == NULLAGNUMBER) *agp = 0; - - if (pip) - xfs_iunlock(pip, XFS_IOLOCK_EXCL); - return err; } -/* - * Remove an association between an inode and a filestream object. - * Typically this is done on last close of an unlinked file. - */ void xfs_filestream_deassociate( - xfs_inode_t *ip) + struct xfs_inode *ip) { - xfs_mru_cache_t *cache = ip->i_mount->m_filestream; + xfs_mru_cache_delete(ip->i_mount->m_filestream, ip->i_ino); +} + +int +xfs_filestream_mount( + xfs_mount_t *mp) +{ + /* + * The filestream timer tunable is currently fixed within the range of + * one second to four minutes, with five seconds being the default. The + * group count is somewhat arbitrary, but it'd be nice to adhere to the + * timer tunable to within about 10 percent. This requires at least 10 + * groups. + */ + return xfs_mru_cache_create(&mp->m_filestream, xfs_fstrm_centisecs * 10, + 10, xfs_fstrm_free_func); +} - xfs_mru_cache_delete(cache, ip->i_ino); +void +xfs_filestream_unmount( + xfs_mount_t *mp) +{ + xfs_mru_cache_destroy(mp->m_filestream); } diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h index 6d61dbee8564..2ef43406e53b 100644 --- a/fs/xfs/xfs_filestream.h +++ b/fs/xfs/xfs_filestream.h @@ -20,50 +20,20 @@ struct xfs_mount; struct xfs_inode; -struct xfs_perag; struct xfs_bmalloca; -#ifdef XFS_FILESTREAMS_TRACE -#define XFS_FSTRM_KTRACE_INFO 1 -#define XFS_FSTRM_KTRACE_AGSCAN 2 -#define XFS_FSTRM_KTRACE_AGPICK1 3 -#define XFS_FSTRM_KTRACE_AGPICK2 4 -#define XFS_FSTRM_KTRACE_UPDATE 5 -#define XFS_FSTRM_KTRACE_FREE 6 -#define XFS_FSTRM_KTRACE_ITEM_LOOKUP 7 -#define XFS_FSTRM_KTRACE_ASSOCIATE 8 -#define XFS_FSTRM_KTRACE_MOVEAG 9 -#define XFS_FSTRM_KTRACE_ORPHAN 10 - -#define XFS_FSTRM_KTRACE_SIZE 16384 -extern ktrace_t *xfs_filestreams_trace_buf; - -#endif - -/* allocation selection flags */ -typedef enum xfs_fstrm_alloc { - XFS_PICK_USERDATA = 1, - XFS_PICK_LOWSPACE = 2, -} xfs_fstrm_alloc_t; - -/* prototypes for filestream.c */ -int xfs_filestream_init(void); -void xfs_filestream_uninit(void); int xfs_filestream_mount(struct xfs_mount *mp); void xfs_filestream_unmount(struct xfs_mount *mp); -xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip); -int xfs_filestream_associate(struct xfs_inode *dip, struct xfs_inode *ip); void xfs_filestream_deassociate(struct xfs_inode *ip); +xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip); int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp); +int xfs_filestream_peek_ag(struct xfs_mount *mp, xfs_agnumber_t agno); - -/* filestreams for the inode? */ static inline int xfs_inode_is_filestream( struct xfs_inode *ip) { return (ip->i_mount->m_flags & XFS_MOUNT_FILESTREAMS) || - xfs_iflags_test(ip, XFS_IFILESTREAM) || (ip->i_d.di_flags & XFS_DIFLAG_FILESTREAM); } diff --git a/fs/xfs/xfs_format.h b/fs/xfs/xfs_format.h index 9898f31d05d8..34d85aca3058 100644 --- a/fs/xfs/xfs_format.h +++ b/fs/xfs/xfs_format.h @@ -202,6 +202,8 @@ typedef __be32 xfs_alloc_ptr_t; */ #define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */ #define XFS_IBT_CRC_MAGIC 0x49414233 /* 'IAB3' */ +#define XFS_FIBT_MAGIC 0x46494254 /* 'FIBT' */ +#define XFS_FIBT_CRC_MAGIC 0x46494233 /* 'FIB3' */ typedef __uint64_t xfs_inofree_t; #define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t)) @@ -244,7 +246,17 @@ typedef __be32 xfs_inobt_ptr_t; * block numbers in the AG. */ #define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1)) -#define XFS_PREALLOC_BLOCKS(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1)) +#define XFS_FIBT_BLOCK(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1)) + +/* + * The first data block of an AG depends on whether the filesystem was formatted + * with the finobt feature. If so, account for the finobt reserved root btree + * block. + */ +#define XFS_PREALLOC_BLOCKS(mp) \ + (xfs_sb_version_hasfinobt(&((mp)->m_sb)) ? \ + XFS_FIBT_BLOCK(mp) + 1 : \ + XFS_IBT_BLOCK(mp) + 1) diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index c5fc116dfaa3..d34703dbcb42 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -238,6 +238,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */ #define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */ #define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */ +#define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */ /* * Minimum and maximum sizes need for growth checks. diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 02fb943cbf22..d2295561570a 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -24,6 +24,8 @@ #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_inode_item.h" @@ -74,23 +76,18 @@ xfs_fs_geometry( } if (new_version >= 3) { geo->version = XFS_FSOP_GEOM_VERSION; - geo->flags = + geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK | + XFS_FSOP_GEOM_FLAGS_DIRV2 | (xfs_sb_version_hasattr(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_ATTR : 0) | - (xfs_sb_version_hasnlink(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_NLINK : 0) | (xfs_sb_version_hasquota(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_QUOTA : 0) | (xfs_sb_version_hasalign(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_IALIGN : 0) | (xfs_sb_version_hasdalign(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_DALIGN : 0) | - (xfs_sb_version_hasshared(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_SHARED : 0) | (xfs_sb_version_hasextflgbit(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_EXTFLG : 0) | - (xfs_sb_version_hasdirv2(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_DIRV2 : 0) | (xfs_sb_version_hassector(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_SECTOR : 0) | (xfs_sb_version_hasasciici(&mp->m_sb) ? @@ -104,11 +101,13 @@ xfs_fs_geometry( (xfs_sb_version_hascrc(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_V5SB : 0) | (xfs_sb_version_hasftype(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_FTYPE : 0); + XFS_FSOP_GEOM_FLAGS_FTYPE : 0) | + (xfs_sb_version_hasfinobt(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_FINOBT : 0); geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? mp->m_sb.sb_logsectsize : BBSIZE; geo->rtsectsize = mp->m_sb.sb_blocksize; - geo->dirblocksize = mp->m_dirblksize; + geo->dirblocksize = mp->m_dir_geo->blksize; } if (new_version >= 4) { geo->flags |= @@ -316,6 +315,10 @@ xfs_growfs_data_private( agi->agi_dirino = cpu_to_be32(NULLAGINO); if (xfs_sb_version_hascrc(&mp->m_sb)) uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_uuid); + if (xfs_sb_version_hasfinobt(&mp->m_sb)) { + agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp)); + agi->agi_free_level = cpu_to_be32(1); + } for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); @@ -407,6 +410,34 @@ xfs_growfs_data_private( xfs_buf_relse(bp); if (error) goto error0; + + /* + * FINO btree root block + */ + if (xfs_sb_version_hasfinobt(&mp->m_sb)) { + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_FIBT_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_inobt_buf_ops); + if (!bp) { + error = ENOMEM; + goto error0; + } + + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, bp, XFS_FIBT_CRC_MAGIC, + 0, 0, agno, + XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, bp, XFS_FIBT_MAGIC, 0, + 0, agno, 0); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + } + } xfs_trans_agblocks_delta(tp, nfree); /* diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 8f711db61a0c..5960e5593fe0 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -112,6 +112,66 @@ xfs_inobt_get_rec( } /* + * Insert a single inobt record. Cursor must already point to desired location. + */ +STATIC int +xfs_inobt_insert_rec( + struct xfs_btree_cur *cur, + __int32_t freecount, + xfs_inofree_t free, + int *stat) +{ + cur->bc_rec.i.ir_freecount = freecount; + cur->bc_rec.i.ir_free = free; + return xfs_btree_insert(cur, stat); +} + +/* + * Insert records describing a newly allocated inode chunk into the inobt. + */ +STATIC int +xfs_inobt_insert( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agino_t newino, + xfs_agino_t newlen, + xfs_btnum_t btnum) +{ + struct xfs_btree_cur *cur; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + xfs_agino_t thisino; + int i; + int error; + + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum); + + for (thisino = newino; + thisino < newino + newlen; + thisino += XFS_INODES_PER_CHUNK) { + error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i); + if (error) { + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; + } + ASSERT(i == 0); + + error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK, + XFS_INOBT_ALL_FREE, &i); + if (error) { + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; + } + ASSERT(i == 1); + } + + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + + return 0; +} + +/* * Verify that the number of free inodes in the AGI is correct. */ #ifdef DEBUG @@ -220,10 +280,8 @@ xfs_ialloc_inode_init( if (tp) xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos, mp->m_sb.sb_inodesize, length, gen); - } else if (xfs_sb_version_hasnlink(&mp->m_sb)) + } else version = 2; - else - version = 1; for (j = 0; j < nbufs; j++) { /* @@ -303,13 +361,10 @@ xfs_ialloc_ag_alloc( { xfs_agi_t *agi; /* allocation group header */ xfs_alloc_arg_t args; /* allocation argument structure */ - xfs_btree_cur_t *cur; /* inode btree cursor */ xfs_agnumber_t agno; int error; - int i; xfs_agino_t newino; /* new first inode's number */ xfs_agino_t newlen; /* new number of inodes */ - xfs_agino_t thisino; /* current inode number, for loop */ int isaligned = 0; /* inode allocation at stripe unit */ /* boundary */ struct xfs_perag *pag; @@ -459,29 +514,19 @@ xfs_ialloc_ag_alloc( agi->agi_newino = cpu_to_be32(newino); /* - * Insert records describing the new inode chunk into the btree. + * Insert records describing the new inode chunk into the btrees. */ - cur = xfs_inobt_init_cursor(args.mp, tp, agbp, agno); - for (thisino = newino; - thisino < newino + newlen; - thisino += XFS_INODES_PER_CHUNK) { - cur->bc_rec.i.ir_startino = thisino; - cur->bc_rec.i.ir_freecount = XFS_INODES_PER_CHUNK; - cur->bc_rec.i.ir_free = XFS_INOBT_ALL_FREE; - error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, &i); - if (error) { - xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); - return error; - } - ASSERT(i == 0); - error = xfs_btree_insert(cur, &i); - if (error) { - xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, + XFS_BTNUM_INO); + if (error) + return error; + + if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { + error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, + XFS_BTNUM_FINO); + if (error) return error; - } - ASSERT(i == 1); } - xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); /* * Log allocation group header fields */ @@ -675,13 +720,10 @@ xfs_ialloc_get_rec( } /* - * Allocate an inode. - * - * The caller selected an AG for us, and made sure that free inodes are - * available. + * Allocate an inode using the inobt-only algorithm. */ STATIC int -xfs_dialloc_ag( +xfs_dialloc_ag_inobt( struct xfs_trans *tp, struct xfs_buf *agbp, xfs_ino_t parent, @@ -707,7 +749,7 @@ xfs_dialloc_ag( ASSERT(pag->pagi_freecount > 0); restart_pagno: - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); /* * If pagino is 0 (this is the root inode allocation) use newino. * This must work because we've just allocated some. @@ -940,6 +982,294 @@ error0: } /* + * Use the free inode btree to allocate an inode based on distance from the + * parent. Note that the provided cursor may be deleted and replaced. + */ +STATIC int +xfs_dialloc_ag_finobt_near( + xfs_agino_t pagino, + struct xfs_btree_cur **ocur, + struct xfs_inobt_rec_incore *rec) +{ + struct xfs_btree_cur *lcur = *ocur; /* left search cursor */ + struct xfs_btree_cur *rcur; /* right search cursor */ + struct xfs_inobt_rec_incore rrec; + int error; + int i, j; + + error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i); + if (error) + return error; + + if (i == 1) { + error = xfs_inobt_get_rec(lcur, rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + + /* + * See if we've landed in the parent inode record. The finobt + * only tracks chunks with at least one free inode, so record + * existence is enough. + */ + if (pagino >= rec->ir_startino && + pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK)) + return 0; + } + + error = xfs_btree_dup_cursor(lcur, &rcur); + if (error) + return error; + + error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j); + if (error) + goto error_rcur; + if (j == 1) { + error = xfs_inobt_get_rec(rcur, &rrec, &j); + if (error) + goto error_rcur; + XFS_WANT_CORRUPTED_GOTO(j == 1, error_rcur); + } + + XFS_WANT_CORRUPTED_GOTO(i == 1 || j == 1, error_rcur); + if (i == 1 && j == 1) { + /* + * Both the left and right records are valid. Choose the closer + * inode chunk to the target. + */ + if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) > + (rrec.ir_startino - pagino)) { + *rec = rrec; + xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR); + *ocur = rcur; + } else { + xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR); + } + } else if (j == 1) { + /* only the right record is valid */ + *rec = rrec; + xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR); + *ocur = rcur; + } else if (i == 1) { + /* only the left record is valid */ + xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR); + } + + return 0; + +error_rcur: + xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR); + return error; +} + +/* + * Use the free inode btree to find a free inode based on a newino hint. If + * the hint is NULL, find the first free inode in the AG. + */ +STATIC int +xfs_dialloc_ag_finobt_newino( + struct xfs_agi *agi, + struct xfs_btree_cur *cur, + struct xfs_inobt_rec_incore *rec) +{ + int error; + int i; + + if (agi->agi_newino != cpu_to_be32(NULLAGINO)) { + error = xfs_inobt_lookup(cur, agi->agi_newino, XFS_LOOKUP_EQ, + &i); + if (error) + return error; + if (i == 1) { + error = xfs_inobt_get_rec(cur, rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + + return 0; + } + } + + /* + * Find the first inode available in the AG. + */ + error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + + error = xfs_inobt_get_rec(cur, rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + + return 0; +} + +/* + * Update the inobt based on a modification made to the finobt. Also ensure that + * the records from both trees are equivalent post-modification. + */ +STATIC int +xfs_dialloc_ag_update_inobt( + struct xfs_btree_cur *cur, /* inobt cursor */ + struct xfs_inobt_rec_incore *frec, /* finobt record */ + int offset) /* inode offset */ +{ + struct xfs_inobt_rec_incore rec; + int error; + int i; + + error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) % + XFS_INODES_PER_CHUNK) == 0); + + rec.ir_free &= ~XFS_INOBT_MASK(offset); + rec.ir_freecount--; + + XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) && + (rec.ir_freecount == frec->ir_freecount)); + + error = xfs_inobt_update(cur, &rec); + if (error) + return error; + + return 0; +} + +/* + * Allocate an inode using the free inode btree, if available. Otherwise, fall + * back to the inobt search algorithm. + * + * The caller selected an AG for us, and made sure that free inodes are + * available. + */ +STATIC int +xfs_dialloc_ag( + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_ino_t parent, + xfs_ino_t *inop) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); + xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); + struct xfs_perag *pag; + struct xfs_btree_cur *cur; /* finobt cursor */ + struct xfs_btree_cur *icur; /* inobt cursor */ + struct xfs_inobt_rec_incore rec; + xfs_ino_t ino; + int error; + int offset; + int i; + + if (!xfs_sb_version_hasfinobt(&mp->m_sb)) + return xfs_dialloc_ag_inobt(tp, agbp, parent, inop); + + pag = xfs_perag_get(mp, agno); + + /* + * If pagino is 0 (this is the root inode allocation) use newino. + * This must work because we've just allocated some. + */ + if (!pagino) + pagino = be32_to_cpu(agi->agi_newino); + + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO); + + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error_cur; + + /* + * The search algorithm depends on whether we're in the same AG as the + * parent. If so, find the closest available inode to the parent. If + * not, consider the agi hint or find the first free inode in the AG. + */ + if (agno == pagno) + error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec); + else + error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec); + if (error) + goto error_cur; + + offset = xfs_lowbit64(rec.ir_free); + ASSERT(offset >= 0); + ASSERT(offset < XFS_INODES_PER_CHUNK); + ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % + XFS_INODES_PER_CHUNK) == 0); + ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset); + + /* + * Modify or remove the finobt record. + */ + rec.ir_free &= ~XFS_INOBT_MASK(offset); + rec.ir_freecount--; + if (rec.ir_freecount) + error = xfs_inobt_update(cur, &rec); + else + error = xfs_btree_delete(cur, &i); + if (error) + goto error_cur; + + /* + * The finobt has now been updated appropriately. We haven't updated the + * agi and superblock yet, so we can create an inobt cursor and validate + * the original freecount. If all is well, make the equivalent update to + * the inobt using the finobt record and offset information. + */ + icur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); + + error = xfs_check_agi_freecount(icur, agi); + if (error) + goto error_icur; + + error = xfs_dialloc_ag_update_inobt(icur, &rec, offset); + if (error) + goto error_icur; + + /* + * Both trees have now been updated. We must update the perag and + * superblock before we can check the freecount for each btree. + */ + be32_add_cpu(&agi->agi_freecount, -1); + xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); + pag->pagi_freecount--; + + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); + + error = xfs_check_agi_freecount(icur, agi); + if (error) + goto error_icur; + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error_icur; + + xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + xfs_perag_put(pag); + *inop = ino; + return 0; + +error_icur: + xfs_btree_del_cursor(icur, XFS_BTREE_ERROR); +error_cur: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + xfs_perag_put(pag); + return error; +} + +/* * Allocate an inode on disk. * * Mode is used to tell whether the new inode will need space, and whether it @@ -1098,78 +1428,34 @@ out_error: return XFS_ERROR(error); } -/* - * Free disk inode. Carefully avoids touching the incore inode, all - * manipulations incore are the caller's responsibility. - * The on-disk inode is not changed by this operation, only the - * btree (free inode mask) is changed. - */ -int -xfs_difree( - xfs_trans_t *tp, /* transaction pointer */ - xfs_ino_t inode, /* inode to be freed */ - xfs_bmap_free_t *flist, /* extents to free */ - int *delete, /* set if inode cluster was deleted */ - xfs_ino_t *first_ino) /* first inode in deleted cluster */ +STATIC int +xfs_difree_inobt( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agino_t agino, + struct xfs_bmap_free *flist, + int *deleted, + xfs_ino_t *first_ino, + struct xfs_inobt_rec_incore *orec) { - /* REFERENCED */ - xfs_agblock_t agbno; /* block number containing inode */ - xfs_buf_t *agbp; /* buffer containing allocation group header */ - xfs_agino_t agino; /* inode number relative to allocation group */ - xfs_agnumber_t agno; /* allocation group number */ - xfs_agi_t *agi; /* allocation group header */ - xfs_btree_cur_t *cur; /* inode btree cursor */ - int error; /* error return value */ - int i; /* result code */ - int ilen; /* inodes in an inode cluster */ - xfs_mount_t *mp; /* mount structure for filesystem */ - int off; /* offset of inode in inode chunk */ - xfs_inobt_rec_incore_t rec; /* btree record */ - struct xfs_perag *pag; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + struct xfs_perag *pag; + struct xfs_btree_cur *cur; + struct xfs_inobt_rec_incore rec; + int ilen; + int error; + int i; + int off; - mp = tp->t_mountp; - - /* - * Break up inode number into its components. - */ - agno = XFS_INO_TO_AGNO(mp, inode); - if (agno >= mp->m_sb.sb_agcount) { - xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).", - __func__, agno, mp->m_sb.sb_agcount); - ASSERT(0); - return XFS_ERROR(EINVAL); - } - agino = XFS_INO_TO_AGINO(mp, inode); - if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) { - xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).", - __func__, (unsigned long long)inode, - (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino)); - ASSERT(0); - return XFS_ERROR(EINVAL); - } - agbno = XFS_AGINO_TO_AGBNO(mp, agino); - if (agbno >= mp->m_sb.sb_agblocks) { - xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).", - __func__, agbno, mp->m_sb.sb_agblocks); - ASSERT(0); - return XFS_ERROR(EINVAL); - } - /* - * Get the allocation group header. - */ - error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); - if (error) { - xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.", - __func__, error); - return error; - } - agi = XFS_BUF_TO_AGI(agbp); ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); - ASSERT(agbno < be32_to_cpu(agi->agi_length)); + ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length)); + /* * Initialize the cursor. */ - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); error = xfs_check_agi_freecount(cur, agi); if (error) @@ -1209,7 +1495,7 @@ xfs_difree( if (!(mp->m_flags & XFS_MOUNT_IKEEP) && (rec.ir_freecount == mp->m_ialloc_inos)) { - *delete = 1; + *deleted = 1; *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); /* @@ -1237,7 +1523,7 @@ xfs_difree( XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)), mp->m_ialloc_blks, flist, mp); } else { - *delete = 0; + *deleted = 0; error = xfs_inobt_update(cur, &rec); if (error) { @@ -1261,6 +1547,7 @@ xfs_difree( if (error) goto error0; + *orec = rec; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); return 0; @@ -1269,6 +1556,182 @@ error0: return error; } +/* + * Free an inode in the free inode btree. + */ +STATIC int +xfs_difree_finobt( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agino_t agino, + struct xfs_inobt_rec_incore *ibtrec) /* inobt record */ +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + struct xfs_btree_cur *cur; + struct xfs_inobt_rec_incore rec; + int offset = agino - ibtrec->ir_startino; + int error; + int i; + + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO); + + error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i); + if (error) + goto error; + if (i == 0) { + /* + * If the record does not exist in the finobt, we must have just + * freed an inode in a previously fully allocated chunk. If not, + * something is out of sync. + */ + XFS_WANT_CORRUPTED_GOTO(ibtrec->ir_freecount == 1, error); + + error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount, + ibtrec->ir_free, &i); + if (error) + goto error; + ASSERT(i == 1); + + goto out; + } + + /* + * Read and update the existing record. We could just copy the ibtrec + * across here, but that would defeat the purpose of having redundant + * metadata. By making the modifications independently, we can catch + * corruptions that we wouldn't see if we just copied from one record + * to another. + */ + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + goto error; + XFS_WANT_CORRUPTED_GOTO(i == 1, error); + + rec.ir_free |= XFS_INOBT_MASK(offset); + rec.ir_freecount++; + + XFS_WANT_CORRUPTED_GOTO((rec.ir_free == ibtrec->ir_free) && + (rec.ir_freecount == ibtrec->ir_freecount), + error); + + /* + * The content of inobt records should always match between the inobt + * and finobt. The lifecycle of records in the finobt is different from + * the inobt in that the finobt only tracks records with at least one + * free inode. Hence, if all of the inodes are free and we aren't + * keeping inode chunks permanently on disk, remove the record. + * Otherwise, update the record with the new information. + */ + if (rec.ir_freecount == mp->m_ialloc_inos && + !(mp->m_flags & XFS_MOUNT_IKEEP)) { + error = xfs_btree_delete(cur, &i); + if (error) + goto error; + ASSERT(i == 1); + } else { + error = xfs_inobt_update(cur, &rec); + if (error) + goto error; + } + +out: + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error; + + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return 0; + +error: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + +/* + * Free disk inode. Carefully avoids touching the incore inode, all + * manipulations incore are the caller's responsibility. + * The on-disk inode is not changed by this operation, only the + * btree (free inode mask) is changed. + */ +int +xfs_difree( + struct xfs_trans *tp, /* transaction pointer */ + xfs_ino_t inode, /* inode to be freed */ + struct xfs_bmap_free *flist, /* extents to free */ + int *deleted,/* set if inode cluster was deleted */ + xfs_ino_t *first_ino)/* first inode in deleted cluster */ +{ + /* REFERENCED */ + xfs_agblock_t agbno; /* block number containing inode */ + struct xfs_buf *agbp; /* buffer for allocation group header */ + xfs_agino_t agino; /* allocation group inode number */ + xfs_agnumber_t agno; /* allocation group number */ + int error; /* error return value */ + struct xfs_mount *mp; /* mount structure for filesystem */ + struct xfs_inobt_rec_incore rec;/* btree record */ + + mp = tp->t_mountp; + + /* + * Break up inode number into its components. + */ + agno = XFS_INO_TO_AGNO(mp, inode); + if (agno >= mp->m_sb.sb_agcount) { + xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).", + __func__, agno, mp->m_sb.sb_agcount); + ASSERT(0); + return XFS_ERROR(EINVAL); + } + agino = XFS_INO_TO_AGINO(mp, inode); + if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) { + xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).", + __func__, (unsigned long long)inode, + (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino)); + ASSERT(0); + return XFS_ERROR(EINVAL); + } + agbno = XFS_AGINO_TO_AGBNO(mp, agino); + if (agbno >= mp->m_sb.sb_agblocks) { + xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).", + __func__, agbno, mp->m_sb.sb_agblocks); + ASSERT(0); + return XFS_ERROR(EINVAL); + } + /* + * Get the allocation group header. + */ + error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + if (error) { + xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.", + __func__, error); + return error; + } + + /* + * Fix up the inode allocation btree. + */ + error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino, + &rec); + if (error) + goto error0; + + /* + * Fix up the free inode btree. + */ + if (xfs_sb_version_hasfinobt(&mp->m_sb)) { + error = xfs_difree_finobt(mp, tp, agbp, agino, &rec); + if (error) + goto error0; + } + + return 0; + +error0: + return error; +} + STATIC int xfs_imap_lookup( struct xfs_mount *mp, @@ -1300,7 +1763,7 @@ xfs_imap_lookup( * we have a record, we need to ensure it contains the inode number * we are looking up. */ - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i); if (!error) { if (i) @@ -1488,7 +1951,16 @@ xfs_ialloc_compute_maxlevels( } /* - * Log specified fields for the ag hdr (inode section) + * Log specified fields for the ag hdr (inode section). The growth of the agi + * structure over time requires that we interpret the buffer as two logical + * regions delineated by the end of the unlinked list. This is due to the size + * of the hash table and its location in the middle of the agi. + * + * For example, a request to log a field before agi_unlinked and a field after + * agi_unlinked could cause us to log the entire hash table and use an excessive + * amount of log space. To avoid this behavior, log the region up through + * agi_unlinked in one call and the region after agi_unlinked through the end of + * the structure in another. */ void xfs_ialloc_log_agi( @@ -1511,6 +1983,8 @@ xfs_ialloc_log_agi( offsetof(xfs_agi_t, agi_newino), offsetof(xfs_agi_t, agi_dirino), offsetof(xfs_agi_t, agi_unlinked), + offsetof(xfs_agi_t, agi_free_root), + offsetof(xfs_agi_t, agi_free_level), sizeof(xfs_agi_t) }; #ifdef DEBUG @@ -1519,15 +1993,30 @@ xfs_ialloc_log_agi( agi = XFS_BUF_TO_AGI(bp); ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); #endif + + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF); + /* - * Compute byte offsets for the first and last fields. + * Compute byte offsets for the first and last fields in the first + * region and log the agi buffer. This only logs up through + * agi_unlinked. */ - xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS, &first, &last); + if (fields & XFS_AGI_ALL_BITS_R1) { + xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1, + &first, &last); + xfs_trans_log_buf(tp, bp, first, last); + } + /* - * Log the allocation group inode header buffer. + * Mask off the bits in the first region and calculate the first and + * last field offsets for any bits in the second region. */ - xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF); - xfs_trans_log_buf(tp, bp, first, last); + fields &= ~XFS_AGI_ALL_BITS_R1; + if (fields) { + xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2, + &first, &last); + xfs_trans_log_buf(tp, bp, first, last); + } } #ifdef DEBUG @@ -1640,7 +2129,6 @@ xfs_read_agi( if (error) return error; - ASSERT(!xfs_buf_geterror(*bpp)); xfs_buf_set_ref(*bpp, XFS_AGI_REF); return 0; } diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index 812365d17e67..95ad1c002d60 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -90,7 +90,7 @@ xfs_difree( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t inode, /* inode to be freed */ struct xfs_bmap_free *flist, /* extents to free */ - int *delete, /* set if inode cluster was deleted */ + int *deleted, /* set if inode cluster was deleted */ xfs_ino_t *first_ino); /* first inode in deleted cluster */ /* diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index 7e309b11e87d..726f83a681a5 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -49,7 +49,8 @@ xfs_inobt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agbp, cur->bc_private.a.agno); + cur->bc_private.a.agbp, cur->bc_private.a.agno, + cur->bc_btnum); } STATIC void @@ -66,12 +67,26 @@ xfs_inobt_set_root( xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL); } +STATIC void +xfs_finobt_set_root( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *nptr, + int inc) /* level change */ +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + + agi->agi_free_root = nptr->s; + be32_add_cpu(&agi->agi_free_level, inc); + xfs_ialloc_log_agi(cur->bc_tp, agbp, + XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL); +} + STATIC int xfs_inobt_alloc_block( struct xfs_btree_cur *cur, union xfs_btree_ptr *start, union xfs_btree_ptr *new, - int length, int *stat) { xfs_alloc_arg_t args; /* block allocation args */ @@ -173,6 +188,17 @@ xfs_inobt_init_ptr_from_cur( ptr->s = agi->agi_root; } +STATIC void +xfs_finobt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp); + + ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno)); + ptr->s = agi->agi_free_root; +} + STATIC __int64_t xfs_inobt_key_diff( struct xfs_btree_cur *cur, @@ -203,6 +229,7 @@ xfs_inobt_verify( */ switch (block->bb_magic) { case cpu_to_be32(XFS_IBT_CRC_MAGIC): + case cpu_to_be32(XFS_FIBT_CRC_MAGIC): if (!xfs_sb_version_hascrc(&mp->m_sb)) return false; if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) @@ -214,6 +241,7 @@ xfs_inobt_verify( return false; /* fall through */ case cpu_to_be32(XFS_IBT_MAGIC): + case cpu_to_be32(XFS_FIBT_MAGIC): break; default: return 0; @@ -317,6 +345,28 @@ static const struct xfs_btree_ops xfs_inobt_ops = { #endif }; +static const struct xfs_btree_ops xfs_finobt_ops = { + .rec_len = sizeof(xfs_inobt_rec_t), + .key_len = sizeof(xfs_inobt_key_t), + + .dup_cursor = xfs_inobt_dup_cursor, + .set_root = xfs_finobt_set_root, + .alloc_block = xfs_inobt_alloc_block, + .free_block = xfs_inobt_free_block, + .get_minrecs = xfs_inobt_get_minrecs, + .get_maxrecs = xfs_inobt_get_maxrecs, + .init_key_from_rec = xfs_inobt_init_key_from_rec, + .init_rec_from_key = xfs_inobt_init_rec_from_key, + .init_rec_from_cur = xfs_inobt_init_rec_from_cur, + .init_ptr_from_cur = xfs_finobt_init_ptr_from_cur, + .key_diff = xfs_inobt_key_diff, + .buf_ops = &xfs_inobt_buf_ops, +#if defined(DEBUG) || defined(XFS_WARN) + .keys_inorder = xfs_inobt_keys_inorder, + .recs_inorder = xfs_inobt_recs_inorder, +#endif +}; + /* * Allocate a new inode btree cursor. */ @@ -325,7 +375,8 @@ xfs_inobt_init_cursor( struct xfs_mount *mp, /* file system mount point */ struct xfs_trans *tp, /* transaction pointer */ struct xfs_buf *agbp, /* buffer for agi structure */ - xfs_agnumber_t agno) /* allocation group number */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_btnum_t btnum) /* ialloc or free ino btree */ { struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); struct xfs_btree_cur *cur; @@ -334,11 +385,17 @@ xfs_inobt_init_cursor( cur->bc_tp = tp; cur->bc_mp = mp; - cur->bc_nlevels = be32_to_cpu(agi->agi_level); - cur->bc_btnum = XFS_BTNUM_INO; + cur->bc_btnum = btnum; + if (btnum == XFS_BTNUM_INO) { + cur->bc_nlevels = be32_to_cpu(agi->agi_level); + cur->bc_ops = &xfs_inobt_ops; + } else { + cur->bc_nlevels = be32_to_cpu(agi->agi_free_level); + cur->bc_ops = &xfs_finobt_ops; + } + cur->bc_blocklog = mp->m_sb.sb_blocklog; - cur->bc_ops = &xfs_inobt_ops; if (xfs_sb_version_hascrc(&mp->m_sb)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h index f38b22011c4e..d7ebea72c2d0 100644 --- a/fs/xfs/xfs_ialloc_btree.h +++ b/fs/xfs/xfs_ialloc_btree.h @@ -58,7 +58,8 @@ struct xfs_mount; ((index) - 1) * sizeof(xfs_inobt_ptr_t))) extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *, - struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t); + struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t, + xfs_btnum_t); extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 98d35244eecc..c48df5f25b9f 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -507,8 +507,7 @@ STATIC int xfs_inode_ag_walk( struct xfs_mount *mp, struct xfs_perag *pag, - int (*execute)(struct xfs_inode *ip, - struct xfs_perag *pag, int flags, + int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args, @@ -582,7 +581,7 @@ restart: for (i = 0; i < nr_found; i++) { if (!batch[i]) continue; - error = execute(batch[i], pag, flags, args); + error = execute(batch[i], flags, args); IRELE(batch[i]); if (error == EAGAIN) { skipped++; @@ -636,8 +635,7 @@ xfs_eofblocks_worker( int xfs_inode_ag_iterator( struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, - struct xfs_perag *pag, int flags, + int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args) @@ -664,8 +662,7 @@ xfs_inode_ag_iterator( int xfs_inode_ag_iterator_tag( struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, - struct xfs_perag *pag, int flags, + int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args, @@ -1209,7 +1206,6 @@ xfs_inode_match_id( STATIC int xfs_inode_free_eofblocks( struct xfs_inode *ip, - struct xfs_perag *pag, int flags, void *args) { diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 9ed68bb750f5..9cf017b899be 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -60,12 +60,10 @@ int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); void xfs_eofblocks_worker(struct work_struct *); int xfs_inode_ag_iterator(struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, - int flags, void *args), + int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args); int xfs_inode_ag_iterator_tag(struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, - int flags, void *args), + int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args, int tag); static inline int diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 5e7a38fa6ee6..a6115fe1ac94 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -655,7 +655,6 @@ xfs_ialloc( uint flags; int error; timespec_t tv; - int filestreams = 0; /* * Call the space management code to pick @@ -682,6 +681,14 @@ xfs_ialloc( return error; ASSERT(ip != NULL); + /* + * We always convert v1 inodes to v2 now - we only support filesystems + * with >= v2 inode capability, so there is no reason for ever leaving + * an inode in v1 format. + */ + if (ip->i_d.di_version == 1) + ip->i_d.di_version = 2; + ip->i_d.di_mode = mode; ip->i_d.di_onlink = 0; ip->i_d.di_nlink = nlink; @@ -691,27 +698,6 @@ xfs_ialloc( xfs_set_projid(ip, prid); memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); - /* - * If the superblock version is up to where we support new format - * inodes and this is currently an old format inode, then change - * the inode version number now. This way we only do the conversion - * here rather than here and in the flush/logging code. - */ - if (xfs_sb_version_hasnlink(&mp->m_sb) && - ip->i_d.di_version == 1) { - ip->i_d.di_version = 2; - /* - * We've already zeroed the old link count, the projid field, - * and the pad field. - */ - } - - /* - * Project ids won't be stored on disk if we are using a version 1 inode. - */ - if ((prid != 0) && (ip->i_d.di_version == 1)) - xfs_bump_ino_vers2(tp, ip); - if (pip && XFS_INHERIT_GID(pip)) { ip->i_d.di_gid = pip->i_d.di_gid; if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) { @@ -772,13 +758,6 @@ xfs_ialloc( flags |= XFS_ILOG_DEV; break; case S_IFREG: - /* - * we can't set up filestreams until after the VFS inode - * is set up properly. - */ - if (pip && xfs_inode_is_filestream(pip)) - filestreams = 1; - /* fall through */ case S_IFDIR: if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { uint di_flags = 0; @@ -844,15 +823,6 @@ xfs_ialloc( /* now that we have an i_mode we can setup inode ops and unlock */ xfs_setup_inode(ip); - /* now we have set up the vfs inode we can associate the filestream */ - if (filestreams) { - error = xfs_filestream_associate(pip, ip); - if (error < 0) - return -error; - if (!error) - xfs_iflags_set(ip, XFS_IFILESTREAM); - } - *ipp = ip; return 0; } @@ -1073,40 +1043,6 @@ xfs_droplink( } /* - * This gets called when the inode's version needs to be changed from 1 to 2. - * Currently this happens when the nlink field overflows the old 16-bit value - * or when chproj is called to change the project for the first time. - * As a side effect the superblock version will also get rev'd - * to contain the NLINK bit. - */ -void -xfs_bump_ino_vers2( - xfs_trans_t *tp, - xfs_inode_t *ip) -{ - xfs_mount_t *mp; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(ip->i_d.di_version == 1); - - ip->i_d.di_version = 2; - ip->i_d.di_onlink = 0; - memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); - mp = tp->t_mountp; - if (!xfs_sb_version_hasnlink(&mp->m_sb)) { - spin_lock(&mp->m_sb_lock); - if (!xfs_sb_version_hasnlink(&mp->m_sb)) { - xfs_sb_version_addnlink(&mp->m_sb); - spin_unlock(&mp->m_sb_lock); - xfs_mod_sb(tp, XFS_SB_VERSIONNUM); - } else { - spin_unlock(&mp->m_sb_lock); - } - } - /* Caller must log the inode */ -} - -/* * Increment the link count on an inode & log the change. */ int @@ -1116,22 +1052,10 @@ xfs_bumplink( { xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); + ASSERT(ip->i_d.di_version > 1); ASSERT(ip->i_d.di_nlink > 0 || (VFS_I(ip)->i_state & I_LINKABLE)); ip->i_d.di_nlink++; inc_nlink(VFS_I(ip)); - if ((ip->i_d.di_version == 1) && - (ip->i_d.di_nlink > XFS_MAXLINK_1)) { - /* - * The inode has increased its number of links beyond - * what can fit in an old format inode. It now needs - * to be converted to a version 2 inode with a 32 bit - * link count. If this is the first inode in the file - * system to do this, then we need to bump the superblock - * version number as well. - */ - xfs_bump_ino_vers2(tp, ip); - } - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); return 0; } @@ -1334,7 +1258,8 @@ int xfs_create_tmpfile( struct xfs_inode *dp, struct dentry *dentry, - umode_t mode) + umode_t mode, + struct xfs_inode **ipp) { struct xfs_mount *mp = dp->i_mount; struct xfs_inode *ip = NULL; @@ -1402,7 +1327,6 @@ xfs_create_tmpfile( xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); ip->i_d.di_nlink--; - d_tmpfile(dentry, VFS_I(ip)); error = xfs_iunlink(tp, ip); if (error) goto out_trans_abort; @@ -1415,6 +1339,7 @@ xfs_create_tmpfile( xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); + *ipp = ip; return 0; out_trans_abort: @@ -1698,16 +1623,6 @@ xfs_release( int truncated; /* - * If we are using filestreams, and we have an unlinked - * file that we are processing the last close on, then nothing - * will be able to reopen and write to this file. Purge this - * inode from the filestreams cache so that it doesn't delay - * teardown of the inode. - */ - if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip)) - xfs_filestream_deassociate(ip); - - /* * If we previously truncated this file and removed old data * in the process, we want to initiate "early" writeout on * the last close. This is an attempt to combat the notorious @@ -1837,9 +1752,33 @@ xfs_inactive_ifree( int error; tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree, 0, 0); + + /* + * The ifree transaction might need to allocate blocks for record + * insertion to the finobt. We don't want to fail here at ENOSPC, so + * allow ifree to dip into the reserved block pool if necessary. + * + * Freeing large sets of inodes generally means freeing inode chunks, + * directory and file data blocks, so this should be relatively safe. + * Only under severe circumstances should it be possible to free enough + * inodes to exhaust the reserve block pool via finobt expansion while + * at the same time not creating free space in the filesystem. + * + * Send a warning if the reservation does happen to fail, as the inode + * now remains allocated and sits on the unlinked list until the fs is + * repaired. + */ + tp->t_flags |= XFS_TRANS_RESERVE; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree, + XFS_IFREE_SPACE_RES(mp), 0); if (error) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); + if (error == ENOSPC) { + xfs_warn_ratelimited(mp, + "Failed to remove inode(s) from unlinked list. " + "Please free space, unmount and run xfs_repair."); + } else { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + } xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES); return error; } @@ -2663,13 +2602,7 @@ xfs_remove( if (error) goto std_return; - /* - * If we are using filestreams, kill the stream association. - * If the file is still open it may get a new one but that - * will get killed on last close in xfs_close() so we don't - * have to worry about that. - */ - if (!is_dir && link_zero && xfs_inode_is_filestream(ip)) + if (is_dir && xfs_inode_is_filestream(ip)) xfs_filestream_deassociate(ip); return 0; @@ -3257,6 +3190,7 @@ xfs_iflush_int( ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); ASSERT(iip != NULL && iip->ili_fields != 0); + ASSERT(ip->i_d.di_version > 1); /* set *dip = inode's place in the buffer */ dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); @@ -3317,7 +3251,7 @@ xfs_iflush_int( } /* - * Inode item log recovery for v1/v2 inodes are dependent on the + * Inode item log recovery for v2 inodes are dependent on the * di_flushiter count for correct sequencing. We bump the flush * iteration count so we can detect flushes which postdate a log record * during recovery. This is redundant as we now log every change and @@ -3340,40 +3274,9 @@ xfs_iflush_int( if (ip->i_d.di_flushiter == DI_MAX_FLUSH) ip->i_d.di_flushiter = 0; - /* - * If this is really an old format inode and the superblock version - * has not been updated to support only new format inodes, then - * convert back to the old inode format. If the superblock version - * has been updated, then make the conversion permanent. - */ - ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb)); - if (ip->i_d.di_version == 1) { - if (!xfs_sb_version_hasnlink(&mp->m_sb)) { - /* - * Convert it back. - */ - ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); - dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink); - } else { - /* - * The superblock version has already been bumped, - * so just make the conversion to the new inode - * format permanent. - */ - ip->i_d.di_version = 2; - dip->di_version = 2; - ip->i_d.di_onlink = 0; - dip->di_onlink = 0; - memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); - memset(&(dip->di_pad[0]), 0, - sizeof(dip->di_pad)); - ASSERT(xfs_get_projid(ip) == 0); - } - } - - xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp); + xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); if (XFS_IFORK_Q(ip)) - xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp); + xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); xfs_inobp_check(mp, bp); /* diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 396cc1fafd0d..f72bffa67266 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -209,7 +209,6 @@ xfs_get_initial_prid(struct xfs_inode *dp) #define XFS_ISTALE (1 << 1) /* inode has been staled */ #define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */ #define XFS_INEW (1 << 3) /* inode has just been allocated */ -#define XFS_IFILESTREAM (1 << 4) /* inode is in a filestream dir. */ #define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */ #define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */ #define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */ @@ -225,8 +224,7 @@ xfs_get_initial_prid(struct xfs_inode *dp) */ #define XFS_IRECLAIM_RESET_FLAGS \ (XFS_IRECLAIMABLE | XFS_IRECLAIM | \ - XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | \ - XFS_IFILESTREAM); + XFS_IDIRTY_RELEASE | XFS_ITRUNCATED) /* * Synchronize processes attempting to flush the in-core inode back to disk. @@ -334,7 +332,7 @@ int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, int xfs_create(struct xfs_inode *dp, struct xfs_name *name, umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp); int xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry, - umode_t mode); + umode_t mode, struct xfs_inode **ipp); int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, @@ -379,7 +377,6 @@ int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, struct xfs_inode **, int *); int xfs_droplink(struct xfs_trans *, struct xfs_inode *); int xfs_bumplink(struct xfs_trans *, struct xfs_inode *); -void xfs_bump_ino_vers2(struct xfs_trans *, struct xfs_inode *); /* from xfs_file.c */ int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/xfs_inode_buf.c index 24e993996bdc..cb35ae41d4a1 100644 --- a/fs/xfs/xfs_inode_buf.c +++ b/fs/xfs/xfs_inode_buf.c @@ -437,17 +437,16 @@ xfs_iread( } /* - * The inode format changed when we moved the link count and - * made it 32 bits long. If this is an old format inode, - * convert it in memory to look like a new one. If it gets - * flushed to disk we will convert back before flushing or - * logging it. We zero out the new projid field and the old link - * count field. We'll handle clearing the pad field (the remains - * of the old uuid field) when we actually convert the inode to - * the new format. We don't change the version number so that we - * can distinguish this from a real new format inode. + * Automatically convert version 1 inode formats in memory to version 2 + * inode format. If the inode is modified, it will get logged and + * rewritten as a version 2 inode. We can do this because we set the + * superblock feature bit for v2 inodes unconditionally during mount + * and it means the reast of the code can assume the inode version is 2 + * or higher. */ if (ip->i_d.di_version == 1) { + ip->i_d.di_version = 2; + memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); ip->i_d.di_nlink = ip->i_d.di_onlink; ip->i_d.di_onlink = 0; xfs_set_projid(ip, 0); diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/xfs_inode_fork.c index 73514c0486b7..b031e8d0d928 100644 --- a/fs/xfs/xfs_inode_fork.c +++ b/fs/xfs/xfs_inode_fork.c @@ -798,8 +798,7 @@ xfs_iflush_fork( xfs_inode_t *ip, xfs_dinode_t *dip, xfs_inode_log_item_t *iip, - int whichfork, - xfs_buf_t *bp) + int whichfork) { char *cp; xfs_ifork_t *ifp; diff --git a/fs/xfs/xfs_inode_fork.h b/fs/xfs/xfs_inode_fork.h index eb329a1ea888..7d3b1ed6dcbe 100644 --- a/fs/xfs/xfs_inode_fork.h +++ b/fs/xfs/xfs_inode_fork.h @@ -127,8 +127,7 @@ typedef struct xfs_ifork { int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *); void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, - struct xfs_inode_log_item *, int, - struct xfs_buf *); + struct xfs_inode_log_item *, int); void xfs_idestroy_fork(struct xfs_inode *, int); void xfs_idata_realloc(struct xfs_inode *, int, int); void xfs_iroot_realloc(struct xfs_inode *, int, int); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 686889b4a1e5..a640137b3573 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -145,34 +145,6 @@ xfs_inode_item_size( xfs_inode_item_attr_fork_size(iip, nvecs, nbytes); } -/* - * If this is a v1 format inode, then we need to log it as such. This means - * that we have to copy the link count from the new field to the old. We - * don't have to worry about the new fields, because nothing trusts them as - * long as the old inode version number is there. - */ -STATIC void -xfs_inode_item_format_v1_inode( - struct xfs_inode *ip) -{ - if (!xfs_sb_version_hasnlink(&ip->i_mount->m_sb)) { - /* - * Convert it back. - */ - ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); - ip->i_d.di_onlink = ip->i_d.di_nlink; - } else { - /* - * The superblock version has already been bumped, - * so just make the conversion to the new inode - * format permanent. - */ - ip->i_d.di_version = 2; - ip->i_d.di_onlink = 0; - memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); - } -} - STATIC void xfs_inode_item_format_data_fork( struct xfs_inode_log_item *iip, @@ -370,6 +342,8 @@ xfs_inode_item_format( struct xfs_inode_log_format *ilf; struct xfs_log_iovec *vecp = NULL; + ASSERT(ip->i_d.di_version > 1); + ilf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT); ilf->ilf_type = XFS_LI_INODE; ilf->ilf_ino = ip->i_ino; @@ -380,8 +354,6 @@ xfs_inode_item_format( ilf->ilf_size = 2; /* format + core */ xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format)); - if (ip->i_d.di_version == 1) - xfs_inode_item_format_v1_inode(ip); xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICORE, &ip->i_d, xfs_icdinode_size(ip->i_d.di_version)); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 0b18776b075e..8bc1bbce7451 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -543,10 +543,11 @@ xfs_attrmulti_by_handle( ops = memdup_user(am_hreq.ops, size); if (IS_ERR(ops)) { - error = PTR_ERR(ops); + error = -PTR_ERR(ops); goto out_dput; } + error = ENOMEM; attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); if (!attr_name) goto out_kfree_ops; @@ -556,7 +557,7 @@ xfs_attrmulti_by_handle( ops[i].am_error = strncpy_from_user((char *)attr_name, ops[i].am_attrname, MAXNAMELEN); if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) - error = -ERANGE; + error = ERANGE; if (ops[i].am_error < 0) break; @@ -1215,7 +1216,7 @@ xfs_ioctl_setattr( * cleared upon successful return from chown() */ if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && - !inode_capable(VFS_I(ip), CAP_FSETID)) + !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID)) ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); /* @@ -1227,15 +1228,8 @@ xfs_ioctl_setattr( olddquot = xfs_qm_vop_chown(tp, ip, &ip->i_pdquot, pdqp); } + ASSERT(ip->i_d.di_version > 1); xfs_set_projid(ip, fa->fsx_projid); - - /* - * We may have to rev the inode as well as - * the superblock version number since projids didn't - * exist before DINODE_VERSION_2 and SB_VERSION_NLINK. - */ - if (ip->i_d.di_version == 1) - xfs_bump_ino_vers2(tp, ip); } } diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index a7992f8de9d3..944d5baa710a 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -424,10 +424,11 @@ xfs_compat_attrmulti_by_handle( ops = memdup_user(compat_ptr(am_hreq.ops), size); if (IS_ERR(ops)) { - error = PTR_ERR(ops); + error = -PTR_ERR(ops); goto out_dput; } + error = ENOMEM; attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); if (!attr_name) goto out_kfree_ops; @@ -438,7 +439,7 @@ xfs_compat_attrmulti_by_handle( compat_ptr(ops[i].am_attrname), MAXNAMELEN); if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) - error = -ERANGE; + error = ERANGE; if (ops[i].am_error < 0) break; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 3b80ebae05f5..6d3ec2b6ee29 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -730,7 +730,7 @@ xfs_iomap_write_allocate( */ nimaps = 1; end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); - error = xfs_bmap_last_offset(NULL, ip, &last_block, + error = xfs_bmap_last_offset(ip, &last_block, XFS_DATA_FORK); if (error) goto trans_cancel; @@ -749,8 +749,7 @@ xfs_iomap_write_allocate( * pointer that the caller gave to us. */ error = xfs_bmapi_write(tp, ip, map_start_fsb, - count_fsb, - XFS_BMAPI_STACK_SWITCH, + count_fsb, 0, &first_block, 1, imap, &nimaps, &free_list); if (error) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 89b07e43ca28..205613a06068 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -72,8 +72,8 @@ xfs_initxattrs( int error = 0; for (xattr = xattr_array; xattr->name != NULL; xattr++) { - error = xfs_attr_set(ip, xattr->name, xattr->value, - xattr->value_len, ATTR_SECURE); + error = -xfs_attr_set(ip, xattr->name, xattr->value, + xattr->value_len, ATTR_SECURE); if (error < 0) break; } @@ -93,8 +93,8 @@ xfs_init_security( struct inode *dir, const struct qstr *qstr) { - return security_inode_init_security(inode, dir, qstr, - &xfs_initxattrs, NULL); + return -security_inode_init_security(inode, dir, qstr, + &xfs_initxattrs, NULL); } static void @@ -124,15 +124,15 @@ xfs_cleanup_inode( xfs_dentry_to_name(&teardown, dentry, 0); xfs_remove(XFS_I(dir), &teardown, XFS_I(inode)); - iput(inode); } STATIC int -xfs_vn_mknod( +xfs_generic_create( struct inode *dir, struct dentry *dentry, umode_t mode, - dev_t rdev) + dev_t rdev, + bool tmpfile) /* unnamed file */ { struct inode *inode; struct xfs_inode *ip = NULL; @@ -156,8 +156,12 @@ xfs_vn_mknod( if (error) return error; - xfs_dentry_to_name(&name, dentry, mode); - error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); + if (!tmpfile) { + xfs_dentry_to_name(&name, dentry, mode); + error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); + } else { + error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip); + } if (unlikely(error)) goto out_free_acl; @@ -169,18 +173,22 @@ xfs_vn_mknod( #ifdef CONFIG_XFS_POSIX_ACL if (default_acl) { - error = xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + error = -xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); if (error) goto out_cleanup_inode; } if (acl) { - error = xfs_set_acl(inode, acl, ACL_TYPE_ACCESS); + error = -xfs_set_acl(inode, acl, ACL_TYPE_ACCESS); if (error) goto out_cleanup_inode; } #endif - d_instantiate(dentry, inode); + if (tmpfile) + d_tmpfile(dentry, inode); + else + d_instantiate(dentry, inode); + out_free_acl: if (default_acl) posix_acl_release(default_acl); @@ -189,11 +197,23 @@ xfs_vn_mknod( return -error; out_cleanup_inode: - xfs_cleanup_inode(dir, inode, dentry); + if (!tmpfile) + xfs_cleanup_inode(dir, inode, dentry); + iput(inode); goto out_free_acl; } STATIC int +xfs_vn_mknod( + struct inode *dir, + struct dentry *dentry, + umode_t mode, + dev_t rdev) +{ + return xfs_generic_create(dir, dentry, mode, rdev, false); +} + +STATIC int xfs_vn_create( struct inode *dir, struct dentry *dentry, @@ -353,6 +373,7 @@ xfs_vn_symlink( out_cleanup_inode: xfs_cleanup_inode(dir, inode, dentry); + iput(inode); out: return -error; } @@ -808,22 +829,34 @@ xfs_setattr_size( */ inode_dio_wait(inode); + /* + * Do all the page cache truncate work outside the transaction context + * as the "lock" order is page lock->log space reservation. i.e. + * locking pages inside the transaction can ABBA deadlock with + * writeback. We have to do the VFS inode size update before we truncate + * the pagecache, however, to avoid racing with page faults beyond the + * new EOF they are not serialised against truncate operations except by + * page locks and size updates. + * + * Hence we are in a situation where a truncate can fail with ENOMEM + * from xfs_trans_reserve(), but having already truncated the in-memory + * version of the file (i.e. made user visible changes). There's not + * much we can do about this, except to hope that the caller sees ENOMEM + * and retries the truncate operation. + */ error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); if (error) return error; + truncate_setsize(inode, newsize); tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) goto out_trans_cancel; - truncate_setsize(inode, newsize); - commit_flags = XFS_TRANS_RELEASE_LOG_RES; lock_flags |= XFS_ILOCK_EXCL; - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); /* @@ -1053,11 +1086,7 @@ xfs_vn_tmpfile( struct dentry *dentry, umode_t mode) { - int error; - - error = xfs_create_tmpfile(XFS_I(dir), dentry, mode); - - return -error; + return xfs_generic_create(dir, dentry, mode, 0, true); } static const struct inode_operations xfs_inode_operations = { diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index f46338285152..cb64f222d607 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -270,7 +270,8 @@ xfs_bulkstat( /* * Allocate and initialize a btree cursor for ialloc btree. */ - cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno); + cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, + XFS_BTNUM_INO); irbp = irbuf; irbufend = irbuf + nirbuf; end_of_ag = 0; @@ -621,7 +622,8 @@ xfs_inumbers( agino = 0; continue; } - cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno); + cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, + XFS_BTNUM_INO); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE, &tmp); if (error) { diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 8497a00e399d..292308dede6d 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -616,11 +616,13 @@ xfs_log_mount( int error = 0; int min_logfsbs; - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) - xfs_notice(mp, "Mounting Filesystem"); - else { + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { + xfs_notice(mp, "Mounting V%d Filesystem", + XFS_SB_VERSION_NUM(&mp->m_sb)); + } else { xfs_notice(mp, -"Mounting filesystem in no-recovery mode. Filesystem will be inconsistent."); +"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.", + XFS_SB_VERSION_NUM(&mp->m_sb)); ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); } @@ -1163,7 +1165,7 @@ xlog_iodone(xfs_buf_t *bp) /* * Race to shutdown the filesystem if we see an error. */ - if (XFS_TEST_ERROR((xfs_buf_geterror(bp)), l->l_mp, + if (XFS_TEST_ERROR(bp->b_error, l->l_mp, XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) { xfs_buf_ioerror_alert(bp, __func__); xfs_buf_stale(bp); @@ -1181,11 +1183,14 @@ xlog_iodone(xfs_buf_t *bp) /* log I/O is always issued ASYNC */ ASSERT(XFS_BUF_ISASYNC(bp)); xlog_state_done_syncing(iclog, aborted); + /* - * do not reference the buffer (bp) here as we could race - * with it being freed after writing the unmount record to the - * log. + * drop the buffer lock now that we are done. Nothing references + * the buffer after this, so an unmount waiting on this lock can now + * tear it down safely. As such, it is unsafe to reference the buffer + * (bp) after the unlock as we could race with it being freed. */ + xfs_buf_unlock(bp); } /* @@ -1368,8 +1373,16 @@ xlog_alloc_log( bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0); if (!bp) goto out_free_log; - bp->b_iodone = xlog_iodone; + + /* + * The iclogbuf buffer locks are held over IO but we are not going to do + * IO yet. Hence unlock the buffer so that the log IO path can grab it + * when appropriately. + */ ASSERT(xfs_buf_islocked(bp)); + xfs_buf_unlock(bp); + + bp->b_iodone = xlog_iodone; log->l_xbuf = bp; spin_lock_init(&log->l_icloglock); @@ -1398,6 +1411,9 @@ xlog_alloc_log( if (!bp) goto out_free_iclog; + ASSERT(xfs_buf_islocked(bp)); + xfs_buf_unlock(bp); + bp->b_iodone = xlog_iodone; iclog->ic_bp = bp; iclog->ic_data = bp->b_addr; @@ -1422,7 +1438,6 @@ xlog_alloc_log( iclog->ic_callback_tail = &(iclog->ic_callback); iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; - ASSERT(xfs_buf_islocked(iclog->ic_bp)); init_waitqueue_head(&iclog->ic_force_wait); init_waitqueue_head(&iclog->ic_write_wait); @@ -1631,6 +1646,12 @@ xlog_cksum( * we transition the iclogs to IOERROR state *after* flushing all existing * iclogs to disk. This is because we don't want anymore new transactions to be * started or completed afterwards. + * + * We lock the iclogbufs here so that we can serialise against IO completion + * during unmount. We might be processing a shutdown triggered during unmount, + * and that can occur asynchronously to the unmount thread, and hence we need to + * ensure that completes before tearing down the iclogbufs. Hence we need to + * hold the buffer lock across the log IO to acheive that. */ STATIC int xlog_bdstrat( @@ -1638,6 +1659,7 @@ xlog_bdstrat( { struct xlog_in_core *iclog = bp->b_fspriv; + xfs_buf_lock(bp); if (iclog->ic_state & XLOG_STATE_IOERROR) { xfs_buf_ioerror(bp, EIO); xfs_buf_stale(bp); @@ -1645,7 +1667,8 @@ xlog_bdstrat( /* * It would seem logical to return EIO here, but we rely on * the log state machine to propagate I/O errors instead of - * doing it here. + * doing it here. Similarly, IO completion will unlock the + * buffer, so we don't do it here. */ return 0; } @@ -1847,14 +1870,28 @@ xlog_dealloc_log( xlog_cil_destroy(log); /* - * always need to ensure that the extra buffer does not point to memory - * owned by another log buffer before we free it. + * Cycle all the iclogbuf locks to make sure all log IO completion + * is done before we tear down these buffers. + */ + iclog = log->l_iclog; + for (i = 0; i < log->l_iclog_bufs; i++) { + xfs_buf_lock(iclog->ic_bp); + xfs_buf_unlock(iclog->ic_bp); + iclog = iclog->ic_next; + } + + /* + * Always need to ensure that the extra buffer does not point to memory + * owned by another log buffer before we free it. Also, cycle the lock + * first to ensure we've completed IO on it. */ + xfs_buf_lock(log->l_xbuf); + xfs_buf_unlock(log->l_xbuf); xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size)); xfs_buf_free(log->l_xbuf); iclog = log->l_iclog; - for (i=0; i<log->l_iclog_bufs; i++) { + for (i = 0; i < log->l_iclog_bufs; i++) { xfs_buf_free(iclog->ic_bp); next_iclog = iclog->ic_next; kmem_free(iclog); @@ -3915,11 +3952,14 @@ xfs_log_force_umount( retval = xlog_state_ioerror(log); spin_unlock(&log->l_icloglock); } + /* - * Wake up everybody waiting on xfs_log_force. - * Callback all log item committed functions as if the - * log writes were completed. + * Wake up everybody waiting on xfs_log_force. Wake the CIL push first + * as if the log writes were completed. The abort handling in the log + * item committed callback functions will do this again under lock to + * avoid races. */ + wake_up_all(&log->l_cilp->xc_commit_wait); xlog_state_do_callback(log, XFS_LI_ABORTED, NULL); #ifdef XFSERRORDEBUG diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 2c4004475e71..84e0deb95abd 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -24,7 +24,8 @@ struct xfs_log_vec { struct xfs_log_iovec *lv_iovecp; /* iovec array */ struct xfs_log_item *lv_item; /* owner */ char *lv_buf; /* formatted buffer */ - int lv_buf_len; /* size of formatted buffer */ + int lv_bytes; /* accounted space in buffer */ + int lv_buf_len; /* aligned size of buffer */ int lv_size; /* size of allocated lv */ }; @@ -52,15 +53,21 @@ xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, return vec->i_addr; } +/* + * We need to make sure the next buffer is naturally aligned for the biggest + * basic data type we put into it. We already accounted for this padding when + * sizing the buffer. + * + * However, this padding does not get written into the log, and hence we have to + * track the space used by the log vectors separately to prevent log space hangs + * due to inaccurate accounting (i.e. a leak) of the used log space through the + * CIL context ticket. + */ static inline void xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len) { - /* - * We need to make sure the next buffer is naturally aligned for the - * biggest basic data type we put into it. We already accounted for - * this when sizing the buffer. - */ lv->lv_buf_len += round_up(len, sizeof(uint64_t)); + lv->lv_bytes += len; vec->i_len = len; } diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 7e5455391176..b3425b34e3d5 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -97,7 +97,7 @@ xfs_cil_prepare_item( { /* Account for the new LV being passed in */ if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) { - *diff_len += lv->lv_buf_len; + *diff_len += lv->lv_bytes; *diff_iovecs += lv->lv_niovecs; } @@ -111,7 +111,7 @@ xfs_cil_prepare_item( else if (old_lv != lv) { ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED); - *diff_len -= old_lv->lv_buf_len; + *diff_len -= old_lv->lv_bytes; *diff_iovecs -= old_lv->lv_niovecs; kmem_free(old_lv); } @@ -239,7 +239,7 @@ xlog_cil_insert_format_items( * that the space reservation accounting is correct. */ *diff_iovecs -= lv->lv_niovecs; - *diff_len -= lv->lv_buf_len; + *diff_len -= lv->lv_bytes; } else { /* allocate new data chunk */ lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS); @@ -259,6 +259,7 @@ xlog_cil_insert_format_items( /* The allocated data region lies beyond the iovec region */ lv->lv_buf_len = 0; + lv->lv_bytes = 0; lv->lv_buf = (char *)lv + buf_size - nbytes; ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t))); @@ -385,7 +386,15 @@ xlog_cil_committed( xfs_extent_busy_clear(mp, &ctx->busy_extents, (mp->m_flags & XFS_MOUNT_DISCARD) && !abort); + /* + * If we are aborting the commit, wake up anyone waiting on the + * committing list. If we don't, then a shutdown we can leave processes + * waiting in xlog_cil_force_lsn() waiting on a sequence commit that + * will never happen because we aborted it. + */ spin_lock(&ctx->cil->xc_push_lock); + if (abort) + wake_up_all(&ctx->cil->xc_commit_wait); list_del(&ctx->committing); spin_unlock(&ctx->cil->xc_push_lock); @@ -564,8 +573,18 @@ restart: spin_lock(&cil->xc_push_lock); list_for_each_entry(new_ctx, &cil->xc_committing, committing) { /* + * Avoid getting stuck in this loop because we were woken by the + * shutdown, but then went back to sleep once already in the + * shutdown state. + */ + if (XLOG_FORCED_SHUTDOWN(log)) { + spin_unlock(&cil->xc_push_lock); + goto out_abort_free_ticket; + } + + /* * Higher sequences will wait for this one so skip them. - * Don't wait for own own sequence, either. + * Don't wait for our own sequence, either. */ if (new_ctx->sequence >= ctx->sequence) continue; @@ -810,6 +829,13 @@ restart: */ spin_lock(&cil->xc_push_lock); list_for_each_entry(ctx, &cil->xc_committing, committing) { + /* + * Avoid getting stuck in this loop because we were woken by the + * shutdown, but then went back to sleep once already in the + * shutdown state. + */ + if (XLOG_FORCED_SHUTDOWN(log)) + goto out_shutdown; if (ctx->sequence > sequence) continue; if (!ctx->commit_lsn) { @@ -833,14 +859,12 @@ restart: * push sequence after the above wait loop and the CIL still contains * dirty objects. * - * When the push occurs, it will empty the CIL and - * atomically increment the currect sequence past the push sequence and - * move it into the committing list. Of course, if the CIL is clean at - * the time of the push, it won't have pushed the CIL at all, so in that - * case we should try the push for this sequence again from the start - * just in case. + * When the push occurs, it will empty the CIL and atomically increment + * the currect sequence past the push sequence and move it into the + * committing list. Of course, if the CIL is clean at the time of the + * push, it won't have pushed the CIL at all, so in that case we should + * try the push for this sequence again from the start just in case. */ - if (sequence == cil->xc_current_sequence && !list_empty(&cil->xc_cil)) { spin_unlock(&cil->xc_push_lock); @@ -849,6 +873,17 @@ restart: spin_unlock(&cil->xc_push_lock); return commit_lsn; + + /* + * We detected a shutdown in progress. We need to trigger the log force + * to pass through it's iclog state machine error handling, even though + * we are already in a shutdown state. Hence we can't return + * NULLCOMMITLSN here as that has special meaning to log forces (i.e. + * LSN is already stable), so we return a zero LSN instead. + */ +out_shutdown: + spin_unlock(&cil->xc_push_lock); + return 0; } /* diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index bce53ac81096..981af0f6504b 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2138,7 +2138,9 @@ xlog_recover_validate_buf_type( bp->b_ops = &xfs_allocbt_buf_ops; break; case XFS_IBT_CRC_MAGIC: + case XFS_FIBT_CRC_MAGIC: case XFS_IBT_MAGIC: + case XFS_FIBT_MAGIC: bp->b_ops = &xfs_inobt_buf_ops; break; case XFS_BMAP_CRC_MAGIC: @@ -3145,7 +3147,7 @@ xlog_recover_efd_pass2( } lip = xfs_trans_ail_cursor_next(ailp, &cur); } - xfs_trans_ail_cursor_done(ailp, &cur); + xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->xa_lock); return 0; @@ -3520,8 +3522,7 @@ out: STATIC int xlog_recover_unmount_trans( - struct xlog *log, - struct xlog_recover *trans) + struct xlog *log) { /* Do nothing now */ xfs_warn(log->l_mp, "%s: Unmount LR", __func__); @@ -3595,7 +3596,7 @@ xlog_recover_process_data( trans, pass); break; case XLOG_UNMOUNT_TRANS: - error = xlog_recover_unmount_trans(log, trans); + error = xlog_recover_unmount_trans(log); break; case XLOG_WAS_CONT_TRANS: error = xlog_recover_add_to_cont_trans(log, @@ -3757,7 +3758,7 @@ xlog_recover_process_efis( lip = xfs_trans_ail_cursor_next(ailp, &cur); } out: - xfs_trans_ail_cursor_done(ailp, &cur); + xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->xa_lock); return error; } diff --git a/fs/xfs/xfs_log_rlimit.c b/fs/xfs/xfs_log_rlimit.c index 2af1a0a4d0f1..ee7e0e80246b 100644 --- a/fs/xfs/xfs_log_rlimit.c +++ b/fs/xfs/xfs_log_rlimit.c @@ -42,7 +42,7 @@ xfs_log_calc_max_attrsetm_res( int size; int nblks; - size = xfs_attr_leaf_entsize_local_max(mp->m_sb.sb_blocksize) - + size = xfs_attr_leaf_entsize_local_max(mp->m_attr_geo->blksize) - MAXNAMELEN - 1; nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK); nblks += XFS_B_TO_FSB(mp, size); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 993cb19e7d39..3507cd0ec400 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -323,8 +323,19 @@ reread: /* * Initialize the mount structure from the superblock. */ - xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp)); - xfs_sb_quota_from_disk(&mp->m_sb); + xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); + xfs_sb_quota_from_disk(sbp); + + /* + * If we haven't validated the superblock, do so now before we try + * to check the sector size and reread the superblock appropriately. + */ + if (sbp->sb_magicnum != XFS_SB_MAGIC) { + if (loud) + xfs_warn(mp, "Invalid superblock magic number"); + error = EINVAL; + goto release_buf; + } /* * We must be able to do sector-sized and sector-aligned IO. @@ -337,11 +348,11 @@ reread: goto release_buf; } - /* - * Re-read the superblock so the buffer is correctly sized, - * and properly verified. - */ if (buf_ops == NULL) { + /* + * Re-read the superblock so the buffer is correctly sized, + * and properly verified. + */ xfs_buf_relse(bp); sector_size = sbp->sb_sectsize; buf_ops = loud ? &xfs_sb_buf_ops : &xfs_sb_quiet_buf_ops; @@ -697,6 +708,12 @@ xfs_mountfs( mp->m_update_flags |= XFS_SB_VERSIONNUM; } + /* always use v2 inodes by default now */ + if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) { + mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT; + mp->m_update_flags |= XFS_SB_VERSIONNUM; + } + /* * Check if sb_agblocks is aligned at stripe boundary * If sb_agblocks is NOT aligned turn off m_dalign since @@ -743,8 +760,6 @@ xfs_mountfs( new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE; if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size)) mp->m_inode_cluster_size = new_size; - xfs_info(mp, "Using inode cluster size of %d bytes", - mp->m_inode_cluster_size); } /* @@ -776,12 +791,11 @@ xfs_mountfs( mp->m_dmevmask = 0; /* not persistent; set after each mount */ - xfs_dir_mount(mp); - - /* - * Initialize the attribute manager's entries. - */ - mp->m_attr_magicpct = (mp->m_sb.sb_blocksize * 37) / 100; + error = xfs_da_mount(mp); + if (error) { + xfs_warn(mp, "Failed dir/attr init: %d", error); + goto out_remove_uuid; + } /* * Initialize the precomputed transaction reservations values. @@ -796,7 +810,7 @@ xfs_mountfs( error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); if (error) { xfs_warn(mp, "Failed per-ag init: %d", error); - goto out_remove_uuid; + goto out_free_dir; } if (!sbp->sb_logblocks) { @@ -971,6 +985,8 @@ xfs_mountfs( xfs_wait_buftarg(mp->m_ddev_targp); out_free_perag: xfs_free_perag(mp); + out_free_dir: + xfs_da_unmount(mp); out_remove_uuid: xfs_uuid_unmount(mp); out: @@ -1048,6 +1064,7 @@ xfs_unmountfs( "Freespace may not be correct on next mount."); xfs_log_unmount(mp); + xfs_da_unmount(mp); xfs_uuid_unmount(mp); #if defined(DEBUG) diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index a466c5e5826e..7295a0b7c343 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -27,6 +27,7 @@ struct xfs_nameops; struct xfs_ail; struct xfs_quotainfo; struct xfs_dir_ops; +struct xfs_da_geometry; #ifdef HAVE_PERCPU_SB @@ -96,6 +97,8 @@ typedef struct xfs_mount { uint m_readio_blocks; /* min read size blocks */ uint m_writeio_log; /* min write size log bytes */ uint m_writeio_blocks; /* min write size blocks */ + struct xfs_da_geometry *m_dir_geo; /* directory block geometry */ + struct xfs_da_geometry *m_attr_geo; /* attribute block geometry */ struct xlog *m_log; /* log specific stuff */ int m_logbufs; /* number of log buffers */ int m_logbsize; /* size of each log buffer */ @@ -131,8 +134,6 @@ typedef struct xfs_mount { int m_fixedfsid[2]; /* unchanged for life of FS */ uint m_dmevmask; /* DMI events for this FS */ __uint64_t m_flags; /* global mount flags */ - uint m_dir_node_ents; /* #entries in a dir danode */ - uint m_attr_node_ents; /* #entries in attr danode */ int m_ialloc_inos; /* inodes in inode allocation */ int m_ialloc_blks; /* blocks in inode allocation */ int m_inoalign_mask;/* mask sb_inoalignmt if used */ @@ -145,17 +146,10 @@ typedef struct xfs_mount { int m_dalign; /* stripe unit */ int m_swidth; /* stripe width */ int m_sinoalign; /* stripe unit inode alignment */ - int m_attr_magicpct;/* 37% of the blocksize */ - int m_dir_magicpct; /* 37% of the dir blocksize */ __uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */ const struct xfs_dir_ops *m_dir_inode_ops; /* vector of dir inode ops */ const struct xfs_dir_ops *m_nondir_inode_ops; /* !dir inode ops */ - int m_dirblksize; /* directory block sz--bytes */ - int m_dirblkfsbs; /* directory block sz--fsbs */ - xfs_dablk_t m_dirdatablk; /* blockno of dir data v2 */ - xfs_dablk_t m_dirleafblk; /* blockno of dir non-data v2 */ - xfs_dablk_t m_dirfreeblk; /* blockno of dirfreeindex v2 */ uint m_chsize; /* size of next field */ atomic_t m_active_trans; /* number trans frozen */ #ifdef HAVE_PERCPU_SB diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index 4aff56395732..f99b4933dc22 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -100,14 +100,20 @@ * likely result in a loop in one of the lists. That's a sure-fire recipe for * an infinite loop in the code. */ -typedef struct xfs_mru_cache_elem -{ - struct list_head list_node; - unsigned long key; - void *value; -} xfs_mru_cache_elem_t; +struct xfs_mru_cache { + struct radix_tree_root store; /* Core storage data structure. */ + struct list_head *lists; /* Array of lists, one per grp. */ + struct list_head reap_list; /* Elements overdue for reaping. */ + spinlock_t lock; /* Lock to protect this struct. */ + unsigned int grp_count; /* Number of discrete groups. */ + unsigned int grp_time; /* Time period spanned by grps. */ + unsigned int lru_grp; /* Group containing time zero. */ + unsigned long time_zero; /* Time first element was added. */ + xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */ + struct delayed_work work; /* Workqueue data for reaping. */ + unsigned int queued; /* work has been queued */ +}; -static kmem_zone_t *xfs_mru_elem_zone; static struct workqueue_struct *xfs_mru_reap_wq; /* @@ -129,12 +135,12 @@ static struct workqueue_struct *xfs_mru_reap_wq; */ STATIC unsigned long _xfs_mru_cache_migrate( - xfs_mru_cache_t *mru, - unsigned long now) + struct xfs_mru_cache *mru, + unsigned long now) { - unsigned int grp; - unsigned int migrated = 0; - struct list_head *lru_list; + unsigned int grp; + unsigned int migrated = 0; + struct list_head *lru_list; /* Nothing to do if the data store is empty. */ if (!mru->time_zero) @@ -193,11 +199,11 @@ _xfs_mru_cache_migrate( */ STATIC void _xfs_mru_cache_list_insert( - xfs_mru_cache_t *mru, - xfs_mru_cache_elem_t *elem) + struct xfs_mru_cache *mru, + struct xfs_mru_cache_elem *elem) { - unsigned int grp = 0; - unsigned long now = jiffies; + unsigned int grp = 0; + unsigned long now = jiffies; /* * If the data store is empty, initialise time zero, leave grp set to @@ -231,10 +237,10 @@ _xfs_mru_cache_list_insert( */ STATIC void _xfs_mru_cache_clear_reap_list( - xfs_mru_cache_t *mru) __releases(mru->lock) __acquires(mru->lock) - + struct xfs_mru_cache *mru) + __releases(mru->lock) __acquires(mru->lock) { - xfs_mru_cache_elem_t *elem, *next; + struct xfs_mru_cache_elem *elem, *next; struct list_head tmp; INIT_LIST_HEAD(&tmp); @@ -252,15 +258,8 @@ _xfs_mru_cache_clear_reap_list( spin_unlock(&mru->lock); list_for_each_entry_safe(elem, next, &tmp, list_node) { - - /* Remove the element from the reap list. */ list_del_init(&elem->list_node); - - /* Call the client's free function with the key and value pointer. */ - mru->free_func(elem->key, elem->value); - - /* Free the element structure. */ - kmem_zone_free(xfs_mru_elem_zone, elem); + mru->free_func(elem); } spin_lock(&mru->lock); @@ -277,7 +276,8 @@ STATIC void _xfs_mru_cache_reap( struct work_struct *work) { - xfs_mru_cache_t *mru = container_of(work, xfs_mru_cache_t, work.work); + struct xfs_mru_cache *mru = + container_of(work, struct xfs_mru_cache, work.work); unsigned long now, next; ASSERT(mru && mru->lists); @@ -304,28 +304,16 @@ _xfs_mru_cache_reap( int xfs_mru_cache_init(void) { - xfs_mru_elem_zone = kmem_zone_init(sizeof(xfs_mru_cache_elem_t), - "xfs_mru_cache_elem"); - if (!xfs_mru_elem_zone) - goto out; - xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", WQ_MEM_RECLAIM, 1); if (!xfs_mru_reap_wq) - goto out_destroy_mru_elem_zone; - + return -ENOMEM; return 0; - - out_destroy_mru_elem_zone: - kmem_zone_destroy(xfs_mru_elem_zone); - out: - return -ENOMEM; } void xfs_mru_cache_uninit(void) { destroy_workqueue(xfs_mru_reap_wq); - kmem_zone_destroy(xfs_mru_elem_zone); } /* @@ -336,14 +324,14 @@ xfs_mru_cache_uninit(void) */ int xfs_mru_cache_create( - xfs_mru_cache_t **mrup, + struct xfs_mru_cache **mrup, unsigned int lifetime_ms, unsigned int grp_count, xfs_mru_cache_free_func_t free_func) { - xfs_mru_cache_t *mru = NULL; - int err = 0, grp; - unsigned int grp_time; + struct xfs_mru_cache *mru = NULL; + int err = 0, grp; + unsigned int grp_time; if (mrup) *mrup = NULL; @@ -400,7 +388,7 @@ exit: */ static void xfs_mru_cache_flush( - xfs_mru_cache_t *mru) + struct xfs_mru_cache *mru) { if (!mru || !mru->lists) return; @@ -420,7 +408,7 @@ xfs_mru_cache_flush( void xfs_mru_cache_destroy( - xfs_mru_cache_t *mru) + struct xfs_mru_cache *mru) { if (!mru || !mru->lists) return; @@ -438,38 +426,30 @@ xfs_mru_cache_destroy( */ int xfs_mru_cache_insert( - xfs_mru_cache_t *mru, - unsigned long key, - void *value) + struct xfs_mru_cache *mru, + unsigned long key, + struct xfs_mru_cache_elem *elem) { - xfs_mru_cache_elem_t *elem; + int error; ASSERT(mru && mru->lists); if (!mru || !mru->lists) return EINVAL; - elem = kmem_zone_zalloc(xfs_mru_elem_zone, KM_SLEEP); - if (!elem) + if (radix_tree_preload(GFP_KERNEL)) return ENOMEM; - if (radix_tree_preload(GFP_KERNEL)) { - kmem_zone_free(xfs_mru_elem_zone, elem); - return ENOMEM; - } - INIT_LIST_HEAD(&elem->list_node); elem->key = key; - elem->value = value; spin_lock(&mru->lock); - - radix_tree_insert(&mru->store, key, elem); + error = -radix_tree_insert(&mru->store, key, elem); radix_tree_preload_end(); - _xfs_mru_cache_list_insert(mru, elem); - + if (!error) + _xfs_mru_cache_list_insert(mru, elem); spin_unlock(&mru->lock); - return 0; + return error; } /* @@ -478,13 +458,12 @@ xfs_mru_cache_insert( * the client data pointer for the removed element is returned, otherwise this * function will return a NULL pointer. */ -void * +struct xfs_mru_cache_elem * xfs_mru_cache_remove( - xfs_mru_cache_t *mru, - unsigned long key) + struct xfs_mru_cache *mru, + unsigned long key) { - xfs_mru_cache_elem_t *elem; - void *value = NULL; + struct xfs_mru_cache_elem *elem; ASSERT(mru && mru->lists); if (!mru || !mru->lists) @@ -492,17 +471,11 @@ xfs_mru_cache_remove( spin_lock(&mru->lock); elem = radix_tree_delete(&mru->store, key); - if (elem) { - value = elem->value; + if (elem) list_del(&elem->list_node); - } - spin_unlock(&mru->lock); - if (elem) - kmem_zone_free(xfs_mru_elem_zone, elem); - - return value; + return elem; } /* @@ -511,13 +484,14 @@ xfs_mru_cache_remove( */ void xfs_mru_cache_delete( - xfs_mru_cache_t *mru, - unsigned long key) + struct xfs_mru_cache *mru, + unsigned long key) { - void *value = xfs_mru_cache_remove(mru, key); + struct xfs_mru_cache_elem *elem; - if (value) - mru->free_func(key, value); + elem = xfs_mru_cache_remove(mru, key); + if (elem) + mru->free_func(elem); } /* @@ -540,12 +514,12 @@ xfs_mru_cache_delete( * status, we need to help it get it right by annotating the path that does * not release the lock. */ -void * +struct xfs_mru_cache_elem * xfs_mru_cache_lookup( - xfs_mru_cache_t *mru, - unsigned long key) + struct xfs_mru_cache *mru, + unsigned long key) { - xfs_mru_cache_elem_t *elem; + struct xfs_mru_cache_elem *elem; ASSERT(mru && mru->lists); if (!mru || !mru->lists) @@ -560,7 +534,7 @@ xfs_mru_cache_lookup( } else spin_unlock(&mru->lock); - return elem ? elem->value : NULL; + return elem; } /* @@ -570,7 +544,8 @@ xfs_mru_cache_lookup( */ void xfs_mru_cache_done( - xfs_mru_cache_t *mru) __releases(mru->lock) + struct xfs_mru_cache *mru) + __releases(mru->lock) { spin_unlock(&mru->lock); } diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h index 36dd3ec8b4eb..fb5245ba5ff7 100644 --- a/fs/xfs/xfs_mru_cache.h +++ b/fs/xfs/xfs_mru_cache.h @@ -18,24 +18,15 @@ #ifndef __XFS_MRU_CACHE_H__ #define __XFS_MRU_CACHE_H__ +struct xfs_mru_cache; -/* Function pointer type for callback to free a client's data pointer. */ -typedef void (*xfs_mru_cache_free_func_t)(unsigned long, void*); +struct xfs_mru_cache_elem { + struct list_head list_node; + unsigned long key; +}; -typedef struct xfs_mru_cache -{ - struct radix_tree_root store; /* Core storage data structure. */ - struct list_head *lists; /* Array of lists, one per grp. */ - struct list_head reap_list; /* Elements overdue for reaping. */ - spinlock_t lock; /* Lock to protect this struct. */ - unsigned int grp_count; /* Number of discrete groups. */ - unsigned int grp_time; /* Time period spanned by grps. */ - unsigned int lru_grp; /* Group containing time zero. */ - unsigned long time_zero; /* Time first element was added. */ - xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */ - struct delayed_work work; /* Workqueue data for reaping. */ - unsigned int queued; /* work has been queued */ -} xfs_mru_cache_t; +/* Function pointer type for callback to free a client's data pointer. */ +typedef void (*xfs_mru_cache_free_func_t)(struct xfs_mru_cache_elem *elem); int xfs_mru_cache_init(void); void xfs_mru_cache_uninit(void); @@ -44,10 +35,12 @@ int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms, xfs_mru_cache_free_func_t free_func); void xfs_mru_cache_destroy(struct xfs_mru_cache *mru); int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key, - void *value); -void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key); + struct xfs_mru_cache_elem *elem); +struct xfs_mru_cache_elem * +xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key); void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key); -void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key); +struct xfs_mru_cache_elem * +xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key); void xfs_mru_cache_done(struct xfs_mru_cache *mru); #endif /* __XFS_MRU_CACHE_H__ */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 348e4d2ed6e6..6d26759c779a 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -193,47 +193,6 @@ xfs_qm_dqpurge( } /* - * Release the group or project dquot pointers the user dquots maybe carrying - * around as a hint, and proceed to purge the user dquot cache if requested. -*/ -STATIC int -xfs_qm_dqpurge_hints( - struct xfs_dquot *dqp, - void *data) -{ - struct xfs_dquot *gdqp = NULL; - struct xfs_dquot *pdqp = NULL; - uint flags = *((uint *)data); - - xfs_dqlock(dqp); - if (dqp->dq_flags & XFS_DQ_FREEING) { - xfs_dqunlock(dqp); - return EAGAIN; - } - - /* If this quota has a hint attached, prepare for releasing it now */ - gdqp = dqp->q_gdquot; - if (gdqp) - dqp->q_gdquot = NULL; - - pdqp = dqp->q_pdquot; - if (pdqp) - dqp->q_pdquot = NULL; - - xfs_dqunlock(dqp); - - if (gdqp) - xfs_qm_dqrele(gdqp); - if (pdqp) - xfs_qm_dqrele(pdqp); - - if (flags & XFS_QMOPT_UQUOTA) - return xfs_qm_dqpurge(dqp, NULL); - - return 0; -} - -/* * Purge the dquot cache. */ void @@ -241,18 +200,8 @@ xfs_qm_dqpurge_all( struct xfs_mount *mp, uint flags) { - /* - * We have to release group/project dquot hint(s) from the user dquot - * at first if they are there, otherwise we would run into an infinite - * loop while walking through radix tree to purge other type of dquots - * since their refcount is not zero if the user dquot refers to them - * as hint. - * - * Call the special xfs_qm_dqpurge_hints() will end up go through the - * general xfs_qm_dqpurge() against user dquot cache if requested. - */ - xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge_hints, &flags); - + if (flags & XFS_QMOPT_UQUOTA) + xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL); if (flags & XFS_QMOPT_GQUOTA) xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL); if (flags & XFS_QMOPT_PQUOTA) @@ -409,7 +358,6 @@ xfs_qm_dqattach_one( xfs_dqid_t id, uint type, uint doalloc, - xfs_dquot_t *udqhint, /* hint */ xfs_dquot_t **IO_idqpp) { xfs_dquot_t *dqp; @@ -419,9 +367,9 @@ xfs_qm_dqattach_one( error = 0; /* - * See if we already have it in the inode itself. IO_idqpp is - * &i_udquot or &i_gdquot. This made the code look weird, but - * made the logic a lot simpler. + * See if we already have it in the inode itself. IO_idqpp is &i_udquot + * or &i_gdquot. This made the code look weird, but made the logic a lot + * simpler. */ dqp = *IO_idqpp; if (dqp) { @@ -430,49 +378,10 @@ xfs_qm_dqattach_one( } /* - * udqhint is the i_udquot field in inode, and is non-NULL only - * when the type arg is group/project. Its purpose is to save a - * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside - * the user dquot. - */ - if (udqhint) { - ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ); - xfs_dqlock(udqhint); - - /* - * No need to take dqlock to look at the id. - * - * The ID can't change until it gets reclaimed, and it won't - * be reclaimed as long as we have a ref from inode and we - * hold the ilock. - */ - if (type == XFS_DQ_GROUP) - dqp = udqhint->q_gdquot; - else - dqp = udqhint->q_pdquot; - if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) { - ASSERT(*IO_idqpp == NULL); - - *IO_idqpp = xfs_qm_dqhold(dqp); - xfs_dqunlock(udqhint); - return 0; - } - - /* - * We can't hold a dquot lock when we call the dqget code. - * We'll deadlock in no time, because of (not conforming to) - * lock ordering - the inodelock comes before any dquot lock, - * and we may drop and reacquire the ilock in xfs_qm_dqget(). - */ - xfs_dqunlock(udqhint); - } - - /* - * Find the dquot from somewhere. This bumps the - * reference count of dquot and returns it locked. - * This can return ENOENT if dquot didn't exist on - * disk and we didn't ask it to allocate; - * ESRCH if quotas got turned off suddenly. + * Find the dquot from somewhere. This bumps the reference count of + * dquot and returns it locked. This can return ENOENT if dquot didn't + * exist on disk and we didn't ask it to allocate; ESRCH if quotas got + * turned off suddenly. */ error = xfs_qm_dqget(ip->i_mount, ip, id, type, doalloc | XFS_QMOPT_DOWARN, &dqp); @@ -490,48 +399,6 @@ xfs_qm_dqattach_one( return 0; } - -/* - * Given a udquot and group/project type, attach the group/project - * dquot pointer to the udquot as a hint for future lookups. - */ -STATIC void -xfs_qm_dqattach_hint( - struct xfs_inode *ip, - int type) -{ - struct xfs_dquot **dqhintp; - struct xfs_dquot *dqp; - struct xfs_dquot *udq = ip->i_udquot; - - ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ); - - xfs_dqlock(udq); - - if (type == XFS_DQ_GROUP) { - dqp = ip->i_gdquot; - dqhintp = &udq->q_gdquot; - } else { - dqp = ip->i_pdquot; - dqhintp = &udq->q_pdquot; - } - - if (*dqhintp) { - struct xfs_dquot *tmp; - - if (*dqhintp == dqp) - goto done; - - tmp = *dqhintp; - *dqhintp = NULL; - xfs_qm_dqrele(tmp); - } - - *dqhintp = xfs_qm_dqhold(dqp); -done: - xfs_dqunlock(udq); -} - static bool xfs_qm_need_dqattach( struct xfs_inode *ip) @@ -562,7 +429,6 @@ xfs_qm_dqattach_locked( uint flags) { xfs_mount_t *mp = ip->i_mount; - uint nquotas = 0; int error = 0; if (!xfs_qm_need_dqattach(ip)) @@ -570,77 +436,39 @@ xfs_qm_dqattach_locked( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - if (XFS_IS_UQUOTA_ON(mp)) { + if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) { error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER, flags & XFS_QMOPT_DQALLOC, - NULL, &ip->i_udquot); + &ip->i_udquot); if (error) goto done; - nquotas++; + ASSERT(ip->i_udquot); } - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - if (XFS_IS_GQUOTA_ON(mp)) { + if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) { error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, flags & XFS_QMOPT_DQALLOC, - ip->i_udquot, &ip->i_gdquot); - /* - * Don't worry about the udquot that we may have - * attached above. It'll get detached, if not already. - */ + &ip->i_gdquot); if (error) goto done; - nquotas++; + ASSERT(ip->i_gdquot); } - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - if (XFS_IS_PQUOTA_ON(mp)) { + if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) { error = xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ, flags & XFS_QMOPT_DQALLOC, - ip->i_udquot, &ip->i_pdquot); - /* - * Don't worry about the udquot that we may have - * attached above. It'll get detached, if not already. - */ + &ip->i_pdquot); if (error) goto done; - nquotas++; + ASSERT(ip->i_pdquot); } +done: /* - * Attach this group/project quota to the user quota as a hint. - * This WON'T, in general, result in a thrash. + * Don't worry about the dquots that we may have attached before any + * error - they'll get detached later if it has not already been done. */ - if (nquotas > 1 && ip->i_udquot) { - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(ip->i_gdquot || !XFS_IS_GQUOTA_ON(mp)); - ASSERT(ip->i_pdquot || !XFS_IS_PQUOTA_ON(mp)); - - /* - * We do not have i_udquot locked at this point, but this check - * is OK since we don't depend on the i_gdquot to be accurate - * 100% all the time. It is just a hint, and this will - * succeed in general. - */ - if (ip->i_udquot->q_gdquot != ip->i_gdquot) - xfs_qm_dqattach_hint(ip, XFS_DQ_GROUP); - - if (ip->i_udquot->q_pdquot != ip->i_pdquot) - xfs_qm_dqattach_hint(ip, XFS_DQ_PROJ); - } - - done: -#ifdef DEBUG - if (!error) { - if (XFS_IS_UQUOTA_ON(mp)) - ASSERT(ip->i_udquot); - if (XFS_IS_GQUOTA_ON(mp)) - ASSERT(ip->i_gdquot); - if (XFS_IS_PQUOTA_ON(mp)) - ASSERT(ip->i_pdquot); - } ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); -#endif return error; } @@ -843,22 +671,17 @@ xfs_qm_init_quotainfo( qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); - if ((error = list_lru_init(&qinf->qi_lru))) { - kmem_free(qinf); - mp->m_quotainfo = NULL; - return error; - } + error = -list_lru_init(&qinf->qi_lru); + if (error) + goto out_free_qinf; /* * See if quotainodes are setup, and if not, allocate them, * and change the superblock accordingly. */ - if ((error = xfs_qm_init_quotainos(mp))) { - list_lru_destroy(&qinf->qi_lru); - kmem_free(qinf); - mp->m_quotainfo = NULL; - return error; - } + error = xfs_qm_init_quotainos(mp); + if (error) + goto out_free_lru; INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS); INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS); @@ -870,8 +693,7 @@ xfs_qm_init_quotainfo( /* Precalc some constants */ qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); - qinf->qi_dqperchunk = xfs_calc_dquots_per_chunk(mp, - qinf->qi_dqchunklen); + qinf->qi_dqperchunk = xfs_calc_dquots_per_chunk(qinf->qi_dqchunklen); mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD); @@ -918,7 +740,7 @@ xfs_qm_init_quotainfo( qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit); qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); - + xfs_qm_dqdestroy(dqp); } else { qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; @@ -935,6 +757,13 @@ xfs_qm_init_quotainfo( qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE; register_shrinker(&qinf->qi_shrinker); return 0; + +out_free_lru: + list_lru_destroy(&qinf->qi_lru); +out_free_qinf: + kmem_free(qinf); + mp->m_quotainfo = NULL; + return error; } diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 3daf5ea1eb8d..bbc813caba4c 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -278,9 +278,10 @@ xfs_qm_scall_trunc_qfiles( xfs_mount_t *mp, uint flags) { - int error; + int error = EINVAL; - if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) { + if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0 || + (flags & ~XFS_DQ_ALLTYPES)) { xfs_debug(mp, "%s: flags=%x m_qflags=%x", __func__, flags, mp->m_qflags); return XFS_ERROR(EINVAL); @@ -959,7 +960,6 @@ xfs_qm_export_flags( STATIC int xfs_dqrele_inode( struct xfs_inode *ip, - struct xfs_perag *pag, int flags, void *args) { diff --git a/fs/xfs/xfs_quota_defs.h b/fs/xfs/xfs_quota_defs.h index b3b2b1065c0f..137e20937077 100644 --- a/fs/xfs/xfs_quota_defs.h +++ b/fs/xfs/xfs_quota_defs.h @@ -156,6 +156,6 @@ typedef __uint16_t xfs_qwarncnt_t; extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq, xfs_dqid_t id, uint type, uint flags, char *str); -extern int xfs_calc_dquots_per_chunk(struct xfs_mount *mp, unsigned int nbblks); +extern int xfs_calc_dquots_per_chunk(unsigned int nbblks); #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index af33cafe69b6..2ad1b9822e92 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -100,16 +100,36 @@ xfs_fs_set_xstate( if (!XFS_IS_QUOTA_ON(mp)) return -EINVAL; return -xfs_qm_scall_quotaoff(mp, flags); - case Q_XQUOTARM: - if (XFS_IS_QUOTA_ON(mp)) - return -EINVAL; - return -xfs_qm_scall_trunc_qfiles(mp, flags); } return -EINVAL; } STATIC int +xfs_fs_rm_xquota( + struct super_block *sb, + unsigned int uflags) +{ + struct xfs_mount *mp = XFS_M(sb); + unsigned int flags = 0; + + if (sb->s_flags & MS_RDONLY) + return -EROFS; + + if (XFS_IS_QUOTA_ON(mp)) + return -EINVAL; + + if (uflags & FS_USER_QUOTA) + flags |= XFS_DQ_USER; + if (uflags & FS_GROUP_QUOTA) + flags |= XFS_DQ_GROUP; + if (uflags & FS_USER_QUOTA) + flags |= XFS_DQ_PROJ; + + return -xfs_qm_scall_trunc_qfiles(mp, flags); +} + +STATIC int xfs_fs_get_dqblk( struct super_block *sb, struct kqid qid, @@ -149,6 +169,7 @@ const struct quotactl_ops xfs_quotactl_operations = { .get_xstatev = xfs_fs_get_xstatev, .get_xstate = xfs_fs_get_xstate, .set_xstate = xfs_fs_set_xstate, + .rm_xquota = xfs_fs_rm_xquota, .get_dqblk = xfs_fs_get_dqblk, .set_dqblk = xfs_fs_set_dqblk, }; diff --git a/fs/xfs/xfs_rtbitmap.c b/fs/xfs/xfs_rtbitmap.c index b1f2fe8af4a8..f4dd697cac08 100644 --- a/fs/xfs/xfs_rtbitmap.c +++ b/fs/xfs/xfs_rtbitmap.c @@ -74,7 +74,6 @@ xfs_rtbuf_get( mp->m_bsize, 0, &bp, NULL); if (error) return error; - ASSERT(!xfs_buf_geterror(bp)); *bpp = bp; return 0; } diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c index 0c0e41bbe4e3..7703fa6770ff 100644 --- a/fs/xfs/xfs_sb.c +++ b/fs/xfs/xfs_sb.c @@ -201,10 +201,6 @@ xfs_mount_validate_sb( * write validation, we don't need to check feature masks. */ if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) { - xfs_alert(mp, -"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n" -"Use of these features in this kernel is at your own risk!"); - if (xfs_sb_has_compat_feature(sbp, XFS_SB_FEAT_COMPAT_UNKNOWN)) { xfs_warn(mp, @@ -295,7 +291,8 @@ xfs_mount_validate_sb( (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */) || sbp->sb_dblocks == 0 || sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) || - sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) { + sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp) || + sbp->sb_shared_vn != 0)) { xfs_notice(mp, "SB sanity check failed"); return XFS_ERROR(EFSCORRUPTED); } @@ -337,15 +334,6 @@ xfs_mount_validate_sb( xfs_warn(mp, "Offline file system operation in progress!"); return XFS_ERROR(EFSCORRUPTED); } - - /* - * Version 1 directory format has never worked on Linux. - */ - if (unlikely(!xfs_sb_version_hasdirv2(sbp))) { - xfs_warn(mp, "file system using version 1 directory format"); - return XFS_ERROR(ENOSYS); - } - return 0; } @@ -495,10 +483,16 @@ xfs_sb_quota_to_disk( } /* - * GQUOTINO and PQUOTINO cannot be used together in versions - * of superblock that do not have pquotino. from->sb_flags - * tells us which quota is active and should be copied to - * disk. + * GQUOTINO and PQUOTINO cannot be used together in versions of + * superblock that do not have pquotino. from->sb_flags tells us which + * quota is active and should be copied to disk. If neither are active, + * make sure we write NULLFSINO to the sb_gquotino field as a quota + * inode value of "0" is invalid when the XFS_SB_VERSION_QUOTA feature + * bit is set. + * + * Note that we don't need to handle the sb_uquotino or sb_pquotino here + * as they do not require any translation. Hence the main sb field loop + * will write them appropriately from the in-core superblock. */ if ((*fields & XFS_SB_GQUOTINO) && (from->sb_qflags & XFS_GQUOTA_ACCT)) @@ -506,6 +500,17 @@ xfs_sb_quota_to_disk( else if ((*fields & XFS_SB_PQUOTINO) && (from->sb_qflags & XFS_PQUOTA_ACCT)) to->sb_gquotino = cpu_to_be64(from->sb_pquotino); + else { + /* + * We can't rely on just the fields being logged to tell us + * that it is safe to write NULLFSINO - we should only do that + * if quotas are not actually enabled. Hence only write + * NULLFSINO if both in-core quota inodes are NULL. + */ + if (from->sb_gquotino == NULLFSINO && + from->sb_pquotino == NULLFSINO) + to->sb_gquotino = cpu_to_be64(NULLFSINO); + } *fields &= ~(XFS_SB_PQUOTINO | XFS_SB_GQUOTINO); } diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h index f7b2fe77c5a5..c43c2d609a24 100644 --- a/fs/xfs/xfs_sb.h +++ b/fs/xfs/xfs_sb.h @@ -36,8 +36,6 @@ struct xfs_trans; #define XFS_SB_VERSION_5 5 /* CRC enabled filesystem */ #define XFS_SB_VERSION_NUMBITS 0x000f #define XFS_SB_VERSION_ALLFBITS 0xfff0 -#define XFS_SB_VERSION_SASHFBITS 0xf000 -#define XFS_SB_VERSION_REALFBITS 0x0ff0 #define XFS_SB_VERSION_ATTRBIT 0x0010 #define XFS_SB_VERSION_NLINKBIT 0x0020 #define XFS_SB_VERSION_QUOTABIT 0x0040 @@ -50,24 +48,15 @@ struct xfs_trans; #define XFS_SB_VERSION_DIRV2BIT 0x2000 #define XFS_SB_VERSION_BORGBIT 0x4000 /* ASCII only case-insens. */ #define XFS_SB_VERSION_MOREBITSBIT 0x8000 -#define XFS_SB_VERSION_OKSASHFBITS \ - (XFS_SB_VERSION_EXTFLGBIT | \ - XFS_SB_VERSION_DIRV2BIT | \ - XFS_SB_VERSION_BORGBIT) -#define XFS_SB_VERSION_OKREALFBITS \ - (XFS_SB_VERSION_ATTRBIT | \ - XFS_SB_VERSION_NLINKBIT | \ - XFS_SB_VERSION_QUOTABIT | \ - XFS_SB_VERSION_ALIGNBIT | \ - XFS_SB_VERSION_DALIGNBIT | \ - XFS_SB_VERSION_SHAREDBIT | \ - XFS_SB_VERSION_LOGV2BIT | \ - XFS_SB_VERSION_SECTORBIT | \ - XFS_SB_VERSION_MOREBITSBIT) -#define XFS_SB_VERSION_OKREALBITS \ - (XFS_SB_VERSION_NUMBITS | \ - XFS_SB_VERSION_OKREALFBITS | \ - XFS_SB_VERSION_OKSASHFBITS) + +/* + * Supported feature bit list is just all bits in the versionnum field because + * we've used them all up and understand them all. Except, of course, for the + * shared superblock bit, which nobody knows what it does and so is unsupported. + */ +#define XFS_SB_VERSION_OKBITS \ + ((XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALLFBITS) & \ + ~XFS_SB_VERSION_SHAREDBIT) /* * There are two words to hold XFS "feature" bits: the original @@ -76,7 +65,6 @@ struct xfs_trans; * * These defines represent bits in sb_features2. */ -#define XFS_SB_VERSION2_REALFBITS 0x00ffffff /* Mask: features */ #define XFS_SB_VERSION2_RESERVED1BIT 0x00000001 #define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002 /* Superblk counters */ #define XFS_SB_VERSION2_RESERVED4BIT 0x00000004 @@ -86,16 +74,11 @@ struct xfs_trans; #define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */ #define XFS_SB_VERSION2_FTYPE 0x00000200 /* inode type in dir */ -#define XFS_SB_VERSION2_OKREALFBITS \ +#define XFS_SB_VERSION2_OKBITS \ (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ XFS_SB_VERSION2_ATTR2BIT | \ XFS_SB_VERSION2_PROJID32BIT | \ XFS_SB_VERSION2_FTYPE) -#define XFS_SB_VERSION2_OKSASHFBITS \ - (0) -#define XFS_SB_VERSION2_OKREALBITS \ - (XFS_SB_VERSION2_OKREALFBITS | \ - XFS_SB_VERSION2_OKSASHFBITS ) /* * Superblock - in core version. Must match the ondisk version below. @@ -345,214 +328,140 @@ typedef enum { #define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS) -static inline int xfs_sb_good_version(xfs_sb_t *sbp) -{ - /* We always support version 1-3 */ - if (sbp->sb_versionnum >= XFS_SB_VERSION_1 && - sbp->sb_versionnum <= XFS_SB_VERSION_3) - return 1; - - /* We support version 4 if all feature bits are supported */ - if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) { - if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) || - ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && - (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS))) - return 0; - - if (sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN) - return 0; - return 1; - } - if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) - return 1; - - return 0; -} - /* - * Detect a mismatched features2 field. Older kernels read/wrote - * this into the wrong slot, so to be safe we keep them in sync. + * The first XFS version we support is a v4 superblock with V2 directories. */ -static inline int xfs_sb_has_mismatched_features2(xfs_sb_t *sbp) +static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp) { - return (sbp->sb_bad_features2 != sbp->sb_features2); -} - -static inline unsigned xfs_sb_version_tonew(unsigned v) -{ - if (v == XFS_SB_VERSION_1) - return XFS_SB_VERSION_4; + if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT)) + return false; - if (v == XFS_SB_VERSION_2) - return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT; + /* check for unknown features in the fs */ + if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || + ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && + (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) + return false; - return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT | - XFS_SB_VERSION_NLINKBIT; + return true; } -static inline unsigned xfs_sb_version_toold(unsigned v) +static inline bool xfs_sb_good_version(struct xfs_sb *sbp) { - if (v & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT)) - return 0; - if (v & XFS_SB_VERSION_NLINKBIT) - return XFS_SB_VERSION_3; - if (v & XFS_SB_VERSION_ATTRBIT) - return XFS_SB_VERSION_2; - return XFS_SB_VERSION_1; -} - -static inline int xfs_sb_version_hasattr(xfs_sb_t *sbp) -{ - return sbp->sb_versionnum == XFS_SB_VERSION_2 || - sbp->sb_versionnum == XFS_SB_VERSION_3 || - (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT)); + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) + return true; + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) + return xfs_sb_good_v4_features(sbp); + return false; } -static inline void xfs_sb_version_addattr(xfs_sb_t *sbp) +/* + * Detect a mismatched features2 field. Older kernels read/wrote + * this into the wrong slot, so to be safe we keep them in sync. + */ +static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp) { - if (sbp->sb_versionnum == XFS_SB_VERSION_1) - sbp->sb_versionnum = XFS_SB_VERSION_2; - else if (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4) - sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT; - else - sbp->sb_versionnum = XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT; + return sbp->sb_bad_features2 != sbp->sb_features2; } -static inline int xfs_sb_version_hasnlink(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasattr(struct xfs_sb *sbp) { - return sbp->sb_versionnum == XFS_SB_VERSION_3 || - (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT)); + return (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT); } -static inline void xfs_sb_version_addnlink(xfs_sb_t *sbp) +static inline void xfs_sb_version_addattr(struct xfs_sb *sbp) { - if (sbp->sb_versionnum <= XFS_SB_VERSION_2) - sbp->sb_versionnum = XFS_SB_VERSION_3; - else - sbp->sb_versionnum |= XFS_SB_VERSION_NLINKBIT; + sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT; } -static inline int xfs_sb_version_hasquota(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasquota(struct xfs_sb *sbp) { - return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT); + return (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT); } -static inline void xfs_sb_version_addquota(xfs_sb_t *sbp) +static inline void xfs_sb_version_addquota(struct xfs_sb *sbp) { - if (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4) - sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT; - else - sbp->sb_versionnum = xfs_sb_version_tonew(sbp->sb_versionnum) | - XFS_SB_VERSION_QUOTABIT; + sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT; } -static inline int xfs_sb_version_hasalign(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasalign(struct xfs_sb *sbp) { - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || - (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT)); } -static inline int xfs_sb_version_hasdalign(xfs_sb_t *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT); -} - -static inline int xfs_sb_version_hasshared(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasdalign(struct xfs_sb *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT); + return (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT); } -static inline int xfs_sb_version_hasdirv2(xfs_sb_t *sbp) +static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp) { - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || - (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT)); + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || + (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT); } -static inline int xfs_sb_version_haslogv2(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp) { - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || - (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT)); + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || + (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT); } -static inline int xfs_sb_version_hasextflgbit(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp) { - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || - (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT)); + return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT); } -static inline int xfs_sb_version_hassector(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasasciici(struct xfs_sb *sbp) { - return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT); + return (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT); } -static inline int xfs_sb_version_hasasciici(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp) { - return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT); -} - -static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || - (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 && - (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT)); + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || + (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT); } /* * sb_features2 bit version macros. - * - * For example, for a bit defined as XFS_SB_VERSION2_FUNBIT, has a macro: - * - * SB_VERSION_HASFUNBIT(xfs_sb_t *sbp) - * ((xfs_sb_version_hasmorebits(sbp) && - * ((sbp)->sb_features2 & XFS_SB_VERSION2_FUNBIT) */ - -static inline int xfs_sb_version_haslazysbcount(xfs_sb_t *sbp) +static inline bool xfs_sb_version_haslazysbcount(struct xfs_sb *sbp) { return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || (xfs_sb_version_hasmorebits(sbp) && (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT)); } -static inline int xfs_sb_version_hasattr2(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasattr2(struct xfs_sb *sbp) { return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || (xfs_sb_version_hasmorebits(sbp) && (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT)); } -static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp) +static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp) { sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT; + sbp->sb_bad_features2 |= XFS_SB_VERSION2_ATTR2BIT; } -static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp) +static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp) { sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT; + sbp->sb_bad_features2 &= ~XFS_SB_VERSION2_ATTR2BIT; if (!sbp->sb_features2) sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT; } -static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasprojid32bit(struct xfs_sb *sbp) { return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || (xfs_sb_version_hasmorebits(sbp) && (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT)); } -static inline void xfs_sb_version_addprojid32bit(xfs_sb_t *sbp) +static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp) { sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT; @@ -587,7 +496,9 @@ xfs_sb_has_compat_feature( return (sbp->sb_features_compat & feature) != 0; } -#define XFS_SB_FEAT_RO_COMPAT_ALL 0 +#define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */ +#define XFS_SB_FEAT_RO_COMPAT_ALL \ + (XFS_SB_FEAT_RO_COMPAT_FINOBT) #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL static inline bool xfs_sb_has_ro_compat_feature( @@ -623,12 +534,12 @@ xfs_sb_has_incompat_log_feature( /* * V5 superblock specific feature checks */ -static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp) +static inline int xfs_sb_version_hascrc(struct xfs_sb *sbp) { return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; } -static inline int xfs_sb_version_has_pquotino(xfs_sb_t *sbp) +static inline int xfs_sb_version_has_pquotino(struct xfs_sb *sbp) { return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; } @@ -641,6 +552,12 @@ static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp) (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE)); } +static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) && + (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT); +} + /* * end of superblock version macros */ diff --git a/fs/xfs/xfs_shared.h b/fs/xfs/xfs_shared.h index 4484e5151395..82404da2ca67 100644 --- a/fs/xfs/xfs_shared.h +++ b/fs/xfs/xfs_shared.h @@ -238,7 +238,7 @@ int xfs_log_calc_minimum_size(struct xfs_mount *); int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen); int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset, uint32_t size, struct xfs_buf *bp); -bool xfs_symlink_hdr_ok(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset, +bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset, uint32_t size, struct xfs_buf *bp); void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_inode *ip, struct xfs_ifork *ifp); diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index ce372b7d5644..f2240383d4bb 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -59,6 +59,7 @@ static int xfs_stat_proc_show(struct seq_file *m, void *v) { "abtc2", XFSSTAT_END_ABTC_V2 }, { "bmbt2", XFSSTAT_END_BMBT_V2 }, { "ibt2", XFSSTAT_END_IBT_V2 }, + { "fibt2", XFSSTAT_END_FIBT_V2 }, /* we print both series of quota information together */ { "qm", XFSSTAT_END_QM }, }; diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index c03ad38ceaeb..c8f238b8299a 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -183,7 +183,23 @@ struct xfsstats { __uint32_t xs_ibt_2_alloc; __uint32_t xs_ibt_2_free; __uint32_t xs_ibt_2_moves; -#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_IBT_V2+6) +#define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2+15) + __uint32_t xs_fibt_2_lookup; + __uint32_t xs_fibt_2_compare; + __uint32_t xs_fibt_2_insrec; + __uint32_t xs_fibt_2_delrec; + __uint32_t xs_fibt_2_newroot; + __uint32_t xs_fibt_2_killroot; + __uint32_t xs_fibt_2_increment; + __uint32_t xs_fibt_2_decrement; + __uint32_t xs_fibt_2_lshift; + __uint32_t xs_fibt_2_rshift; + __uint32_t xs_fibt_2_split; + __uint32_t xs_fibt_2_join; + __uint32_t xs_fibt_2_alloc; + __uint32_t xs_fibt_2_free; + __uint32_t xs_fibt_2_moves; +#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_FIBT_V2+6) __uint32_t xs_qm_dqreclaims; __uint32_t xs_qm_dqreclaim_misses; __uint32_t xs_qm_dquot_dups; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 205376776377..8f0333b3f7a0 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -765,20 +765,18 @@ xfs_open_devices( * Setup xfs_mount buffer target pointers */ error = ENOMEM; - mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, 0, mp->m_fsname); + mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev); if (!mp->m_ddev_targp) goto out_close_rtdev; if (rtdev) { - mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, 1, - mp->m_fsname); + mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev); if (!mp->m_rtdev_targp) goto out_free_ddev_targ; } if (logdev && logdev != ddev) { - mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, 1, - mp->m_fsname); + mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev); if (!mp->m_logdev_targp) goto out_free_rtdev_targ; } else { @@ -811,8 +809,7 @@ xfs_setup_devices( { int error; - error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_blocksize, - mp->m_sb.sb_sectsize); + error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize); if (error) return error; @@ -822,14 +819,12 @@ xfs_setup_devices( if (xfs_sb_version_hassector(&mp->m_sb)) log_sector_size = mp->m_sb.sb_logsectsize; error = xfs_setsize_buftarg(mp->m_logdev_targp, - mp->m_sb.sb_blocksize, log_sector_size); if (error) return error; } if (mp->m_rtdev_targp) { error = xfs_setsize_buftarg(mp->m_rtdev_targp, - mp->m_sb.sb_blocksize, mp->m_sb.sb_sectsize); if (error) return error; @@ -1433,11 +1428,11 @@ xfs_fs_fill_super( if (error) goto out_free_fsname; - error = xfs_init_mount_workqueues(mp); + error = -xfs_init_mount_workqueues(mp); if (error) goto out_close_devices; - error = xfs_icsb_init_counters(mp); + error = -xfs_icsb_init_counters(mp); if (error) goto out_destroy_workqueues; @@ -1754,13 +1749,9 @@ init_xfs_fs(void) if (error) goto out_destroy_wq; - error = xfs_filestream_init(); - if (error) - goto out_mru_cache_uninit; - error = xfs_buf_init(); if (error) - goto out_filestream_uninit; + goto out_mru_cache_uninit; error = xfs_init_procfs(); if (error) @@ -1787,8 +1778,6 @@ init_xfs_fs(void) xfs_cleanup_procfs(); out_buf_terminate: xfs_buf_terminate(); - out_filestream_uninit: - xfs_filestream_uninit(); out_mru_cache_uninit: xfs_mru_cache_uninit(); out_destroy_wq: @@ -1807,7 +1796,6 @@ exit_xfs_fs(void) xfs_sysctl_unregister(); xfs_cleanup_procfs(); xfs_buf_terminate(); - xfs_filestream_uninit(); xfs_mru_cache_uninit(); xfs_destroy_workqueues(); xfs_destroy_zones(); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 52979aa90986..d69363c833e1 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -27,6 +27,7 @@ #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_dir2.h" #include "xfs_inode.h" #include "xfs_ialloc.h" @@ -92,7 +93,7 @@ xfs_readlink_bmap( cur_chunk = bp->b_addr; if (xfs_sb_version_hascrc(&mp->m_sb)) { - if (!xfs_symlink_hdr_ok(mp, ip->i_ino, offset, + if (!xfs_symlink_hdr_ok(ip->i_ino, offset, byte_cnt, bp)) { error = EFSCORRUPTED; xfs_alert(mp, diff --git a/fs/xfs/xfs_symlink_remote.c b/fs/xfs/xfs_symlink_remote.c index 9b32052ff65e..23c2f2577c8d 100644 --- a/fs/xfs/xfs_symlink_remote.c +++ b/fs/xfs/xfs_symlink_remote.c @@ -80,7 +80,6 @@ xfs_symlink_hdr_set( */ bool xfs_symlink_hdr_ok( - struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset, uint32_t size, diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index dee3279c095e..1e85bcd0e418 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -46,6 +46,7 @@ #include "xfs_log_recover.h" #include "xfs_inode_item.h" #include "xfs_bmap_btree.h" +#include "xfs_filestream.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index a4ae41c179a8..152f82782630 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -538,6 +538,64 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release); DEFINE_BUF_ITEM_EVENT(xfs_trans_binval); DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered); +DECLARE_EVENT_CLASS(xfs_filestream_class, + TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno), + TP_ARGS(ip, agno), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_agnumber_t, agno) + __field(int, streams) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->agno = agno; + __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno); + ), + TP_printk("dev %d:%d ino 0x%llx agno %u streams %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->agno, + __entry->streams) +) +#define DEFINE_FILESTREAM_EVENT(name) \ +DEFINE_EVENT(xfs_filestream_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno), \ + TP_ARGS(ip, agno)) +DEFINE_FILESTREAM_EVENT(xfs_filestream_free); +DEFINE_FILESTREAM_EVENT(xfs_filestream_lookup); +DEFINE_FILESTREAM_EVENT(xfs_filestream_scan); + +TRACE_EVENT(xfs_filestream_pick, + TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno, + xfs_extlen_t free, int nscan), + TP_ARGS(ip, agno, free, nscan), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_agnumber_t, agno) + __field(int, streams) + __field(xfs_extlen_t, free) + __field(int, nscan) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->agno = agno; + __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno); + __entry->free = free; + __entry->nscan = nscan; + ), + TP_printk("dev %d:%d ino 0x%llx agno %u streams %d free %d nscan %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->agno, + __entry->streams, + __entry->free, + __entry->nscan) +); + DECLARE_EVENT_CLASS(xfs_lock_class, TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, unsigned long caller_ip), @@ -603,6 +661,7 @@ DEFINE_INODE_EVENT(xfs_readlink); DEFINE_INODE_EVENT(xfs_inactive_symlink); DEFINE_INODE_EVENT(xfs_alloc_file_space); DEFINE_INODE_EVENT(xfs_free_file_space); +DEFINE_INODE_EVENT(xfs_zero_file_space); DEFINE_INODE_EVENT(xfs_collapse_file_space); DEFINE_INODE_EVENT(xfs_readdir); #ifdef CONFIG_XFS_POSIX_ACL @@ -1059,7 +1118,6 @@ DEFINE_RW_EVENT(xfs_file_read); DEFINE_RW_EVENT(xfs_file_buffered_write); DEFINE_RW_EVENT(xfs_file_direct_write); DEFINE_RW_EVENT(xfs_file_splice_read); -DEFINE_RW_EVENT(xfs_file_splice_write); DECLARE_EVENT_CLASS(xfs_page_class, TP_PROTO(struct inode *inode, struct page *page, unsigned long off, diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 54a57326d85b..d03932564ccb 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -827,7 +827,7 @@ xfs_trans_committed_bulk( xfs_log_item_batch_insert(ailp, &cur, log_items, i, commit_lsn); spin_lock(&ailp->xa_lock); - xfs_trans_ail_cursor_done(ailp, &cur); + xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->xa_lock); } diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index a7287354e535..cb0f3a84cc68 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -173,7 +173,6 @@ xfs_trans_ail_cursor_next( */ void xfs_trans_ail_cursor_done( - struct xfs_ail *ailp, struct xfs_ail_cursor *cur) { cur->item = NULL; @@ -368,7 +367,7 @@ xfsaild_push( * If the AIL is empty or our push has reached the end we are * done now. */ - xfs_trans_ail_cursor_done(ailp, &cur); + xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->xa_lock); goto out_done; } @@ -453,7 +452,7 @@ xfsaild_push( break; lsn = lip->li_lsn; } - xfs_trans_ail_cursor_done(ailp, &cur); + xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->xa_lock); if (xfs_buf_delwri_submit_nowait(&ailp->xa_buf_list)) diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 12e86af9d9b9..bd1281862ad7 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -133,8 +133,7 @@ struct xfs_log_item * xfs_trans_ail_cursor_last(struct xfs_ail *ailp, xfs_lsn_t lsn); struct xfs_log_item * xfs_trans_ail_cursor_next(struct xfs_ail *ailp, struct xfs_ail_cursor *cur); -void xfs_trans_ail_cursor_done(struct xfs_ail *ailp, - struct xfs_ail_cursor *cur); +void xfs_trans_ail_cursor_done(struct xfs_ail_cursor *cur); #if BITS_PER_LONG != 64 static inline void diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c index ae368165244d..f2bda7c76b8a 100644 --- a/fs/xfs/xfs_trans_resv.c +++ b/fs/xfs/xfs_trans_resv.c @@ -26,6 +26,7 @@ #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_inode.h" #include "xfs_bmap_btree.h" #include "xfs_ialloc.h" @@ -106,6 +107,47 @@ xfs_calc_inode_res( } /* + * The free inode btree is a conditional feature and the log reservation + * requirements differ slightly from that of the traditional inode allocation + * btree. The finobt tracks records for inode chunks with at least one free + * inode. A record can be removed from the tree for an inode allocation + * or free and thus the finobt reservation is unconditional across: + * + * - inode allocation + * - inode free + * - inode chunk allocation + * + * The 'modify' param indicates to include the record modification scenario. The + * 'alloc' param indicates to include the reservation for free space btree + * modifications on behalf of finobt modifications. This is required only for + * transactions that do not already account for free space btree modifications. + * + * the free inode btree: max depth * block size + * the allocation btrees: 2 trees * (max depth - 1) * block size + * the free inode btree entry: block size + */ +STATIC uint +xfs_calc_finobt_res( + struct xfs_mount *mp, + int alloc, + int modify) +{ + uint res; + + if (!xfs_sb_version_hasfinobt(&mp->m_sb)) + return 0; + + res = xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)); + if (alloc) + res += xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); + if (modify) + res += (uint)XFS_FSB_TO_B(mp, 1); + + return res; +} + +/* * Various log reservation values. * * These are based on the size of the file system block because that is what @@ -302,6 +344,7 @@ xfs_calc_remove_reservation( * the superblock for the nlink flag: sector size * the directory btree: (max depth + v2) * dir block size * the directory inode's bmap btree: (max depth + v2) * block size + * the finobt (record modification and allocation btrees) */ STATIC uint xfs_calc_create_resv_modify( @@ -310,7 +353,8 @@ xfs_calc_create_resv_modify( return xfs_calc_inode_res(mp, 2) + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + (uint)XFS_FSB_TO_B(mp, 1) + - xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)); + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)) + + xfs_calc_finobt_res(mp, 1, 1); } /* @@ -348,6 +392,7 @@ __xfs_calc_create_reservation( * the superblock for the nlink flag: sector size * the inode btree: max depth * blocksize * the allocation btrees: 2 trees * (max depth - 1) * block size + * the finobt (record insertion) */ STATIC uint xfs_calc_icreate_resv_alloc( @@ -357,7 +402,8 @@ xfs_calc_icreate_resv_alloc( mp->m_sb.sb_sectsize + xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), - XFS_FSB_TO_B(mp, 1)); + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_finobt_res(mp, 0, 0); } STATIC uint @@ -425,6 +471,7 @@ xfs_calc_symlink_reservation( * the on disk inode before ours in the agi hash list: inode cluster size * the inode btree: max depth * blocksize * the allocation btrees: 2 trees * (max depth - 1) * block size + * the finobt (record insertion, removal or modification) */ STATIC uint xfs_calc_ifree_reservation( @@ -439,7 +486,8 @@ xfs_calc_ifree_reservation( xfs_calc_buf_res(2 + mp->m_ialloc_blks + mp->m_in_maxlevels, 0) + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), - XFS_FSB_TO_B(mp, 1)); + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_finobt_res(mp, 0, 1); } /* @@ -562,7 +610,7 @@ xfs_calc_addafork_reservation( return XFS_DQUOT_LOGRES(mp) + xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(1, mp->m_dirblksize) + + xfs_calc_buf_res(1, mp->m_dir_geo->blksize) + xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1, XFS_FSB_TO_B(mp, 1)) + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h index af5dbe06cb65..bf9c4579334d 100644 --- a/fs/xfs/xfs_trans_space.h +++ b/fs/xfs/xfs_trans_space.h @@ -28,7 +28,8 @@ (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \ XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \ XFS_EXTENTADD_SPACE_RES(mp,w)) -#define XFS_DAENTER_1B(mp,w) ((w) == XFS_DATA_FORK ? (mp)->m_dirblkfsbs : 1) +#define XFS_DAENTER_1B(mp,w) \ + ((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1) #define XFS_DAENTER_DBS(mp,w) \ (XFS_DA_NODE_MAXDEPTH + (((w) == XFS_DATA_FORK) ? 2 : 0)) #define XFS_DAENTER_BLOCKS(mp,w) \ @@ -47,13 +48,15 @@ #define XFS_DIRREMOVE_SPACE_RES(mp) \ XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK) #define XFS_IALLOC_SPACE_RES(mp) \ - ((mp)->m_ialloc_blks + (mp)->m_in_maxlevels - 1) + ((mp)->m_ialloc_blks + \ + (xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1 * \ + ((mp)->m_in_maxlevels - 1))) /* * Space reservation values for various transactions. */ #define XFS_ADDAFORK_SPACE_RES(mp) \ - ((mp)->m_dirblkfsbs + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK)) + ((mp)->m_dir_geo->fsbcount + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK)) #define XFS_ATTRRM_SPACE_RES(mp) \ XFS_DAREMOVE_SPACE_RES(mp, XFS_ATTR_FORK) /* This macro is not used - see inline code in xfs_attr_set */ @@ -82,5 +85,8 @@ (XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) #define XFS_SYMLINK_SPACE_RES(mp,nl,b) \ (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b)) +#define XFS_IFREE_SPACE_RES(mp) \ + (xfs_sb_version_hasfinobt(&mp->m_sb) ? (mp)->m_in_maxlevels : 0) + #endif /* __XFS_TRANS_SPACE_H__ */ diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h index 82bbc34d54a3..65c6e6650b1a 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/xfs_types.h @@ -134,7 +134,7 @@ typedef enum { typedef enum { XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_BMAPi, XFS_BTNUM_INOi, - XFS_BTNUM_MAX + XFS_BTNUM_FINOi, XFS_BTNUM_MAX } xfs_btnum_t; struct xfs_name { |