diff options
Diffstat (limited to 'fs')
464 files changed, 12718 insertions, 8147 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index a16b0ff497ca..d8223209d4b1 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -832,6 +832,7 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma) static const struct vm_operations_struct v9fs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = v9fs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; @@ -839,6 +840,7 @@ static const struct vm_operations_struct v9fs_file_vm_ops = { static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { .close = v9fs_mmap_vm_close, .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = v9fs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index bb7991c7e5c7..53161ec058a7 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -451,7 +451,7 @@ void v9fs_evict_inode(struct inode *inode) { struct v9fs_inode *v9inode = V9FS_I(inode); - truncate_inode_pages(inode->i_mapping, 0); + truncate_inode_pages_final(inode->i_mapping); clear_inode(inode); filemap_fdatawrite(inode->i_mapping); diff --git a/fs/Kconfig b/fs/Kconfig index 7385e54be4b9..312393f32948 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -96,6 +96,7 @@ endif # BLOCK menu "Pseudo filesystems" source "fs/proc/Kconfig" +source "fs/kernfs/Kconfig" source "fs/sysfs/Kconfig" config TMPFS diff --git a/fs/Makefile b/fs/Makefile index 47ac07bb4acc..f9cb9876e466 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -52,7 +52,8 @@ obj-$(CONFIG_FHANDLE) += fhandle.o obj-y += quota/ obj-$(CONFIG_PROC_FS) += proc/ -obj-$(CONFIG_SYSFS) += sysfs/ kernfs/ +obj-$(CONFIG_KERNFS) += kernfs/ +obj-$(CONFIG_SYSFS) += sysfs/ obj-$(CONFIG_CONFIGFS_FS) += configfs/ obj-y += devpts/ diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 7b3003cb6f1b..9852bdf34d76 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -212,6 +212,7 @@ static int parse_options(struct super_block *sb, char *options) static int adfs_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_NODIRATIME; return parse_options(sb, data); } @@ -265,7 +266,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { adfs_inode_cachep = kmem_cache_create("adfs_inode_cache", sizeof(struct adfs_inode_info), diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 3952121f2f28..25b23b1e7f22 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -5,14 +5,6 @@ #include <linux/mutex.h> #include <linux/workqueue.h> -/* AmigaOS allows file names with up to 30 characters length. - * Names longer than that will be silently truncated. If you - * want to disallow this, comment out the following #define. - * Creating filesystem objects with longer names will then - * result in an error (ENAMETOOLONG). - */ -/*#define AFFS_NO_TRUNCATE */ - /* Ugly macros make the code more pretty. */ #define GET_END_PTR(st,p,sz) ((st *)((char *)(p)+((sz)-sizeof(st)))) @@ -28,7 +20,6 @@ #define AFFS_CACHE_SIZE PAGE_SIZE -#define AFFS_MAX_PREALLOC 32 #define AFFS_LC_SIZE (AFFS_CACHE_SIZE/sizeof(u32)/2) #define AFFS_AC_SIZE (AFFS_CACHE_SIZE/sizeof(struct affs_ext_key)/2) #define AFFS_AC_MASK (AFFS_AC_SIZE-1) @@ -118,6 +109,7 @@ struct affs_sb_info { #define SF_OFS 0x0200 /* Old filesystem */ #define SF_PREFIX 0x0400 /* Buffer for prefix is allocated */ #define SF_VERBOSE 0x0800 /* Talk about fs when mounting */ +#define SF_NO_TRUNCATE 0x1000 /* Don't truncate filenames */ /* short cut to get to the affs specific sb data */ static inline struct affs_sb_info *AFFS_SB(struct super_block *sb) @@ -137,9 +129,13 @@ extern void affs_fix_checksum(struct super_block *sb, struct buffer_head *bh); extern void secs_to_datestamp(time_t secs, struct affs_date *ds); extern umode_t prot_to_mode(u32 prot); extern void mode_to_prot(struct inode *inode); -extern void affs_error(struct super_block *sb, const char *function, const char *fmt, ...); -extern void affs_warning(struct super_block *sb, const char *function, const char *fmt, ...); -extern int affs_check_name(const unsigned char *name, int len); +extern void affs_error(struct super_block *sb, const char *function, + const char *fmt, ...); +extern void affs_warning(struct super_block *sb, const char *function, + const char *fmt, ...); +extern bool affs_nofilenametruncate(const struct dentry *dentry); +extern int affs_check_name(const unsigned char *name, int len, + bool notruncate); extern int affs_copy_name(unsigned char *bstr, struct dentry *dentry); /* bitmap. c */ diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index d9a43674cb94..533a322c41c0 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -471,20 +471,27 @@ affs_warning(struct super_block *sb, const char *function, const char *fmt, ...) function,ErrorBuffer); } +bool +affs_nofilenametruncate(const struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + return AFFS_SB(inode->i_sb)->s_flags & SF_NO_TRUNCATE; + +} + /* Check if the name is valid for a affs object. */ int -affs_check_name(const unsigned char *name, int len) +affs_check_name(const unsigned char *name, int len, bool notruncate) { int i; - if (len > 30) -#ifdef AFFS_NO_TRUNCATE - return -ENAMETOOLONG; -#else - len = 30; -#endif - + if (len > 30) { + if (notruncate) + return -ENAMETOOLONG; + else + len = 30; + } for (i = 0; i < len; i++) { if (name[i] < ' ' || name[i] == ':' || (name[i] > 0x7e && name[i] < 0xa0)) diff --git a/fs/affs/dir.c b/fs/affs/dir.c index f1eba8c3644e..cbbda476a805 100644 --- a/fs/affs/dir.c +++ b/fs/affs/dir.c @@ -52,8 +52,10 @@ affs_readdir(struct file *file, struct dir_context *ctx) int hash_pos; int chain_pos; u32 ino; + int error = 0; - pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)ctx->pos); + pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n", + inode->i_ino, (unsigned long)ctx->pos); if (ctx->pos < 2) { file->private_data = (void *)0; @@ -72,7 +74,7 @@ affs_readdir(struct file *file, struct dir_context *ctx) } dir_bh = affs_bread(sb, inode->i_ino); if (!dir_bh) - goto readdir_out; + goto out_unlock_dir; /* If the directory hasn't changed since the last call to readdir(), * we can jump directly to where we left off. @@ -88,7 +90,8 @@ affs_readdir(struct file *file, struct dir_context *ctx) fh_bh = affs_bread(sb, ino); if (!fh_bh) { affs_error(sb, "readdir","Cannot read block %d", i); - return -EIO; + error = -EIO; + goto out_brelse_dir; } ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain); affs_brelse(fh_bh); @@ -107,29 +110,34 @@ inside: do { fh_bh = affs_bread(sb, ino); if (!fh_bh) { - affs_error(sb, "readdir","Cannot read block %d", ino); + affs_error(sb, "readdir", + "Cannot read block %d", ino); break; } namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30); name = AFFS_TAIL(sb, fh_bh)->name + 1; - pr_debug("AFFS: readdir(): filldir(\"%.*s\", ino=%u), hash=%d, f_pos=%x\n", + pr_debug("AFFS: readdir(): dir_emit(\"%.*s\", " + "ino=%u), hash=%d, f_pos=%x\n", namelen, name, ino, hash_pos, (u32)ctx->pos); + if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN)) - goto readdir_done; + goto done; ctx->pos++; ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain); affs_brelse(fh_bh); fh_bh = NULL; } while (ino); } -readdir_done: +done: file->f_version = inode->i_version; file->private_data = (void *)(long)ino; + affs_brelse(fh_bh); -readdir_out: +out_brelse_dir: affs_brelse(dir_bh); - affs_brelse(fh_bh); + +out_unlock_dir: affs_unlock_dir(inode); - return 0; + return error; } diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 0e092d08680e..96df91e8c334 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -259,7 +259,7 @@ affs_evict_inode(struct inode *inode) { unsigned long cache_page; pr_debug("AFFS: evict_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); if (!inode->i_nlink) { inode->i_size = 0; diff --git a/fs/affs/namei.c b/fs/affs/namei.c index c36cbb4537a2..6dae1ccd176d 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -60,13 +60,13 @@ affs_get_toupper(struct super_block *sb) * Note: the dentry argument is the parent dentry. */ static inline int -__affs_hash_dentry(struct qstr *qstr, toupper_t toupper) +__affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate) { const u8 *name = qstr->name; unsigned long hash; int i; - i = affs_check_name(qstr->name, qstr->len); + i = affs_check_name(qstr->name, qstr->len, notruncate); if (i) return i; @@ -82,16 +82,22 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper) static int affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr) { - return __affs_hash_dentry(qstr, affs_toupper); + return __affs_hash_dentry(qstr, affs_toupper, + affs_nofilenametruncate(dentry)); + } + static int affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr) { - return __affs_hash_dentry(qstr, affs_intl_toupper); + return __affs_hash_dentry(qstr, affs_intl_toupper, + affs_nofilenametruncate(dentry)); + } static inline int __affs_compare_dentry(unsigned int len, - const char *str, const struct qstr *name, toupper_t toupper) + const char *str, const struct qstr *name, toupper_t toupper, + bool notruncate) { const u8 *aname = str; const u8 *bname = name->name; @@ -101,7 +107,7 @@ static inline int __affs_compare_dentry(unsigned int len, * must be valid. 'name' must be validated first. */ - if (affs_check_name(name->name, name->len)) + if (affs_check_name(name->name, name->len, notruncate)) return 1; /* @@ -126,13 +132,18 @@ static int affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { - return __affs_compare_dentry(len, str, name, affs_toupper); + + return __affs_compare_dentry(len, str, name, affs_toupper, + affs_nofilenametruncate(parent)); } + static int affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { - return __affs_compare_dentry(len, str, name, affs_intl_toupper); + return __affs_compare_dentry(len, str, name, affs_intl_toupper, + affs_nofilenametruncate(parent)); + } /* @@ -411,7 +422,10 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry, (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name, (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name); - retval = affs_check_name(new_dentry->d_name.name,new_dentry->d_name.len); + retval = affs_check_name(new_dentry->d_name.name, + new_dentry->d_name.len, + affs_nofilenametruncate(old_dentry)); + if (retval) return retval; diff --git a/fs/affs/super.c b/fs/affs/super.c index d098731b82ff..895ac7dc9dbf 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -128,7 +128,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { affs_inode_cachep = kmem_cache_create("affs_inode_cache", sizeof(struct affs_inode_info), @@ -163,7 +163,7 @@ static const struct super_operations affs_sops = { }; enum { - Opt_bs, Opt_mode, Opt_mufs, Opt_prefix, Opt_protect, + Opt_bs, Opt_mode, Opt_mufs, Opt_notruncate, Opt_prefix, Opt_protect, Opt_reserved, Opt_root, Opt_setgid, Opt_setuid, Opt_verbose, Opt_volume, Opt_ignore, Opt_err, }; @@ -172,6 +172,7 @@ static const match_table_t tokens = { {Opt_bs, "bs=%u"}, {Opt_mode, "mode=%o"}, {Opt_mufs, "mufs"}, + {Opt_notruncate, "nofilenametruncate"}, {Opt_prefix, "prefix=%s"}, {Opt_protect, "protect"}, {Opt_reserved, "reserved=%u"}, @@ -233,6 +234,9 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved, case Opt_mufs: *mount_opts |= SF_MUFS; break; + case Opt_notruncate: + *mount_opts |= SF_NO_TRUNCATE; + break; case Opt_prefix: *prefix = match_strdup(&args[0]); if (!*prefix) @@ -336,8 +340,6 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) &blocksize,&sbi->s_prefix, sbi->s_volume, &mount_flags)) { printk(KERN_ERR "AFFS: Error parsing options\n"); - kfree(sbi->s_prefix); - kfree(sbi); return -EINVAL; } /* N.B. after this point s_prefix must be released */ @@ -530,6 +532,7 @@ affs_remount(struct super_block *sb, int *flags, char *data) pr_debug("AFFS: remount(flags=0x%x,opts=\"%s\")\n",*flags,data); + sync_filesystem(sb); *flags |= MS_NODIRATIME; memcpy(volume, sbi->s_volume, 32); diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 1c8c6cc6de30..4b0eff6da674 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -130,6 +130,15 @@ static void afs_cm_destructor(struct afs_call *call) { _enter(""); + /* Break the callbacks here so that we do it after the final ACK is + * received. The step number here must match the final number in + * afs_deliver_cb_callback(). + */ + if (call->unmarshall == 6) { + ASSERT(call->server && call->count && call->request); + afs_break_callbacks(call->server, call->count, call->request); + } + afs_put_server(call->server); call->server = NULL; kfree(call->buffer); @@ -272,6 +281,16 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, _debug("trailer"); if (skb->len != 0) return -EBADMSG; + + /* Record that the message was unmarshalled successfully so + * that the call destructor can know do the callback breaking + * work, even if the final ACK isn't received. + * + * If the step number changes, then afs_cm_destructor() must be + * updated also. + */ + call->unmarshall++; + case 6: break; } diff --git a/fs/afs/inode.c b/fs/afs/inode.c index ce25d755b7aa..294671288449 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -422,7 +422,7 @@ void afs_evict_inode(struct inode *inode) ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); afs_give_up_callback(vnode); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 6621f8008122..590b55f46d61 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -75,6 +75,7 @@ struct afs_call { const struct afs_call_type *type; /* type of call */ const struct afs_wait_mode *wait_mode; /* completion wait mode */ wait_queue_head_t waitq; /* processes awaiting completion */ + void (*async_workfn)(struct afs_call *call); /* asynchronous work function */ struct work_struct async_work; /* asynchronous work processor */ struct work_struct work; /* actual work processor */ struct sk_buff_head rx_queue; /* received packets */ diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 8ad8c2a0703a..03a3beb17004 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -25,7 +25,7 @@ static void afs_wake_up_call_waiter(struct afs_call *); static int afs_wait_for_call_to_complete(struct afs_call *); static void afs_wake_up_async_call(struct afs_call *); static int afs_dont_wait_for_call_to_complete(struct afs_call *); -static void afs_process_async_call(struct work_struct *); +static void afs_process_async_call(struct afs_call *); static void afs_rx_interceptor(struct sock *, unsigned long, struct sk_buff *); static int afs_deliver_cm_op_id(struct afs_call *, struct sk_buff *, bool); @@ -58,6 +58,13 @@ static void afs_collect_incoming_call(struct work_struct *); static struct sk_buff_head afs_incoming_calls; static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call); +static void afs_async_workfn(struct work_struct *work) +{ + struct afs_call *call = container_of(work, struct afs_call, async_work); + + call->async_workfn(call); +} + /* * open an RxRPC socket and bind it to be a server for callback notifications * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT @@ -184,6 +191,28 @@ static void afs_free_call(struct afs_call *call) } /* + * End a call but do not free it + */ +static void afs_end_call_nofree(struct afs_call *call) +{ + if (call->rxcall) { + rxrpc_kernel_end_call(call->rxcall); + call->rxcall = NULL; + } + if (call->type->destructor) + call->type->destructor(call); +} + +/* + * End a call and free it + */ +static void afs_end_call(struct afs_call *call) +{ + afs_end_call_nofree(call); + afs_free_call(call); +} + +/* * allocate a call with flat request and reply buffers */ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type, @@ -326,7 +355,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, atomic_read(&afs_outstanding_calls)); call->wait_mode = wait_mode; - INIT_WORK(&call->async_work, afs_process_async_call); + call->async_workfn = afs_process_async_call; + INIT_WORK(&call->async_work, afs_async_workfn); memset(&srx, 0, sizeof(srx)); srx.srx_family = AF_RXRPC; @@ -383,11 +413,8 @@ error_do_abort: rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT); while ((skb = skb_dequeue(&call->rx_queue))) afs_free_skb(skb); - rxrpc_kernel_end_call(rxcall); - call->rxcall = NULL; error_kill_call: - call->type->destructor(call); - afs_free_call(call); + afs_end_call(call); _leave(" = %d", ret); return ret; } @@ -509,12 +536,8 @@ static void afs_deliver_to_call(struct afs_call *call) if (call->state >= AFS_CALL_COMPLETE) { while ((skb = skb_dequeue(&call->rx_queue))) afs_free_skb(skb); - if (call->incoming) { - rxrpc_kernel_end_call(call->rxcall); - call->rxcall = NULL; - call->type->destructor(call); - afs_free_call(call); - } + if (call->incoming) + afs_end_call(call); } _leave(""); @@ -564,10 +587,7 @@ static int afs_wait_for_call_to_complete(struct afs_call *call) } _debug("call complete"); - rxrpc_kernel_end_call(call->rxcall); - call->rxcall = NULL; - call->type->destructor(call); - afs_free_call(call); + afs_end_call(call); _leave(" = %d", ret); return ret; } @@ -603,11 +623,8 @@ static int afs_dont_wait_for_call_to_complete(struct afs_call *call) /* * delete an asynchronous call */ -static void afs_delete_async_call(struct work_struct *work) +static void afs_delete_async_call(struct afs_call *call) { - struct afs_call *call = - container_of(work, struct afs_call, async_work); - _enter(""); afs_free_call(call); @@ -620,11 +637,8 @@ static void afs_delete_async_call(struct work_struct *work) * - on a multiple-thread workqueue this work item may try to run on several * CPUs at the same time */ -static void afs_process_async_call(struct work_struct *work) +static void afs_process_async_call(struct afs_call *call) { - struct afs_call *call = - container_of(work, struct afs_call, async_work); - _enter(""); if (!skb_queue_empty(&call->rx_queue)) @@ -637,14 +651,11 @@ static void afs_process_async_call(struct work_struct *work) call->reply = NULL; /* kill the call */ - rxrpc_kernel_end_call(call->rxcall); - call->rxcall = NULL; - if (call->type->destructor) - call->type->destructor(call); + afs_end_call_nofree(call); /* we can't just delete the call because the work item may be * queued */ - PREPARE_WORK(&call->async_work, afs_delete_async_call); + call->async_workfn = afs_delete_async_call; queue_work(afs_async_calls, &call->async_work); } @@ -685,7 +696,8 @@ static void afs_collect_incoming_call(struct work_struct *work) return; } - INIT_WORK(&call->async_work, afs_process_async_call); + call->async_workfn = afs_process_async_call; + INIT_WORK(&call->async_work, afs_async_workfn); call->wait_mode = &afs_async_incoming_call; call->type = &afs_RXCMxxxx; init_waitqueue_head(&call->waitq); @@ -782,10 +794,7 @@ void afs_send_empty_reply(struct afs_call *call) _debug("oom"); rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT); default: - rxrpc_kernel_end_call(call->rxcall); - call->rxcall = NULL; - call->type->destructor(call); - afs_free_call(call); + afs_end_call(call); _leave(" [error]"); return; } @@ -815,17 +824,16 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) call->state = AFS_CALL_AWAIT_ACK; n = rxrpc_kernel_send_data(call->rxcall, &msg, len); if (n >= 0) { + /* Success */ _leave(" [replied]"); return; } + if (n == -ENOMEM) { _debug("oom"); rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT); } - rxrpc_kernel_end_call(call->rxcall); - call->rxcall = NULL; - call->type->destructor(call); - afs_free_call(call); + afs_end_call(call); _leave(" [error]"); } @@ -52,7 +52,8 @@ struct aio_ring { unsigned id; /* kernel internal index number */ unsigned nr; /* number of io_events */ - unsigned head; + unsigned head; /* Written to by userland or under ring_lock + * mutex by aio_read_events_ring(). */ unsigned tail; unsigned magic; @@ -111,6 +112,11 @@ struct kioctx { struct work_struct free_work; + /* + * signals when all in-flight requests are done + */ + struct completion *requests_done; + struct { /* * This counts the number of available slots in the ringbuffer, @@ -243,6 +249,11 @@ static void aio_free_ring(struct kioctx *ctx) { int i; + /* Disconnect the kiotx from the ring file. This prevents future + * accesses to the kioctx from page migration. + */ + put_aio_ring_file(ctx); + for (i = 0; i < ctx->nr_pages; i++) { struct page *page; pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, @@ -254,8 +265,6 @@ static void aio_free_ring(struct kioctx *ctx) put_page(page); } - put_aio_ring_file(ctx); - if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) { kfree(ctx->ring_pages); ctx->ring_pages = NULL; @@ -283,29 +292,38 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, { struct kioctx *ctx; unsigned long flags; + pgoff_t idx; int rc; rc = 0; - /* Make sure the old page hasn't already been changed */ + /* mapping->private_lock here protects against the kioctx teardown. */ spin_lock(&mapping->private_lock); ctx = mapping->private_data; - if (ctx) { - pgoff_t idx; - spin_lock_irqsave(&ctx->completion_lock, flags); - idx = old->index; - if (idx < (pgoff_t)ctx->nr_pages) { - if (ctx->ring_pages[idx] != old) - rc = -EAGAIN; - } else - rc = -EINVAL; - spin_unlock_irqrestore(&ctx->completion_lock, flags); + if (!ctx) { + rc = -EINVAL; + goto out; + } + + /* The ring_lock mutex. The prevents aio_read_events() from writing + * to the ring's head, and prevents page migration from mucking in + * a partially initialized kiotx. + */ + if (!mutex_trylock(&ctx->ring_lock)) { + rc = -EAGAIN; + goto out; + } + + idx = old->index; + if (idx < (pgoff_t)ctx->nr_pages) { + /* Make sure the old page hasn't already been changed */ + if (ctx->ring_pages[idx] != old) + rc = -EAGAIN; } else rc = -EINVAL; - spin_unlock(&mapping->private_lock); if (rc != 0) - return rc; + goto out_unlock; /* Writeback must be complete */ BUG_ON(PageWriteback(old)); @@ -314,38 +332,26 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1); if (rc != MIGRATEPAGE_SUCCESS) { put_page(new); - return rc; + goto out_unlock; } - /* We can potentially race against kioctx teardown here. Use the - * address_space's private data lock to protect the mapping's - * private_data. + /* Take completion_lock to prevent other writes to the ring buffer + * while the old page is copied to the new. This prevents new + * events from being lost. */ - spin_lock(&mapping->private_lock); - ctx = mapping->private_data; - if (ctx) { - pgoff_t idx; - spin_lock_irqsave(&ctx->completion_lock, flags); - migrate_page_copy(new, old); - idx = old->index; - if (idx < (pgoff_t)ctx->nr_pages) { - /* And only do the move if things haven't changed */ - if (ctx->ring_pages[idx] == old) - ctx->ring_pages[idx] = new; - else - rc = -EAGAIN; - } else - rc = -EINVAL; - spin_unlock_irqrestore(&ctx->completion_lock, flags); - } else - rc = -EBUSY; - spin_unlock(&mapping->private_lock); + spin_lock_irqsave(&ctx->completion_lock, flags); + migrate_page_copy(new, old); + BUG_ON(ctx->ring_pages[idx] != old); + ctx->ring_pages[idx] = new; + spin_unlock_irqrestore(&ctx->completion_lock, flags); - if (rc == MIGRATEPAGE_SUCCESS) - put_page(old); - else - put_page(new); + /* The old page is no longer accessible. */ + put_page(old); +out_unlock: + mutex_unlock(&ctx->ring_lock); +out: + spin_unlock(&mapping->private_lock); return rc; } #endif @@ -380,7 +386,7 @@ static int aio_setup_ring(struct kioctx *ctx) file = aio_private_file(ctx, nr_pages); if (IS_ERR(file)) { ctx->aio_ring_file = NULL; - return -EAGAIN; + return -ENOMEM; } ctx->aio_ring_file = file; @@ -415,7 +421,7 @@ static int aio_setup_ring(struct kioctx *ctx) if (unlikely(i != nr_pages)) { aio_free_ring(ctx); - return -EAGAIN; + return -ENOMEM; } ctx->mmap_size = nr_pages * PAGE_SIZE; @@ -429,7 +435,7 @@ static int aio_setup_ring(struct kioctx *ctx) if (IS_ERR((void *)ctx->mmap_base)) { ctx->mmap_size = 0; aio_free_ring(ctx); - return -EAGAIN; + return -ENOMEM; } pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); @@ -507,6 +513,10 @@ static void free_ioctx_reqs(struct percpu_ref *ref) { struct kioctx *ctx = container_of(ref, struct kioctx, reqs); + /* At this point we know that there are no any in-flight requests */ + if (ctx->requests_done) + complete(ctx->requests_done); + INIT_WORK(&ctx->free_work, free_ioctx); schedule_work(&ctx->free_work); } @@ -556,6 +566,10 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); + /* While kioctx setup is in progress, + * we are protected from page migration + * changes ring_pages by ->ring_lock. + */ ring = kmap_atomic(ctx->ring_pages[0]); ring->id = ctx->id; kunmap_atomic(ring); @@ -640,24 +654,28 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) ctx->max_reqs = nr_events; - if (percpu_ref_init(&ctx->users, free_ioctx_users)) - goto err; - - if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs)) - goto err; - spin_lock_init(&ctx->ctx_lock); spin_lock_init(&ctx->completion_lock); mutex_init(&ctx->ring_lock); + /* Protect against page migration throughout kiotx setup by keeping + * the ring_lock mutex held until setup is complete. */ + mutex_lock(&ctx->ring_lock); init_waitqueue_head(&ctx->wait); INIT_LIST_HEAD(&ctx->active_reqs); + if (percpu_ref_init(&ctx->users, free_ioctx_users)) + goto err; + + if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs)) + goto err; + ctx->cpu = alloc_percpu(struct kioctx_cpu); if (!ctx->cpu) goto err; - if (aio_setup_ring(ctx) < 0) + err = aio_setup_ring(ctx); + if (err < 0) goto err; atomic_set(&ctx->reqs_available, ctx->nr_events - 1); @@ -683,6 +701,9 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) if (err) goto err_cleanup; + /* Release the ring_lock mutex now that all setup is complete. */ + mutex_unlock(&ctx->ring_lock); + pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", ctx, ctx->user_id, mm, ctx->nr_events); return ctx; @@ -692,6 +713,7 @@ err_cleanup: err_ctx: aio_free_ring(ctx); err: + mutex_unlock(&ctx->ring_lock); free_percpu(ctx->cpu); free_percpu(ctx->reqs.pcpu_count); free_percpu(ctx->users.pcpu_count); @@ -705,7 +727,8 @@ err: * when the processes owning a context have all exited to encourage * the rapid destruction of the kioctx. */ -static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx) +static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, + struct completion *requests_done) { if (!atomic_xchg(&ctx->dead, 1)) { struct kioctx_table *table; @@ -734,7 +757,11 @@ static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx) if (ctx->mmap_size) vm_munmap(ctx->mmap_base, ctx->mmap_size); + ctx->requests_done = requests_done; percpu_ref_kill(&ctx->users); + } else { + if (requests_done) + complete(requests_done); } } @@ -796,7 +823,7 @@ void exit_aio(struct mm_struct *mm) */ ctx->mmap_size = 0; - kill_ioctx(mm, ctx); + kill_ioctx(mm, ctx, NULL); } } @@ -1024,6 +1051,7 @@ static long aio_read_events_ring(struct kioctx *ctx, mutex_lock(&ctx->ring_lock); + /* Access to ->ring_pages here is protected by ctx->ring_lock. */ ring = kmap_atomic(ctx->ring_pages[0]); head = ring->head; tail = ring->tail; @@ -1171,7 +1199,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) if (!IS_ERR(ioctx)) { ret = put_user(ioctx->user_id, ctxp); if (ret) - kill_ioctx(current->mm, ioctx); + kill_ioctx(current->mm, ioctx, NULL); percpu_ref_put(&ioctx->users); } @@ -1189,8 +1217,22 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) { struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { - kill_ioctx(current->mm, ioctx); + struct completion requests_done = + COMPLETION_INITIALIZER_ONSTACK(requests_done); + + /* Pass requests_done to kill_ioctx() where it can be set + * in a thread-safe way. If we try to set it here then we have + * a race condition if two io_destroy() called simultaneously. + */ + kill_ioctx(current->mm, ioctx, &requests_done); percpu_ref_put(&ioctx->users); + + /* Wait until all IO for the context are done. Otherwise kernel + * keep using user-space buffers even if user thinks the context + * is destroyed. + */ + wait_for_completion(&requests_done); + return 0; } pr_debug("EINVAL: io_destroy: invalid context id\n"); @@ -1285,10 +1327,8 @@ rw_common: &iovec, compat) : aio_setup_single_vector(req, rw, buf, &nr_segs, iovec); - if (ret) - return ret; - - ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); + if (!ret) + ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); if (ret < 0) { if (iovec != &inline_vec) kfree(iovec); diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 3182c0e68b42..232e03d4780d 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -103,6 +103,9 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i if (tmp.size < sizeof(tmp)) return ERR_PTR(-EINVAL); + if (tmp.size > (PATH_MAX + sizeof(tmp))) + return ERR_PTR(-ENAMETOOLONG); + return memdup_user(in, tmp.size); } diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 2caf36ac3e93..cc87c1abac97 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -179,7 +179,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry) spin_lock(&active->d_lock); /* Already gone? */ - if (!d_count(active)) + if ((int) d_count(active) <= 0) goto next; qstr = &active->d_name; @@ -230,7 +230,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry) spin_lock(&expiring->d_lock); - /* Bad luck, we've already been dentry_iput */ + /* We've already been dentry_iput or unlinked */ if (!expiring->d_inode) goto next; diff --git a/fs/befs/Makefile b/fs/befs/Makefile index 2f370bd7a50d..8b9f66642a83 100644 --- a/fs/befs/Makefile +++ b/fs/befs/Makefile @@ -3,5 +3,5 @@ # obj-$(CONFIG_BEFS_FS) += befs.o - +ccflags-$(CONFIG_BEFS_DEBUG) += -DDEBUG befs-objs := datastream.o btree.o super.o inode.o debug.o io.o linuxvfs.o diff --git a/fs/befs/befs.h b/fs/befs/befs.h index b26642839156..3a7813ab8c95 100644 --- a/fs/befs/befs.h +++ b/fs/befs/befs.h @@ -88,8 +88,11 @@ enum befs_err { /****************************/ /* debug.c */ +__printf(2, 3) void befs_error(const struct super_block *sb, const char *fmt, ...); +__printf(2, 3) void befs_warning(const struct super_block *sb, const char *fmt, ...); +__printf(2, 3) void befs_debug(const struct super_block *sb, const char *fmt, ...); void befs_dump_super_block(const struct super_block *sb, befs_super_block *); diff --git a/fs/befs/btree.c b/fs/befs/btree.c index 74e397db0b8b..a2cd305a993a 100644 --- a/fs/befs/btree.c +++ b/fs/befs/btree.c @@ -137,7 +137,7 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds, struct buffer_head *bh = NULL; befs_disk_btree_super *od_sup = NULL; - befs_debug(sb, "---> befs_btree_read_super()"); + befs_debug(sb, "---> %s", __func__); bh = befs_read_datastream(sb, ds, 0, NULL); @@ -162,11 +162,11 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds, goto error; } - befs_debug(sb, "<--- befs_btree_read_super()"); + befs_debug(sb, "<--- %s", __func__); return BEFS_OK; error: - befs_debug(sb, "<--- befs_btree_read_super() ERROR"); + befs_debug(sb, "<--- %s ERROR", __func__); return BEFS_ERR; } @@ -195,16 +195,16 @@ befs_bt_read_node(struct super_block *sb, befs_data_stream * ds, { uint off = 0; - befs_debug(sb, "---> befs_bt_read_node()"); + befs_debug(sb, "---> %s", __func__); if (node->bh) brelse(node->bh); node->bh = befs_read_datastream(sb, ds, node_off, &off); if (!node->bh) { - befs_error(sb, "befs_bt_read_node() failed to read " - "node at %Lu", node_off); - befs_debug(sb, "<--- befs_bt_read_node() ERROR"); + befs_error(sb, "%s failed to read " + "node at %llu", __func__, node_off); + befs_debug(sb, "<--- %s ERROR", __func__); return BEFS_ERR; } @@ -221,7 +221,7 @@ befs_bt_read_node(struct super_block *sb, befs_data_stream * ds, node->head.all_key_length = fs16_to_cpu(sb, node->od_node->all_key_length); - befs_debug(sb, "<--- befs_btree_read_node()"); + befs_debug(sb, "<--- %s", __func__); return BEFS_OK; } @@ -252,7 +252,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds, befs_off_t node_off; int res; - befs_debug(sb, "---> befs_btree_find() Key: %s", key); + befs_debug(sb, "---> %s Key: %s", __func__, key); if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) { befs_error(sb, @@ -263,7 +263,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds, this_node = kmalloc(sizeof (befs_btree_node), GFP_NOFS); if (!this_node) { - befs_error(sb, "befs_btree_find() failed to allocate %u " + befs_error(sb, "befs_btree_find() failed to allocate %zu " "bytes of memory", sizeof (befs_btree_node)); goto error; } @@ -274,7 +274,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds, node_off = bt_super.root_node_ptr; if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) { befs_error(sb, "befs_btree_find() failed to read " - "node at %Lu", node_off); + "node at %llu", node_off); goto error_alloc; } @@ -285,7 +285,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds, /* if no match, go to overflow node */ if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) { befs_error(sb, "befs_btree_find() failed to read " - "node at %Lu", node_off); + "node at %llu", node_off); goto error_alloc; } } @@ -298,11 +298,11 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds, kfree(this_node); if (res != BEFS_BT_MATCH) { - befs_debug(sb, "<--- befs_btree_find() Key %s not found", key); + befs_debug(sb, "<--- %s Key %s not found", __func__, key); *value = 0; return BEFS_BT_NOT_FOUND; } - befs_debug(sb, "<--- befs_btree_find() Found key %s, value %Lu", + befs_debug(sb, "<--- %s Found key %s, value %llu", __func__, key, *value); return BEFS_OK; @@ -310,7 +310,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds, kfree(this_node); error: *value = 0; - befs_debug(sb, "<--- befs_btree_find() ERROR"); + befs_debug(sb, "<--- %s ERROR", __func__); return BEFS_ERR; } @@ -343,7 +343,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node, char *thiskey; fs64 *valarray; - befs_debug(sb, "---> befs_find_key() %s", findkey); + befs_debug(sb, "---> %s %s", __func__, findkey); *value = 0; @@ -355,7 +355,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node, eq = befs_compare_strings(thiskey, keylen, findkey, findkey_len); if (eq < 0) { - befs_debug(sb, "<--- befs_find_key() %s not found", findkey); + befs_debug(sb, "<--- %s %s not found", __func__, findkey); return BEFS_BT_NOT_FOUND; } @@ -373,8 +373,8 @@ befs_find_key(struct super_block *sb, befs_btree_node * node, findkey_len); if (eq == 0) { - befs_debug(sb, "<--- befs_find_key() found %s at %d", - thiskey, mid); + befs_debug(sb, "<--- %s found %s at %d", + __func__, thiskey, mid); *value = fs64_to_cpu(sb, valarray[mid]); return BEFS_BT_MATCH; @@ -388,7 +388,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node, *value = fs64_to_cpu(sb, valarray[mid + 1]); else *value = fs64_to_cpu(sb, valarray[mid]); - befs_debug(sb, "<--- befs_find_key() found %s at %d", thiskey, mid); + befs_debug(sb, "<--- %s found %s at %d", __func__, thiskey, mid); return BEFS_BT_PARMATCH; } @@ -428,7 +428,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, uint key_sum = 0; - befs_debug(sb, "---> befs_btree_read()"); + befs_debug(sb, "---> %s", __func__); if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) { befs_error(sb, @@ -437,7 +437,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, } if ((this_node = kmalloc(sizeof (befs_btree_node), GFP_NOFS)) == NULL) { - befs_error(sb, "befs_btree_read() failed to allocate %u " + befs_error(sb, "befs_btree_read() failed to allocate %zu " "bytes of memory", sizeof (befs_btree_node)); goto error; } @@ -452,7 +452,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, kfree(this_node); *value = 0; *keysize = 0; - befs_debug(sb, "<--- befs_btree_read() Tree is EMPTY"); + befs_debug(sb, "<--- %s Tree is EMPTY", __func__); return BEFS_BT_EMPTY; } else if (res == BEFS_ERR) { goto error_alloc; @@ -467,7 +467,8 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, *keysize = 0; *value = 0; befs_debug(sb, - "<--- befs_btree_read() END of keys at %Lu", + "<--- %s END of keys at %llu", __func__, + (unsigned long long) key_sum + this_node->head.all_key_count); brelse(this_node->bh); kfree(this_node); @@ -478,8 +479,8 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, node_off = this_node->head.right; if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) { - befs_error(sb, "befs_btree_read() failed to read " - "node at %Lu", node_off); + befs_error(sb, "%s failed to read node at %llu", + __func__, (unsigned long long)node_off); goto error_alloc; } } @@ -492,11 +493,13 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, keystart = befs_bt_get_key(sb, this_node, cur_key, &keylen); - befs_debug(sb, "Read [%Lu,%d]: keysize %d", node_off, cur_key, keylen); + befs_debug(sb, "Read [%llu,%d]: keysize %d", + (long long unsigned int)node_off, (int)cur_key, + (int)keylen); if (bufsize < keylen + 1) { - befs_error(sb, "befs_btree_read() keybuf too small (%u) " - "for key of size %d", bufsize, keylen); + befs_error(sb, "%s keybuf too small (%zu) " + "for key of size %d", __func__, bufsize, keylen); brelse(this_node->bh); goto error_alloc; }; @@ -506,13 +509,13 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, *keysize = keylen; keybuf[keylen] = '\0'; - befs_debug(sb, "Read [%Lu,%d]: Key \"%.*s\", Value %Lu", node_off, + befs_debug(sb, "Read [%llu,%d]: Key \"%.*s\", Value %llu", node_off, cur_key, keylen, keybuf, *value); brelse(this_node->bh); kfree(this_node); - befs_debug(sb, "<--- befs_btree_read()"); + befs_debug(sb, "<--- %s", __func__); return BEFS_OK; @@ -522,7 +525,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, error: *keysize = 0; *value = 0; - befs_debug(sb, "<--- befs_btree_read() ERROR"); + befs_debug(sb, "<--- %s ERROR", __func__); return BEFS_ERR; } @@ -547,26 +550,26 @@ befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds, befs_off_t * node_off) { - befs_debug(sb, "---> befs_btree_seekleaf()"); + befs_debug(sb, "---> %s", __func__); if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) { - befs_error(sb, "befs_btree_seekleaf() failed to read " - "node at %Lu", *node_off); + befs_error(sb, "%s failed to read " + "node at %llu", __func__, *node_off); goto error; } - befs_debug(sb, "Seekleaf to root node %Lu", *node_off); + befs_debug(sb, "Seekleaf to root node %llu", *node_off); if (this_node->head.all_key_count == 0 && befs_leafnode(this_node)) { - befs_debug(sb, "<--- befs_btree_seekleaf() Tree is EMPTY"); + befs_debug(sb, "<--- %s Tree is EMPTY", __func__); return BEFS_BT_EMPTY; } while (!befs_leafnode(this_node)) { if (this_node->head.all_key_count == 0) { - befs_debug(sb, "befs_btree_seekleaf() encountered " - "an empty interior node: %Lu. Using Overflow " - "node: %Lu", *node_off, + befs_debug(sb, "%s encountered " + "an empty interior node: %llu. Using Overflow " + "node: %llu", __func__, *node_off, this_node->head.overflow); *node_off = this_node->head.overflow; } else { @@ -574,19 +577,19 @@ befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds, *node_off = fs64_to_cpu(sb, valarray[0]); } if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) { - befs_error(sb, "befs_btree_seekleaf() failed to read " - "node at %Lu", *node_off); + befs_error(sb, "%s failed to read " + "node at %llu", __func__, *node_off); goto error; } - befs_debug(sb, "Seekleaf to child node %Lu", *node_off); + befs_debug(sb, "Seekleaf to child node %llu", *node_off); } - befs_debug(sb, "Node %Lu is a leaf node", *node_off); + befs_debug(sb, "Node %llu is a leaf node", *node_off); return BEFS_OK; error: - befs_debug(sb, "<--- befs_btree_seekleaf() ERROR"); + befs_debug(sb, "<--- %s ERROR", __func__); return BEFS_ERR; } diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c index 59096b5e0fc7..c467bebd50af 100644 --- a/fs/befs/datastream.c +++ b/fs/befs/datastream.c @@ -52,26 +52,25 @@ befs_read_datastream(struct super_block *sb, befs_data_stream * ds, befs_block_run run; befs_blocknr_t block; /* block coresponding to pos */ - befs_debug(sb, "---> befs_read_datastream() %Lu", pos); + befs_debug(sb, "---> %s %llu", __func__, pos); block = pos >> BEFS_SB(sb)->block_shift; if (off) *off = pos - (block << BEFS_SB(sb)->block_shift); if (befs_fblock2brun(sb, ds, block, &run) != BEFS_OK) { befs_error(sb, "BeFS: Error finding disk addr of block %lu", - block); - befs_debug(sb, "<--- befs_read_datastream() ERROR"); + (unsigned long)block); + befs_debug(sb, "<--- %s ERROR", __func__); return NULL; } bh = befs_bread_iaddr(sb, run); if (!bh) { befs_error(sb, "BeFS: Error reading block %lu from datastream", - block); + (unsigned long)block); return NULL; } - befs_debug(sb, "<--- befs_read_datastream() read data, starting at %Lu", - pos); + befs_debug(sb, "<--- %s read data, starting at %llu", __func__, pos); return bh; } @@ -106,7 +105,8 @@ befs_fblock2brun(struct super_block *sb, befs_data_stream * data, } else { befs_error(sb, "befs_fblock2brun() was asked to find block %lu, " - "which is not mapped by the datastream\n", fblock); + "which is not mapped by the datastream\n", + (unsigned long)fblock); err = BEFS_ERR; } return err; @@ -128,14 +128,14 @@ befs_read_lsymlink(struct super_block * sb, befs_data_stream * ds, void *buff, befs_off_t bytes_read = 0; /* bytes readed */ u16 plen; struct buffer_head *bh = NULL; - befs_debug(sb, "---> befs_read_lsymlink() length: %Lu", len); + befs_debug(sb, "---> %s length: %llu", __func__, len); while (bytes_read < len) { bh = befs_read_datastream(sb, ds, bytes_read, NULL); if (!bh) { befs_error(sb, "BeFS: Error reading datastream block " - "starting from %Lu", bytes_read); - befs_debug(sb, "<--- befs_read_lsymlink() ERROR"); + "starting from %llu", bytes_read); + befs_debug(sb, "<--- %s ERROR", __func__); return bytes_read; } @@ -146,7 +146,8 @@ befs_read_lsymlink(struct super_block * sb, befs_data_stream * ds, void *buff, bytes_read += plen; } - befs_debug(sb, "<--- befs_read_lsymlink() read %u bytes", bytes_read); + befs_debug(sb, "<--- %s read %u bytes", __func__, (unsigned int) + bytes_read); return bytes_read; } @@ -169,7 +170,7 @@ befs_count_blocks(struct super_block * sb, befs_data_stream * ds) befs_blocknr_t metablocks; /* FS metadata blocks */ befs_sb_info *befs_sb = BEFS_SB(sb); - befs_debug(sb, "---> befs_count_blocks()"); + befs_debug(sb, "---> %s", __func__); datablocks = ds->size >> befs_sb->block_shift; if (ds->size & (befs_sb->block_size - 1)) @@ -206,7 +207,7 @@ befs_count_blocks(struct super_block * sb, befs_data_stream * ds) } blocks = datablocks + metablocks; - befs_debug(sb, "<--- befs_count_blocks() %u blocks", blocks); + befs_debug(sb, "<--- %s %u blocks", __func__, (unsigned int)blocks); return blocks; } @@ -251,11 +252,11 @@ befs_find_brun_direct(struct super_block *sb, befs_data_stream * data, befs_blocknr_t max_block = data->max_direct_range >> BEFS_SB(sb)->block_shift; - befs_debug(sb, "---> befs_find_brun_direct(), find %lu", blockno); + befs_debug(sb, "---> %s, find %lu", __func__, (unsigned long)blockno); if (blockno > max_block) { - befs_error(sb, "befs_find_brun_direct() passed block outside of" - "direct region"); + befs_error(sb, "%s passed block outside of direct region", + __func__); return BEFS_ERR; } @@ -267,13 +268,14 @@ befs_find_brun_direct(struct super_block *sb, befs_data_stream * data, run->start = array[i].start + offset; run->len = array[i].len - offset; - befs_debug(sb, "---> befs_find_brun_direct(), " - "found %lu at direct[%d]", blockno, i); + befs_debug(sb, "---> %s, " + "found %lu at direct[%d]", __func__, + (unsigned long)blockno, i); return BEFS_OK; } } - befs_debug(sb, "---> befs_find_brun_direct() ERROR"); + befs_debug(sb, "---> %s ERROR", __func__); return BEFS_ERR; } @@ -316,7 +318,7 @@ befs_find_brun_indirect(struct super_block *sb, befs_blocknr_t indirblockno = iaddr2blockno(sb, &indirect); int arraylen = befs_iaddrs_per_block(sb); - befs_debug(sb, "---> befs_find_brun_indirect(), find %lu", blockno); + befs_debug(sb, "---> %s, find %lu", __func__, (unsigned long)blockno); indir_start_blk = data->max_direct_range >> BEFS_SB(sb)->block_shift; search_blk = blockno - indir_start_blk; @@ -325,10 +327,9 @@ befs_find_brun_indirect(struct super_block *sb, for (i = 0; i < indirect.len; i++) { indirblock = befs_bread(sb, indirblockno + i); if (indirblock == NULL) { - befs_debug(sb, - "---> befs_find_brun_indirect() failed to " - "read disk block %lu from the indirect brun", - indirblockno + i); + befs_debug(sb, "---> %s failed to read " + "disk block %lu from the indirect brun", + __func__, (unsigned long)indirblockno + i); return BEFS_ERR; } @@ -348,9 +349,10 @@ befs_find_brun_indirect(struct super_block *sb, brelse(indirblock); befs_debug(sb, - "<--- befs_find_brun_indirect() found " - "file block %lu at indirect[%d]", - blockno, j + (i * arraylen)); + "<--- %s found file block " + "%lu at indirect[%d]", __func__, + (unsigned long)blockno, + j + (i * arraylen)); return BEFS_OK; } sum += len; @@ -360,10 +362,10 @@ befs_find_brun_indirect(struct super_block *sb, } /* Only fallthrough is an error */ - befs_error(sb, "BeFS: befs_find_brun_indirect() failed to find " - "file block %lu", blockno); + befs_error(sb, "BeFS: %s failed to find " + "file block %lu", __func__, (unsigned long)blockno); - befs_debug(sb, "<--- befs_find_brun_indirect() ERROR"); + befs_debug(sb, "<--- %s ERROR", __func__); return BEFS_ERR; } @@ -444,7 +446,7 @@ befs_find_brun_dblindirect(struct super_block *sb, size_t diblklen = iblklen * befs_iaddrs_per_block(sb) * BEFS_DBLINDIR_BRUN_LEN; - befs_debug(sb, "---> befs_find_brun_dblindirect() find %lu", blockno); + befs_debug(sb, "---> %s find %lu", __func__, (unsigned long)blockno); /* First, discover which of the double_indir->indir blocks * contains pos. Then figure out how much of pos that @@ -460,8 +462,9 @@ befs_find_brun_dblindirect(struct super_block *sb, dbl_which_block = dblindir_indx / befs_iaddrs_per_block(sb); if (dbl_which_block > data->double_indirect.len) { befs_error(sb, "The double-indirect index calculated by " - "befs_read_brun_dblindirect(), %d, is outside the range " - "of the double-indirect block", dblindir_indx); + "%s, %d, is outside the range " + "of the double-indirect block", __func__, + dblindir_indx); return BEFS_ERR; } @@ -469,10 +472,10 @@ befs_find_brun_dblindirect(struct super_block *sb, befs_bread(sb, iaddr2blockno(sb, &data->double_indirect) + dbl_which_block); if (dbl_indir_block == NULL) { - befs_error(sb, "befs_read_brun_dblindirect() couldn't read the " - "double-indirect block at blockno %lu", - iaddr2blockno(sb, - &data->double_indirect) + + befs_error(sb, "%s couldn't read the " + "double-indirect block at blockno %lu", __func__, + (unsigned long) + iaddr2blockno(sb, &data->double_indirect) + dbl_which_block); brelse(dbl_indir_block); return BEFS_ERR; @@ -489,16 +492,16 @@ befs_find_brun_dblindirect(struct super_block *sb, which_block = indir_indx / befs_iaddrs_per_block(sb); if (which_block > indir_run.len) { befs_error(sb, "The indirect index calculated by " - "befs_read_brun_dblindirect(), %d, is outside the range " - "of the indirect block", indir_indx); + "%s, %d, is outside the range " + "of the indirect block", __func__, indir_indx); return BEFS_ERR; } indir_block = befs_bread(sb, iaddr2blockno(sb, &indir_run) + which_block); if (indir_block == NULL) { - befs_error(sb, "befs_read_brun_dblindirect() couldn't read the " - "indirect block at blockno %lu", + befs_error(sb, "%s couldn't read the indirect block " + "at blockno %lu", __func__, (unsigned long) iaddr2blockno(sb, &indir_run) + which_block); brelse(indir_block); return BEFS_ERR; @@ -519,7 +522,7 @@ befs_find_brun_dblindirect(struct super_block *sb, run->len -= offset; befs_debug(sb, "Found file block %lu in double_indirect[%d][%d]," - " double_indirect_leftover = %lu", + " double_indirect_leftover = %lu", (unsigned long) blockno, dblindir_indx, indir_indx, dblindir_leftover); return BEFS_OK; diff --git a/fs/befs/debug.c b/fs/befs/debug.c index 622e73775c83..4de7cffcd662 100644 --- a/fs/befs/debug.c +++ b/fs/befs/debug.c @@ -10,6 +10,7 @@ * debug functions */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #ifdef __KERNEL__ #include <stdarg.h> @@ -23,43 +24,30 @@ #include "befs.h" -#define ERRBUFSIZE 1024 - void befs_error(const struct super_block *sb, const char *fmt, ...) { + struct va_format vaf; va_list args; - char *err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL); - if (err_buf == NULL) { - printk(KERN_ERR "could not allocate %d bytes\n", ERRBUFSIZE); - return; - } va_start(args, fmt); - vsnprintf(err_buf, ERRBUFSIZE, fmt, args); + vaf.fmt = fmt; + vaf.va = &args; + pr_err("(%s): %pV\n", sb->s_id, &vaf); va_end(args); - - printk(KERN_ERR "BeFS(%s): %s\n", sb->s_id, err_buf); - kfree(err_buf); } void befs_warning(const struct super_block *sb, const char *fmt, ...) { + struct va_format vaf; va_list args; - char *err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL); - if (err_buf == NULL) { - printk(KERN_ERR "could not allocate %d bytes\n", ERRBUFSIZE); - return; - } va_start(args, fmt); - vsnprintf(err_buf, ERRBUFSIZE, fmt, args); + vaf.fmt = fmt; + vaf.va = &args; + pr_warn("(%s): %pV\n", sb->s_id, &vaf); va_end(args); - - printk(KERN_WARNING "BeFS(%s): %s\n", sb->s_id, err_buf); - - kfree(err_buf); } void @@ -67,25 +55,13 @@ befs_debug(const struct super_block *sb, const char *fmt, ...) { #ifdef CONFIG_BEFS_DEBUG + struct va_format vaf; va_list args; - char *err_buf = NULL; - - if (BEFS_SB(sb)->mount_opts.debug) { - err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL); - if (err_buf == NULL) { - printk(KERN_ERR "could not allocate %d bytes\n", - ERRBUFSIZE); - return; - } - - va_start(args, fmt); - vsnprintf(err_buf, ERRBUFSIZE, fmt, args); - va_end(args); - - printk(KERN_DEBUG "BeFS(%s): %s\n", sb->s_id, err_buf); - - kfree(err_buf); - } + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + pr_debug("(%s): %pV\n", sb->s_id, &vaf); + va_end(args); #endif //CONFIG_BEFS_DEBUG } @@ -109,9 +85,9 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode) befs_debug(sb, " gid %u", fs32_to_cpu(sb, inode->gid)); befs_debug(sb, " mode %08x", fs32_to_cpu(sb, inode->mode)); befs_debug(sb, " flags %08x", fs32_to_cpu(sb, inode->flags)); - befs_debug(sb, " create_time %Lu", + befs_debug(sb, " create_time %llu", fs64_to_cpu(sb, inode->create_time)); - befs_debug(sb, " last_modified_time %Lu", + befs_debug(sb, " last_modified_time %llu", fs64_to_cpu(sb, inode->last_modified_time)); tmp_run = fsrun_to_cpu(sb, inode->parent); @@ -137,7 +113,7 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode) tmp_run.allocation_group, tmp_run.start, tmp_run.len); } - befs_debug(sb, " max_direct_range %Lu", + befs_debug(sb, " max_direct_range %llu", fs64_to_cpu(sb, inode->data.datastream. max_direct_range)); @@ -147,7 +123,7 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode) tmp_run.allocation_group, tmp_run.start, tmp_run.len); - befs_debug(sb, " max_indirect_range %Lu", + befs_debug(sb, " max_indirect_range %llu", fs64_to_cpu(sb, inode->data.datastream. max_indirect_range)); @@ -158,12 +134,12 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode) tmp_run.allocation_group, tmp_run.start, tmp_run.len); - befs_debug(sb, " max_double_indirect_range %Lu", + befs_debug(sb, " max_double_indirect_range %llu", fs64_to_cpu(sb, inode->data.datastream. max_double_indirect_range)); - befs_debug(sb, " size %Lu", + befs_debug(sb, " size %llu", fs64_to_cpu(sb, inode->data.datastream.size)); } @@ -191,8 +167,8 @@ befs_dump_super_block(const struct super_block *sb, befs_super_block * sup) befs_debug(sb, " block_size %u", fs32_to_cpu(sb, sup->block_size)); befs_debug(sb, " block_shift %u", fs32_to_cpu(sb, sup->block_shift)); - befs_debug(sb, " num_blocks %Lu", fs64_to_cpu(sb, sup->num_blocks)); - befs_debug(sb, " used_blocks %Lu", fs64_to_cpu(sb, sup->used_blocks)); + befs_debug(sb, " num_blocks %llu", fs64_to_cpu(sb, sup->num_blocks)); + befs_debug(sb, " used_blocks %llu", fs64_to_cpu(sb, sup->used_blocks)); befs_debug(sb, " magic2 %08x", fs32_to_cpu(sb, sup->magic2)); befs_debug(sb, " blocks_per_ag %u", @@ -206,8 +182,8 @@ befs_dump_super_block(const struct super_block *sb, befs_super_block * sup) befs_debug(sb, " log_blocks %u, %hu, %hu", tmp_run.allocation_group, tmp_run.start, tmp_run.len); - befs_debug(sb, " log_start %Ld", fs64_to_cpu(sb, sup->log_start)); - befs_debug(sb, " log_end %Ld", fs64_to_cpu(sb, sup->log_end)); + befs_debug(sb, " log_start %lld", fs64_to_cpu(sb, sup->log_start)); + befs_debug(sb, " log_end %lld", fs64_to_cpu(sb, sup->log_end)); befs_debug(sb, " magic3 %08x", fs32_to_cpu(sb, sup->magic3)); diff --git a/fs/befs/inode.c b/fs/befs/inode.c index 94c17f9a9576..fa4b718de597 100644 --- a/fs/befs/inode.c +++ b/fs/befs/inode.c @@ -25,7 +25,8 @@ befs_check_inode(struct super_block *sb, befs_inode * raw_inode, /* check magic header. */ if (magic1 != BEFS_INODE_MAGIC1) { befs_error(sb, - "Inode has a bad magic header - inode = %lu", inode); + "Inode has a bad magic header - inode = %lu", + (unsigned long)inode); return BEFS_BAD_INODE; } @@ -34,8 +35,8 @@ befs_check_inode(struct super_block *sb, befs_inode * raw_inode, */ if (inode != iaddr2blockno(sb, &ino_num)) { befs_error(sb, "inode blocknr field disagrees with vfs " - "VFS: %lu, Inode %lu", - inode, iaddr2blockno(sb, &ino_num)); + "VFS: %lu, Inode %lu", (unsigned long) + inode, (unsigned long)iaddr2blockno(sb, &ino_num)); return BEFS_BAD_INODE; } @@ -44,7 +45,8 @@ befs_check_inode(struct super_block *sb, befs_inode * raw_inode, */ if (!(flags & BEFS_INODE_IN_USE)) { - befs_error(sb, "inode is not used - inode = %lu", inode); + befs_error(sb, "inode is not used - inode = %lu", + (unsigned long)inode); return BEFS_BAD_INODE; } diff --git a/fs/befs/io.c b/fs/befs/io.c index ddef98aa255d..0408a3d601d0 100644 --- a/fs/befs/io.c +++ b/fs/befs/io.c @@ -30,9 +30,9 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr) befs_blocknr_t block = 0; befs_sb_info *befs_sb = BEFS_SB(sb); - befs_debug(sb, "---> Enter befs_read_iaddr() " - "[%u, %hu, %hu]", - iaddr.allocation_group, iaddr.start, iaddr.len); + befs_debug(sb, "---> Enter %s " + "[%u, %hu, %hu]", __func__, iaddr.allocation_group, + iaddr.start, iaddr.len); if (iaddr.allocation_group > befs_sb->num_ags) { befs_error(sb, "BEFS: Invalid allocation group %u, max is %u", @@ -42,20 +42,21 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr) block = iaddr2blockno(sb, &iaddr); - befs_debug(sb, "befs_read_iaddr: offset = %lu", block); + befs_debug(sb, "%s: offset = %lu", __func__, (unsigned long)block); bh = sb_bread(sb, block); if (bh == NULL) { - befs_error(sb, "Failed to read block %lu", block); + befs_error(sb, "Failed to read block %lu", + (unsigned long)block); goto error; } - befs_debug(sb, "<--- befs_read_iaddr()"); + befs_debug(sb, "<--- %s", __func__); return bh; error: - befs_debug(sb, "<--- befs_read_iaddr() ERROR"); + befs_debug(sb, "<--- %s ERROR", __func__); return NULL; } @@ -64,20 +65,21 @@ befs_bread(struct super_block *sb, befs_blocknr_t block) { struct buffer_head *bh = NULL; - befs_debug(sb, "---> Enter befs_read() %Lu", block); + befs_debug(sb, "---> Enter %s %lu", __func__, (unsigned long)block); bh = sb_bread(sb, block); if (bh == NULL) { - befs_error(sb, "Failed to read block %lu", block); + befs_error(sb, "Failed to read block %lu", + (unsigned long)block); goto error; } - befs_debug(sb, "<--- befs_read()"); + befs_debug(sb, "<--- %s", __func__); return bh; error: - befs_debug(sb, "<--- befs_read() ERROR"); + befs_debug(sb, "<--- %s ERROR", __func__); return NULL; } diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 845d2d690ce2..d626756ff721 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -5,6 +5,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/slab.h> #include <linux/fs.h> @@ -39,7 +41,6 @@ static struct dentry *befs_lookup(struct inode *, struct dentry *, unsigned int) static struct inode *befs_iget(struct super_block *, unsigned long); static struct inode *befs_alloc_inode(struct super_block *sb); static void befs_destroy_inode(struct inode *inode); -static int befs_init_inodecache(void); static void befs_destroy_inodecache(void); static void *befs_follow_link(struct dentry *, struct nameidata *); static void *befs_fast_follow_link(struct dentry *, struct nameidata *); @@ -131,26 +132,28 @@ befs_get_block(struct inode *inode, sector_t block, ulong disk_off; befs_debug(sb, "---> befs_get_block() for inode %lu, block %ld", - inode->i_ino, block); + (unsigned long)inode->i_ino, (long)block); if (block < 0) { befs_error(sb, "befs_get_block() was asked for a block " "number less than zero: block %ld in inode %lu", - block, inode->i_ino); + (long)block, (unsigned long)inode->i_ino); return -EIO; } if (create) { befs_error(sb, "befs_get_block() was asked to write to " - "block %ld in inode %lu", block, inode->i_ino); + "block %ld in inode %lu", (long)block, + (unsigned long)inode->i_ino); return -EPERM; } res = befs_fblock2brun(sb, ds, block, &run); if (res != BEFS_OK) { befs_error(sb, - "<--- befs_get_block() for inode %lu, block " - "%ld ERROR", inode->i_ino, block); + "<--- %s for inode %lu, block %ld ERROR", + __func__, (unsigned long)inode->i_ino, + (long)block); return -EFBIG; } @@ -158,8 +161,9 @@ befs_get_block(struct inode *inode, sector_t block, map_bh(bh_result, inode->i_sb, disk_off); - befs_debug(sb, "<--- befs_get_block() for inode %lu, block %ld, " - "disk address %lu", inode->i_ino, block, disk_off); + befs_debug(sb, "<--- %s for inode %lu, block %ld, disk address %lu", + __func__, (unsigned long)inode->i_ino, (long)block, + (unsigned long)disk_off); return 0; } @@ -176,15 +180,15 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) char *utfname; const char *name = dentry->d_name.name; - befs_debug(sb, "---> befs_lookup() " - "name %s inode %ld", dentry->d_name.name, dir->i_ino); + befs_debug(sb, "---> %s name %s inode %ld", __func__, + dentry->d_name.name, dir->i_ino); /* Convert to UTF-8 */ if (BEFS_SB(sb)->nls) { ret = befs_nls2utf(sb, name, strlen(name), &utfname, &utfnamelen); if (ret < 0) { - befs_debug(sb, "<--- befs_lookup() ERROR"); + befs_debug(sb, "<--- %s ERROR", __func__); return ERR_PTR(ret); } ret = befs_btree_find(sb, ds, utfname, &offset); @@ -195,12 +199,12 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) } if (ret == BEFS_BT_NOT_FOUND) { - befs_debug(sb, "<--- befs_lookup() %s not found", + befs_debug(sb, "<--- %s %s not found", __func__, dentry->d_name.name); return ERR_PTR(-ENOENT); } else if (ret != BEFS_OK || offset == 0) { - befs_warning(sb, "<--- befs_lookup() Error"); + befs_warning(sb, "<--- %s Error", __func__); return ERR_PTR(-ENODATA); } @@ -210,7 +214,7 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) d_add(dentry, inode); - befs_debug(sb, "<--- befs_lookup()"); + befs_debug(sb, "<--- %s", __func__); return NULL; } @@ -228,26 +232,25 @@ befs_readdir(struct file *file, struct dir_context *ctx) char keybuf[BEFS_NAME_LEN + 1]; const char *dirname = file->f_path.dentry->d_name.name; - befs_debug(sb, "---> befs_readdir() " - "name %s, inode %ld, ctx->pos %Ld", - dirname, inode->i_ino, ctx->pos); + befs_debug(sb, "---> %s name %s, inode %ld, ctx->pos %lld", + __func__, dirname, inode->i_ino, ctx->pos); more: result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1, keybuf, &keysize, &value); if (result == BEFS_ERR) { - befs_debug(sb, "<--- befs_readdir() ERROR"); + befs_debug(sb, "<--- %s ERROR", __func__); befs_error(sb, "IO error reading %s (inode %lu)", dirname, inode->i_ino); return -EIO; } else if (result == BEFS_BT_END) { - befs_debug(sb, "<--- befs_readdir() END"); + befs_debug(sb, "<--- %s END", __func__); return 0; } else if (result == BEFS_BT_EMPTY) { - befs_debug(sb, "<--- befs_readdir() Empty directory"); + befs_debug(sb, "<--- %s Empty directory", __func__); return 0; } @@ -260,7 +263,7 @@ more: result = befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen); if (result < 0) { - befs_debug(sb, "<--- befs_readdir() ERROR"); + befs_debug(sb, "<--- %s ERROR", __func__); return result; } if (!dir_emit(ctx, nlsname, nlsnamelen, @@ -277,7 +280,7 @@ more: ctx->pos++; goto more; - befs_debug(sb, "<--- befs_readdir() pos %Ld", ctx->pos); + befs_debug(sb, "<--- %s pos %lld", __func__, ctx->pos); return 0; } @@ -321,7 +324,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) struct inode *inode; long ret = -EIO; - befs_debug(sb, "---> befs_read_inode() " "inode = %lu", ino); + befs_debug(sb, "---> %s inode = %lu", __func__, ino); inode = iget_locked(sb, ino); if (!inode) @@ -428,7 +431,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) } brelse(bh); - befs_debug(sb, "<--- befs_read_inode()"); + befs_debug(sb, "<--- %s", __func__); unlock_new_inode(inode); return inode; @@ -437,7 +440,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) unacquire_none: iget_failed(inode); - befs_debug(sb, "<--- befs_read_inode() - Bad inode"); + befs_debug(sb, "<--- %s - Bad inode", __func__); return ERR_PTR(ret); } @@ -445,7 +448,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) * * Taken from NFS implementation by Al Viro. */ -static int +static int __init befs_init_inodecache(void) { befs_inode_cachep = kmem_cache_create("befs_inode_cache", @@ -454,11 +457,9 @@ befs_init_inodecache(void) SLAB_MEM_SPREAD), init_once); if (befs_inode_cachep == NULL) { - printk(KERN_ERR "befs_init_inodecache: " - "Couldn't initialize inode slabcache\n"); + pr_err("%s: Couldn't initialize inode slabcache\n", __func__); return -ENOMEM; } - return 0; } @@ -544,16 +545,16 @@ befs_utf2nls(struct super_block *sb, const char *in, */ int maxlen = in_len + 1; - befs_debug(sb, "---> utf2nls()"); + befs_debug(sb, "---> %s", __func__); if (!nls) { - befs_error(sb, "befs_utf2nls called with no NLS table loaded"); + befs_error(sb, "%s called with no NLS table loaded", __func__); return -EINVAL; } *out = result = kmalloc(maxlen, GFP_NOFS); if (!*out) { - befs_error(sb, "befs_utf2nls() cannot allocate memory"); + befs_error(sb, "%s cannot allocate memory", __func__); *out_len = 0; return -ENOMEM; } @@ -575,14 +576,14 @@ befs_utf2nls(struct super_block *sb, const char *in, result[o] = '\0'; *out_len = o; - befs_debug(sb, "<--- utf2nls()"); + befs_debug(sb, "<--- %s", __func__); return o; conv_err: befs_error(sb, "Name using character set %s contains a character that " "cannot be converted to unicode.", nls->charset); - befs_debug(sb, "<--- utf2nls()"); + befs_debug(sb, "<--- %s", __func__); kfree(result); return -EILSEQ; } @@ -623,16 +624,17 @@ befs_nls2utf(struct super_block *sb, const char *in, * in special cases */ int maxlen = (3 * in_len) + 1; - befs_debug(sb, "---> nls2utf()\n"); + befs_debug(sb, "---> %s\n", __func__); if (!nls) { - befs_error(sb, "befs_nls2utf called with no NLS table loaded."); + befs_error(sb, "%s called with no NLS table loaded.", + __func__); return -EINVAL; } *out = result = kmalloc(maxlen, GFP_NOFS); if (!*out) { - befs_error(sb, "befs_nls2utf() cannot allocate memory"); + befs_error(sb, "%s cannot allocate memory", __func__); *out_len = 0; return -ENOMEM; } @@ -653,14 +655,14 @@ befs_nls2utf(struct super_block *sb, const char *in, result[o] = '\0'; *out_len = o; - befs_debug(sb, "<--- nls2utf()"); + befs_debug(sb, "<--- %s", __func__); return i; conv_err: befs_error(sb, "Name using charecter set %s contains a charecter that " "cannot be converted to unicode.", nls->charset); - befs_debug(sb, "<--- nls2utf()"); + befs_debug(sb, "<--- %s", __func__); kfree(result); return -EILSEQ; } @@ -715,8 +717,8 @@ parse_options(char *options, befs_mount_options * opts) if (option >= 0) uid = make_kuid(current_user_ns(), option); if (!uid_valid(uid)) { - printk(KERN_ERR "BeFS: Invalid uid %d, " - "using default\n", option); + pr_err("Invalid uid %d, " + "using default\n", option); break; } opts->uid = uid; @@ -729,8 +731,8 @@ parse_options(char *options, befs_mount_options * opts) if (option >= 0) gid = make_kgid(current_user_ns(), option); if (!gid_valid(gid)) { - printk(KERN_ERR "BeFS: Invalid gid %d, " - "using default\n", option); + pr_err("Invalid gid %d, " + "using default\n", option); break; } opts->gid = gid; @@ -740,8 +742,8 @@ parse_options(char *options, befs_mount_options * opts) kfree(opts->iocharset); opts->iocharset = match_strdup(&args[0]); if (!opts->iocharset) { - printk(KERN_ERR "BeFS: allocation failure for " - "iocharset string\n"); + pr_err("allocation failure for " + "iocharset string\n"); return 0; } break; @@ -749,8 +751,8 @@ parse_options(char *options, befs_mount_options * opts) opts->debug = 1; break; default: - printk(KERN_ERR "BeFS: Unrecognized mount option \"%s\" " - "or missing value\n", p); + pr_err("Unrecognized mount option \"%s\" " + "or missing value\n", p); return 0; } } @@ -791,22 +793,20 @@ befs_fill_super(struct super_block *sb, void *data, int silent) save_mount_options(sb, data); - sb->s_fs_info = kmalloc(sizeof (*befs_sb), GFP_KERNEL); + sb->s_fs_info = kzalloc(sizeof(*befs_sb), GFP_KERNEL); if (sb->s_fs_info == NULL) { - printk(KERN_ERR - "BeFS(%s): Unable to allocate memory for private " + pr_err("(%s): Unable to allocate memory for private " "portion of superblock. Bailing.\n", sb->s_id); goto unacquire_none; } befs_sb = BEFS_SB(sb); - memset(befs_sb, 0, sizeof(befs_sb_info)); if (!parse_options((char *) data, &befs_sb->mount_opts)) { befs_error(sb, "cannot parse mount options"); goto unacquire_priv_sbp; } - befs_debug(sb, "---> befs_fill_super()"); + befs_debug(sb, "---> %s", __func__); #ifndef CONFIG_BEFS_RW if (!(sb->s_flags & MS_RDONLY)) { @@ -854,7 +854,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent) goto unacquire_priv_sbp; if( befs_sb->num_blocks > ~((sector_t)0) ) { - befs_error(sb, "blocks count: %Lu " + befs_error(sb, "blocks count: %llu " "is larger than the host can use", befs_sb->num_blocks); goto unacquire_priv_sbp; @@ -913,6 +913,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent) static int befs_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); if (!(*flags & MS_RDONLY)) return -EINVAL; return 0; @@ -924,7 +925,7 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf) struct super_block *sb = dentry->d_sb; u64 id = huge_encode_dev(sb->s_bdev->bd_dev); - befs_debug(sb, "---> befs_statfs()"); + befs_debug(sb, "---> %s", __func__); buf->f_type = BEFS_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; @@ -937,7 +938,7 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid.val[1] = (u32)(id >> 32); buf->f_namelen = BEFS_NAME_LEN; - befs_debug(sb, "<--- befs_statfs()"); + befs_debug(sb, "<--- %s", __func__); return 0; } @@ -963,7 +964,7 @@ init_befs_fs(void) { int err; - printk(KERN_INFO "BeFS version: %s\n", BEFS_VERSION); + pr_info("version: %s\n", BEFS_VERSION); err = befs_init_inodecache(); if (err) diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 8defc6b3f9a2..7041ac35ace8 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -172,7 +172,7 @@ static void bfs_evict_inode(struct inode *inode) dprintf("ino=%08lx\n", ino); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); clear_inode(inode); @@ -266,7 +266,7 @@ static void init_once(void *foo) inode_init_once(&bi->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { bfs_inode_cachep = kmem_cache_create("bfs_inode_cache", sizeof(struct bfs_inode_info), diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 67be2951b98a..aa3cb626671e 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -46,10 +46,15 @@ #endif static int load_elf_binary(struct linux_binprm *bprm); -static int load_elf_library(struct file *); static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *, int, int, unsigned long); +#ifdef CONFIG_USELIB +static int load_elf_library(struct file *); +#else +#define load_elf_library NULL +#endif + /* * If we don't support core dumping, then supply a NULL so we * don't even try. @@ -579,7 +584,6 @@ static int load_elf_binary(struct linux_binprm *bprm) unsigned long start_code, end_code, start_data, end_data; unsigned long reloc_func_desc __maybe_unused = 0; int executable_stack = EXSTACK_DEFAULT; - unsigned long def_flags = 0; struct pt_regs *regs = current_pt_regs(); struct { struct elfhdr elf_ex; @@ -719,9 +723,6 @@ static int load_elf_binary(struct linux_binprm *bprm) if (retval) goto out_free_dentry; - /* OK, This is the point of no return */ - current->mm->def_flags = def_flags; - /* Do this immediately, since STACK_TOP as used in setup_arg_pages may depend on the personality. */ SET_PERSONALITY(loc->elf_ex); @@ -1005,6 +1006,7 @@ out_free_ph: goto out; } +#ifdef CONFIG_USELIB /* This is really simpleminded and specialized - we are loading an a.out library that is given an ELF header. */ static int load_elf_library(struct file *file) @@ -1083,6 +1085,7 @@ out_free_ph: out: return error; } +#endif /* #ifdef CONFIG_USELIB */ #ifdef CONFIG_ELF_CORE /* diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 1c740e152f38..b60500300dd7 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -656,6 +656,7 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer, mutex_unlock(&root->d_inode->i_mutex); dput(root); + break; default: return res; } return count; diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 4f70f383132c..1c2ce0c87711 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -182,6 +182,9 @@ static int bdev_integrity_enabled(struct block_device *bdev, int rw) */ int bio_integrity_enabled(struct bio *bio) { + if (!bio_is_rw(bio)) + return 0; + /* Already protected? */ if (bio_integrity(bio)) return 0; @@ -301,45 +304,65 @@ int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len) EXPORT_SYMBOL(bio_integrity_get_tag); /** - * bio_integrity_generate - Generate integrity metadata for a bio - * @bio: bio to generate integrity metadata for - * - * Description: Generates integrity metadata for a bio by calling the - * block device's generation callback function. The bio must have a - * bip attached with enough room to accommodate the generated - * integrity metadata. + * bio_integrity_generate_verify - Generate/verify integrity metadata for a bio + * @bio: bio to generate/verify integrity metadata for + * @operate: operate number, 1 for generate, 0 for verify */ -static void bio_integrity_generate(struct bio *bio) +static int bio_integrity_generate_verify(struct bio *bio, int operate) { struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); struct blk_integrity_exchg bix; - struct bio_vec bv; - struct bvec_iter iter; - sector_t sector = bio->bi_iter.bi_sector; - unsigned int sectors, total; + struct bio_vec *bv; + sector_t sector; + unsigned int sectors, ret = 0, i; void *prot_buf = bio->bi_integrity->bip_buf; - total = 0; + if (operate) + sector = bio->bi_iter.bi_sector; + else + sector = bio->bi_integrity->bip_iter.bi_sector; + bix.disk_name = bio->bi_bdev->bd_disk->disk_name; bix.sector_size = bi->sector_size; - bio_for_each_segment(bv, bio, iter) { - void *kaddr = kmap_atomic(bv.bv_page); - bix.data_buf = kaddr + bv.bv_offset; - bix.data_size = bv.bv_len; + bio_for_each_segment_all(bv, bio, i) { + void *kaddr = kmap_atomic(bv->bv_page); + bix.data_buf = kaddr + bv->bv_offset; + bix.data_size = bv->bv_len; bix.prot_buf = prot_buf; bix.sector = sector; - bi->generate_fn(&bix); + if (operate) + bi->generate_fn(&bix); + else { + ret = bi->verify_fn(&bix); + if (ret) { + kunmap_atomic(kaddr); + return ret; + } + } - sectors = bv.bv_len / bi->sector_size; + sectors = bv->bv_len / bi->sector_size; sector += sectors; prot_buf += sectors * bi->tuple_size; - total += sectors * bi->tuple_size; - BUG_ON(total > bio->bi_integrity->bip_iter.bi_size); kunmap_atomic(kaddr); } + return ret; +} + +/** + * bio_integrity_generate - Generate integrity metadata for a bio + * @bio: bio to generate integrity metadata for + * + * Description: Generates integrity metadata for a bio by calling the + * block device's generation callback function. The bio must have a + * bip attached with enough room to accommodate the generated + * integrity metadata. + */ +static void bio_integrity_generate(struct bio *bio) +{ + bio_integrity_generate_verify(bio, 1); } static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi) @@ -454,40 +477,7 @@ EXPORT_SYMBOL(bio_integrity_prep); */ static int bio_integrity_verify(struct bio *bio) { - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - struct blk_integrity_exchg bix; - struct bio_vec *bv; - sector_t sector = bio->bi_integrity->bip_iter.bi_sector; - unsigned int sectors, ret = 0; - void *prot_buf = bio->bi_integrity->bip_buf; - int i; - - bix.disk_name = bio->bi_bdev->bd_disk->disk_name; - bix.sector_size = bi->sector_size; - - bio_for_each_segment_all(bv, bio, i) { - void *kaddr = kmap_atomic(bv->bv_page); - - bix.data_buf = kaddr + bv->bv_offset; - bix.data_size = bv->bv_len; - bix.prot_buf = prot_buf; - bix.sector = sector; - - ret = bi->verify_fn(&bix); - - if (ret) { - kunmap_atomic(kaddr); - return ret; - } - - sectors = bv->bv_len / bi->sector_size; - sector += sectors; - prot_buf += sectors * bi->tuple_size; - - kunmap_atomic(kaddr); - } - - return ret; + return bio_integrity_generate_verify(bio, 0); } /** @@ -116,7 +116,6 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) if (!slab) goto out_unlock; - printk(KERN_INFO "bio: create slab <%s> at %d\n", bslab->name, entry); bslab->slab = slab; bslab->slab_ref = 1; bslab->slab_size = sz; @@ -1003,7 +1002,7 @@ struct bio_map_data { }; static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, - struct sg_iovec *iov, int iov_count, + const struct sg_iovec *iov, int iov_count, int is_our_pages) { memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count); @@ -1023,7 +1022,7 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs, sizeof(struct sg_iovec) * iov_count, gfp_mask); } -static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count, +static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_count, int to_user, int from_user, int do_free_page) { int ret = 0, i; @@ -1121,7 +1120,7 @@ EXPORT_SYMBOL(bio_uncopy_user); */ struct bio *bio_copy_user_iov(struct request_queue *q, struct rq_map_data *map_data, - struct sg_iovec *iov, int iov_count, + const struct sg_iovec *iov, int iov_count, int write_to_vm, gfp_t gfp_mask) { struct bio_map_data *bmd; @@ -1260,7 +1259,7 @@ EXPORT_SYMBOL(bio_copy_user); static struct bio *__bio_map_user_iov(struct request_queue *q, struct block_device *bdev, - struct sg_iovec *iov, int iov_count, + const struct sg_iovec *iov, int iov_count, int write_to_vm, gfp_t gfp_mask) { int i, j; @@ -1408,7 +1407,7 @@ EXPORT_SYMBOL(bio_map_user); * device. Returns an error pointer in case of error. */ struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev, - struct sg_iovec *iov, int iov_count, + const struct sg_iovec *iov, int iov_count, int write_to_vm, gfp_t gfp_mask) { struct bio *bio; @@ -1970,7 +1969,7 @@ int bio_associate_current(struct bio *bio) /* associate blkcg if exists */ rcu_read_lock(); - css = task_css(current, blkio_subsys_id); + css = task_css(current, blkio_cgrp_id); if (css && css_tryget(css)) bio->bi_css = css; rcu_read_unlock(); diff --git a/fs/block_dev.c b/fs/block_dev.c index 1e86823a9cbd..552a8d13bc32 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -83,7 +83,7 @@ void kill_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_inode->i_mapping; - if (mapping->nrpages == 0) + if (mapping->nrpages == 0 && mapping->nrshadows == 0) return; invalidate_bh_lrus(); @@ -419,7 +419,7 @@ static void bdev_evict_inode(struct inode *inode) { struct block_device *bdev = &BDEV_I(inode)->bdev; struct list_head *p; - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); /* is it needed here? */ clear_inode(inode); spin_lock(&bdev_lock); @@ -1518,12 +1518,12 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, BUG_ON(iocb->ki_pos != pos); blk_start_plug(&plug); - ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); + ret = __generic_file_aio_write(iocb, iov, nr_segs); if (ret > 0) { ssize_t err; err = generic_write_sync(file, pos, ret); - if (err < 0 && ret > 0) + if (err < 0) ret = err; } blk_finish_plug(&plug); diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index c1e0b0caf9cc..5a201d81049c 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -1,5 +1,6 @@ /* * Copyright (C) 2007 Oracle. All rights reserved. + * Copyright (C) 2014 Fujitsu. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public @@ -21,708 +22,315 @@ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/freezer.h> +#include <linux/workqueue.h> #include "async-thread.h" +#include "ctree.h" + +#define WORK_DONE_BIT 0 +#define WORK_ORDER_DONE_BIT 1 +#define WORK_HIGH_PRIO_BIT 2 + +#define NO_THRESHOLD (-1) +#define DFT_THRESHOLD (32) + +struct __btrfs_workqueue { + struct workqueue_struct *normal_wq; + /* List head pointing to ordered work list */ + struct list_head ordered_list; + + /* Spinlock for ordered_list */ + spinlock_t list_lock; + + /* Thresholding related variants */ + atomic_t pending; + int max_active; + int current_max; + int thresh; + unsigned int count; + spinlock_t thres_lock; +}; -#define WORK_QUEUED_BIT 0 -#define WORK_DONE_BIT 1 -#define WORK_ORDER_DONE_BIT 2 -#define WORK_HIGH_PRIO_BIT 3 - -/* - * container for the kthread task pointer and the list of pending work - * One of these is allocated per thread. - */ -struct btrfs_worker_thread { - /* pool we belong to */ - struct btrfs_workers *workers; - - /* list of struct btrfs_work that are waiting for service */ - struct list_head pending; - struct list_head prio_pending; - - /* list of worker threads from struct btrfs_workers */ - struct list_head worker_list; - - /* kthread */ - struct task_struct *task; +struct btrfs_workqueue { + struct __btrfs_workqueue *normal; + struct __btrfs_workqueue *high; +}; - /* number of things on the pending list */ - atomic_t num_pending; +static inline struct __btrfs_workqueue +*__btrfs_alloc_workqueue(const char *name, int flags, int max_active, + int thresh) +{ + struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); - /* reference counter for this struct */ - atomic_t refs; + if (unlikely(!ret)) + return NULL; - unsigned long sequence; + ret->max_active = max_active; + atomic_set(&ret->pending, 0); + if (thresh == 0) + thresh = DFT_THRESHOLD; + /* For low threshold, disabling threshold is a better choice */ + if (thresh < DFT_THRESHOLD) { + ret->current_max = max_active; + ret->thresh = NO_THRESHOLD; + } else { + ret->current_max = 1; + ret->thresh = thresh; + } - /* protects the pending list. */ - spinlock_t lock; + if (flags & WQ_HIGHPRI) + ret->normal_wq = alloc_workqueue("%s-%s-high", flags, + ret->max_active, + "btrfs", name); + else + ret->normal_wq = alloc_workqueue("%s-%s", flags, + ret->max_active, "btrfs", + name); + if (unlikely(!ret->normal_wq)) { + kfree(ret); + return NULL; + } - /* set to non-zero when this thread is already awake and kicking */ - int working; + INIT_LIST_HEAD(&ret->ordered_list); + spin_lock_init(&ret->list_lock); + spin_lock_init(&ret->thres_lock); + trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI); + return ret; +} - /* are we currently idle */ - int idle; -}; +static inline void +__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq); -static int __btrfs_start_workers(struct btrfs_workers *workers); +struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, + int flags, + int max_active, + int thresh) +{ + struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); -/* - * btrfs_start_workers uses kthread_run, which can block waiting for memory - * for a very long time. It will actually throttle on page writeback, - * and so it may not make progress until after our btrfs worker threads - * process all of the pending work structs in their queue - * - * This means we can't use btrfs_start_workers from inside a btrfs worker - * thread that is used as part of cleaning dirty memory, which pretty much - * involves all of the worker threads. - * - * Instead we have a helper queue who never has more than one thread - * where we scheduler thread start operations. This worker_start struct - * is used to contain the work and hold a pointer to the queue that needs - * another worker. - */ -struct worker_start { - struct btrfs_work work; - struct btrfs_workers *queue; -}; + if (unlikely(!ret)) + return NULL; -static void start_new_worker_func(struct btrfs_work *work) -{ - struct worker_start *start; - start = container_of(work, struct worker_start, work); - __btrfs_start_workers(start->queue); - kfree(start); -} + ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI, + max_active, thresh); + if (unlikely(!ret->normal)) { + kfree(ret); + return NULL; + } -/* - * helper function to move a thread onto the idle list after it - * has finished some requests. - */ -static void check_idle_worker(struct btrfs_worker_thread *worker) -{ - if (!worker->idle && atomic_read(&worker->num_pending) < - worker->workers->idle_thresh / 2) { - unsigned long flags; - spin_lock_irqsave(&worker->workers->lock, flags); - worker->idle = 1; - - /* the list may be empty if the worker is just starting */ - if (!list_empty(&worker->worker_list) && - !worker->workers->stopping) { - list_move(&worker->worker_list, - &worker->workers->idle_list); + if (flags & WQ_HIGHPRI) { + ret->high = __btrfs_alloc_workqueue(name, flags, max_active, + thresh); + if (unlikely(!ret->high)) { + __btrfs_destroy_workqueue(ret->normal); + kfree(ret); + return NULL; } - spin_unlock_irqrestore(&worker->workers->lock, flags); } + return ret; } /* - * helper function to move a thread off the idle list after new - * pending work is added. + * Hook for threshold which will be called in btrfs_queue_work. + * This hook WILL be called in IRQ handler context, + * so workqueue_set_max_active MUST NOT be called in this hook */ -static void check_busy_worker(struct btrfs_worker_thread *worker) +static inline void thresh_queue_hook(struct __btrfs_workqueue *wq) { - if (worker->idle && atomic_read(&worker->num_pending) >= - worker->workers->idle_thresh) { - unsigned long flags; - spin_lock_irqsave(&worker->workers->lock, flags); - worker->idle = 0; - - if (!list_empty(&worker->worker_list) && - !worker->workers->stopping) { - list_move_tail(&worker->worker_list, - &worker->workers->worker_list); - } - spin_unlock_irqrestore(&worker->workers->lock, flags); - } + if (wq->thresh == NO_THRESHOLD) + return; + atomic_inc(&wq->pending); } -static void check_pending_worker_creates(struct btrfs_worker_thread *worker) +/* + * Hook for threshold which will be called before executing the work, + * This hook is called in kthread content. + * So workqueue_set_max_active is called here. + */ +static inline void thresh_exec_hook(struct __btrfs_workqueue *wq) { - struct btrfs_workers *workers = worker->workers; - struct worker_start *start; - unsigned long flags; - - rmb(); - if (!workers->atomic_start_pending) - return; + int new_max_active; + long pending; + int need_change = 0; - start = kzalloc(sizeof(*start), GFP_NOFS); - if (!start) + if (wq->thresh == NO_THRESHOLD) return; - start->work.func = start_new_worker_func; - start->queue = workers; - - spin_lock_irqsave(&workers->lock, flags); - if (!workers->atomic_start_pending) - goto out; - - workers->atomic_start_pending = 0; - if (workers->num_workers + workers->num_workers_starting >= - workers->max_workers) - goto out; - - workers->num_workers_starting += 1; - spin_unlock_irqrestore(&workers->lock, flags); - btrfs_queue_worker(workers->atomic_worker_start, &start->work); - return; + atomic_dec(&wq->pending); + spin_lock(&wq->thres_lock); + /* + * Use wq->count to limit the calling frequency of + * workqueue_set_max_active. + */ + wq->count++; + wq->count %= (wq->thresh / 4); + if (!wq->count) + goto out; + new_max_active = wq->current_max; + /* + * pending may be changed later, but it's OK since we really + * don't need it so accurate to calculate new_max_active. + */ + pending = atomic_read(&wq->pending); + if (pending > wq->thresh) + new_max_active++; + if (pending < wq->thresh / 2) + new_max_active--; + new_max_active = clamp_val(new_max_active, 1, wq->max_active); + if (new_max_active != wq->current_max) { + need_change = 1; + wq->current_max = new_max_active; + } out: - kfree(start); - spin_unlock_irqrestore(&workers->lock, flags); + spin_unlock(&wq->thres_lock); + + if (need_change) { + workqueue_set_max_active(wq->normal_wq, wq->current_max); + } } -static noinline void run_ordered_completions(struct btrfs_workers *workers, - struct btrfs_work *work) +static void run_ordered_work(struct __btrfs_workqueue *wq) { - if (!workers->ordered) - return; - - set_bit(WORK_DONE_BIT, &work->flags); - - spin_lock(&workers->order_lock); + struct list_head *list = &wq->ordered_list; + struct btrfs_work *work; + spinlock_t *lock = &wq->list_lock; + unsigned long flags; while (1) { - if (!list_empty(&workers->prio_order_list)) { - work = list_entry(workers->prio_order_list.next, - struct btrfs_work, order_list); - } else if (!list_empty(&workers->order_list)) { - work = list_entry(workers->order_list.next, - struct btrfs_work, order_list); - } else { + spin_lock_irqsave(lock, flags); + if (list_empty(list)) break; - } + work = list_entry(list->next, struct btrfs_work, + ordered_list); if (!test_bit(WORK_DONE_BIT, &work->flags)) break; - /* we are going to call the ordered done function, but + /* + * we are going to call the ordered done function, but * we leave the work item on the list as a barrier so * that later work items that are done don't have their * functions called before this one returns */ if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags)) break; - - spin_unlock(&workers->order_lock); - + trace_btrfs_ordered_sched(work); + spin_unlock_irqrestore(lock, flags); work->ordered_func(work); /* now take the lock again and drop our item from the list */ - spin_lock(&workers->order_lock); - list_del(&work->order_list); - spin_unlock(&workers->order_lock); + spin_lock_irqsave(lock, flags); + list_del(&work->ordered_list); + spin_unlock_irqrestore(lock, flags); /* * we don't want to call the ordered free functions * with the lock held though */ work->ordered_free(work); - spin_lock(&workers->order_lock); - } - - spin_unlock(&workers->order_lock); -} - -static void put_worker(struct btrfs_worker_thread *worker) -{ - if (atomic_dec_and_test(&worker->refs)) - kfree(worker); -} - -static int try_worker_shutdown(struct btrfs_worker_thread *worker) -{ - int freeit = 0; - - spin_lock_irq(&worker->lock); - spin_lock(&worker->workers->lock); - if (worker->workers->num_workers > 1 && - worker->idle && - !worker->working && - !list_empty(&worker->worker_list) && - list_empty(&worker->prio_pending) && - list_empty(&worker->pending) && - atomic_read(&worker->num_pending) == 0) { - freeit = 1; - list_del_init(&worker->worker_list); - worker->workers->num_workers--; + trace_btrfs_all_work_done(work); } - spin_unlock(&worker->workers->lock); - spin_unlock_irq(&worker->lock); - - if (freeit) - put_worker(worker); - return freeit; + spin_unlock_irqrestore(lock, flags); } -static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker, - struct list_head *prio_head, - struct list_head *head) +static void normal_work_helper(struct work_struct *arg) { - struct btrfs_work *work = NULL; - struct list_head *cur = NULL; - - if (!list_empty(prio_head)) - cur = prio_head->next; - - smp_mb(); - if (!list_empty(&worker->prio_pending)) - goto refill; - - if (!list_empty(head)) - cur = head->next; - - if (cur) - goto out; - -refill: - spin_lock_irq(&worker->lock); - list_splice_tail_init(&worker->prio_pending, prio_head); - list_splice_tail_init(&worker->pending, head); - - if (!list_empty(prio_head)) - cur = prio_head->next; - else if (!list_empty(head)) - cur = head->next; - spin_unlock_irq(&worker->lock); - - if (!cur) - goto out_fail; - -out: - work = list_entry(cur, struct btrfs_work, list); - -out_fail: - return work; -} - -/* - * main loop for servicing work items - */ -static int worker_loop(void *arg) -{ - struct btrfs_worker_thread *worker = arg; - struct list_head head; - struct list_head prio_head; struct btrfs_work *work; + struct __btrfs_workqueue *wq; + int need_order = 0; - INIT_LIST_HEAD(&head); - INIT_LIST_HEAD(&prio_head); - - do { -again: - while (1) { - - - work = get_next_work(worker, &prio_head, &head); - if (!work) - break; - - list_del(&work->list); - clear_bit(WORK_QUEUED_BIT, &work->flags); - - work->worker = worker; - - work->func(work); - - atomic_dec(&worker->num_pending); - /* - * unless this is an ordered work queue, - * 'work' was probably freed by func above. - */ - run_ordered_completions(worker->workers, work); - - check_pending_worker_creates(worker); - cond_resched(); - } - - spin_lock_irq(&worker->lock); - check_idle_worker(worker); - - if (freezing(current)) { - worker->working = 0; - spin_unlock_irq(&worker->lock); - try_to_freeze(); - } else { - spin_unlock_irq(&worker->lock); - if (!kthread_should_stop()) { - cpu_relax(); - /* - * we've dropped the lock, did someone else - * jump_in? - */ - smp_mb(); - if (!list_empty(&worker->pending) || - !list_empty(&worker->prio_pending)) - continue; - - /* - * this short schedule allows more work to - * come in without the queue functions - * needing to go through wake_up_process() - * - * worker->working is still 1, so nobody - * is going to try and wake us up - */ - schedule_timeout(1); - smp_mb(); - if (!list_empty(&worker->pending) || - !list_empty(&worker->prio_pending)) - continue; - - if (kthread_should_stop()) - break; - - /* still no more work?, sleep for real */ - spin_lock_irq(&worker->lock); - set_current_state(TASK_INTERRUPTIBLE); - if (!list_empty(&worker->pending) || - !list_empty(&worker->prio_pending)) { - spin_unlock_irq(&worker->lock); - set_current_state(TASK_RUNNING); - goto again; - } - - /* - * this makes sure we get a wakeup when someone - * adds something new to the queue - */ - worker->working = 0; - spin_unlock_irq(&worker->lock); - - if (!kthread_should_stop()) { - schedule_timeout(HZ * 120); - if (!worker->working && - try_worker_shutdown(worker)) { - return 0; - } - } - } - __set_current_state(TASK_RUNNING); - } - } while (!kthread_should_stop()); - return 0; -} - -/* - * this will wait for all the worker threads to shutdown - */ -void btrfs_stop_workers(struct btrfs_workers *workers) -{ - struct list_head *cur; - struct btrfs_worker_thread *worker; - int can_stop; - - spin_lock_irq(&workers->lock); - workers->stopping = 1; - list_splice_init(&workers->idle_list, &workers->worker_list); - while (!list_empty(&workers->worker_list)) { - cur = workers->worker_list.next; - worker = list_entry(cur, struct btrfs_worker_thread, - worker_list); - - atomic_inc(&worker->refs); - workers->num_workers -= 1; - if (!list_empty(&worker->worker_list)) { - list_del_init(&worker->worker_list); - put_worker(worker); - can_stop = 1; - } else - can_stop = 0; - spin_unlock_irq(&workers->lock); - if (can_stop) - kthread_stop(worker->task); - spin_lock_irq(&workers->lock); - put_worker(worker); + work = container_of(arg, struct btrfs_work, normal_work); + /* + * We should not touch things inside work in the following cases: + * 1) after work->func() if it has no ordered_free + * Since the struct is freed in work->func(). + * 2) after setting WORK_DONE_BIT + * The work may be freed in other threads almost instantly. + * So we save the needed things here. + */ + if (work->ordered_func) + need_order = 1; + wq = work->wq; + + trace_btrfs_work_sched(work); + thresh_exec_hook(wq); + work->func(work); + if (need_order) { + set_bit(WORK_DONE_BIT, &work->flags); + run_ordered_work(wq); } - spin_unlock_irq(&workers->lock); + if (!need_order) + trace_btrfs_all_work_done(work); } -/* - * simple init on struct btrfs_workers - */ -void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, - struct btrfs_workers *async_helper) +void btrfs_init_work(struct btrfs_work *work, + btrfs_func_t func, + btrfs_func_t ordered_func, + btrfs_func_t ordered_free) { - workers->num_workers = 0; - workers->num_workers_starting = 0; - INIT_LIST_HEAD(&workers->worker_list); - INIT_LIST_HEAD(&workers->idle_list); - INIT_LIST_HEAD(&workers->order_list); - INIT_LIST_HEAD(&workers->prio_order_list); - spin_lock_init(&workers->lock); - spin_lock_init(&workers->order_lock); - workers->max_workers = max; - workers->idle_thresh = 32; - workers->name = name; - workers->ordered = 0; - workers->atomic_start_pending = 0; - workers->atomic_worker_start = async_helper; - workers->stopping = 0; + work->func = func; + work->ordered_func = ordered_func; + work->ordered_free = ordered_free; + INIT_WORK(&work->normal_work, normal_work_helper); + INIT_LIST_HEAD(&work->ordered_list); + work->flags = 0; } -/* - * starts new worker threads. This does not enforce the max worker - * count in case you need to temporarily go past it. - */ -static int __btrfs_start_workers(struct btrfs_workers *workers) +static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq, + struct btrfs_work *work) { - struct btrfs_worker_thread *worker; - int ret = 0; - - worker = kzalloc(sizeof(*worker), GFP_NOFS); - if (!worker) { - ret = -ENOMEM; - goto fail; - } - - INIT_LIST_HEAD(&worker->pending); - INIT_LIST_HEAD(&worker->prio_pending); - INIT_LIST_HEAD(&worker->worker_list); - spin_lock_init(&worker->lock); - - atomic_set(&worker->num_pending, 0); - atomic_set(&worker->refs, 1); - worker->workers = workers; - worker->task = kthread_create(worker_loop, worker, - "btrfs-%s-%d", workers->name, - workers->num_workers + 1); - if (IS_ERR(worker->task)) { - ret = PTR_ERR(worker->task); - goto fail; - } + unsigned long flags; - spin_lock_irq(&workers->lock); - if (workers->stopping) { - spin_unlock_irq(&workers->lock); - ret = -EINVAL; - goto fail_kthread; + work->wq = wq; + thresh_queue_hook(wq); + if (work->ordered_func) { + spin_lock_irqsave(&wq->list_lock, flags); + list_add_tail(&work->ordered_list, &wq->ordered_list); + spin_unlock_irqrestore(&wq->list_lock, flags); } - list_add_tail(&worker->worker_list, &workers->idle_list); - worker->idle = 1; - workers->num_workers++; - workers->num_workers_starting--; - WARN_ON(workers->num_workers_starting < 0); - spin_unlock_irq(&workers->lock); - - wake_up_process(worker->task); - return 0; - -fail_kthread: - kthread_stop(worker->task); -fail: - kfree(worker); - spin_lock_irq(&workers->lock); - workers->num_workers_starting--; - spin_unlock_irq(&workers->lock); - return ret; + queue_work(wq->normal_wq, &work->normal_work); + trace_btrfs_work_queued(work); } -int btrfs_start_workers(struct btrfs_workers *workers) -{ - spin_lock_irq(&workers->lock); - workers->num_workers_starting++; - spin_unlock_irq(&workers->lock); - return __btrfs_start_workers(workers); -} - -/* - * run through the list and find a worker thread that doesn't have a lot - * to do right now. This can return null if we aren't yet at the thread - * count limit and all of the threads are busy. - */ -static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) +void btrfs_queue_work(struct btrfs_workqueue *wq, + struct btrfs_work *work) { - struct btrfs_worker_thread *worker; - struct list_head *next; - int enforce_min; + struct __btrfs_workqueue *dest_wq; - enforce_min = (workers->num_workers + workers->num_workers_starting) < - workers->max_workers; - - /* - * if we find an idle thread, don't move it to the end of the - * idle list. This improves the chance that the next submission - * will reuse the same thread, and maybe catch it while it is still - * working - */ - if (!list_empty(&workers->idle_list)) { - next = workers->idle_list.next; - worker = list_entry(next, struct btrfs_worker_thread, - worker_list); - return worker; - } - if (enforce_min || list_empty(&workers->worker_list)) - return NULL; - - /* - * if we pick a busy task, move the task to the end of the list. - * hopefully this will keep things somewhat evenly balanced. - * Do the move in batches based on the sequence number. This groups - * requests submitted at roughly the same time onto the same worker. - */ - next = workers->worker_list.next; - worker = list_entry(next, struct btrfs_worker_thread, worker_list); - worker->sequence++; - - if (worker->sequence % workers->idle_thresh == 0) - list_move_tail(next, &workers->worker_list); - return worker; + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high) + dest_wq = wq->high; + else + dest_wq = wq->normal; + __btrfs_queue_work(dest_wq, work); } -/* - * selects a worker thread to take the next job. This will either find - * an idle worker, start a new worker up to the max count, or just return - * one of the existing busy workers. - */ -static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) +static inline void +__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq) { - struct btrfs_worker_thread *worker; - unsigned long flags; - struct list_head *fallback; - int ret; - - spin_lock_irqsave(&workers->lock, flags); -again: - worker = next_worker(workers); - - if (!worker) { - if (workers->num_workers + workers->num_workers_starting >= - workers->max_workers) { - goto fallback; - } else if (workers->atomic_worker_start) { - workers->atomic_start_pending = 1; - goto fallback; - } else { - workers->num_workers_starting++; - spin_unlock_irqrestore(&workers->lock, flags); - /* we're below the limit, start another worker */ - ret = __btrfs_start_workers(workers); - spin_lock_irqsave(&workers->lock, flags); - if (ret) - goto fallback; - goto again; - } - } - goto found; - -fallback: - fallback = NULL; - /* - * we have failed to find any workers, just - * return the first one we can find. - */ - if (!list_empty(&workers->worker_list)) - fallback = workers->worker_list.next; - if (!list_empty(&workers->idle_list)) - fallback = workers->idle_list.next; - BUG_ON(!fallback); - worker = list_entry(fallback, - struct btrfs_worker_thread, worker_list); -found: - /* - * this makes sure the worker doesn't exit before it is placed - * onto a busy/idle list - */ - atomic_inc(&worker->num_pending); - spin_unlock_irqrestore(&workers->lock, flags); - return worker; + destroy_workqueue(wq->normal_wq); + trace_btrfs_workqueue_destroy(wq); + kfree(wq); } -/* - * btrfs_requeue_work just puts the work item back on the tail of the list - * it was taken from. It is intended for use with long running work functions - * that make some progress and want to give the cpu up for others. - */ -void btrfs_requeue_work(struct btrfs_work *work) +void btrfs_destroy_workqueue(struct btrfs_workqueue *wq) { - struct btrfs_worker_thread *worker = work->worker; - unsigned long flags; - int wake = 0; - - if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) + if (!wq) return; - - spin_lock_irqsave(&worker->lock, flags); - if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) - list_add_tail(&work->list, &worker->prio_pending); - else - list_add_tail(&work->list, &worker->pending); - atomic_inc(&worker->num_pending); - - /* by definition we're busy, take ourselves off the idle - * list - */ - if (worker->idle) { - spin_lock(&worker->workers->lock); - worker->idle = 0; - list_move_tail(&worker->worker_list, - &worker->workers->worker_list); - spin_unlock(&worker->workers->lock); - } - if (!worker->working) { - wake = 1; - worker->working = 1; - } - - if (wake) - wake_up_process(worker->task); - spin_unlock_irqrestore(&worker->lock, flags); + if (wq->high) + __btrfs_destroy_workqueue(wq->high); + __btrfs_destroy_workqueue(wq->normal); + kfree(wq); } -void btrfs_set_work_high_prio(struct btrfs_work *work) +void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max) { - set_bit(WORK_HIGH_PRIO_BIT, &work->flags); + if (!wq) + return; + wq->normal->max_active = max; + if (wq->high) + wq->high->max_active = max; } -/* - * places a struct btrfs_work into the pending queue of one of the kthreads - */ -void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) +void btrfs_set_work_high_priority(struct btrfs_work *work) { - struct btrfs_worker_thread *worker; - unsigned long flags; - int wake = 0; - - /* don't requeue something already on a list */ - if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) - return; - - worker = find_worker(workers); - if (workers->ordered) { - /* - * you're not allowed to do ordered queues from an - * interrupt handler - */ - spin_lock(&workers->order_lock); - if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) { - list_add_tail(&work->order_list, - &workers->prio_order_list); - } else { - list_add_tail(&work->order_list, &workers->order_list); - } - spin_unlock(&workers->order_lock); - } else { - INIT_LIST_HEAD(&work->order_list); - } - - spin_lock_irqsave(&worker->lock, flags); - - if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) - list_add_tail(&work->list, &worker->prio_pending); - else - list_add_tail(&work->list, &worker->pending); - check_busy_worker(worker); - - /* - * avoid calling into wake_up_process if this thread has already - * been kicked - */ - if (!worker->working) - wake = 1; - worker->working = 1; - - if (wake) - wake_up_process(worker->task); - spin_unlock_irqrestore(&worker->lock, flags); + set_bit(WORK_HIGH_PRIO_BIT, &work->flags); } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 1f26792683ed..9c6b66d15fb0 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -1,5 +1,6 @@ /* * Copyright (C) 2007 Oracle. All rights reserved. + * Copyright (C) 2014 Fujitsu. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public @@ -19,103 +20,35 @@ #ifndef __BTRFS_ASYNC_THREAD_ #define __BTRFS_ASYNC_THREAD_ -struct btrfs_worker_thread; +struct btrfs_workqueue; +/* Internal use only */ +struct __btrfs_workqueue; +struct btrfs_work; +typedef void (*btrfs_func_t)(struct btrfs_work *arg); -/* - * This is similar to a workqueue, but it is meant to spread the operations - * across all available cpus instead of just the CPU that was used to - * queue the work. There is also some batching introduced to try and - * cut down on context switches. - * - * By default threads are added on demand up to 2 * the number of cpus. - * Changing struct btrfs_workers->max_workers is one way to prevent - * demand creation of kthreads. - * - * the basic model of these worker threads is to embed a btrfs_work - * structure in your own data struct, and use container_of in a - * work function to get back to your data struct. - */ struct btrfs_work { - /* - * func should be set to the function you want called - * your work struct is passed as the only arg - * - * ordered_func must be set for work sent to an ordered work queue, - * and it is called to complete a given work item in the same - * order they were sent to the queue. - */ - void (*func)(struct btrfs_work *work); - void (*ordered_func)(struct btrfs_work *work); - void (*ordered_free)(struct btrfs_work *work); - - /* - * flags should be set to zero. It is used to make sure the - * struct is only inserted once into the list. - */ + btrfs_func_t func; + btrfs_func_t ordered_func; + btrfs_func_t ordered_free; + + /* Don't touch things below */ + struct work_struct normal_work; + struct list_head ordered_list; + struct __btrfs_workqueue *wq; unsigned long flags; - - /* don't touch these */ - struct btrfs_worker_thread *worker; - struct list_head list; - struct list_head order_list; -}; - -struct btrfs_workers { - /* current number of running workers */ - int num_workers; - - int num_workers_starting; - - /* max number of workers allowed. changed by btrfs_start_workers */ - int max_workers; - - /* once a worker has this many requests or fewer, it is idle */ - int idle_thresh; - - /* force completions in the order they were queued */ - int ordered; - - /* more workers required, but in an interrupt handler */ - int atomic_start_pending; - - /* - * are we allowed to sleep while starting workers or are we required - * to start them at a later time? If we can't sleep, this indicates - * which queue we need to use to schedule thread creation. - */ - struct btrfs_workers *atomic_worker_start; - - /* list with all the work threads. The workers on the idle thread - * may be actively servicing jobs, but they haven't yet hit the - * idle thresh limit above. - */ - struct list_head worker_list; - struct list_head idle_list; - - /* - * when operating in ordered mode, this maintains the list - * of work items waiting for completion - */ - struct list_head order_list; - struct list_head prio_order_list; - - /* lock for finding the next worker thread to queue on */ - spinlock_t lock; - - /* lock for the ordered lists */ - spinlock_t order_lock; - - /* extra name for this worker, used for current->name */ - char *name; - - int stopping; }; -void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); -int btrfs_start_workers(struct btrfs_workers *workers); -void btrfs_stop_workers(struct btrfs_workers *workers); -void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, - struct btrfs_workers *async_starter); -void btrfs_requeue_work(struct btrfs_work *work); -void btrfs_set_work_high_prio(struct btrfs_work *work); +struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, + int flags, + int max_active, + int thresh); +void btrfs_init_work(struct btrfs_work *work, + btrfs_func_t func, + btrfs_func_t ordered_func, + btrfs_func_t ordered_free); +void btrfs_queue_work(struct btrfs_workqueue *wq, + struct btrfs_work *work); +void btrfs_destroy_workqueue(struct btrfs_workqueue *wq); +void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max); +void btrfs_set_work_high_priority(struct btrfs_work *work); #endif diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index aded3ef3d3d4..10db21fa0926 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -220,7 +220,8 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id, static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, struct ulist *parents, struct __prelim_ref *ref, - int level, u64 time_seq, const u64 *extent_item_pos) + int level, u64 time_seq, const u64 *extent_item_pos, + u64 total_refs) { int ret = 0; int slot; @@ -249,7 +250,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) ret = btrfs_next_old_leaf(root, path, time_seq); - while (!ret && count < ref->count) { + while (!ret && count < total_refs) { eb = path->nodes[0]; slot = path->slots[0]; @@ -306,7 +307,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 time_seq, struct __prelim_ref *ref, struct ulist *parents, - const u64 *extent_item_pos) + const u64 *extent_item_pos, u64 total_refs) { struct btrfs_root *root; struct btrfs_key root_key; @@ -329,7 +330,10 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out; } - root_level = btrfs_old_root_level(root, time_seq); + if (path->search_commit_root) + root_level = btrfs_header_level(root->commit_root); + else + root_level = btrfs_old_root_level(root, time_seq); if (root_level + 1 == level) { srcu_read_unlock(&fs_info->subvol_srcu, index); @@ -361,7 +365,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, } ret = add_all_parents(root, path, parents, ref, level, time_seq, - extent_item_pos); + extent_item_pos, total_refs); out: path->lowest_level = 0; btrfs_release_path(path); @@ -374,7 +378,7 @@ out: static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 time_seq, struct list_head *head, - const u64 *extent_item_pos) + const u64 *extent_item_pos, u64 total_refs) { int err; int ret = 0; @@ -400,7 +404,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, if (ref->count == 0) continue; err = __resolve_indirect_ref(fs_info, path, time_seq, ref, - parents, extent_item_pos); + parents, extent_item_pos, + total_refs); /* * we can only tolerate ENOENT,otherwise,we should catch error * and return directly. @@ -557,7 +562,7 @@ static void __merge_refs(struct list_head *head, int mode) * smaller or equal that seq to the list */ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, - struct list_head *prefs) + struct list_head *prefs, u64 *total_refs) { struct btrfs_delayed_extent_op *extent_op = head->extent_op; struct rb_node *n = &head->node.rb_node; @@ -593,6 +598,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, default: BUG_ON(1); } + *total_refs += (node->ref_mod * sgn); switch (node->type) { case BTRFS_TREE_BLOCK_REF_KEY: { struct btrfs_delayed_tree_ref *ref; @@ -653,7 +659,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, */ static int __add_inline_refs(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 bytenr, - int *info_level, struct list_head *prefs) + int *info_level, struct list_head *prefs, + u64 *total_refs) { int ret = 0; int slot; @@ -677,6 +684,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); flags = btrfs_extent_flags(leaf, ei); + *total_refs += btrfs_extent_refs(leaf, ei); btrfs_item_key_to_cpu(leaf, &found_key, slot); ptr = (unsigned long)(ei + 1); @@ -859,6 +867,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct list_head prefs; struct __prelim_ref *ref; struct extent_inode_elem *eie = NULL; + u64 total_refs = 0; INIT_LIST_HEAD(&prefs); INIT_LIST_HEAD(&prefs_delayed); @@ -873,8 +882,10 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - if (!trans) + if (!trans) { path->search_commit_root = 1; + path->skip_locking = 1; + } /* * grab both a lock on the path and a lock on the delayed ref head. @@ -915,7 +926,7 @@ again: } spin_unlock(&delayed_refs->lock); ret = __add_delayed_refs(head, time_seq, - &prefs_delayed); + &prefs_delayed, &total_refs); mutex_unlock(&head->mutex); if (ret) goto out; @@ -936,7 +947,8 @@ again: (key.type == BTRFS_EXTENT_ITEM_KEY || key.type == BTRFS_METADATA_ITEM_KEY)) { ret = __add_inline_refs(fs_info, path, bytenr, - &info_level, &prefs); + &info_level, &prefs, + &total_refs); if (ret) goto out; ret = __add_keyed_refs(fs_info, path, bytenr, @@ -956,7 +968,7 @@ again: __merge_refs(&prefs, 1); ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs, - extent_item_pos); + extent_item_pos, total_refs); if (ret) goto out; @@ -965,7 +977,7 @@ again: while (!list_empty(&prefs)) { ref = list_first_entry(&prefs, struct __prelim_ref, list); WARN_ON(ref->count < 0); - if (ref->count && ref->root_id && ref->parent == 0) { + if (roots && ref->count && ref->root_id && ref->parent == 0) { /* no parent == root of tree */ ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); if (ret < 0) @@ -1061,22 +1073,14 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, u64 time_seq, struct ulist **leafs, const u64 *extent_item_pos) { - struct ulist *tmp; int ret; - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) - return -ENOMEM; *leafs = ulist_alloc(GFP_NOFS); - if (!*leafs) { - ulist_free(tmp); + if (!*leafs) return -ENOMEM; - } ret = find_parent_nodes(trans, fs_info, bytenr, - time_seq, *leafs, tmp, extent_item_pos); - ulist_free(tmp); - + time_seq, *leafs, NULL, extent_item_pos); if (ret < 0 && ret != -ENOENT) { free_leaf_list(*leafs); return ret; @@ -1098,9 +1102,9 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, * * returns 0 on success, < 0 on error. */ -int btrfs_find_all_roots(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots) +static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist **roots) { struct ulist *tmp; struct ulist_node *node = NULL; @@ -1136,6 +1140,20 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, return 0; } +int btrfs_find_all_roots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist **roots) +{ + int ret; + + if (!trans) + down_read(&fs_info->commit_root_sem); + ret = __btrfs_find_all_roots(trans, fs_info, bytenr, time_seq, roots); + if (!trans) + up_read(&fs_info->commit_root_sem); + return ret; +} + /* * this makes the path point to (inum INODE_ITEM ioff) */ @@ -1333,38 +1351,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, if (ret < 0) return ret; - while (1) { - u32 nritems; - if (path->slots[0] == 0) { - btrfs_set_path_blocking(path); - ret = btrfs_prev_leaf(fs_info->extent_root, path); - if (ret != 0) { - if (ret > 0) { - pr_debug("logical %llu is not within " - "any extent\n", logical); - ret = -ENOENT; - } - return ret; - } - } else { - path->slots[0]--; - } - nritems = btrfs_header_nritems(path->nodes[0]); - if (nritems == 0) { - pr_debug("logical %llu is not within any extent\n", - logical); - return -ENOENT; - } - if (path->slots[0] == nritems) - path->slots[0]--; - - btrfs_item_key_to_cpu(path->nodes[0], found_key, - path->slots[0]); - if (found_key->type == BTRFS_EXTENT_ITEM_KEY || - found_key->type == BTRFS_METADATA_ITEM_KEY) - break; + ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0); + if (ret) { + if (ret > 0) + ret = -ENOENT; + return ret; } - + btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); if (found_key->type == BTRFS_METADATA_ITEM_KEY) size = fs_info->extent_root->leafsize; else if (found_key->type == BTRFS_EXTENT_ITEM_KEY) @@ -1540,6 +1533,8 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, if (IS_ERR(trans)) return PTR_ERR(trans); btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); + } else { + down_read(&fs_info->commit_root_sem); } ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, @@ -1550,8 +1545,8 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, ULIST_ITER_INIT(&ref_uiter); while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { - ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, - tree_mod_seq_elem.seq, &roots); + ret = __btrfs_find_all_roots(trans, fs_info, ref_node->val, + tree_mod_seq_elem.seq, &roots); if (ret) break; ULIST_ITER_INIT(&root_uiter); @@ -1573,6 +1568,8 @@ out: if (!search_commit_root) { btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); btrfs_end_transaction(trans, fs_info->extent_root); + } else { + up_read(&fs_info->commit_root_sem); } return ret; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 8fed2125689e..c9a24444ec9a 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -109,14 +109,17 @@ struct btrfs_inode { u64 last_trans; /* - * log transid when this inode was last modified + * transid that last logged this inode */ - u64 last_sub_trans; + u64 logged_trans; /* - * transid that last logged this inode + * log transid when this inode was last modified */ - u64 logged_trans; + int last_sub_trans; + + /* a local copy of root's last_log_commit */ + int last_log_commit; /* total number of bytes pending delalloc, used by stat to calc the * real block usage of the file @@ -155,9 +158,6 @@ struct btrfs_inode { /* flags field from the on disk inode */ u32 flags; - /* a local copy of root's last_log_commit */ - unsigned long last_log_commit; - /* * Counters to keep track of the number of extent item's we may use due * to delalloc and such. outstanding_extents is the number of extent diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index b01fb6c527e3..d43c544d3b68 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -472,7 +472,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, rcu_read_lock(); page = radix_tree_lookup(&mapping->page_tree, pg_index); rcu_read_unlock(); - if (page) { + if (page && !radix_tree_exceptional_entry(page)) { misses++; if (misses > 4) break; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index cbd3a7d6fa68..1bcfcdb23cf4 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2769,9 +2769,13 @@ again: * the commit roots are read only * so we always do read locks */ + if (p->need_commit_sem) + down_read(&root->fs_info->commit_root_sem); b = root->commit_root; extent_buffer_get(b); level = btrfs_header_level(b); + if (p->need_commit_sem) + up_read(&root->fs_info->commit_root_sem); if (!p->skip_locking) btrfs_tree_read_lock(b); } else { @@ -5360,7 +5364,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root, { int ret; int cmp; - struct btrfs_trans_handle *trans = NULL; struct btrfs_path *left_path = NULL; struct btrfs_path *right_path = NULL; struct btrfs_key left_key; @@ -5376,9 +5379,8 @@ int btrfs_compare_trees(struct btrfs_root *left_root, int advance_right; u64 left_blockptr; u64 right_blockptr; - u64 left_start_ctransid; - u64 right_start_ctransid; - u64 ctransid; + u64 left_gen; + u64 right_gen; left_path = btrfs_alloc_path(); if (!left_path) { @@ -5402,21 +5404,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root, right_path->search_commit_root = 1; right_path->skip_locking = 1; - spin_lock(&left_root->root_item_lock); - left_start_ctransid = btrfs_root_ctransid(&left_root->root_item); - spin_unlock(&left_root->root_item_lock); - - spin_lock(&right_root->root_item_lock); - right_start_ctransid = btrfs_root_ctransid(&right_root->root_item); - spin_unlock(&right_root->root_item_lock); - - trans = btrfs_join_transaction(left_root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out; - } - /* * Strategy: Go to the first items of both trees. Then do * @@ -5453,6 +5440,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, * the right if possible or go up and right. */ + down_read(&left_root->fs_info->commit_root_sem); left_level = btrfs_header_level(left_root->commit_root); left_root_level = left_level; left_path->nodes[left_level] = left_root->commit_root; @@ -5462,6 +5450,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, right_root_level = right_level; right_path->nodes[right_level] = right_root->commit_root; extent_buffer_get(right_path->nodes[right_level]); + up_read(&left_root->fs_info->commit_root_sem); if (left_level == 0) btrfs_item_key_to_cpu(left_path->nodes[left_level], @@ -5480,67 +5469,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root, advance_left = advance_right = 0; while (1) { - /* - * We need to make sure the transaction does not get committed - * while we do anything on commit roots. This means, we need to - * join and leave transactions for every item that we process. - */ - if (trans && btrfs_should_end_transaction(trans, left_root)) { - btrfs_release_path(left_path); - btrfs_release_path(right_path); - - ret = btrfs_end_transaction(trans, left_root); - trans = NULL; - if (ret < 0) - goto out; - } - /* now rejoin the transaction */ - if (!trans) { - trans = btrfs_join_transaction(left_root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out; - } - - spin_lock(&left_root->root_item_lock); - ctransid = btrfs_root_ctransid(&left_root->root_item); - spin_unlock(&left_root->root_item_lock); - if (ctransid != left_start_ctransid) - left_start_ctransid = 0; - - spin_lock(&right_root->root_item_lock); - ctransid = btrfs_root_ctransid(&right_root->root_item); - spin_unlock(&right_root->root_item_lock); - if (ctransid != right_start_ctransid) - right_start_ctransid = 0; - - if (!left_start_ctransid || !right_start_ctransid) { - WARN(1, KERN_WARNING - "BTRFS: btrfs_compare_tree detected " - "a change in one of the trees while " - "iterating. This is probably a " - "bug.\n"); - ret = -EIO; - goto out; - } - - /* - * the commit root may have changed, so start again - * where we stopped - */ - left_path->lowest_level = left_level; - right_path->lowest_level = right_level; - ret = btrfs_search_slot(NULL, left_root, - &left_key, left_path, 0, 0); - if (ret < 0) - goto out; - ret = btrfs_search_slot(NULL, right_root, - &right_key, right_path, 0, 0); - if (ret < 0) - goto out; - } - if (advance_left && !left_end_reached) { ret = tree_advance(left_root, left_path, &left_level, left_root_level, @@ -5640,7 +5568,14 @@ int btrfs_compare_trees(struct btrfs_root *left_root, right_blockptr = btrfs_node_blockptr( right_path->nodes[right_level], right_path->slots[right_level]); - if (left_blockptr == right_blockptr) { + left_gen = btrfs_node_ptr_generation( + left_path->nodes[left_level], + left_path->slots[left_level]); + right_gen = btrfs_node_ptr_generation( + right_path->nodes[right_level], + right_path->slots[right_level]); + if (left_blockptr == right_blockptr && + left_gen == right_gen) { /* * As we're on a shared block, don't * allow to go deeper. @@ -5663,14 +5598,6 @@ out: btrfs_free_path(left_path); btrfs_free_path(right_path); kfree(tmp_buf); - - if (trans) { - if (!ret) - ret = btrfs_end_transaction(trans, left_root); - else - btrfs_end_transaction(trans, left_root); - } - return ret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2c1a42ca519f..ba6b88528dc7 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -351,6 +351,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) #define BTRFS_FS_STATE_ERROR 0 #define BTRFS_FS_STATE_REMOUNTING 1 #define BTRFS_FS_STATE_TRANS_ABORTED 2 +#define BTRFS_FS_STATE_DEV_REPLACING 3 /* Super block flags */ /* Errors detected */ @@ -608,6 +609,7 @@ struct btrfs_path { unsigned int skip_locking:1; unsigned int leave_spinning:1; unsigned int search_commit_root:1; + unsigned int need_commit_sem:1; }; /* @@ -985,7 +987,8 @@ struct btrfs_dev_replace_item { #define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) #define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7) #define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8) -#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE +#define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \ + BTRFS_SPACE_INFO_GLOBAL_RSV) enum btrfs_raid_types { BTRFS_RAID_RAID10, @@ -1017,6 +1020,12 @@ enum btrfs_raid_types { */ #define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48) +/* + * A fake block group type that is used to communicate global block reserve + * size to userspace via the SPACE_INFO ioctl. + */ +#define BTRFS_SPACE_INFO_GLOBAL_RSV (1ULL << 49) + #define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \ BTRFS_AVAIL_ALLOC_BIT_SINGLE) @@ -1439,7 +1448,7 @@ struct btrfs_fs_info { */ struct mutex ordered_extent_flush_mutex; - struct rw_semaphore extent_commit_sem; + struct rw_semaphore commit_root_sem; struct rw_semaphore cleanup_work_sem; @@ -1489,6 +1498,7 @@ struct btrfs_fs_info { */ struct list_head ordered_roots; + struct mutex delalloc_root_mutex; spinlock_t delalloc_root_lock; /* all fs/file tree roots that have delalloc inodes. */ struct list_head delalloc_roots; @@ -1503,28 +1513,27 @@ struct btrfs_fs_info { * A third pool does submit_bio to avoid deadlocking with the other * two */ - struct btrfs_workers generic_worker; - struct btrfs_workers workers; - struct btrfs_workers delalloc_workers; - struct btrfs_workers flush_workers; - struct btrfs_workers endio_workers; - struct btrfs_workers endio_meta_workers; - struct btrfs_workers endio_raid56_workers; - struct btrfs_workers rmw_workers; - struct btrfs_workers endio_meta_write_workers; - struct btrfs_workers endio_write_workers; - struct btrfs_workers endio_freespace_worker; - struct btrfs_workers submit_workers; - struct btrfs_workers caching_workers; - struct btrfs_workers readahead_workers; + struct btrfs_workqueue *workers; + struct btrfs_workqueue *delalloc_workers; + struct btrfs_workqueue *flush_workers; + struct btrfs_workqueue *endio_workers; + struct btrfs_workqueue *endio_meta_workers; + struct btrfs_workqueue *endio_raid56_workers; + struct btrfs_workqueue *rmw_workers; + struct btrfs_workqueue *endio_meta_write_workers; + struct btrfs_workqueue *endio_write_workers; + struct btrfs_workqueue *endio_freespace_worker; + struct btrfs_workqueue *submit_workers; + struct btrfs_workqueue *caching_workers; + struct btrfs_workqueue *readahead_workers; /* * fixup workers take dirty pages that didn't properly go through * the cow mechanism and make them safe to write. It happens * for the sys_munmap function call path */ - struct btrfs_workers fixup_workers; - struct btrfs_workers delayed_workers; + struct btrfs_workqueue *fixup_workers; + struct btrfs_workqueue *delayed_workers; struct task_struct *transaction_kthread; struct task_struct *cleaner_kthread; int thread_pool_size; @@ -1604,9 +1613,9 @@ struct btrfs_fs_info { atomic_t scrub_cancel_req; wait_queue_head_t scrub_pause_wait; int scrub_workers_refcnt; - struct btrfs_workers scrub_workers; - struct btrfs_workers scrub_wr_completion_workers; - struct btrfs_workers scrub_nocow_workers; + struct btrfs_workqueue *scrub_workers; + struct btrfs_workqueue *scrub_wr_completion_workers; + struct btrfs_workqueue *scrub_nocow_workers; #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY u32 check_integrity_print_mask; @@ -1647,7 +1656,7 @@ struct btrfs_fs_info { /* qgroup rescan items */ struct mutex qgroup_rescan_lock; /* protects the progress item */ struct btrfs_key qgroup_rescan_progress; - struct btrfs_workers qgroup_rescan_workers; + struct btrfs_workqueue *qgroup_rescan_workers; struct completion qgroup_rescan_completion; struct btrfs_work qgroup_rescan_work; @@ -1674,10 +1683,18 @@ struct btrfs_fs_info { atomic_t mutually_exclusive_operation_running; + struct percpu_counter bio_counter; + wait_queue_head_t replace_wait; + struct semaphore uuid_tree_rescan_sem; unsigned int update_uuid_tree_gen:1; }; +struct btrfs_subvolume_writers { + struct percpu_counter counter; + wait_queue_head_t wait; +}; + /* * in ram representation of the tree. extent_root is used for all allocations * and for the extent tree extent_root root. @@ -1702,7 +1719,6 @@ struct btrfs_root { struct btrfs_block_rsv *block_rsv; /* free ino cache stuff */ - struct mutex fs_commit_mutex; struct btrfs_free_space_ctl *free_ino_ctl; enum btrfs_caching_type cached; spinlock_t cache_lock; @@ -1714,11 +1730,15 @@ struct btrfs_root { struct mutex log_mutex; wait_queue_head_t log_writer_wait; wait_queue_head_t log_commit_wait[2]; + struct list_head log_ctxs[2]; atomic_t log_writers; atomic_t log_commit[2]; atomic_t log_batch; - unsigned long log_transid; - unsigned long last_log_commit; + int log_transid; + /* No matter the commit succeeds or not*/ + int log_transid_committed; + /* Just be updated when the commit succeeds. */ + int last_log_commit; pid_t log_start_pid; bool log_multiple_pids; @@ -1793,6 +1813,7 @@ struct btrfs_root { spinlock_t root_item_lock; atomic_t refs; + struct mutex delalloc_mutex; spinlock_t delalloc_lock; /* * all of the inodes that have delalloc bytes. It is possible for @@ -1802,6 +1823,8 @@ struct btrfs_root { struct list_head delalloc_inodes; struct list_head delalloc_root; u64 nr_delalloc_inodes; + + struct mutex ordered_extent_mutex; /* * this is used by the balancing code to wait for all the pending * ordered extents @@ -1822,6 +1845,8 @@ struct btrfs_root { * manipulation with the read-only status via SUBVOL_SETFLAGS */ int send_in_progress; + struct btrfs_subvolume_writers *subv_writers; + atomic_t will_be_snapshoted; }; struct btrfs_ioctl_defrag_range_args { @@ -2033,6 +2058,20 @@ struct btrfs_ioctl_defrag_range_args { #define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) #define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ BTRFS_MOUNT_##opt) +#define btrfs_set_and_info(root, opt, fmt, args...) \ +{ \ + if (!btrfs_test_opt(root, opt)) \ + btrfs_info(root->fs_info, fmt, ##args); \ + btrfs_set_opt(root->fs_info->mount_opt, opt); \ +} + +#define btrfs_clear_and_info(root, opt, fmt, args...) \ +{ \ + if (btrfs_test_opt(root, opt)) \ + btrfs_info(root->fs_info, fmt, ##args); \ + btrfs_clear_opt(root->fs_info->mount_opt, opt); \ +} + /* * Inode flags */ @@ -3346,6 +3385,9 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info); int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int __get_raid_index(u64 flags); + +int btrfs_start_nocow_write(struct btrfs_root *root); +void btrfs_end_nocow_write(struct btrfs_root *root); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -3723,7 +3765,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u32 min_type); int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput); +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput, + int nr); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, struct extent_state **cached_state); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, @@ -4005,6 +4048,11 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info, int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, struct btrfs_scrub_progress *progress); +/* dev-replace.c */ +void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); +void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info); +void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info); + /* reada.c */ struct reada_control { struct btrfs_root *root; /* tree to prefetch */ diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 451b00c86f6c..33e561a84013 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1392,11 +1392,11 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, return -ENOMEM; async_work->delayed_root = delayed_root; - async_work->work.func = btrfs_async_run_delayed_root; - async_work->work.flags = 0; + btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, + NULL, NULL); async_work->nr = nr; - btrfs_queue_worker(&root->fs_info->delayed_workers, &async_work->work); + btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work); return 0; } diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index f3bff89eecf0..31299646024d 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -199,44 +199,31 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root, */ static struct btrfs_delayed_ref_head * find_ref_head(struct rb_root *root, u64 bytenr, - struct btrfs_delayed_ref_head **last, int return_bigger) + int return_bigger) { struct rb_node *n; struct btrfs_delayed_ref_head *entry; - int cmp = 0; -again: n = root->rb_node; entry = NULL; while (n) { entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); - if (last) - *last = entry; if (bytenr < entry->node.bytenr) - cmp = -1; - else if (bytenr > entry->node.bytenr) - cmp = 1; - else - cmp = 0; - - if (cmp < 0) n = n->rb_left; - else if (cmp > 0) + else if (bytenr > entry->node.bytenr) n = n->rb_right; else return entry; } if (entry && return_bigger) { - if (cmp > 0) { + if (bytenr > entry->node.bytenr) { n = rb_next(&entry->href_node); if (!n) n = rb_first(root); entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); - bytenr = entry->node.bytenr; - return_bigger = 0; - goto again; + return entry; } return entry; } @@ -415,12 +402,12 @@ btrfs_select_ref_head(struct btrfs_trans_handle *trans) again: start = delayed_refs->run_delayed_start; - head = find_ref_head(&delayed_refs->href_root, start, NULL, 1); + head = find_ref_head(&delayed_refs->href_root, start, 1); if (!head && !loop) { delayed_refs->run_delayed_start = 0; start = 0; loop = true; - head = find_ref_head(&delayed_refs->href_root, start, NULL, 1); + head = find_ref_head(&delayed_refs->href_root, start, 1); if (!head) return NULL; } else if (!head && loop) { @@ -508,6 +495,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, ref = btrfs_delayed_node_to_head(update); BUG_ON(existing_ref->is_data != ref->is_data); + spin_lock(&existing_ref->lock); if (ref->must_insert_reserved) { /* if the extent was freed and then * reallocated before the delayed ref @@ -549,7 +537,6 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, * only need the lock for this case cause we could be processing it * currently, for refs we just added we know we're a-ok. */ - spin_lock(&existing_ref->lock); existing->ref_mod += update->ref_mod; spin_unlock(&existing_ref->lock); } @@ -898,7 +885,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) struct btrfs_delayed_ref_root *delayed_refs; delayed_refs = &trans->transaction->delayed_refs; - return find_ref_head(&delayed_refs->href_root, bytenr, NULL, 0); + return find_ref_head(&delayed_refs->href_root, bytenr, 0); } void btrfs_delayed_ref_exit(void) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 564c92638b20..9f2290509aca 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -431,6 +431,35 @@ leave_no_lock: return ret; } +/* + * blocked until all flighting bios are finished. + */ +static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info) +{ + s64 writers; + DEFINE_WAIT(wait); + + set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); + do { + prepare_to_wait(&fs_info->replace_wait, &wait, + TASK_UNINTERRUPTIBLE); + writers = percpu_counter_sum(&fs_info->bio_counter); + if (writers) + schedule(); + finish_wait(&fs_info->replace_wait, &wait); + } while (writers); +} + +/* + * we have removed target device, it is safe to allow new bios request. + */ +static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info) +{ + clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); + if (waitqueue_active(&fs_info->replace_wait)) + wake_up(&fs_info->replace_wait); +} + static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret) { @@ -458,17 +487,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, src_device = dev_replace->srcdev; btrfs_dev_replace_unlock(dev_replace); - /* replace old device with new one in mapping tree */ - if (!scrub_ret) - btrfs_dev_replace_update_device_in_mapping_tree(fs_info, - src_device, - tgt_device); - /* * flush all outstanding I/O and inode extent mappings before the * copy operation is declared as being finished */ - ret = btrfs_start_delalloc_roots(root->fs_info, 0); + ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1); if (ret) { mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; @@ -484,6 +507,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, WARN_ON(ret); /* keep away write_all_supers() during the finishing procedure */ + mutex_lock(&root->fs_info->chunk_mutex); mutex_lock(&root->fs_info->fs_devices->device_list_mutex); btrfs_dev_replace_lock(dev_replace); dev_replace->replace_state = @@ -494,7 +518,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, dev_replace->time_stopped = get_seconds(); dev_replace->item_needs_writeback = 1; - if (scrub_ret) { + /* replace old device with new one in mapping tree */ + if (!scrub_ret) { + btrfs_dev_replace_update_device_in_mapping_tree(fs_info, + src_device, + tgt_device); + } else { printk_in_rcu(KERN_ERR "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", src_device->missing ? "<missing disk>" : @@ -503,6 +532,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, rcu_str_deref(tgt_device->name), scrub_ret); btrfs_dev_replace_unlock(dev_replace); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&root->fs_info->chunk_mutex); if (tgt_device) btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); @@ -532,8 +562,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, fs_info->fs_devices->latest_bdev = tgt_device->bdev; list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); + btrfs_rm_dev_replace_blocked(fs_info); + btrfs_rm_dev_replace_srcdev(fs_info, src_device); + btrfs_rm_dev_replace_unblocked(fs_info); + /* * this is again a consistent state where no dev_replace procedure * is running, the target device is part of the filesystem, the @@ -543,6 +577,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, */ btrfs_dev_replace_unlock(dev_replace); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&root->fs_info->chunk_mutex); /* write back the superblocks */ trans = btrfs_start_transaction(root, 0); @@ -862,3 +897,31 @@ void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace) mutex_unlock(&dev_replace->lock_management_lock); } } + +void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info) +{ + percpu_counter_inc(&fs_info->bio_counter); +} + +void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) +{ + percpu_counter_dec(&fs_info->bio_counter); + + if (waitqueue_active(&fs_info->replace_wait)) + wake_up(&fs_info->replace_wait); +} + +void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info) +{ + DEFINE_WAIT(wait); +again: + percpu_counter_inc(&fs_info->bio_counter); + if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) { + btrfs_bio_counter_dec(fs_info); + wait_event(fs_info->replace_wait, + !test_bit(BTRFS_FS_STATE_DEV_REPLACING, + &fs_info->fs_state)); + goto again; + } + +} diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 81ea55314b1f..983314932af3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -329,6 +329,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, { struct extent_state *cached_state = NULL; int ret; + bool need_lock = (current->journal_info == + (void *)BTRFS_SEND_TRANS_STUB); if (!parent_transid || btrfs_header_generation(eb) == parent_transid) return 0; @@ -336,6 +338,11 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, if (atomic) return -EAGAIN; + if (need_lock) { + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + } + lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, 0, &cached_state); if (extent_buffer_uptodate(eb) && @@ -347,10 +354,21 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, "found %llu\n", eb->start, parent_transid, btrfs_header_generation(eb)); ret = 1; - clear_extent_buffer_uptodate(eb); + + /* + * Things reading via commit roots that don't have normal protection, + * like send, can have a really old block in cache that may point at a + * block that has been free'd and re-allocated. So don't clear uptodate + * if we find an eb that is under IO (dirty/writeback) because we could + * end up reading in the stale data and then writing it back out and + * making everybody very sad. + */ + if (!extent_buffer_under_io(eb)) + clear_extent_buffer_uptodate(eb); out: unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, &cached_state, GFP_NOFS); + btrfs_tree_read_unlock_blocking(eb); return ret; } @@ -678,32 +696,31 @@ static void end_workqueue_bio(struct bio *bio, int err) fs_info = end_io_wq->info; end_io_wq->error = err; - end_io_wq->work.func = end_workqueue_fn; - end_io_wq->work.flags = 0; + btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL); if (bio->bi_rw & REQ_WRITE) { if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) - btrfs_queue_worker(&fs_info->endio_meta_write_workers, - &end_io_wq->work); + btrfs_queue_work(fs_info->endio_meta_write_workers, + &end_io_wq->work); else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) - btrfs_queue_worker(&fs_info->endio_freespace_worker, - &end_io_wq->work); + btrfs_queue_work(fs_info->endio_freespace_worker, + &end_io_wq->work); else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) - btrfs_queue_worker(&fs_info->endio_raid56_workers, - &end_io_wq->work); + btrfs_queue_work(fs_info->endio_raid56_workers, + &end_io_wq->work); else - btrfs_queue_worker(&fs_info->endio_write_workers, - &end_io_wq->work); + btrfs_queue_work(fs_info->endio_write_workers, + &end_io_wq->work); } else { if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) - btrfs_queue_worker(&fs_info->endio_raid56_workers, - &end_io_wq->work); + btrfs_queue_work(fs_info->endio_raid56_workers, + &end_io_wq->work); else if (end_io_wq->metadata) - btrfs_queue_worker(&fs_info->endio_meta_workers, - &end_io_wq->work); + btrfs_queue_work(fs_info->endio_meta_workers, + &end_io_wq->work); else - btrfs_queue_worker(&fs_info->endio_workers, - &end_io_wq->work); + btrfs_queue_work(fs_info->endio_workers, + &end_io_wq->work); } } @@ -738,7 +755,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) { unsigned long limit = min_t(unsigned long, - info->workers.max_workers, + info->thread_pool_size, info->fs_devices->open_devices); return 256 * limit; } @@ -811,11 +828,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->submit_bio_start = submit_bio_start; async->submit_bio_done = submit_bio_done; - async->work.func = run_one_async_start; - async->work.ordered_func = run_one_async_done; - async->work.ordered_free = run_one_async_free; + btrfs_init_work(&async->work, run_one_async_start, + run_one_async_done, run_one_async_free); - async->work.flags = 0; async->bio_flags = bio_flags; async->bio_offset = bio_offset; @@ -824,9 +839,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, atomic_inc(&fs_info->nr_async_submits); if (rw & REQ_SYNC) - btrfs_set_work_high_prio(&async->work); + btrfs_set_work_high_priority(&async->work); - btrfs_queue_worker(&fs_info->workers, &async->work); + btrfs_queue_work(fs_info->workers, &async->work); while (atomic_read(&fs_info->async_submit_draining) && atomic_read(&fs_info->nr_async_submits)) { @@ -1149,6 +1164,32 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, } } +static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void) +{ + struct btrfs_subvolume_writers *writers; + int ret; + + writers = kmalloc(sizeof(*writers), GFP_NOFS); + if (!writers) + return ERR_PTR(-ENOMEM); + + ret = percpu_counter_init(&writers->counter, 0); + if (ret < 0) { + kfree(writers); + return ERR_PTR(ret); + } + + init_waitqueue_head(&writers->wait); + return writers; +} + +static void +btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers) +{ + percpu_counter_destroy(&writers->counter); + kfree(writers); +} + static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, u32 stripesize, struct btrfs_root *root, struct btrfs_fs_info *fs_info, @@ -1194,16 +1235,22 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, spin_lock_init(&root->log_extents_lock[1]); mutex_init(&root->objectid_mutex); mutex_init(&root->log_mutex); + mutex_init(&root->ordered_extent_mutex); + mutex_init(&root->delalloc_mutex); init_waitqueue_head(&root->log_writer_wait); init_waitqueue_head(&root->log_commit_wait[0]); init_waitqueue_head(&root->log_commit_wait[1]); + INIT_LIST_HEAD(&root->log_ctxs[0]); + INIT_LIST_HEAD(&root->log_ctxs[1]); atomic_set(&root->log_commit[0], 0); atomic_set(&root->log_commit[1], 0); atomic_set(&root->log_writers, 0); atomic_set(&root->log_batch, 0); atomic_set(&root->orphan_inodes, 0); atomic_set(&root->refs, 1); + atomic_set(&root->will_be_snapshoted, 0); root->log_transid = 0; + root->log_transid_committed = -1; root->last_log_commit = 0; if (fs_info) extent_io_tree_init(&root->dirty_log_pages, @@ -1417,6 +1464,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, WARN_ON(root->log_root); root->log_root = log_root; root->log_transid = 0; + root->log_transid_committed = -1; root->last_log_commit = 0; return 0; } @@ -1498,6 +1546,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, int btrfs_init_fs_root(struct btrfs_root *root) { int ret; + struct btrfs_subvolume_writers *writers; root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), @@ -1507,15 +1556,24 @@ int btrfs_init_fs_root(struct btrfs_root *root) goto fail; } + writers = btrfs_alloc_subvolume_writers(); + if (IS_ERR(writers)) { + ret = PTR_ERR(writers); + goto fail; + } + root->subv_writers = writers; + btrfs_init_free_ino_ctl(root); - mutex_init(&root->fs_commit_mutex); spin_lock_init(&root->cache_lock); init_waitqueue_head(&root->cache_wait); ret = get_anon_bdev(&root->anon_dev); if (ret) - goto fail; + goto free_writers; return 0; + +free_writers: + btrfs_free_subvolume_writers(root->subv_writers); fail: kfree(root->free_ino_ctl); kfree(root->free_ino_pinned); @@ -1990,23 +2048,22 @@ static noinline int next_root_backup(struct btrfs_fs_info *info, /* helper to cleanup workers */ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) { - btrfs_stop_workers(&fs_info->generic_worker); - btrfs_stop_workers(&fs_info->fixup_workers); - btrfs_stop_workers(&fs_info->delalloc_workers); - btrfs_stop_workers(&fs_info->workers); - btrfs_stop_workers(&fs_info->endio_workers); - btrfs_stop_workers(&fs_info->endio_meta_workers); - btrfs_stop_workers(&fs_info->endio_raid56_workers); - btrfs_stop_workers(&fs_info->rmw_workers); - btrfs_stop_workers(&fs_info->endio_meta_write_workers); - btrfs_stop_workers(&fs_info->endio_write_workers); - btrfs_stop_workers(&fs_info->endio_freespace_worker); - btrfs_stop_workers(&fs_info->submit_workers); - btrfs_stop_workers(&fs_info->delayed_workers); - btrfs_stop_workers(&fs_info->caching_workers); - btrfs_stop_workers(&fs_info->readahead_workers); - btrfs_stop_workers(&fs_info->flush_workers); - btrfs_stop_workers(&fs_info->qgroup_rescan_workers); + btrfs_destroy_workqueue(fs_info->fixup_workers); + btrfs_destroy_workqueue(fs_info->delalloc_workers); + btrfs_destroy_workqueue(fs_info->workers); + btrfs_destroy_workqueue(fs_info->endio_workers); + btrfs_destroy_workqueue(fs_info->endio_meta_workers); + btrfs_destroy_workqueue(fs_info->endio_raid56_workers); + btrfs_destroy_workqueue(fs_info->rmw_workers); + btrfs_destroy_workqueue(fs_info->endio_meta_write_workers); + btrfs_destroy_workqueue(fs_info->endio_write_workers); + btrfs_destroy_workqueue(fs_info->endio_freespace_worker); + btrfs_destroy_workqueue(fs_info->submit_workers); + btrfs_destroy_workqueue(fs_info->delayed_workers); + btrfs_destroy_workqueue(fs_info->caching_workers); + btrfs_destroy_workqueue(fs_info->readahead_workers); + btrfs_destroy_workqueue(fs_info->flush_workers); + btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers); } static void free_root_extent_buffers(struct btrfs_root *root) @@ -2097,6 +2154,8 @@ int open_ctree(struct super_block *sb, int err = -EINVAL; int num_backups_tried = 0; int backup_index = 0; + int max_active; + int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; bool create_uuid_tree; bool check_uuid_tree; @@ -2133,10 +2192,16 @@ int open_ctree(struct super_block *sb, goto fail_dirty_metadata_bytes; } + ret = percpu_counter_init(&fs_info->bio_counter, 0); + if (ret) { + err = ret; + goto fail_delalloc_bytes; + } + fs_info->btree_inode = new_inode(sb); if (!fs_info->btree_inode) { err = -ENOMEM; - goto fail_delalloc_bytes; + goto fail_bio_counter; } mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); @@ -2159,6 +2224,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->buffer_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->reloc_mutex); + mutex_init(&fs_info->delalloc_root_mutex); seqlock_init(&fs_info->profiles_lock); init_completion(&fs_info->kobj_unregister); @@ -2211,6 +2277,7 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->scrub_pause_req, 0); atomic_set(&fs_info->scrubs_paused, 0); atomic_set(&fs_info->scrub_cancel_req, 0); + init_waitqueue_head(&fs_info->replace_wait); init_waitqueue_head(&fs_info->scrub_pause_wait); fs_info->scrub_workers_refcnt = 0; #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY @@ -2274,7 +2341,7 @@ int open_ctree(struct super_block *sb, mutex_init(&fs_info->transaction_kthread_mutex); mutex_init(&fs_info->cleaner_mutex); mutex_init(&fs_info->volume_mutex); - init_rwsem(&fs_info->extent_commit_sem); + init_rwsem(&fs_info->commit_root_sem); init_rwsem(&fs_info->cleanup_work_sem); init_rwsem(&fs_info->subvol_sem); sema_init(&fs_info->uuid_tree_rescan_sem, 1); @@ -2458,104 +2525,68 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } - btrfs_init_workers(&fs_info->generic_worker, - "genwork", 1, NULL); + max_active = fs_info->thread_pool_size; - btrfs_init_workers(&fs_info->workers, "worker", - fs_info->thread_pool_size, - &fs_info->generic_worker); + fs_info->workers = + btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI, + max_active, 16); - btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", - fs_info->thread_pool_size, NULL); + fs_info->delalloc_workers = + btrfs_alloc_workqueue("delalloc", flags, max_active, 2); - btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc", - fs_info->thread_pool_size, NULL); + fs_info->flush_workers = + btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0); - btrfs_init_workers(&fs_info->submit_workers, "submit", - min_t(u64, fs_devices->num_devices, - fs_info->thread_pool_size), NULL); + fs_info->caching_workers = + btrfs_alloc_workqueue("cache", flags, max_active, 0); - btrfs_init_workers(&fs_info->caching_workers, "cache", - fs_info->thread_pool_size, NULL); - - /* a higher idle thresh on the submit workers makes it much more + /* + * a higher idle thresh on the submit workers makes it much more * likely that bios will be send down in a sane order to the * devices */ - fs_info->submit_workers.idle_thresh = 64; - - fs_info->workers.idle_thresh = 16; - fs_info->workers.ordered = 1; - - fs_info->delalloc_workers.idle_thresh = 2; - fs_info->delalloc_workers.ordered = 1; - - btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->endio_workers, "endio", - fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta", - fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->endio_meta_write_workers, - "endio-meta-write", fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->endio_raid56_workers, - "endio-raid56", fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->rmw_workers, - "rmw", fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", - fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write", - 1, &fs_info->generic_worker); - btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta", - fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->readahead_workers, "readahead", - fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->qgroup_rescan_workers, "qgroup-rescan", 1, - &fs_info->generic_worker); + fs_info->submit_workers = + btrfs_alloc_workqueue("submit", flags, + min_t(u64, fs_devices->num_devices, + max_active), 64); + + fs_info->fixup_workers = + btrfs_alloc_workqueue("fixup", flags, 1, 0); /* * endios are largely parallel and should have a very * low idle thresh */ - fs_info->endio_workers.idle_thresh = 4; - fs_info->endio_meta_workers.idle_thresh = 4; - fs_info->endio_raid56_workers.idle_thresh = 4; - fs_info->rmw_workers.idle_thresh = 2; - - fs_info->endio_write_workers.idle_thresh = 2; - fs_info->endio_meta_write_workers.idle_thresh = 2; - fs_info->readahead_workers.idle_thresh = 2; - - /* - * btrfs_start_workers can really only fail because of ENOMEM so just - * return -ENOMEM if any of these fail. - */ - ret = btrfs_start_workers(&fs_info->workers); - ret |= btrfs_start_workers(&fs_info->generic_worker); - ret |= btrfs_start_workers(&fs_info->submit_workers); - ret |= btrfs_start_workers(&fs_info->delalloc_workers); - ret |= btrfs_start_workers(&fs_info->fixup_workers); - ret |= btrfs_start_workers(&fs_info->endio_workers); - ret |= btrfs_start_workers(&fs_info->endio_meta_workers); - ret |= btrfs_start_workers(&fs_info->rmw_workers); - ret |= btrfs_start_workers(&fs_info->endio_raid56_workers); - ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); - ret |= btrfs_start_workers(&fs_info->endio_write_workers); - ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); - ret |= btrfs_start_workers(&fs_info->delayed_workers); - ret |= btrfs_start_workers(&fs_info->caching_workers); - ret |= btrfs_start_workers(&fs_info->readahead_workers); - ret |= btrfs_start_workers(&fs_info->flush_workers); - ret |= btrfs_start_workers(&fs_info->qgroup_rescan_workers); - if (ret) { + fs_info->endio_workers = + btrfs_alloc_workqueue("endio", flags, max_active, 4); + fs_info->endio_meta_workers = + btrfs_alloc_workqueue("endio-meta", flags, max_active, 4); + fs_info->endio_meta_write_workers = + btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2); + fs_info->endio_raid56_workers = + btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4); + fs_info->rmw_workers = + btrfs_alloc_workqueue("rmw", flags, max_active, 2); + fs_info->endio_write_workers = + btrfs_alloc_workqueue("endio-write", flags, max_active, 2); + fs_info->endio_freespace_worker = + btrfs_alloc_workqueue("freespace-write", flags, max_active, 0); + fs_info->delayed_workers = + btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0); + fs_info->readahead_workers = + btrfs_alloc_workqueue("readahead", flags, max_active, 2); + fs_info->qgroup_rescan_workers = + btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0); + + if (!(fs_info->workers && fs_info->delalloc_workers && + fs_info->submit_workers && fs_info->flush_workers && + fs_info->endio_workers && fs_info->endio_meta_workers && + fs_info->endio_meta_write_workers && + fs_info->endio_write_workers && fs_info->endio_raid56_workers && + fs_info->endio_freespace_worker && fs_info->rmw_workers && + fs_info->caching_workers && fs_info->readahead_workers && + fs_info->fixup_workers && fs_info->delayed_workers && + fs_info->qgroup_rescan_workers)) { err = -ENOMEM; goto fail_sb_buffer; } @@ -2830,7 +2861,7 @@ retry_root_backup: printk(KERN_ERR "BTRFS: failed to read log tree\n"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); - goto fail_trans_kthread; + goto fail_qgroup; } /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); @@ -2839,24 +2870,24 @@ retry_root_backup: "Failed to recover log tree"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); - goto fail_trans_kthread; + goto fail_qgroup; } if (sb->s_flags & MS_RDONLY) { ret = btrfs_commit_super(tree_root); if (ret) - goto fail_trans_kthread; + goto fail_qgroup; } } ret = btrfs_find_orphan_roots(tree_root); if (ret) - goto fail_trans_kthread; + goto fail_qgroup; if (!(sb->s_flags & MS_RDONLY)) { ret = btrfs_cleanup_fs_roots(fs_info); if (ret) - goto fail_trans_kthread; + goto fail_qgroup; ret = btrfs_recover_relocation(tree_root); if (ret < 0) { @@ -2963,6 +2994,8 @@ fail_iput: btrfs_mapping_tree_free(&fs_info->mapping_tree); iput(fs_info->btree_inode); +fail_bio_counter: + percpu_counter_destroy(&fs_info->bio_counter); fail_delalloc_bytes: percpu_counter_destroy(&fs_info->delalloc_bytes); fail_dirty_metadata_bytes: @@ -3244,6 +3277,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info) /* send down all the barriers */ head = &info->fs_devices->devices; list_for_each_entry_rcu(dev, head, dev_list) { + if (dev->missing) + continue; if (!dev->bdev) { errors_send++; continue; @@ -3258,6 +3293,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info) /* wait for all the barriers */ list_for_each_entry_rcu(dev, head, dev_list) { + if (dev->missing) + continue; if (!dev->bdev) { errors_wait++; continue; @@ -3477,6 +3514,8 @@ static void free_fs_root(struct btrfs_root *root) root->orphan_block_rsv = NULL; if (root->anon_dev) free_anon_bdev(root->anon_dev); + if (root->subv_writers) + btrfs_free_subvolume_writers(root->subv_writers); free_extent_buffer(root->node); free_extent_buffer(root->commit_root); kfree(root->free_ino_ctl); @@ -3610,6 +3649,7 @@ int close_ctree(struct btrfs_root *root) percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); + percpu_counter_destroy(&fs_info->bio_counter); bdi_destroy(&fs_info->bdi); cleanup_srcu_struct(&fs_info->subvol_srcu); @@ -3791,9 +3831,11 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) list_move_tail(&root->ordered_root, &fs_info->ordered_roots); + spin_unlock(&fs_info->ordered_root_lock); btrfs_destroy_ordered_extents(root); - cond_resched_lock(&fs_info->ordered_root_lock); + cond_resched(); + spin_lock(&fs_info->ordered_root_lock); } spin_unlock(&fs_info->ordered_root_lock); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 32312e09f0f5..5590af92094b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -419,7 +419,7 @@ static noinline void caching_thread(struct btrfs_work *work) again: mutex_lock(&caching_ctl->mutex); /* need to make sure the commit_root doesn't disappear */ - down_read(&fs_info->extent_commit_sem); + down_read(&fs_info->commit_root_sem); next: ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); @@ -443,10 +443,10 @@ next: break; if (need_resched() || - rwsem_is_contended(&fs_info->extent_commit_sem)) { + rwsem_is_contended(&fs_info->commit_root_sem)) { caching_ctl->progress = last; btrfs_release_path(path); - up_read(&fs_info->extent_commit_sem); + up_read(&fs_info->commit_root_sem); mutex_unlock(&caching_ctl->mutex); cond_resched(); goto again; @@ -513,7 +513,7 @@ next: err: btrfs_free_path(path); - up_read(&fs_info->extent_commit_sem); + up_read(&fs_info->commit_root_sem); free_excluded_extents(extent_root, block_group); @@ -549,7 +549,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, caching_ctl->block_group = cache; caching_ctl->progress = cache->key.objectid; atomic_set(&caching_ctl->count, 1); - caching_ctl->work.func = caching_thread; + btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); spin_lock(&cache->lock); /* @@ -633,14 +633,14 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, return 0; } - down_write(&fs_info->extent_commit_sem); + down_write(&fs_info->commit_root_sem); atomic_inc(&caching_ctl->count); list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); - up_write(&fs_info->extent_commit_sem); + up_write(&fs_info->commit_root_sem); btrfs_get_block_group(cache); - btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work); + btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); return ret; } @@ -1542,6 +1542,7 @@ again: ret = 0; } if (ret) { + key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; btrfs_release_path(path); @@ -2444,7 +2445,8 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, spin_unlock(&locked_ref->lock); spin_lock(&delayed_refs->lock); spin_lock(&locked_ref->lock); - if (rb_first(&locked_ref->ref_root)) { + if (rb_first(&locked_ref->ref_root) || + locked_ref->extent_op) { spin_unlock(&locked_ref->lock); spin_unlock(&delayed_refs->lock); continue; @@ -3541,11 +3543,13 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) return extended_to_chunk(flags | tmp); } -static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) +static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags) { unsigned seq; + u64 flags; do { + flags = orig_flags; seq = read_seqbegin(&root->fs_info->profiles_lock); if (flags & BTRFS_BLOCK_GROUP_DATA) @@ -3971,7 +3975,7 @@ static int can_overcommit(struct btrfs_root *root, } static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, - unsigned long nr_pages) + unsigned long nr_pages, int nr_items) { struct super_block *sb = root->fs_info->sb; @@ -3986,9 +3990,9 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, * the filesystem is readonly(all dirty pages are written to * the disk). */ - btrfs_start_delalloc_roots(root->fs_info, 0); + btrfs_start_delalloc_roots(root->fs_info, 0, nr_items); if (!current->journal_info) - btrfs_wait_ordered_roots(root->fs_info, -1); + btrfs_wait_ordered_roots(root->fs_info, nr_items); } } @@ -4045,7 +4049,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, while (delalloc_bytes && loops < 3) { max_reclaim = min(delalloc_bytes, to_reclaim); nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; - btrfs_writeback_inodes_sb_nr(root, nr_pages); + btrfs_writeback_inodes_sb_nr(root, nr_pages, items); /* * We need to wait for the async pages to actually start before * we do anything. @@ -4112,13 +4116,9 @@ static int may_commit_transaction(struct btrfs_root *root, goto commit; /* See if there is enough pinned space to make this reservation */ - spin_lock(&space_info->lock); if (percpu_counter_compare(&space_info->total_bytes_pinned, - bytes) >= 0) { - spin_unlock(&space_info->lock); + bytes) >= 0) goto commit; - } - spin_unlock(&space_info->lock); /* * See if there is some space in the delayed insertion reservation for @@ -4127,16 +4127,13 @@ static int may_commit_transaction(struct btrfs_root *root, if (space_info != delayed_rsv->space_info) return -ENOSPC; - spin_lock(&space_info->lock); spin_lock(&delayed_rsv->lock); if (percpu_counter_compare(&space_info->total_bytes_pinned, bytes - delayed_rsv->size) >= 0) { spin_unlock(&delayed_rsv->lock); - spin_unlock(&space_info->lock); return -ENOSPC; } spin_unlock(&delayed_rsv->lock); - spin_unlock(&space_info->lock); commit: trans = btrfs_join_transaction(root); @@ -4181,7 +4178,7 @@ static int flush_space(struct btrfs_root *root, break; case FLUSH_DELALLOC: case FLUSH_DELALLOC_WAIT: - shrink_delalloc(root, num_bytes, orig_bytes, + shrink_delalloc(root, num_bytes * 2, orig_bytes, state == FLUSH_DELALLOC_WAIT); break; case ALLOC_CHUNK: @@ -5477,7 +5474,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *cache; struct btrfs_space_info *space_info; - down_write(&fs_info->extent_commit_sem); + down_write(&fs_info->commit_root_sem); list_for_each_entry_safe(caching_ctl, next, &fs_info->caching_block_groups, list) { @@ -5496,7 +5493,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, else fs_info->pinned_extents = &fs_info->freed_extents[0]; - up_write(&fs_info->extent_commit_sem); + up_write(&fs_info->commit_root_sem); list_for_each_entry_rcu(space_info, &fs_info->space_info, list) percpu_counter_set(&space_info->total_bytes_pinned, 0); @@ -5725,6 +5722,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (ret > 0 && skinny_metadata) { skinny_metadata = false; + key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; btrfs_release_path(path); @@ -5751,6 +5749,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", bytenr, parent, root_objectid, owner_objectid, owner_offset); + btrfs_abort_transaction(trans, extent_root, ret); + goto out; } else { btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -8262,14 +8262,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) struct btrfs_caching_control *caching_ctl; struct rb_node *n; - down_write(&info->extent_commit_sem); + down_write(&info->commit_root_sem); while (!list_empty(&info->caching_block_groups)) { caching_ctl = list_entry(info->caching_block_groups.next, struct btrfs_caching_control, list); list_del(&caching_ctl->list); put_caching_control(caching_ctl); } - up_write(&info->extent_commit_sem); + up_write(&info->commit_root_sem); spin_lock(&info->block_group_cache_lock); while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { @@ -8343,9 +8343,15 @@ static void __link_block_group(struct btrfs_space_info *space_info, struct btrfs_block_group_cache *cache) { int index = get_block_group_index(cache); + bool first = false; down_write(&space_info->groups_sem); - if (list_empty(&space_info->block_groups[index])) { + if (list_empty(&space_info->block_groups[index])) + first = true; + list_add_tail(&cache->list, &space_info->block_groups[index]); + up_write(&space_info->groups_sem); + + if (first) { struct kobject *kobj = &space_info->block_group_kobjs[index]; int ret; @@ -8357,8 +8363,6 @@ static void __link_block_group(struct btrfs_space_info *space_info, kobject_put(&space_info->kobj); } } - list_add_tail(&cache->list, &space_info->block_groups[index]); - up_write(&space_info->groups_sem); } static struct btrfs_block_group_cache * @@ -8938,3 +8942,38 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) range->len = trimmed; return ret; } + +/* + * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(), + * they are used to prevent the some tasks writing data into the page cache + * by nocow before the subvolume is snapshoted, but flush the data into + * the disk after the snapshot creation. + */ +void btrfs_end_nocow_write(struct btrfs_root *root) +{ + percpu_counter_dec(&root->subv_writers->counter); + /* + * Make sure counter is updated before we wake up + * waiters. + */ + smp_mb(); + if (waitqueue_active(&root->subv_writers->wait)) + wake_up(&root->subv_writers->wait); +} + +int btrfs_start_nocow_write(struct btrfs_root *root) +{ + if (unlikely(atomic_read(&root->will_be_snapshoted))) + return 0; + + percpu_counter_inc(&root->subv_writers->counter); + /* + * Make sure counter is updated before we check for snapshot creation. + */ + smp_mb(); + if (unlikely(atomic_read(&root->will_be_snapshoted))) { + btrfs_end_nocow_write(root); + return 0; + } + return 1; +} diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 85bbd01f1271..3955e475ceec 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -229,12 +229,14 @@ void free_extent_state(struct extent_state *state) } } -static struct rb_node *tree_insert(struct rb_root *root, u64 offset, +static struct rb_node *tree_insert(struct rb_root *root, + struct rb_node *search_start, + u64 offset, struct rb_node *node, struct rb_node ***p_in, struct rb_node **parent_in) { - struct rb_node **p = &root->rb_node; + struct rb_node **p; struct rb_node *parent = NULL; struct tree_entry *entry; @@ -244,6 +246,7 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset, goto do_insert; } + p = search_start ? &search_start : &root->rb_node; while (*p) { parent = *p; entry = rb_entry(parent, struct tree_entry, rb_node); @@ -430,7 +433,7 @@ static int insert_state(struct extent_io_tree *tree, set_state_bits(tree, state, bits); - node = tree_insert(&tree->state, end, &state->rb_node, p, parent); + node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent); if (node) { struct extent_state *found; found = rb_entry(node, struct extent_state, rb_node); @@ -477,8 +480,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, prealloc->state = orig->state; orig->start = split; - node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node, - NULL, NULL); + node = tree_insert(&tree->state, &orig->rb_node, prealloc->end, + &prealloc->rb_node, NULL, NULL); if (node) { free_extent_state(prealloc); return -EEXIST; @@ -746,6 +749,7 @@ again: * our range starts */ node = tree_search(tree, start); +process_node: if (!node) break; @@ -766,7 +770,10 @@ again: if (start > end) break; - cond_resched_lock(&tree->lock); + if (!cond_resched_lock(&tree->lock)) { + node = rb_next(node); + goto process_node; + } } out: spin_unlock(&tree->lock); @@ -2757,7 +2764,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, if (em_cached && *em_cached) { em = *em_cached; - if (em->in_tree && start >= em->start && + if (extent_map_in_tree(em) && start >= em->start && start < extent_map_end(em)) { atomic_inc(&em->refs); return em; @@ -4303,7 +4310,7 @@ static void __free_extent_buffer(struct extent_buffer *eb) kmem_cache_free(extent_buffer_cache, eb); } -static int extent_buffer_under_io(struct extent_buffer *eb) +int extent_buffer_under_io(struct extent_buffer *eb) { return (atomic_read(&eb->io_pages) || test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 58b27e5ab521..c488b45237bf 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -320,6 +320,7 @@ int set_extent_buffer_dirty(struct extent_buffer *eb); int set_extent_buffer_uptodate(struct extent_buffer *eb); int clear_extent_buffer_uptodate(struct extent_buffer *eb); int extent_buffer_uptodate(struct extent_buffer *eb); +int extent_buffer_under_io(struct extent_buffer *eb); int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, unsigned long min_len, char **map, unsigned long *map_start, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 996ad56b57db..1874aee69c86 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -51,7 +51,7 @@ struct extent_map *alloc_extent_map(void) em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS); if (!em) return NULL; - em->in_tree = 0; + RB_CLEAR_NODE(&em->rb_node); em->flags = 0; em->compress_type = BTRFS_COMPRESS_NONE; em->generation = 0; @@ -73,7 +73,7 @@ void free_extent_map(struct extent_map *em) return; WARN_ON(atomic_read(&em->refs) == 0); if (atomic_dec_and_test(&em->refs)) { - WARN_ON(em->in_tree); + WARN_ON(extent_map_in_tree(em)); WARN_ON(!list_empty(&em->list)); kmem_cache_free(extent_map_cache, em); } @@ -99,8 +99,6 @@ static int tree_insert(struct rb_root *root, struct extent_map *em) parent = *p; entry = rb_entry(parent, struct extent_map, rb_node); - WARN_ON(!entry->in_tree); - if (em->start < entry->start) p = &(*p)->rb_left; else if (em->start >= extent_map_end(entry)) @@ -128,7 +126,6 @@ static int tree_insert(struct rb_root *root, struct extent_map *em) if (end > entry->start && em->start < extent_map_end(entry)) return -EEXIST; - em->in_tree = 1; rb_link_node(&em->rb_node, orig_parent, p); rb_insert_color(&em->rb_node, root); return 0; @@ -153,8 +150,6 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, prev = n; prev_entry = entry; - WARN_ON(!entry->in_tree); - if (offset < entry->start) n = n->rb_left; else if (offset >= extent_map_end(entry)) @@ -240,12 +235,12 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->len += merge->len; em->block_len += merge->block_len; em->block_start = merge->block_start; - merge->in_tree = 0; em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; em->mod_start = merge->mod_start; em->generation = max(em->generation, merge->generation); rb_erase(&merge->rb_node, &tree->map); + RB_CLEAR_NODE(&merge->rb_node); free_extent_map(merge); } } @@ -257,7 +252,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->len += merge->len; em->block_len += merge->block_len; rb_erase(&merge->rb_node, &tree->map); - merge->in_tree = 0; + RB_CLEAR_NODE(&merge->rb_node); em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; em->generation = max(em->generation, merge->generation); free_extent_map(merge); @@ -319,7 +314,21 @@ out: void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) { clear_bit(EXTENT_FLAG_LOGGING, &em->flags); - if (em->in_tree) + if (extent_map_in_tree(em)) + try_merge_map(tree, em); +} + +static inline void setup_extent_mapping(struct extent_map_tree *tree, + struct extent_map *em, + int modified) +{ + atomic_inc(&em->refs); + em->mod_start = em->start; + em->mod_len = em->len; + + if (modified) + list_move(&em->list, &tree->modified_extents); + else try_merge_map(tree, em); } @@ -342,15 +351,7 @@ int add_extent_mapping(struct extent_map_tree *tree, if (ret) goto out; - atomic_inc(&em->refs); - - em->mod_start = em->start; - em->mod_len = em->len; - - if (modified) - list_move(&em->list, &tree->modified_extents); - else - try_merge_map(tree, em); + setup_extent_mapping(tree, em, modified); out: return ret; } @@ -434,6 +435,21 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) rb_erase(&em->rb_node, &tree->map); if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) list_del_init(&em->list); - em->in_tree = 0; + RB_CLEAR_NODE(&em->rb_node); return ret; } + +void replace_extent_mapping(struct extent_map_tree *tree, + struct extent_map *cur, + struct extent_map *new, + int modified) +{ + WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags)); + ASSERT(extent_map_in_tree(cur)); + if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags)) + list_del_init(&cur->list); + rb_replace_node(&cur->rb_node, &new->rb_node, &tree->map); + RB_CLEAR_NODE(&cur->rb_node); + + setup_extent_mapping(tree, new, modified); +} diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 93fba716d7f8..e7fd8a56a140 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -33,7 +33,6 @@ struct extent_map { unsigned long flags; struct block_device *bdev; atomic_t refs; - unsigned int in_tree; unsigned int compress_type; struct list_head list; }; @@ -44,6 +43,11 @@ struct extent_map_tree { rwlock_t lock; }; +static inline int extent_map_in_tree(const struct extent_map *em) +{ + return !RB_EMPTY_NODE(&em->rb_node); +} + static inline u64 extent_map_end(struct extent_map *em) { if (em->start + em->len < em->start) @@ -64,6 +68,10 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, int add_extent_mapping(struct extent_map_tree *tree, struct extent_map *em, int modified); int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); +void replace_extent_mapping(struct extent_map_tree *tree, + struct extent_map *cur, + struct extent_map *new, + int modified); struct extent_map *alloc_extent_map(void); void free_extent_map(struct extent_map *em); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0165b8672f09..ae6af072b635 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -425,13 +425,8 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, struct page *page = prepared_pages[pg]; /* * Copy data from userspace to the current page - * - * Disable pagefault to avoid recursive lock since - * the pages are already locked */ - pagefault_disable(); copied = iov_iter_copy_from_user_atomic(page, i, offset, count); - pagefault_enable(); /* Flush processor's dcache for this page */ flush_dcache_page(page); @@ -591,7 +586,6 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, clear_bit(EXTENT_FLAG_PINNED, &em->flags); clear_bit(EXTENT_FLAG_LOGGING, &flags); modified = !list_empty(&em->list); - remove_extent_mapping(em_tree, em); if (no_splits) goto next; @@ -622,8 +616,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->bdev = em->bdev; split->flags = flags; split->compress_type = em->compress_type; - ret = add_extent_mapping(em_tree, split, modified); - BUG_ON(ret); /* Logic error */ + replace_extent_mapping(em_tree, em, split, modified); free_extent_map(split); split = split2; split2 = NULL; @@ -661,12 +654,20 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->orig_block_len = 0; } - ret = add_extent_mapping(em_tree, split, modified); - BUG_ON(ret); /* Logic error */ + if (extent_map_in_tree(em)) { + replace_extent_mapping(em_tree, em, split, + modified); + } else { + ret = add_extent_mapping(em_tree, split, + modified); + ASSERT(ret == 0); /* Logic error */ + } free_extent_map(split); split = NULL; } next: + if (extent_map_in_tree(em)) + remove_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); /* once for us */ @@ -720,7 +721,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, if (drop_cache) btrfs_drop_extent_cache(inode, start, end - 1, 0); - if (start >= BTRFS_I(inode)->disk_i_size) + if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent) modify_tree = 0; while (1) { @@ -798,7 +799,10 @@ next_slot: */ if (start > key.offset && end < extent_end) { BUG_ON(del_nr > 0); - BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + ret = -EOPNOTSUPP; + break; + } memcpy(&new_key, &key, sizeof(new_key)); new_key.offset = start; @@ -841,7 +845,10 @@ next_slot: * | -------- extent -------- | */ if (start <= key.offset && end < extent_end) { - BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + ret = -EOPNOTSUPP; + break; + } memcpy(&new_key, &key, sizeof(new_key)); new_key.offset = end; @@ -864,7 +871,10 @@ next_slot: */ if (start > key.offset && end >= extent_end) { BUG_ON(del_nr > 0); - BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + ret = -EOPNOTSUPP; + break; + } btrfs_set_file_extent_num_bytes(leaf, fi, start - key.offset); @@ -938,34 +948,42 @@ next_slot: * Set path->slots[0] to first slot, so that after the delete * if items are move off from our leaf to its immediate left or * right neighbor leafs, we end up with a correct and adjusted - * path->slots[0] for our insertion. + * path->slots[0] for our insertion (if replace_extent != 0). */ path->slots[0] = del_slot; ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret) btrfs_abort_transaction(trans, root, ret); + } - leaf = path->nodes[0]; - /* - * leaf eb has flag EXTENT_BUFFER_STALE if it was deleted (that - * is, its contents got pushed to its neighbors), in which case - * it means path->locks[0] == 0 - */ - if (!ret && replace_extent && leafs_visited == 1 && - path->locks[0] && - btrfs_leaf_free_space(root, leaf) >= - sizeof(struct btrfs_item) + extent_item_size) { - - key.objectid = ino; - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = start; - setup_items_for_insert(root, path, &key, - &extent_item_size, - extent_item_size, - sizeof(struct btrfs_item) + - extent_item_size, 1); - *key_inserted = 1; + leaf = path->nodes[0]; + /* + * If btrfs_del_items() was called, it might have deleted a leaf, in + * which case it unlocked our path, so check path->locks[0] matches a + * write lock. + */ + if (!ret && replace_extent && leafs_visited == 1 && + (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING || + path->locks[0] == BTRFS_WRITE_LOCK) && + btrfs_leaf_free_space(root, leaf) >= + sizeof(struct btrfs_item) + extent_item_size) { + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) { + struct btrfs_key slot_key; + + btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]); + if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) + path->slots[0]++; } + setup_items_for_insert(root, path, &key, + &extent_item_size, + extent_item_size, + sizeof(struct btrfs_item) + + extent_item_size, 1); + *key_inserted = 1; } if (!replace_extent || !(*key_inserted)) @@ -1346,11 +1364,11 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages, struct btrfs_ordered_extent *ordered; lock_extent_bits(&BTRFS_I(inode)->io_tree, start_pos, last_pos, 0, cached_state); - ordered = btrfs_lookup_first_ordered_extent(inode, last_pos); + ordered = btrfs_lookup_ordered_range(inode, start_pos, + last_pos - start_pos + 1); if (ordered && ordered->file_offset + ordered->len > start_pos && ordered->file_offset <= last_pos) { - btrfs_put_ordered_extent(ordered); unlock_extent_cached(&BTRFS_I(inode)->io_tree, start_pos, last_pos, cached_state, GFP_NOFS); @@ -1358,12 +1376,9 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages, unlock_page(pages[i]); page_cache_release(pages[i]); } - ret = btrfs_wait_ordered_range(inode, start_pos, - last_pos - start_pos + 1); - if (ret) - return ret; - else - return -EAGAIN; + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + return -EAGAIN; } if (ordered) btrfs_put_ordered_extent(ordered); @@ -1396,8 +1411,12 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos, u64 num_bytes; int ret; + ret = btrfs_start_nocow_write(root); + if (!ret) + return -ENOSPC; + lockstart = round_down(pos, root->sectorsize); - lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1; + lockend = round_up(pos + *write_bytes, root->sectorsize) - 1; while (1) { lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); @@ -1415,12 +1434,10 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos, ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL); if (ret <= 0) { ret = 0; + btrfs_end_nocow_write(root); } else { - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, - NULL, GFP_NOFS); - *write_bytes = min_t(size_t, *write_bytes, num_bytes); + *write_bytes = min_t(size_t, *write_bytes , + num_bytes - pos + lockstart); } unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); @@ -1510,6 +1527,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, if (!only_release_metadata) btrfs_free_reserved_data_space(inode, reserve_bytes); + else + btrfs_end_nocow_write(root); break; } @@ -1598,6 +1617,9 @@ again: } release_bytes = 0; + if (only_release_metadata) + btrfs_end_nocow_write(root); + if (only_release_metadata && copied > 0) { u64 lockstart = round_down(pos, root->sectorsize); u64 lockend = lockstart + @@ -1624,10 +1646,12 @@ again: kfree(pages); if (release_bytes) { - if (only_release_metadata) + if (only_release_metadata) { + btrfs_end_nocow_write(root); btrfs_delalloc_release_metadata(inode, release_bytes); - else + } else { btrfs_delalloc_release_space(inode, release_bytes); + } } return num_written ? num_written : ret; @@ -1636,7 +1660,7 @@ again: static ssize_t __btrfs_direct_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos, - loff_t *ppos, size_t count, size_t ocount) + size_t count, size_t ocount) { struct file *file = iocb->ki_filp; struct iov_iter i; @@ -1645,7 +1669,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, loff_t endbyte; int err; - written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos, + written = generic_file_direct_write(iocb, iov, &nr_segs, pos, count, ocount); if (written < 0 || written == count) @@ -1664,7 +1688,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, if (err) goto out; written += written_buffered; - *ppos = pos + written_buffered; + iocb->ki_pos = pos + written_buffered; invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT, endbyte >> PAGE_CACHE_SHIFT); out: @@ -1696,8 +1720,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; - loff_t *ppos = &iocb->ki_pos; u64 start_pos; + u64 end_pos; ssize_t num_written = 0; ssize_t err = 0; size_t count, ocount; @@ -1752,7 +1776,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, start_pos = round_down(pos, root->sectorsize); if (start_pos > i_size_read(inode)) { - err = btrfs_cont_expand(inode, i_size_read(inode), start_pos); + /* Expand hole size to cover write data, preventing empty gap */ + end_pos = round_up(pos + count, root->sectorsize); + err = btrfs_cont_expand(inode, i_size_read(inode), end_pos); if (err) { mutex_unlock(&inode->i_mutex); goto out; @@ -1764,7 +1790,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, if (unlikely(file->f_flags & O_DIRECT)) { num_written = __btrfs_direct_write(iocb, iov, nr_segs, - pos, ppos, count, ocount); + pos, count, ocount); } else { struct iov_iter i; @@ -1772,7 +1798,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, num_written = __btrfs_buffered_write(file, &i, pos); if (num_written > 0) - *ppos = pos + num_written; + iocb->ki_pos = pos + num_written; } mutex_unlock(&inode->i_mutex); @@ -1797,7 +1823,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, BTRFS_I(inode)->last_sub_trans = root->log_transid; if (num_written > 0) { err = generic_write_sync(file, pos, num_written); - if (err < 0 && num_written > 0) + if (err < 0) num_written = err; } @@ -1856,8 +1882,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; - int ret = 0; struct btrfs_trans_handle *trans; + struct btrfs_log_ctx ctx; + int ret = 0; bool full_sync = 0; trace_btrfs_sync_file(file, datasync); @@ -1951,7 +1978,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } trans->sync = true; - ret = btrfs_log_dentry_safe(trans, root, dentry); + btrfs_init_log_ctx(&ctx); + + ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ ret = 1; @@ -1971,7 +2000,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret != BTRFS_NO_LOG_SYNC) { if (!ret) { - ret = btrfs_sync_log(trans, root); + ret = btrfs_sync_log(trans, root, &ctx); if (!ret) { ret = btrfs_end_transaction(trans, root); goto out; @@ -1993,6 +2022,7 @@ out: static const struct vm_operations_struct btrfs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = btrfs_page_mkwrite, .remap_pages = generic_file_remap_pages, }; @@ -2157,6 +2187,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) bool same_page = ((offset >> PAGE_CACHE_SHIFT) == ((offset + len - 1) >> PAGE_CACHE_SHIFT)); bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); + u64 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE); ret = btrfs_wait_ordered_range(inode, offset, len); if (ret) @@ -2172,14 +2203,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) * entire page. */ if (same_page && len < PAGE_CACHE_SIZE) { - if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) + if (offset < ino_size) ret = btrfs_truncate_page(inode, offset, len, 0); mutex_unlock(&inode->i_mutex); return ret; } /* zero back part of the first page */ - if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) { + if (offset < ino_size) { ret = btrfs_truncate_page(inode, offset, 0, 0); if (ret) { mutex_unlock(&inode->i_mutex); @@ -2188,7 +2219,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } /* zero the front end of the last page */ - if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) { + if (offset + len < ino_size) { ret = btrfs_truncate_page(inode, offset + len, 0, 1); if (ret) { mutex_unlock(&inode->i_mutex); @@ -2277,10 +2308,13 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) trans->block_rsv = &root->fs_info->trans_block_rsv; - ret = fill_holes(trans, inode, path, cur_offset, drop_end); - if (ret) { - err = ret; - break; + if (cur_offset < ino_size) { + ret = fill_holes(trans, inode, path, cur_offset, + drop_end); + if (ret) { + err = ret; + break; + } } cur_offset = drop_end; @@ -2313,10 +2347,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } trans->block_rsv = &root->fs_info->trans_block_rsv; - ret = fill_holes(trans, inode, path, cur_offset, drop_end); - if (ret) { - err = ret; - goto out_trans; + if (cur_offset < ino_size) { + ret = fill_holes(trans, inode, path, cur_offset, drop_end); + if (ret) { + err = ret; + goto out_trans; + } } out_trans: diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index ab485e57b6fe..86935f5ae291 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -55,7 +55,7 @@ static int caching_kthread(void *data) key.type = BTRFS_INODE_ITEM_KEY; again: /* need to make sure the commit_root doesn't disappear */ - mutex_lock(&root->fs_commit_mutex); + down_read(&fs_info->commit_root_sem); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -88,7 +88,7 @@ again: btrfs_item_key_to_cpu(leaf, &key, 0); btrfs_release_path(path); root->cache_progress = last; - mutex_unlock(&root->fs_commit_mutex); + up_read(&fs_info->commit_root_sem); schedule_timeout(1); goto again; } else @@ -127,7 +127,7 @@ next: btrfs_unpin_free_ino(root); out: wake_up(&root->cache_wait); - mutex_unlock(&root->fs_commit_mutex); + up_read(&fs_info->commit_root_sem); btrfs_free_path(path); @@ -176,7 +176,11 @@ static void start_caching(struct btrfs_root *root) tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n", root->root_key.objectid); - BUG_ON(IS_ERR(tsk)); /* -ENOMEM */ + if (IS_ERR(tsk)) { + btrfs_warn(root->fs_info, "failed to start inode caching task"); + btrfs_clear_and_info(root, CHANGE_INODE_CACHE, + "disabling inode map caching"); + } } int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) @@ -205,42 +209,28 @@ again: void btrfs_return_ino(struct btrfs_root *root, u64 objectid) { - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; if (!btrfs_test_opt(root, INODE_MAP_CACHE)) return; - again: if (root->cached == BTRFS_CACHE_FINISHED) { - __btrfs_add_free_space(ctl, objectid, 1); + __btrfs_add_free_space(pinned, objectid, 1); } else { - /* - * If we are in the process of caching free ino chunks, - * to avoid adding the same inode number to the free_ino - * tree twice due to cross transaction, we'll leave it - * in the pinned tree until a transaction is committed - * or the caching work is done. - */ - - mutex_lock(&root->fs_commit_mutex); + down_write(&root->fs_info->commit_root_sem); spin_lock(&root->cache_lock); if (root->cached == BTRFS_CACHE_FINISHED) { spin_unlock(&root->cache_lock); - mutex_unlock(&root->fs_commit_mutex); + up_write(&root->fs_info->commit_root_sem); goto again; } spin_unlock(&root->cache_lock); start_caching(root); - if (objectid <= root->cache_progress || - objectid >= root->highest_objectid) - __btrfs_add_free_space(ctl, objectid, 1); - else - __btrfs_add_free_space(pinned, objectid, 1); + __btrfs_add_free_space(pinned, objectid, 1); - mutex_unlock(&root->fs_commit_mutex); + up_write(&root->fs_info->commit_root_sem); } } @@ -250,7 +240,7 @@ again: * and others will just be dropped, because the commit root we were * searching has changed. * - * Must be called with root->fs_commit_mutex held + * Must be called with root->fs_info->commit_root_sem held */ void btrfs_unpin_free_ino(struct btrfs_root *root) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d3d44486290b..5f805bc944fa 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -394,6 +394,14 @@ static noinline int compress_file_range(struct inode *inode, (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) btrfs_add_inode_defrag(NULL, inode); + /* + * skip compression for a small file range(<=blocksize) that + * isn't an inline extent, since it dosen't save disk space at all. + */ + if ((end - start + 1) <= blocksize && + (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) + goto cleanup_and_bail_uncompressed; + actual_end = min_t(u64, isize, end + 1); again: will_compress = 0; @@ -864,7 +872,8 @@ static noinline int cow_file_range(struct inode *inode, if (btrfs_is_free_space_inode(inode)) { WARN_ON_ONCE(1); - return -EINVAL; + ret = -EINVAL; + goto out_unlock; } num_bytes = ALIGN(end - start + 1, blocksize); @@ -1075,17 +1084,15 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow->end = cur_end; INIT_LIST_HEAD(&async_cow->extents); - async_cow->work.func = async_cow_start; - async_cow->work.ordered_func = async_cow_submit; - async_cow->work.ordered_free = async_cow_free; - async_cow->work.flags = 0; + btrfs_init_work(&async_cow->work, async_cow_start, + async_cow_submit, async_cow_free); nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> PAGE_CACHE_SHIFT; atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); - btrfs_queue_worker(&root->fs_info->delalloc_workers, - &async_cow->work); + btrfs_queue_work(root->fs_info->delalloc_workers, + &async_cow->work); if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { wait_event(root->fs_info->async_submit_wait, @@ -1272,6 +1279,15 @@ next_slot: disk_bytenr += cur_offset - found_key.offset; num_bytes = min(end + 1, extent_end) - cur_offset; /* + * if there are pending snapshots for this root, + * we fall into common COW way. + */ + if (!nolock) { + err = btrfs_start_nocow_write(root); + if (!err) + goto out_check; + } + /* * force cow if csum exists in the range. * this ensure that csum for a given extent are * either valid or do not exist. @@ -1290,6 +1306,8 @@ next_slot: out_check: if (extent_end <= start) { path->slots[0]++; + if (!nolock && nocow) + btrfs_end_nocow_write(root); goto next_slot; } if (!nocow) { @@ -1307,8 +1325,11 @@ out_check: ret = cow_file_range(inode, locked_page, cow_start, found_key.offset - 1, page_started, nr_written, 1); - if (ret) + if (ret) { + if (!nolock && nocow) + btrfs_end_nocow_write(root); goto error; + } cow_start = (u64)-1; } @@ -1355,8 +1376,11 @@ out_check: BTRFS_DATA_RELOC_TREE_OBJECTID) { ret = btrfs_reloc_clone_csums(inode, cur_offset, num_bytes); - if (ret) + if (ret) { + if (!nolock && nocow) + btrfs_end_nocow_write(root); goto error; + } } extent_clear_unlock_delalloc(inode, cur_offset, @@ -1364,6 +1388,8 @@ out_check: locked_page, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_SET_PRIVATE2); + if (!nolock && nocow) + btrfs_end_nocow_write(root); cur_offset = extent_end; if (cur_offset > end) break; @@ -1843,9 +1869,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) SetPageChecked(page); page_cache_get(page); - fixup->work.func = btrfs_writepage_fixup_worker; + btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); fixup->page = page; - btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); + btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work); return -EBUSY; } @@ -2239,6 +2265,11 @@ static noinline int relink_extent_backref(struct btrfs_path *path, return PTR_ERR(root); } + if (btrfs_root_readonly(root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + return 0; + } + /* step 2: get inode */ key.objectid = backref->inum; key.type = BTRFS_INODE_ITEM_KEY; @@ -2759,7 +2790,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, struct inode *inode = page->mapping->host; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ordered_extent *ordered_extent = NULL; - struct btrfs_workers *workers; + struct btrfs_workqueue *workers; trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); @@ -2768,14 +2799,13 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, end - start + 1, uptodate)) return 0; - ordered_extent->work.func = finish_ordered_fn; - ordered_extent->work.flags = 0; + btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL); if (btrfs_is_free_space_inode(inode)) - workers = &root->fs_info->endio_freespace_worker; + workers = root->fs_info->endio_freespace_worker; else - workers = &root->fs_info->endio_write_workers; - btrfs_queue_worker(workers, &ordered_extent->work); + workers = root->fs_info->endio_write_workers; + btrfs_queue_work(workers, &ordered_extent->work); return 0; } @@ -4593,7 +4623,7 @@ static void evict_inode_truncate_pages(struct inode *inode) struct rb_node *node; ASSERT(inode->i_state & I_FREEING); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); write_lock(&map_tree->lock); while (!RB_EMPTY_ROOT(&map_tree->map)) { @@ -4924,7 +4954,8 @@ void btrfs_invalidate_inodes(struct btrfs_root *root) struct inode *inode; u64 objectid = 0; - WARN_ON(btrfs_root_refs(&root->root_item) != 0); + if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) + WARN_ON(btrfs_root_refs(&root->root_item) != 0); spin_lock(&root->inode_lock); again: @@ -5799,6 +5830,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, } out_unlock: btrfs_end_transaction(trans, root); + btrfs_balance_delayed_items(root); btrfs_btree_balance_dirty(root); if (drop_inode) { inode_dec_link_count(inode); @@ -5872,6 +5904,7 @@ out_unlock: inode_dec_link_count(inode); iput(inode); } + btrfs_balance_delayed_items(root); btrfs_btree_balance_dirty(root); return err; } @@ -5930,6 +5963,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, } btrfs_end_transaction(trans, root); + btrfs_balance_delayed_items(root); fail: if (drop_inode) { inode_dec_link_count(inode); @@ -5996,6 +6030,7 @@ out_fail: btrfs_end_transaction(trans, root); if (drop_on_err) iput(inode); + btrfs_balance_delayed_items(root); btrfs_btree_balance_dirty(root); return err; } @@ -6550,6 +6585,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, int ret; struct extent_buffer *leaf; struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_file_extent_item *fi; struct btrfs_key key; u64 disk_bytenr; @@ -6626,6 +6662,20 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, if (btrfs_extent_readonly(root, disk_bytenr)) goto out; + + num_bytes = min(offset + *len, extent_end) - offset; + if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) { + u64 range_end; + + range_end = round_up(offset + num_bytes, root->sectorsize) - 1; + ret = test_range_bit(io_tree, offset, range_end, + EXTENT_DELALLOC, 0, NULL); + if (ret) { + ret = -EAGAIN; + goto out; + } + } + btrfs_release_path(path); /* @@ -6654,7 +6704,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, */ disk_bytenr += backref_offset; disk_bytenr += offset - key.offset; - num_bytes = min(offset + *len, extent_end) - offset; if (csum_exist_in_range(root, disk_bytenr, num_bytes)) goto out; /* @@ -7024,10 +7073,9 @@ again: if (!ret) goto out_test; - ordered->work.func = finish_ordered_fn; - ordered->work.flags = 0; - btrfs_queue_worker(&root->fs_info->endio_write_workers, - &ordered->work); + btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL); + btrfs_queue_work(root->fs_info->endio_write_workers, + &ordered->work); out_test: /* * our bio might span multiple ordered extents. If we haven't @@ -7404,15 +7452,15 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, smp_mb__after_atomic_inc(); /* - * The generic stuff only does filemap_write_and_wait_range, which isn't - * enough if we've written compressed pages to this area, so we need to - * call btrfs_wait_ordered_range to make absolutely sure that any - * outstanding dirty pages are on disk. + * The generic stuff only does filemap_write_and_wait_range, which + * isn't enough if we've written compressed pages to this area, so + * we need to flush the dirty pages again to make absolutely sure + * that any outstanding dirty pages are on disk. */ count = iov_length(iov, nr_segs); - ret = btrfs_wait_ordered_range(inode, offset, count); - if (ret) - return ret; + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + filemap_fdatawrite_range(inode->i_mapping, offset, count); if (rw & WRITE) { /* @@ -8404,7 +8452,7 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, work->inode = inode; work->wait = wait; work->delay_iput = delay_iput; - work->work.func = btrfs_run_delalloc_work; + btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL); return work; } @@ -8419,7 +8467,8 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput) +static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput, + int nr) { struct btrfs_inode *binode; struct inode *inode; @@ -8431,6 +8480,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput) INIT_LIST_HEAD(&works); INIT_LIST_HEAD(&splice); + mutex_lock(&root->delalloc_mutex); spin_lock(&root->delalloc_lock); list_splice_init(&root->delalloc_inodes, &splice); while (!list_empty(&splice)) { @@ -8456,19 +8506,16 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput) goto out; } list_add_tail(&work->list, &works); - btrfs_queue_worker(&root->fs_info->flush_workers, - &work->work); - + btrfs_queue_work(root->fs_info->flush_workers, + &work->work); + ret++; + if (nr != -1 && ret >= nr) + goto out; cond_resched(); spin_lock(&root->delalloc_lock); } spin_unlock(&root->delalloc_lock); - list_for_each_entry_safe(work, next, &works, list) { - list_del_init(&work->list); - btrfs_wait_and_free_delalloc_work(work); - } - return 0; out: list_for_each_entry_safe(work, next, &works, list) { list_del_init(&work->list); @@ -8480,6 +8527,7 @@ out: list_splice_tail(&splice, &root->delalloc_inodes); spin_unlock(&root->delalloc_lock); } + mutex_unlock(&root->delalloc_mutex); return ret; } @@ -8490,7 +8538,9 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return -EROFS; - ret = __start_delalloc_inodes(root, delay_iput); + ret = __start_delalloc_inodes(root, delay_iput, -1); + if (ret > 0) + ret = 0; /* * the filemap_flush will queue IO into the worker threads, but * we have to make sure the IO is actually started and that @@ -8507,7 +8557,8 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) return ret; } -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput) +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput, + int nr) { struct btrfs_root *root; struct list_head splice; @@ -8518,9 +8569,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput) INIT_LIST_HEAD(&splice); + mutex_lock(&fs_info->delalloc_root_mutex); spin_lock(&fs_info->delalloc_root_lock); list_splice_init(&fs_info->delalloc_roots, &splice); - while (!list_empty(&splice)) { + while (!list_empty(&splice) && nr) { root = list_first_entry(&splice, struct btrfs_root, delalloc_root); root = btrfs_grab_fs_root(root); @@ -8529,15 +8581,20 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput) &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); - ret = __start_delalloc_inodes(root, delay_iput); + ret = __start_delalloc_inodes(root, delay_iput, nr); btrfs_put_fs_root(root); - if (ret) + if (ret < 0) goto out; + if (nr != -1) { + nr -= ret; + WARN_ON(nr < 0); + } spin_lock(&fs_info->delalloc_root_lock); } spin_unlock(&fs_info->delalloc_root_lock); + ret = 0; atomic_inc(&fs_info->async_submit_draining); while (atomic_read(&fs_info->nr_async_submits) || atomic_read(&fs_info->async_delalloc_pages)) { @@ -8546,13 +8603,13 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput) atomic_read(&fs_info->async_delalloc_pages) == 0)); } atomic_dec(&fs_info->async_submit_draining); - return 0; out: if (!list_empty_careful(&splice)) { spin_lock(&fs_info->delalloc_root_lock); list_splice_tail(&splice, &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); } + mutex_unlock(&fs_info->delalloc_root_mutex); return ret; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a6d8efa46bfe..2f6d7b13b5bd 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -59,6 +59,32 @@ #include "props.h" #include "sysfs.h" +#ifdef CONFIG_64BIT +/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI + * structures are incorrect, as the timespec structure from userspace + * is 4 bytes too small. We define these alternatives here to teach + * the kernel about the 32-bit struct packing. + */ +struct btrfs_ioctl_timespec_32 { + __u64 sec; + __u32 nsec; +} __attribute__ ((__packed__)); + +struct btrfs_ioctl_received_subvol_args_32 { + char uuid[BTRFS_UUID_SIZE]; /* in */ + __u64 stransid; /* in */ + __u64 rtransid; /* out */ + struct btrfs_ioctl_timespec_32 stime; /* in */ + struct btrfs_ioctl_timespec_32 rtime; /* out */ + __u64 flags; /* in */ + __u64 reserved[16]; /* in */ +} __attribute__ ((__packed__)); + +#define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \ + struct btrfs_ioctl_received_subvol_args_32) +#endif + + static int btrfs_clone(struct inode *src, struct inode *inode, u64 off, u64 olen, u64 olen_aligned, u64 destoff); @@ -585,6 +611,23 @@ fail: return ret; } +static void btrfs_wait_nocow_write(struct btrfs_root *root) +{ + s64 writers; + DEFINE_WAIT(wait); + + do { + prepare_to_wait(&root->subv_writers->wait, &wait, + TASK_UNINTERRUPTIBLE); + + writers = percpu_counter_sum(&root->subv_writers->counter); + if (writers) + schedule(); + + finish_wait(&root->subv_writers->wait, &wait); + } while (writers); +} + static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct dentry *dentry, char *name, int namelen, u64 *async_transid, bool readonly, @@ -598,15 +641,21 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (!root->ref_cows) return -EINVAL; + atomic_inc(&root->will_be_snapshoted); + smp_mb__after_atomic_inc(); + btrfs_wait_nocow_write(root); + ret = btrfs_start_delalloc_inodes(root, 0); if (ret) - return ret; + goto out; btrfs_wait_ordered_extents(root, -1); pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); - if (!pending_snapshot) - return -ENOMEM; + if (!pending_snapshot) { + ret = -ENOMEM; + goto out; + } btrfs_init_block_rsv(&pending_snapshot->block_rsv, BTRFS_BLOCK_RSV_TEMP); @@ -623,7 +672,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, &pending_snapshot->qgroup_reserved, false); if (ret) - goto out; + goto free; pending_snapshot->dentry = dentry; pending_snapshot->root = root; @@ -674,8 +723,10 @@ fail: btrfs_subvolume_release_metadata(BTRFS_I(dir)->root, &pending_snapshot->block_rsv, pending_snapshot->qgroup_reserved); -out: +free: kfree(pending_snapshot); +out: + atomic_dec(&root->will_be_snapshoted); return ret; } @@ -884,12 +935,14 @@ static int find_new_extents(struct btrfs_root *root, min_key.type = BTRFS_EXTENT_DATA_KEY; min_key.offset = *off; - path->keep_locks = 1; - while (1) { + path->keep_locks = 1; ret = btrfs_search_forward(root, &min_key, path, newer_than); if (ret != 0) goto none; + path->keep_locks = 0; + btrfs_unlock_up_safe(path, 1); +process_slot: if (min_key.objectid != ino) goto none; if (min_key.type != BTRFS_EXTENT_DATA_KEY) @@ -908,6 +961,12 @@ static int find_new_extents(struct btrfs_root *root, return 0; } + path->slots[0]++; + if (path->slots[0] < btrfs_header_nritems(leaf)) { + btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]); + goto process_slot; + } + if (min_key.offset == (u64)-1) goto none; @@ -935,10 +994,13 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) read_unlock(&em_tree->lock); if (!em) { + struct extent_state *cached = NULL; + u64 end = start + len - 1; + /* get the big lock and read metadata off disk */ - lock_extent(io_tree, start, start + len - 1); + lock_extent_bits(io_tree, start, end, 0, &cached); em = btrfs_get_extent(inode, NULL, 0, start, len, 0); - unlock_extent(io_tree, start, start + len - 1); + unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS); if (IS_ERR(em)) return NULL; @@ -957,7 +1019,8 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) return false; next = defrag_lookup_extent(inode, em->start + em->len); - if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) + if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE || + (em->block_start + em->block_len == next->block_start)) ret = false; free_extent_map(next); @@ -1076,10 +1139,12 @@ again: page_start = page_offset(page); page_end = page_start + PAGE_CACHE_SIZE - 1; while (1) { - lock_extent(tree, page_start, page_end); + lock_extent_bits(tree, page_start, page_end, + 0, &cached_state); ordered = btrfs_lookup_ordered_extent(inode, page_start); - unlock_extent(tree, page_start, page_end); + unlock_extent_cached(tree, page_start, page_end, + &cached_state, GFP_NOFS); if (!ordered) break; @@ -1356,8 +1421,12 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, } } - if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) + if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) { filemap_flush(inode->i_mapping); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + filemap_flush(inode->i_mapping); + } if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { /* the filemap_flush will queue IO into the worker threads, but @@ -1403,6 +1472,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, struct btrfs_trans_handle *trans; struct btrfs_device *device = NULL; char *sizestr; + char *retptr; char *devstr = NULL; int ret = 0; int mod = 0; @@ -1470,8 +1540,8 @@ static noinline int btrfs_ioctl_resize(struct file *file, mod = 1; sizestr++; } - new_size = memparse(sizestr, NULL); - if (new_size == 0) { + new_size = memparse(sizestr, &retptr); + if (*retptr != '\0' || new_size == 0) { ret = -EINVAL; goto out_free; } @@ -1573,7 +1643,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, if (src_inode->i_sb != file_inode(file)->i_sb) { btrfs_info(BTRFS_I(src_inode)->root->fs_info, "Snapshot src from another FS"); - ret = -EINVAL; + ret = -EXDEV; } else if (!inode_owner_or_capable(src_inode)) { /* * Subvolume creation is not restricted, but snapshots @@ -1797,7 +1867,9 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) if (di && !IS_ERR(di)) { btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); if (key.objectid == root->root_key.objectid) { - ret = -ENOTEMPTY; + ret = -EPERM; + btrfs_err(root->fs_info, "deleting default subvolume " + "%llu is not allowed", key.objectid); goto out; } btrfs_release_path(path); @@ -2994,8 +3066,9 @@ process_slot: new_key.offset + datal, 1); if (ret) { - btrfs_abort_transaction(trans, root, - ret); + if (ret != -EOPNOTSUPP) + btrfs_abort_transaction(trans, + root, ret); btrfs_end_transaction(trans, root); goto out; } @@ -3047,6 +3120,8 @@ process_slot: } else if (type == BTRFS_FILE_EXTENT_INLINE) { u64 skip = 0; u64 trim = 0; + u64 aligned_end = 0; + if (off > key.offset) { skip = off - key.offset; new_key.offset += skip; @@ -3063,13 +3138,16 @@ process_slot: size -= skip + trim; datal -= skip + trim; + aligned_end = ALIGN(new_key.offset + datal, + root->sectorsize); ret = btrfs_drop_extents(trans, root, inode, new_key.offset, - new_key.offset + datal, + aligned_end, 1); if (ret) { - btrfs_abort_transaction(trans, root, - ret); + if (ret != -EOPNOTSUPP) + btrfs_abort_transaction(trans, + root, ret); btrfs_end_transaction(trans, root); goto out; } @@ -3153,8 +3231,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, * decompress into destination's address_space (the file offset * may change, so source mapping won't do), then recompress (or * otherwise reinsert) a subrange. - * - allow ranges within the same file to be cloned (provided - * they don't overlap)? + * + * - split destination inode's inline extents. The inline extents can + * be either compressed or non-compressed. */ /* the destination must be opened for writing */ @@ -3465,6 +3544,11 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) up_read(&info->groups_sem); } + /* + * Global block reserve, exported as a space_info + */ + slot_count++; + /* space_slots == 0 means they are asking for a count */ if (space_args.space_slots == 0) { space_args.total_spaces = slot_count; @@ -3523,6 +3607,21 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) up_read(&info->groups_sem); } + /* + * Add global block reserve + */ + if (slot_count) { + struct btrfs_block_rsv *block_rsv = &root->fs_info->global_block_rsv; + + spin_lock(&block_rsv->lock); + space.total_bytes = block_rsv->size; + space.used_bytes = block_rsv->size - block_rsv->reserved; + spin_unlock(&block_rsv->lock); + space.flags = BTRFS_SPACE_INFO_GLOBAL_RSV; + memcpy(dest, &space, sizeof(space)); + space_args.total_spaces++; + } + user_dest = (struct btrfs_ioctl_space_info __user *) (arg + sizeof(struct btrfs_ioctl_space_args)); @@ -4353,10 +4452,9 @@ static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg) return btrfs_qgroup_wait_for_completion(root->fs_info); } -static long btrfs_ioctl_set_received_subvol(struct file *file, - void __user *arg) +static long _btrfs_ioctl_set_received_subvol(struct file *file, + struct btrfs_ioctl_received_subvol_args *sa) { - struct btrfs_ioctl_received_subvol_args *sa = NULL; struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root_item *root_item = &root->root_item; @@ -4384,13 +4482,6 @@ static long btrfs_ioctl_set_received_subvol(struct file *file, goto out; } - sa = memdup_user(arg, sizeof(*sa)); - if (IS_ERR(sa)) { - ret = PTR_ERR(sa); - sa = NULL; - goto out; - } - /* * 1 - root item * 2 - uuid items (received uuid + subvol uuid) @@ -4444,14 +4535,90 @@ static long btrfs_ioctl_set_received_subvol(struct file *file, goto out; } +out: + up_write(&root->fs_info->subvol_sem); + mnt_drop_write_file(file); + return ret; +} + +#ifdef CONFIG_64BIT +static long btrfs_ioctl_set_received_subvol_32(struct file *file, + void __user *arg) +{ + struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL; + struct btrfs_ioctl_received_subvol_args *args64 = NULL; + int ret = 0; + + args32 = memdup_user(arg, sizeof(*args32)); + if (IS_ERR(args32)) { + ret = PTR_ERR(args32); + args32 = NULL; + goto out; + } + + args64 = kmalloc(sizeof(*args64), GFP_NOFS); + if (!args64) { + ret = -ENOMEM; + goto out; + } + + memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE); + args64->stransid = args32->stransid; + args64->rtransid = args32->rtransid; + args64->stime.sec = args32->stime.sec; + args64->stime.nsec = args32->stime.nsec; + args64->rtime.sec = args32->rtime.sec; + args64->rtime.nsec = args32->rtime.nsec; + args64->flags = args32->flags; + + ret = _btrfs_ioctl_set_received_subvol(file, args64); + if (ret) + goto out; + + memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE); + args32->stransid = args64->stransid; + args32->rtransid = args64->rtransid; + args32->stime.sec = args64->stime.sec; + args32->stime.nsec = args64->stime.nsec; + args32->rtime.sec = args64->rtime.sec; + args32->rtime.nsec = args64->rtime.nsec; + args32->flags = args64->flags; + + ret = copy_to_user(arg, args32, sizeof(*args32)); + if (ret) + ret = -EFAULT; + +out: + kfree(args32); + kfree(args64); + return ret; +} +#endif + +static long btrfs_ioctl_set_received_subvol(struct file *file, + void __user *arg) +{ + struct btrfs_ioctl_received_subvol_args *sa = NULL; + int ret = 0; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) { + ret = PTR_ERR(sa); + sa = NULL; + goto out; + } + + ret = _btrfs_ioctl_set_received_subvol(file, sa); + + if (ret) + goto out; + ret = copy_to_user(arg, sa, sizeof(*sa)); if (ret) ret = -EFAULT; out: kfree(sa); - up_write(&root->fs_info->subvol_sem); - mnt_drop_write_file(file); return ret; } @@ -4746,7 +4913,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SYNC: { int ret; - ret = btrfs_start_delalloc_roots(root->fs_info, 0); + ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1); if (ret) return ret; ret = btrfs_sync_fs(file->f_dentry->d_sb, 1); @@ -4770,6 +4937,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_balance_progress(root, argp); case BTRFS_IOC_SET_RECEIVED_SUBVOL: return btrfs_ioctl_set_received_subvol(file, argp); +#ifdef CONFIG_64BIT + case BTRFS_IOC_SET_RECEIVED_SUBVOL_32: + return btrfs_ioctl_set_received_subvol_32(file, argp); +#endif case BTRFS_IOC_SEND: return btrfs_ioctl_send(file, argp); case BTRFS_IOC_GET_DEV_STATS: diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index b16450b840e7..a94b05f72869 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -349,10 +349,13 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, if (!uptodate) set_bit(BTRFS_ORDERED_IOERR, &entry->flags); - if (entry->bytes_left == 0) + if (entry->bytes_left == 0) { ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); - else + if (waitqueue_active(&entry->wait)) + wake_up(&entry->wait); + } else { ret = 1; + } out: if (!ret && cached && entry) { *cached = entry; @@ -410,10 +413,13 @@ have_entry: if (!uptodate) set_bit(BTRFS_ORDERED_IOERR, &entry->flags); - if (entry->bytes_left == 0) + if (entry->bytes_left == 0) { ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); - else + if (waitqueue_active(&entry->wait)) + wake_up(&entry->wait); + } else { ret = 1; + } out: if (!ret && cached && entry) { *cached = entry; @@ -424,27 +430,48 @@ out: } /* Needs to either be called under a log transaction or the log_mutex */ -void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode) +void btrfs_get_logged_extents(struct inode *inode, + struct list_head *logged_list) { struct btrfs_ordered_inode_tree *tree; struct btrfs_ordered_extent *ordered; struct rb_node *n; - int index = log->log_transid % 2; tree = &BTRFS_I(inode)->ordered_tree; spin_lock_irq(&tree->lock); for (n = rb_first(&tree->tree); n; n = rb_next(n)) { ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); - spin_lock(&log->log_extents_lock[index]); - if (list_empty(&ordered->log_list)) { - list_add_tail(&ordered->log_list, &log->logged_list[index]); - atomic_inc(&ordered->refs); - } - spin_unlock(&log->log_extents_lock[index]); + if (!list_empty(&ordered->log_list)) + continue; + list_add_tail(&ordered->log_list, logged_list); + atomic_inc(&ordered->refs); } spin_unlock_irq(&tree->lock); } +void btrfs_put_logged_extents(struct list_head *logged_list) +{ + struct btrfs_ordered_extent *ordered; + + while (!list_empty(logged_list)) { + ordered = list_first_entry(logged_list, + struct btrfs_ordered_extent, + log_list); + list_del_init(&ordered->log_list); + btrfs_put_ordered_extent(ordered); + } +} + +void btrfs_submit_logged_extents(struct list_head *logged_list, + struct btrfs_root *log) +{ + int index = log->log_transid % 2; + + spin_lock_irq(&log->log_extents_lock[index]); + list_splice_tail(logged_list, &log->logged_list[index]); + spin_unlock_irq(&log->log_extents_lock[index]); +} + void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) { struct btrfs_ordered_extent *ordered; @@ -577,7 +604,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) INIT_LIST_HEAD(&splice); INIT_LIST_HEAD(&works); - mutex_lock(&root->fs_info->ordered_operations_mutex); + mutex_lock(&root->ordered_extent_mutex); spin_lock(&root->ordered_extent_lock); list_splice_init(&root->ordered_extents, &splice); while (!list_empty(&splice) && nr) { @@ -588,10 +615,11 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) atomic_inc(&ordered->refs); spin_unlock(&root->ordered_extent_lock); - ordered->flush_work.func = btrfs_run_ordered_extent_work; + btrfs_init_work(&ordered->flush_work, + btrfs_run_ordered_extent_work, NULL, NULL); list_add_tail(&ordered->work_list, &works); - btrfs_queue_worker(&root->fs_info->flush_workers, - &ordered->flush_work); + btrfs_queue_work(root->fs_info->flush_workers, + &ordered->flush_work); cond_resched(); spin_lock(&root->ordered_extent_lock); @@ -608,7 +636,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) btrfs_put_ordered_extent(ordered); cond_resched(); } - mutex_unlock(&root->fs_info->ordered_operations_mutex); + mutex_unlock(&root->ordered_extent_mutex); return count; } @@ -621,6 +649,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) INIT_LIST_HEAD(&splice); + mutex_lock(&fs_info->ordered_operations_mutex); spin_lock(&fs_info->ordered_root_lock); list_splice_init(&fs_info->ordered_roots, &splice); while (!list_empty(&splice) && nr) { @@ -643,6 +672,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) } list_splice_tail(&splice, &fs_info->ordered_roots); spin_unlock(&fs_info->ordered_root_lock); + mutex_unlock(&fs_info->ordered_operations_mutex); } /* @@ -704,8 +734,8 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, goto out; } list_add_tail(&work->list, &works); - btrfs_queue_worker(&root->fs_info->flush_workers, - &work->work); + btrfs_queue_work(root->fs_info->flush_workers, + &work->work); cond_resched(); spin_lock(&root->fs_info->ordered_root_lock); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 9b0450f7ac20..246897058efb 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -197,7 +197,11 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); -void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode); +void btrfs_get_logged_extents(struct inode *inode, + struct list_head *logged_list); +void btrfs_put_logged_extents(struct list_head *logged_list); +void btrfs_submit_logged_extents(struct list_head *logged_list, + struct btrfs_root *log); void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); int __init ordered_data_init(void); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 472302a2d745..2cf905877aaf 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1509,8 +1509,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans, ret = qgroup_rescan_init(fs_info, 0, 1); if (!ret) { qgroup_rescan_zero_tracking(fs_info); - btrfs_queue_worker(&fs_info->qgroup_rescan_workers, - &fs_info->qgroup_rescan_work); + btrfs_queue_work(fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); } ret = 0; } @@ -2095,7 +2095,8 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, memset(&fs_info->qgroup_rescan_work, 0, sizeof(fs_info->qgroup_rescan_work)); - fs_info->qgroup_rescan_work.func = btrfs_qgroup_rescan_worker; + btrfs_init_work(&fs_info->qgroup_rescan_work, + btrfs_qgroup_rescan_worker, NULL, NULL); if (ret) { err: @@ -2158,8 +2159,8 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) qgroup_rescan_zero_tracking(fs_info); - btrfs_queue_worker(&fs_info->qgroup_rescan_workers, - &fs_info->qgroup_rescan_work); + btrfs_queue_work(fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); return 0; } @@ -2190,6 +2191,6 @@ void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) { if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) - btrfs_queue_worker(&fs_info->qgroup_rescan_workers, - &fs_info->qgroup_rescan_work); + btrfs_queue_work(fs_info->qgroup_rescan_workers, + &fs_info->qgroup_rescan_work); } diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 9af0b25d991a..4055291a523e 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1416,20 +1416,18 @@ cleanup: static void async_rmw_stripe(struct btrfs_raid_bio *rbio) { - rbio->work.flags = 0; - rbio->work.func = rmw_work; + btrfs_init_work(&rbio->work, rmw_work, NULL, NULL); - btrfs_queue_worker(&rbio->fs_info->rmw_workers, - &rbio->work); + btrfs_queue_work(rbio->fs_info->rmw_workers, + &rbio->work); } static void async_read_rebuild(struct btrfs_raid_bio *rbio) { - rbio->work.flags = 0; - rbio->work.func = read_rebuild_work; + btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL); - btrfs_queue_worker(&rbio->fs_info->rmw_workers, - &rbio->work); + btrfs_queue_work(rbio->fs_info->rmw_workers, + &rbio->work); } /* @@ -1667,10 +1665,9 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) plug = container_of(cb, struct btrfs_plug_cb, cb); if (from_schedule) { - plug->work.flags = 0; - plug->work.func = unplug_work; - btrfs_queue_worker(&plug->info->rmw_workers, - &plug->work); + btrfs_init_work(&plug->work, unplug_work, NULL, NULL); + btrfs_queue_work(plug->info->rmw_workers, + &plug->work); return; } run_plug(plug); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 31c797c48c3e..30947f923620 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -793,10 +793,10 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info) /* FIXME we cannot handle this properly right now */ BUG(); } - rmw->work.func = reada_start_machine_worker; + btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL); rmw->fs_info = fs_info; - btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work); + btrfs_queue_work(fs_info->readahead_workers, &rmw->work); } #ifdef DEBUG diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 07b3b36f40ee..7f92ab1daa87 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2317,7 +2317,6 @@ void free_reloc_roots(struct list_head *list) static noinline_for_stack int merge_reloc_roots(struct reloc_control *rc) { - struct btrfs_trans_handle *trans; struct btrfs_root *root; struct btrfs_root *reloc_root; u64 last_snap; @@ -2375,26 +2374,6 @@ again: list_add_tail(&reloc_root->root_list, &reloc_roots); goto out; - } else if (!ret) { - /* - * recover the last snapshot tranid to avoid - * the space balance break NOCOW. - */ - root = read_fs_root(rc->extent_root->fs_info, - objectid); - if (IS_ERR(root)) - continue; - - trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); - - /* Check if the fs/file tree was snapshoted or not. */ - if (btrfs_root_last_snapshot(&root->root_item) == - otransid - 1) - btrfs_set_root_last_snapshot(&root->root_item, - last_snap); - - btrfs_end_transaction(trans, root); } } @@ -4248,7 +4227,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu", rc->block_group->key.objectid, rc->block_group->flags); - ret = btrfs_start_delalloc_roots(fs_info, 0); + ret = btrfs_start_delalloc_roots(fs_info, 0, -1); if (ret < 0) { err = ret; goto out; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 1389b69059de..38bb47e7d6b1 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -16,6 +16,7 @@ * Boston, MA 021110-1307, USA. */ +#include <linux/err.h> #include <linux/uuid.h> #include "ctree.h" #include "transaction.h" @@ -271,7 +272,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) key.offset++; root = btrfs_read_fs_root(tree_root, &root_key); - err = PTR_RET(root); + err = PTR_ERR_OR_ZERO(root); if (err && err != -ENOENT) { break; } else if (err == -ENOENT) { diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index efba5d1282ee..0be77993378e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -315,6 +315,16 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) atomic_inc(&fs_info->scrubs_running); atomic_inc(&fs_info->scrubs_paused); mutex_unlock(&fs_info->scrub_lock); + + /* + * check if @scrubs_running=@scrubs_paused condition + * inside wait_event() is not an atomic operation. + * which means we may inc/dec @scrub_running/paused + * at any time. Let's wake up @scrub_pause_wait as + * much as we can to let commit transaction blocked less. + */ + wake_up(&fs_info->scrub_pause_wait); + atomic_inc(&sctx->workers_pending); } @@ -418,7 +428,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) sbio->index = i; sbio->sctx = sctx; sbio->page_count = 0; - sbio->work.func = scrub_bio_end_io_worker; + btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, + NULL, NULL); if (i != SCRUB_BIOS_PER_SCTX - 1) sctx->bios[i]->next_free = i + 1; @@ -987,9 +998,10 @@ nodatasum_case: fixup_nodatasum->root = fs_info->extent_root; fixup_nodatasum->mirror_num = failed_mirror_index + 1; scrub_pending_trans_workers_inc(sctx); - fixup_nodatasum->work.func = scrub_fixup_nodatasum; - btrfs_queue_worker(&fs_info->scrub_workers, - &fixup_nodatasum->work); + btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum, + NULL, NULL); + btrfs_queue_work(fs_info->scrub_workers, + &fixup_nodatasum->work); goto out; } @@ -1603,8 +1615,8 @@ static void scrub_wr_bio_end_io(struct bio *bio, int err) sbio->err = err; sbio->bio = bio; - sbio->work.func = scrub_wr_bio_end_io_worker; - btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work); + btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL); + btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); } static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) @@ -2072,7 +2084,7 @@ static void scrub_bio_end_io(struct bio *bio, int err) sbio->err = err; sbio->bio = bio; - btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); + btrfs_queue_work(fs_info->scrub_workers, &sbio->work); } static void scrub_bio_end_io_worker(struct btrfs_work *work) @@ -2223,6 +2235,47 @@ behind_scrub_pages: return 0; } +/* + * Given a physical address, this will calculate it's + * logical offset. if this is a parity stripe, it will return + * the most left data stripe's logical offset. + * + * return 0 if it is a data stripe, 1 means parity stripe. + */ +static int get_raid56_logic_offset(u64 physical, int num, + struct map_lookup *map, u64 *offset) +{ + int i; + int j = 0; + u64 stripe_nr; + u64 last_offset; + int stripe_index; + int rot; + + last_offset = (physical - map->stripes[num].physical) * + nr_data_stripes(map); + *offset = last_offset; + for (i = 0; i < nr_data_stripes(map); i++) { + *offset = last_offset + i * map->stripe_len; + + stripe_nr = *offset; + do_div(stripe_nr, map->stripe_len); + do_div(stripe_nr, nr_data_stripes(map)); + + /* Work out the disk rotation on this stripe-set */ + rot = do_div(stripe_nr, map->num_stripes); + /* calculate which stripe this data locates */ + rot += i; + stripe_index = rot % map->num_stripes; + if (stripe_index == num) + return 0; + if (stripe_index < num) + j++; + } + *offset = last_offset + j * map->stripe_len; + return 1; +} + static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct map_lookup *map, struct btrfs_device *scrub_dev, @@ -2244,6 +2297,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, u64 physical; u64 logical; u64 logic_end; + u64 physical_end; u64 generation; int mirror_num; struct reada_control *reada1; @@ -2257,16 +2311,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, u64 extent_len; struct btrfs_device *extent_dev; int extent_mirror_num; - int stop_loop; - - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { - if (num >= nr_data_stripes(map)) { - return 0; - } - } + int stop_loop = 0; nstripes = length; + physical = map->stripes[num].physical; offset = 0; do_div(nstripes, map->stripe_len); if (map->type & BTRFS_BLOCK_GROUP_RAID0) { @@ -2284,6 +2332,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { increment = map->stripe_len; mirror_num = num % map->num_stripes + 1; + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + get_raid56_logic_offset(physical, num, map, &offset); + increment = map->stripe_len * nr_data_stripes(map); + mirror_num = 1; } else { increment = map->stripe_len; mirror_num = 1; @@ -2307,7 +2360,15 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, * to not hold off transaction commits */ logical = base + offset; - + physical_end = physical + nstripes * map->stripe_len; + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + get_raid56_logic_offset(physical_end, num, + map, &logic_end); + logic_end += base; + } else { + logic_end = logical + increment * nstripes; + } wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); scrub_blocked_if_needed(fs_info); @@ -2316,7 +2377,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, key_start.objectid = logical; key_start.type = BTRFS_EXTENT_ITEM_KEY; key_start.offset = (u64)0; - key_end.objectid = base + offset + nstripes * increment; + key_end.objectid = logic_end; key_end.type = BTRFS_METADATA_ITEM_KEY; key_end.offset = (u64)-1; reada1 = btrfs_reada_add(root, &key_start, &key_end); @@ -2326,7 +2387,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, key_start.offset = logical; key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; key_end.type = BTRFS_EXTENT_CSUM_KEY; - key_end.offset = base + offset + nstripes * increment; + key_end.offset = logic_end; reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); if (!IS_ERR(reada1)) @@ -2344,11 +2405,17 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, /* * now find all extents for each stripe and scrub them */ - logical = base + offset; - physical = map->stripes[num].physical; - logic_end = logical + increment * nstripes; ret = 0; - while (logical < logic_end) { + while (physical < physical_end) { + /* for raid56, we skip parity stripe */ + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + ret = get_raid56_logic_offset(physical, num, + map, &logical); + logical += base; + if (ret) + goto skip; + } /* * canceled? */ @@ -2492,15 +2559,29 @@ again: scrub_free_csums(sctx); if (extent_logical + extent_len < key.objectid + bytes) { - logical += increment; - physical += map->stripe_len; - + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + /* + * loop until we find next data stripe + * or we have finished all stripes. + */ + do { + physical += map->stripe_len; + ret = get_raid56_logic_offset( + physical, num, + map, &logical); + logical += base; + } while (physical < physical_end && ret); + } else { + physical += map->stripe_len; + logical += increment; + } if (logical < key.objectid + bytes) { cond_resched(); goto again; } - if (logical >= logic_end) { + if (physical >= physical_end) { stop_loop = 1; break; } @@ -2509,6 +2590,7 @@ next: path->slots[0]++; } btrfs_release_path(path); +skip: logical += increment; physical += map->stripe_len; spin_lock(&sctx->stat_lock); @@ -2686,10 +2768,23 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); - atomic_set(&sctx->wr_ctx.flush_all_writes, 0); + atomic_inc(&fs_info->scrubs_paused); + wake_up(&fs_info->scrub_pause_wait); + + /* + * must be called before we decrease @scrub_paused. + * make sure we don't block transaction commit while + * we are waiting pending workers finished. + */ wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0); - scrub_blocked_if_needed(fs_info); + atomic_set(&sctx->wr_ctx.flush_all_writes, 0); + + mutex_lock(&fs_info->scrub_lock); + __scrub_blocked_if_needed(fs_info); + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + wake_up(&fs_info->scrub_pause_wait); btrfs_put_block_group(cache); if (ret) @@ -2757,33 +2852,35 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, int is_dev_replace) { int ret = 0; + int flags = WQ_FREEZABLE | WQ_UNBOUND; + int max_active = fs_info->thread_pool_size; if (fs_info->scrub_workers_refcnt == 0) { if (is_dev_replace) - btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1, - &fs_info->generic_worker); + fs_info->scrub_workers = + btrfs_alloc_workqueue("btrfs-scrub", flags, + 1, 4); else - btrfs_init_workers(&fs_info->scrub_workers, "scrub", - fs_info->thread_pool_size, - &fs_info->generic_worker); - fs_info->scrub_workers.idle_thresh = 4; - ret = btrfs_start_workers(&fs_info->scrub_workers); - if (ret) + fs_info->scrub_workers = + btrfs_alloc_workqueue("btrfs-scrub", flags, + max_active, 4); + if (!fs_info->scrub_workers) { + ret = -ENOMEM; goto out; - btrfs_init_workers(&fs_info->scrub_wr_completion_workers, - "scrubwrc", - fs_info->thread_pool_size, - &fs_info->generic_worker); - fs_info->scrub_wr_completion_workers.idle_thresh = 2; - ret = btrfs_start_workers( - &fs_info->scrub_wr_completion_workers); - if (ret) + } + fs_info->scrub_wr_completion_workers = + btrfs_alloc_workqueue("btrfs-scrubwrc", flags, + max_active, 2); + if (!fs_info->scrub_wr_completion_workers) { + ret = -ENOMEM; goto out; - btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1, - &fs_info->generic_worker); - ret = btrfs_start_workers(&fs_info->scrub_nocow_workers); - if (ret) + } + fs_info->scrub_nocow_workers = + btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0); + if (!fs_info->scrub_nocow_workers) { + ret = -ENOMEM; goto out; + } } ++fs_info->scrub_workers_refcnt; out: @@ -2793,9 +2890,9 @@ out: static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) { if (--fs_info->scrub_workers_refcnt == 0) { - btrfs_stop_workers(&fs_info->scrub_workers); - btrfs_stop_workers(&fs_info->scrub_wr_completion_workers); - btrfs_stop_workers(&fs_info->scrub_nocow_workers); + btrfs_destroy_workqueue(fs_info->scrub_workers); + btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); + btrfs_destroy_workqueue(fs_info->scrub_nocow_workers); } WARN_ON(fs_info->scrub_workers_refcnt < 0); } @@ -3106,10 +3203,10 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, nocow_ctx->len = len; nocow_ctx->mirror_num = mirror_num; nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; - nocow_ctx->work.func = copy_nocow_pages_worker; + btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL); INIT_LIST_HEAD(&nocow_ctx->inodes); - btrfs_queue_worker(&fs_info->scrub_nocow_workers, - &nocow_ctx->work); + btrfs_queue_work(fs_info->scrub_nocow_workers, + &nocow_ctx->work); return 0; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 9dde9717c1b9..484aacac2c89 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -51,15 +51,18 @@ struct fs_path { struct { char *start; char *end; - char *prepared; char *buf; - int buf_len; - unsigned int reversed:1; - unsigned int virtual_mem:1; + unsigned short buf_len:15; + unsigned short reversed:1; char inline_buf[]; }; - char pad[PAGE_SIZE]; + /* + * Average path length does not exceed 200 bytes, we'll have + * better packing in the slab and higher chance to satisfy + * a allocation later during send. + */ + char pad[256]; }; }; #define FS_PATH_INLINE_SIZE \ @@ -109,6 +112,7 @@ struct send_ctx { int cur_inode_deleted; u64 cur_inode_size; u64 cur_inode_mode; + u64 cur_inode_rdev; u64 cur_inode_last_extent; u64 send_progress; @@ -120,6 +124,8 @@ struct send_ctx { struct list_head name_cache_list; int name_cache_size; + struct file_ra_state ra; + char *read_buf; /* @@ -175,6 +181,47 @@ struct send_ctx { * own move/rename can be performed. */ struct rb_root waiting_dir_moves; + + /* + * A directory that is going to be rm'ed might have a child directory + * which is in the pending directory moves index above. In this case, + * the directory can only be removed after the move/rename of its child + * is performed. Example: + * + * Parent snapshot: + * + * . (ino 256) + * |-- a/ (ino 257) + * |-- b/ (ino 258) + * |-- c/ (ino 259) + * | |-- x/ (ino 260) + * | + * |-- y/ (ino 261) + * + * Send snapshot: + * + * . (ino 256) + * |-- a/ (ino 257) + * |-- b/ (ino 258) + * |-- YY/ (ino 261) + * |-- x/ (ino 260) + * + * Sequence of steps that lead to the send snapshot: + * rm -f /a/b/c/foo.txt + * mv /a/b/y /a/b/YY + * mv /a/b/c/x /a/b/YY + * rmdir /a/b/c + * + * When the child is processed, its move/rename is delayed until its + * parent is processed (as explained above), but all other operations + * like update utimes, chown, chgrp, etc, are performed and the paths + * that it uses for those operations must use the orphanized name of + * its parent (the directory we're going to rm later), so we need to + * memorize that name. + * + * Indexed by the inode number of the directory to be deleted. + */ + struct rb_root orphan_dirs; }; struct pending_dir_move { @@ -189,6 +236,18 @@ struct pending_dir_move { struct waiting_dir_move { struct rb_node node; u64 ino; + /* + * There might be some directory that could not be removed because it + * was waiting for this directory inode to be moved first. Therefore + * after this directory is moved, we can try to rmdir the ino rmdir_ino. + */ + u64 rmdir_ino; +}; + +struct orphan_dir_info { + struct rb_node node; + u64 ino; + u64 gen; }; struct name_cache_entry { @@ -214,6 +273,11 @@ struct name_cache_entry { static int is_waiting_for_move(struct send_ctx *sctx, u64 ino); +static struct waiting_dir_move * +get_waiting_dir_move(struct send_ctx *sctx, u64 ino); + +static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino); + static int need_send_hole(struct send_ctx *sctx) { return (sctx->parent_root && !sctx->cur_inode_new && @@ -242,7 +306,6 @@ static struct fs_path *fs_path_alloc(void) if (!p) return NULL; p->reversed = 0; - p->virtual_mem = 0; p->buf = p->inline_buf; p->buf_len = FS_PATH_INLINE_SIZE; fs_path_reset(p); @@ -265,12 +328,8 @@ static void fs_path_free(struct fs_path *p) { if (!p) return; - if (p->buf != p->inline_buf) { - if (p->virtual_mem) - vfree(p->buf); - else - kfree(p->buf); - } + if (p->buf != p->inline_buf) + kfree(p->buf); kfree(p); } @@ -290,42 +349,33 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) if (p->buf_len >= len) return 0; + if (len > PATH_MAX) { + WARN_ON(1); + return -ENOMEM; + } + path_len = p->end - p->start; old_buf_len = p->buf_len; - len = PAGE_ALIGN(len); + /* + * First time the inline_buf does not suffice + */ if (p->buf == p->inline_buf) { - tmp_buf = kmalloc(len, GFP_NOFS | __GFP_NOWARN); - if (!tmp_buf) { - tmp_buf = vmalloc(len); - if (!tmp_buf) - return -ENOMEM; - p->virtual_mem = 1; - } - memcpy(tmp_buf, p->buf, p->buf_len); - p->buf = tmp_buf; - p->buf_len = len; + tmp_buf = kmalloc(len, GFP_NOFS); + if (tmp_buf) + memcpy(tmp_buf, p->buf, old_buf_len); } else { - if (p->virtual_mem) { - tmp_buf = vmalloc(len); - if (!tmp_buf) - return -ENOMEM; - memcpy(tmp_buf, p->buf, p->buf_len); - vfree(p->buf); - } else { - tmp_buf = krealloc(p->buf, len, GFP_NOFS); - if (!tmp_buf) { - tmp_buf = vmalloc(len); - if (!tmp_buf) - return -ENOMEM; - memcpy(tmp_buf, p->buf, p->buf_len); - kfree(p->buf); - p->virtual_mem = 1; - } - } - p->buf = tmp_buf; - p->buf_len = len; + tmp_buf = krealloc(p->buf, len, GFP_NOFS); } + if (!tmp_buf) + return -ENOMEM; + p->buf = tmp_buf; + /* + * The real size of the buffer is bigger, this will let the fast path + * happen most of the time + */ + p->buf_len = ksize(p->buf); + if (p->reversed) { tmp_buf = p->buf + old_buf_len - path_len - 1; p->end = p->buf + p->buf_len - 1; @@ -338,7 +388,8 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) return 0; } -static int fs_path_prepare_for_add(struct fs_path *p, int name_len) +static int fs_path_prepare_for_add(struct fs_path *p, int name_len, + char **prepared) { int ret; int new_len; @@ -354,11 +405,11 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len) if (p->start != p->end) *--p->start = '/'; p->start -= name_len; - p->prepared = p->start; + *prepared = p->start; } else { if (p->start != p->end) *p->end++ = '/'; - p->prepared = p->end; + *prepared = p->end; p->end += name_len; *p->end = 0; } @@ -370,12 +421,12 @@ out: static int fs_path_add(struct fs_path *p, const char *name, int name_len) { int ret; + char *prepared; - ret = fs_path_prepare_for_add(p, name_len); + ret = fs_path_prepare_for_add(p, name_len, &prepared); if (ret < 0) goto out; - memcpy(p->prepared, name, name_len); - p->prepared = NULL; + memcpy(prepared, name, name_len); out: return ret; @@ -384,12 +435,12 @@ out: static int fs_path_add_path(struct fs_path *p, struct fs_path *p2) { int ret; + char *prepared; - ret = fs_path_prepare_for_add(p, p2->end - p2->start); + ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared); if (ret < 0) goto out; - memcpy(p->prepared, p2->start, p2->end - p2->start); - p->prepared = NULL; + memcpy(prepared, p2->start, p2->end - p2->start); out: return ret; @@ -400,13 +451,13 @@ static int fs_path_add_from_extent_buffer(struct fs_path *p, unsigned long off, int len) { int ret; + char *prepared; - ret = fs_path_prepare_for_add(p, len); + ret = fs_path_prepare_for_add(p, len, &prepared); if (ret < 0) goto out; - read_extent_buffer(eb, p->prepared, off, len); - p->prepared = NULL; + read_extent_buffer(eb, prepared, off, len); out: return ret; @@ -450,6 +501,7 @@ static struct btrfs_path *alloc_path_for_send(void) return NULL; path->search_commit_root = 1; path->skip_locking = 1; + path->need_commit_sem = 1; return path; } @@ -728,29 +780,22 @@ out: /* * Helper function to retrieve some fields from an inode item. */ -static int get_inode_info(struct btrfs_root *root, - u64 ino, u64 *size, u64 *gen, - u64 *mode, u64 *uid, u64 *gid, - u64 *rdev) +static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path, + u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid, + u64 *gid, u64 *rdev) { int ret; struct btrfs_inode_item *ii; struct btrfs_key key; - struct btrfs_path *path; - - path = alloc_path_for_send(); - if (!path) - return -ENOMEM; key.objectid = ino; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; if (ret) { - ret = -ENOENT; - goto out; + if (ret > 0) + ret = -ENOENT; + return ret; } ii = btrfs_item_ptr(path->nodes[0], path->slots[0], @@ -768,7 +813,22 @@ static int get_inode_info(struct btrfs_root *root, if (rdev) *rdev = btrfs_inode_rdev(path->nodes[0], ii); -out: + return ret; +} + +static int get_inode_info(struct btrfs_root *root, + u64 ino, u64 *size, u64 *gen, + u64 *mode, u64 *uid, u64 *gid, + u64 *rdev) +{ + struct btrfs_path *path; + int ret; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid, + rdev); btrfs_free_path(path); return ret; } @@ -915,9 +975,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_dir_item *di; struct btrfs_key di_key; char *buf = NULL; - char *buf2 = NULL; - int buf_len; - int buf_virtual = 0; + const int buf_len = PATH_MAX; u32 name_len; u32 data_len; u32 cur; @@ -927,7 +985,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, int num; u8 type; - buf_len = PAGE_SIZE; buf = kmalloc(buf_len, GFP_NOFS); if (!buf) { ret = -ENOMEM; @@ -949,30 +1006,12 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, type = btrfs_dir_type(eb, di); btrfs_dir_item_key_to_cpu(eb, di, &di_key); + /* + * Path too long + */ if (name_len + data_len > buf_len) { - buf_len = PAGE_ALIGN(name_len + data_len); - if (buf_virtual) { - buf2 = vmalloc(buf_len); - if (!buf2) { - ret = -ENOMEM; - goto out; - } - vfree(buf); - } else { - buf2 = krealloc(buf, buf_len, GFP_NOFS); - if (!buf2) { - buf2 = vmalloc(buf_len); - if (!buf2) { - ret = -ENOMEM; - goto out; - } - kfree(buf); - buf_virtual = 1; - } - } - - buf = buf2; - buf2 = NULL; + ret = -ENAMETOOLONG; + goto out; } read_extent_buffer(eb, buf, (unsigned long)(di + 1), @@ -995,10 +1034,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, } out: - if (buf_virtual) - vfree(buf); - else - kfree(buf); + kfree(buf); return ret; } @@ -1066,6 +1102,7 @@ out: struct backref_ctx { struct send_ctx *sctx; + struct btrfs_path *path; /* number of total found references */ u64 found; @@ -1136,8 +1173,9 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) * There are inodes that have extents that lie behind its i_size. Don't * accept clones from these extents. */ - ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL, - NULL); + ret = __get_inode_info(found->root, bctx->path, ino, &i_size, NULL, NULL, + NULL, NULL, NULL); + btrfs_release_path(bctx->path); if (ret < 0) return ret; @@ -1216,12 +1254,17 @@ static int find_extent_clone(struct send_ctx *sctx, if (!tmp_path) return -ENOMEM; + /* We only use this path under the commit sem */ + tmp_path->need_commit_sem = 0; + backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS); if (!backref_ctx) { ret = -ENOMEM; goto out; } + backref_ctx->path = tmp_path; + if (data_offset >= ino_size) { /* * There may be extents that lie behind the file's size. @@ -1249,8 +1292,10 @@ static int find_extent_clone(struct send_ctx *sctx, } logical = disk_byte + btrfs_file_extent_offset(eb, fi); + down_read(&sctx->send_root->fs_info->commit_root_sem); ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path, &found_key, &flags); + up_read(&sctx->send_root->fs_info->commit_root_sem); btrfs_release_path(tmp_path); if (ret < 0) @@ -1292,8 +1337,6 @@ static int find_extent_clone(struct send_ctx *sctx, extent_item_pos = logical - found_key.objectid; else extent_item_pos = 0; - - extent_item_pos = logical - found_key.objectid; ret = iterate_extent_inodes(sctx->send_root->fs_info, found_key.objectid, extent_item_pos, 1, __iterate_backrefs, backref_ctx); @@ -1418,11 +1461,7 @@ static int gen_unique_name(struct send_ctx *sctx, while (1) { len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu", ino, gen, idx); - if (len >= sizeof(tmp)) { - /* should really not happen */ - ret = -EOVERFLOW; - goto out; - } + ASSERT(len < sizeof(tmp)); di = btrfs_lookup_dir_item(NULL, sctx->send_root, path, BTRFS_FIRST_FREE_OBJECTID, @@ -1632,7 +1671,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, goto out; } - if (key.type == BTRFS_INODE_REF_KEY) { + if (found_key.type == BTRFS_INODE_REF_KEY) { struct btrfs_inode_ref *iref; iref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); @@ -1898,13 +1937,20 @@ static void name_cache_delete(struct send_ctx *sctx, nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)nce->ino); - BUG_ON(!nce_head); + if (!nce_head) { + btrfs_err(sctx->send_root->fs_info, + "name_cache_delete lookup failed ino %llu cache size %d, leaking memory", + nce->ino, sctx->name_cache_size); + } list_del(&nce->radix_list); list_del(&nce->list); sctx->name_cache_size--; - if (list_empty(nce_head)) { + /* + * We may not get to the final release of nce_head if the lookup fails + */ + if (nce_head && list_empty(nce_head)) { radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); kfree(nce_head); } @@ -1977,7 +2023,6 @@ static void name_cache_free(struct send_ctx *sctx) */ static int __get_cur_name_and_parent(struct send_ctx *sctx, u64 ino, u64 gen, - int skip_name_cache, u64 *parent_ino, u64 *parent_gen, struct fs_path *dest) @@ -1987,8 +2032,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, struct btrfs_path *path = NULL; struct name_cache_entry *nce = NULL; - if (skip_name_cache) - goto get_ref; /* * First check if we already did a call to this function with the same * ino/gen. If yes, check if the cache entry is still up-to-date. If yes @@ -2033,12 +2076,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, goto out_cache; } -get_ref: /* * Depending on whether the inode was already processed or not, use * send_root or parent_root for ref lookup. */ - if (ino < sctx->send_progress && !skip_name_cache) + if (ino < sctx->send_progress) ret = get_first_ref(sctx->send_root, ino, parent_ino, parent_gen, dest); else @@ -2062,8 +2104,6 @@ get_ref: goto out; ret = 1; } - if (skip_name_cache) - goto out; out_cache: /* @@ -2131,9 +2171,6 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, u64 parent_inode = 0; u64 parent_gen = 0; int stop = 0; - u64 start_ino = ino; - u64 start_gen = gen; - int skip_name_cache = 0; name = fs_path_alloc(); if (!name) { @@ -2141,31 +2178,33 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, goto out; } - if (is_waiting_for_move(sctx, ino)) - skip_name_cache = 1; - -again: dest->reversed = 1; fs_path_reset(dest); while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) { fs_path_reset(name); - ret = __get_cur_name_and_parent(sctx, ino, gen, skip_name_cache, - &parent_inode, &parent_gen, name); + if (is_waiting_for_rm(sctx, ino)) { + ret = gen_unique_name(sctx, ino, gen, name); + if (ret < 0) + goto out; + ret = fs_path_add_path(dest, name); + break; + } + + if (is_waiting_for_move(sctx, ino)) { + ret = get_first_ref(sctx->parent_root, ino, + &parent_inode, &parent_gen, name); + } else { + ret = __get_cur_name_and_parent(sctx, ino, gen, + &parent_inode, + &parent_gen, name); + if (ret) + stop = 1; + } + if (ret < 0) goto out; - if (ret) - stop = 1; - - if (!skip_name_cache && - is_waiting_for_move(sctx, parent_inode)) { - ino = start_ino; - gen = start_gen; - stop = 0; - skip_name_cache = 1; - goto again; - } ret = fs_path_add_path(dest, name); if (ret < 0) @@ -2429,10 +2468,16 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino); if (!p) return -ENOMEM; - ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL, - NULL, &rdev); - if (ret < 0) - goto out; + if (ino != sctx->cur_ino) { + ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, + NULL, NULL, &rdev); + if (ret < 0) + goto out; + } else { + gen = sctx->cur_inode_gen; + mode = sctx->cur_inode_mode; + rdev = sctx->cur_inode_rdev; + } if (S_ISREG(mode)) { cmd = BTRFS_SEND_C_MKFILE; @@ -2512,17 +2557,26 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; key.offset = 0; + ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0); + if (ret < 0) + goto out; + while (1) { - ret = btrfs_search_slot_for_read(sctx->send_root, &key, path, - 1, 0); - if (ret < 0) - goto out; - if (!ret) { - eb = path->nodes[0]; - slot = path->slots[0]; - btrfs_item_key_to_cpu(eb, &found_key, slot); + eb = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(sctx->send_root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + break; + } + continue; } - if (ret || found_key.objectid != key.objectid || + + btrfs_item_key_to_cpu(eb, &found_key, slot); + if (found_key.objectid != key.objectid || found_key.type != key.type) { ret = 0; goto out; @@ -2537,8 +2591,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) goto out; } - key.offset = found_key.offset + 1; - btrfs_release_path(path); + path->slots[0]++; } out: @@ -2590,7 +2643,7 @@ struct recorded_ref { * everything mixed. So we first record all refs and later process them. * This function is a helper to record one ref. */ -static int record_ref(struct list_head *head, u64 dir, +static int __record_ref(struct list_head *head, u64 dir, u64 dir_gen, struct fs_path *path) { struct recorded_ref *ref; @@ -2676,12 +2729,78 @@ out: return ret; } +static struct orphan_dir_info * +add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino) +{ + struct rb_node **p = &sctx->orphan_dirs.rb_node; + struct rb_node *parent = NULL; + struct orphan_dir_info *entry, *odi; + + odi = kmalloc(sizeof(*odi), GFP_NOFS); + if (!odi) + return ERR_PTR(-ENOMEM); + odi->ino = dir_ino; + odi->gen = 0; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct orphan_dir_info, node); + if (dir_ino < entry->ino) { + p = &(*p)->rb_left; + } else if (dir_ino > entry->ino) { + p = &(*p)->rb_right; + } else { + kfree(odi); + return entry; + } + } + + rb_link_node(&odi->node, parent, p); + rb_insert_color(&odi->node, &sctx->orphan_dirs); + return odi; +} + +static struct orphan_dir_info * +get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino) +{ + struct rb_node *n = sctx->orphan_dirs.rb_node; + struct orphan_dir_info *entry; + + while (n) { + entry = rb_entry(n, struct orphan_dir_info, node); + if (dir_ino < entry->ino) + n = n->rb_left; + else if (dir_ino > entry->ino) + n = n->rb_right; + else + return entry; + } + return NULL; +} + +static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino) +{ + struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino); + + return odi != NULL; +} + +static void free_orphan_dir_info(struct send_ctx *sctx, + struct orphan_dir_info *odi) +{ + if (!odi) + return; + rb_erase(&odi->node, &sctx->orphan_dirs); + kfree(odi); +} + /* * Returns 1 if a directory can be removed at this point in time. * We check this by iterating all dir items and checking if the inode behind * the dir item was already processed. */ -static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress) +static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, + u64 send_progress) { int ret = 0; struct btrfs_root *root = sctx->parent_root; @@ -2704,31 +2823,52 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress) key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; while (1) { - ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); - if (ret < 0) - goto out; - if (!ret) { - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); + struct waiting_dir_move *dm; + + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; } - if (ret || found_key.objectid != key.objectid || - found_key.type != key.type) { + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (found_key.objectid != key.objectid || + found_key.type != key.type) break; - } di = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); + dm = get_waiting_dir_move(sctx, loc.objectid); + if (dm) { + struct orphan_dir_info *odi; + + odi = add_orphan_dir_info(sctx, dir); + if (IS_ERR(odi)) { + ret = PTR_ERR(odi); + goto out; + } + odi->gen = dir_gen; + dm->rmdir_ino = dir; + ret = 0; + goto out; + } + if (loc.objectid > send_progress) { ret = 0; goto out; } - btrfs_release_path(path); - key.offset = found_key.offset + 1; + path->slots[0]++; } ret = 1; @@ -2740,19 +2880,9 @@ out: static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) { - struct rb_node *n = sctx->waiting_dir_moves.rb_node; - struct waiting_dir_move *entry; + struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino); - while (n) { - entry = rb_entry(n, struct waiting_dir_move, node); - if (ino < entry->ino) - n = n->rb_left; - else if (ino > entry->ino) - n = n->rb_right; - else - return 1; - } - return 0; + return entry != NULL; } static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) @@ -2765,6 +2895,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) if (!dm) return -ENOMEM; dm->ino = ino; + dm->rmdir_ino = 0; while (*p) { parent = *p; @@ -2784,31 +2915,41 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) return 0; } -static int del_waiting_dir_move(struct send_ctx *sctx, u64 ino) +static struct waiting_dir_move * +get_waiting_dir_move(struct send_ctx *sctx, u64 ino) { struct rb_node *n = sctx->waiting_dir_moves.rb_node; struct waiting_dir_move *entry; while (n) { entry = rb_entry(n, struct waiting_dir_move, node); - if (ino < entry->ino) { + if (ino < entry->ino) n = n->rb_left; - } else if (ino > entry->ino) { + else if (ino > entry->ino) n = n->rb_right; - } else { - rb_erase(&entry->node, &sctx->waiting_dir_moves); - kfree(entry); - return 0; - } + else + return entry; } - return -ENOENT; + return NULL; +} + +static void free_waiting_dir_move(struct send_ctx *sctx, + struct waiting_dir_move *dm) +{ + if (!dm) + return; + rb_erase(&dm->node, &sctx->waiting_dir_moves); + kfree(dm); } -static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino) +static int add_pending_dir_move(struct send_ctx *sctx, + u64 ino, + u64 ino_gen, + u64 parent_ino) { struct rb_node **p = &sctx->pending_dir_moves.rb_node; struct rb_node *parent = NULL; - struct pending_dir_move *entry, *pm; + struct pending_dir_move *entry = NULL, *pm; struct recorded_ref *cur; int exists = 0; int ret; @@ -2817,8 +2958,8 @@ static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino) if (!pm) return -ENOMEM; pm->parent_ino = parent_ino; - pm->ino = sctx->cur_ino; - pm->gen = sctx->cur_inode_gen; + pm->ino = ino; + pm->gen = ino_gen; INIT_LIST_HEAD(&pm->list); INIT_LIST_HEAD(&pm->update_refs); RB_CLEAR_NODE(&pm->node); @@ -2888,19 +3029,52 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) { struct fs_path *from_path = NULL; struct fs_path *to_path = NULL; + struct fs_path *name = NULL; u64 orig_progress = sctx->send_progress; struct recorded_ref *cur; + u64 parent_ino, parent_gen; + struct waiting_dir_move *dm = NULL; + u64 rmdir_ino = 0; int ret; + name = fs_path_alloc(); from_path = fs_path_alloc(); - if (!from_path) - return -ENOMEM; + if (!name || !from_path) { + ret = -ENOMEM; + goto out; + } - sctx->send_progress = pm->ino; - ret = get_cur_path(sctx, pm->ino, pm->gen, from_path); + dm = get_waiting_dir_move(sctx, pm->ino); + ASSERT(dm); + rmdir_ino = dm->rmdir_ino; + free_waiting_dir_move(sctx, dm); + + ret = get_first_ref(sctx->parent_root, pm->ino, + &parent_ino, &parent_gen, name); if (ret < 0) goto out; + if (parent_ino == sctx->cur_ino) { + /* child only renamed, not moved */ + ASSERT(parent_gen == sctx->cur_inode_gen); + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, + from_path); + if (ret < 0) + goto out; + ret = fs_path_add_path(from_path, name); + if (ret < 0) + goto out; + } else { + /* child moved and maybe renamed too */ + sctx->send_progress = pm->ino; + ret = get_cur_path(sctx, pm->ino, pm->gen, from_path); + if (ret < 0) + goto out; + } + + fs_path_free(name); + name = NULL; + to_path = fs_path_alloc(); if (!to_path) { ret = -ENOMEM; @@ -2908,9 +3082,6 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) } sctx->send_progress = sctx->cur_ino + 1; - ret = del_waiting_dir_move(sctx, pm->ino); - ASSERT(ret == 0); - ret = get_cur_path(sctx, pm->ino, pm->gen, to_path); if (ret < 0) goto out; @@ -2919,6 +3090,35 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) if (ret < 0) goto out; + if (rmdir_ino) { + struct orphan_dir_info *odi; + + odi = get_orphan_dir_info(sctx, rmdir_ino); + if (!odi) { + /* already deleted */ + goto finish; + } + ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1); + if (ret < 0) + goto out; + if (!ret) + goto finish; + + name = fs_path_alloc(); + if (!name) { + ret = -ENOMEM; + goto out; + } + ret = get_cur_path(sctx, rmdir_ino, odi->gen, name); + if (ret < 0) + goto out; + ret = send_rmdir(sctx, name); + if (ret < 0) + goto out; + free_orphan_dir_info(sctx, odi); + } + +finish: ret = send_utimes(sctx, pm->ino, pm->gen); if (ret < 0) goto out; @@ -2928,12 +3128,15 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) * and old parent(s). */ list_for_each_entry(cur, &pm->update_refs, list) { + if (cur->dir == rmdir_ino) + continue; ret = send_utimes(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; } out: + fs_path_free(name); fs_path_free(from_path); fs_path_free(to_path); sctx->send_progress = orig_progress; @@ -3005,17 +3208,19 @@ static int wait_for_parent_move(struct send_ctx *sctx, int ret; u64 ino = parent_ref->dir; u64 parent_ino_before, parent_ino_after; - u64 new_gen, old_gen; + u64 old_gen; struct fs_path *path_before = NULL; struct fs_path *path_after = NULL; int len1, len2; - - if (parent_ref->dir <= sctx->cur_ino) - return 0; + int register_upper_dirs; + u64 gen; if (is_waiting_for_move(sctx, ino)) return 1; + if (parent_ref->dir <= sctx->cur_ino) + return 0; + ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen, NULL, NULL, NULL, NULL); if (ret == -ENOENT) @@ -3023,12 +3228,7 @@ static int wait_for_parent_move(struct send_ctx *sctx, else if (ret < 0) return ret; - ret = get_inode_info(sctx->send_root, ino, NULL, &new_gen, - NULL, NULL, NULL, NULL); - if (ret < 0) - return ret; - - if (new_gen != old_gen) + if (parent_ref->dir_gen != old_gen) return 0; path_before = fs_path_alloc(); @@ -3051,7 +3251,7 @@ static int wait_for_parent_move(struct send_ctx *sctx, } ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, - NULL, path_after); + &gen, path_after); if (ret == -ENOENT) { ret = 0; goto out; @@ -3061,13 +3261,67 @@ static int wait_for_parent_move(struct send_ctx *sctx, len1 = fs_path_len(path_before); len2 = fs_path_len(path_after); - if ((parent_ino_before != parent_ino_after) && (len1 != len2 || - memcmp(path_before->start, path_after->start, len1))) { + if (parent_ino_before != parent_ino_after || len1 != len2 || + memcmp(path_before->start, path_after->start, len1)) { ret = 1; goto out; } ret = 0; + /* + * Ok, our new most direct ancestor has a higher inode number but + * wasn't moved/renamed. So maybe some of the new ancestors higher in + * the hierarchy have an higher inode number too *and* were renamed + * or moved - in this case we need to wait for the ancestor's rename + * or move operation before we can do the move/rename for the current + * inode. + */ + register_upper_dirs = 0; + ino = parent_ino_after; +again: + while ((ret == 0 || register_upper_dirs) && ino > sctx->cur_ino) { + u64 parent_gen; + + fs_path_reset(path_before); + fs_path_reset(path_after); + + ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, + &parent_gen, path_after); + if (ret < 0) + goto out; + ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before, + NULL, path_before); + if (ret == -ENOENT) { + ret = 0; + break; + } else if (ret < 0) { + goto out; + } + + len1 = fs_path_len(path_before); + len2 = fs_path_len(path_after); + if (parent_ino_before != parent_ino_after || len1 != len2 || + memcmp(path_before->start, path_after->start, len1)) { + ret = 1; + if (register_upper_dirs) { + break; + } else { + register_upper_dirs = 1; + ino = parent_ref->dir; + gen = parent_ref->dir_gen; + goto again; + } + } else if (register_upper_dirs) { + ret = add_pending_dir_move(sctx, ino, gen, + parent_ino_after); + if (ret < 0 && ret != -EEXIST) + goto out; + } + + ino = parent_ino_after; + gen = parent_gen; + } + out: fs_path_free(path_before); fs_path_free(path_after); @@ -3089,6 +3343,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) u64 ow_gen; int did_overwrite = 0; int is_orphan = 0; + u64 last_dir_ino_rm = 0; verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); @@ -3227,9 +3482,14 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); * dirs, we always have one new and one deleted * ref. The deleted ref is ignored later. */ - if (wait_for_parent_move(sctx, cur)) { + ret = wait_for_parent_move(sctx, cur); + if (ret < 0) + goto out; + if (ret) { ret = add_pending_dir_move(sctx, - cur->dir); + sctx->cur_ino, + sctx->cur_inode_gen, + cur->dir); *pending_move = 1; } else { ret = send_rename(sctx, valid_path, @@ -3259,7 +3519,8 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); * later, we do this check again and rmdir it then if possible. * See the use of check_dirs for more details. */ - ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_ino); + ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen, + sctx->cur_ino); if (ret < 0) goto out; if (ret) { @@ -3350,8 +3611,10 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); ret = send_utimes(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; - } else if (ret == inode_state_did_delete) { - ret = can_rmdir(sctx, cur->dir, sctx->cur_ino); + } else if (ret == inode_state_did_delete && + cur->dir != last_dir_ino_rm) { + ret = can_rmdir(sctx, cur->dir, cur->dir_gen, + sctx->cur_ino); if (ret < 0) goto out; if (ret) { @@ -3362,6 +3625,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); ret = send_rmdir(sctx, valid_path); if (ret < 0) goto out; + last_dir_ino_rm = cur->dir; } } } @@ -3375,9 +3639,8 @@ out: return ret; } -static int __record_new_ref(int num, u64 dir, int index, - struct fs_path *name, - void *ctx) +static int record_ref(struct btrfs_root *root, int num, u64 dir, int index, + struct fs_path *name, void *ctx, struct list_head *refs) { int ret = 0; struct send_ctx *sctx = ctx; @@ -3388,7 +3651,7 @@ static int __record_new_ref(int num, u64 dir, int index, if (!p) return -ENOMEM; - ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL, + ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL, NULL, NULL); if (ret < 0) goto out; @@ -3400,7 +3663,7 @@ static int __record_new_ref(int num, u64 dir, int index, if (ret < 0) goto out; - ret = record_ref(&sctx->new_refs, dir, gen, p); + ret = __record_ref(refs, dir, gen, p); out: if (ret) @@ -3408,37 +3671,23 @@ out: return ret; } +static int __record_new_ref(int num, u64 dir, int index, + struct fs_path *name, + void *ctx) +{ + struct send_ctx *sctx = ctx; + return record_ref(sctx->send_root, num, dir, index, name, + ctx, &sctx->new_refs); +} + + static int __record_deleted_ref(int num, u64 dir, int index, struct fs_path *name, void *ctx) { - int ret = 0; struct send_ctx *sctx = ctx; - struct fs_path *p; - u64 gen; - - p = fs_path_alloc(); - if (!p) - return -ENOMEM; - - ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL, - NULL, NULL); - if (ret < 0) - goto out; - - ret = get_cur_path(sctx, dir, gen, p); - if (ret < 0) - goto out; - ret = fs_path_add_path(p, name); - if (ret < 0) - goto out; - - ret = record_ref(&sctx->deleted_refs, dir, gen, p); - -out: - if (ret) - fs_path_free(p); - return ret; + return record_ref(sctx->parent_root, num, dir, index, name, + ctx, &sctx->deleted_refs); } static int record_new_ref(struct send_ctx *sctx) @@ -3619,21 +3868,31 @@ static int process_all_refs(struct send_ctx *sctx, root = sctx->parent_root; cb = __record_deleted_ref; } else { - BUG(); + btrfs_err(sctx->send_root->fs_info, + "Wrong command %d in process_all_refs", cmd); + ret = -EINVAL; + goto out; } key.objectid = sctx->cmp_key->objectid; key.type = BTRFS_INODE_REF_KEY; key.offset = 0; - while (1) { - ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); - if (ret < 0) - goto out; - if (ret) - break; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + while (1) { eb = path->nodes[0]; slot = path->slots[0]; + if (slot >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + btrfs_item_key_to_cpu(eb, &found_key, slot); if (found_key.objectid != key.objectid || @@ -3642,11 +3901,10 @@ static int process_all_refs(struct send_ctx *sctx, break; ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx); - btrfs_release_path(path); if (ret < 0) goto out; - key.offset = found_key.offset + 1; + path->slots[0]++; } btrfs_release_path(path); @@ -3927,19 +4185,25 @@ static int process_all_new_xattrs(struct send_ctx *sctx) key.objectid = sctx->cmp_key->objectid; key.type = BTRFS_XATTR_ITEM_KEY; key.offset = 0; - while (1) { - ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); - if (ret < 0) - goto out; - if (ret) { - ret = 0; - goto out; - } + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + while (1) { eb = path->nodes[0]; slot = path->slots[0]; - btrfs_item_key_to_cpu(eb, &found_key, slot); + if (slot >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + break; + } + continue; + } + btrfs_item_key_to_cpu(eb, &found_key, slot); if (found_key.objectid != key.objectid || found_key.type != key.type) { ret = 0; @@ -3951,8 +4215,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx) if (ret < 0) goto out; - btrfs_release_path(path); - key.offset = found_key.offset + 1; + path->slots[0]++; } out: @@ -3991,6 +4254,13 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) goto out; last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT; + + /* initial readahead */ + memset(&sctx->ra, 0, sizeof(struct file_ra_state)); + file_ra_state_init(&sctx->ra, inode->i_mapping); + btrfs_force_ra(inode->i_mapping, &sctx->ra, NULL, index, + last_index - index + 1); + while (index <= last_index) { unsigned cur_len = min_t(unsigned, len, PAGE_CACHE_SIZE - pg_offset); @@ -4174,6 +4444,9 @@ static int send_hole(struct send_ctx *sctx, u64 end) p = fs_path_alloc(); if (!p) return -ENOMEM; + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto tlv_put_failure; memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE); while (offset < end) { len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE); @@ -4181,9 +4454,6 @@ static int send_hole(struct send_ctx *sctx, u64 end) ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); if (ret < 0) break; - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - break; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len); @@ -4724,7 +4994,9 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) if (S_ISREG(sctx->cur_inode_mode)) { if (need_send_hole(sctx)) { - if (sctx->cur_inode_last_extent == (u64)-1) { + if (sctx->cur_inode_last_extent == (u64)-1 || + sctx->cur_inode_last_extent < + sctx->cur_inode_size) { ret = get_last_extent(sctx, (u64)-1); if (ret) goto out; @@ -4763,18 +5035,19 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) ret = apply_children_dir_moves(sctx); if (ret) goto out; + /* + * Need to send that every time, no matter if it actually + * changed between the two trees as we have done changes to + * the inode before. If our inode is a directory and it's + * waiting to be moved/renamed, we will send its utimes when + * it's moved/renamed, therefore we don't need to do it here. + */ + sctx->send_progress = sctx->cur_ino + 1; + ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); + if (ret < 0) + goto out; } - /* - * Need to send that every time, no matter if it actually - * changed between the two trees as we have done changes to - * the inode before. - */ - sctx->send_progress = sctx->cur_ino + 1; - ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); - if (ret < 0) - goto out; - out: return ret; } @@ -4840,6 +5113,8 @@ static int changed_inode(struct send_ctx *sctx, sctx->left_path->nodes[0], left_ii); sctx->cur_inode_mode = btrfs_inode_mode( sctx->left_path->nodes[0], left_ii); + sctx->cur_inode_rdev = btrfs_inode_rdev( + sctx->left_path->nodes[0], left_ii); if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) ret = send_create_inode_if_needed(sctx); } else if (result == BTRFS_COMPARE_TREE_DELETED) { @@ -4884,6 +5159,8 @@ static int changed_inode(struct send_ctx *sctx, sctx->left_path->nodes[0], left_ii); sctx->cur_inode_mode = btrfs_inode_mode( sctx->left_path->nodes[0], left_ii); + sctx->cur_inode_rdev = btrfs_inode_rdev( + sctx->left_path->nodes[0], left_ii); ret = send_create_inode_if_needed(sctx); if (ret < 0) goto out; @@ -5124,37 +5401,15 @@ static int full_send_tree(struct send_ctx *sctx) struct btrfs_path *path; struct extent_buffer *eb; int slot; - u64 start_ctransid; - u64 ctransid; path = alloc_path_for_send(); if (!path) return -ENOMEM; - spin_lock(&send_root->root_item_lock); - start_ctransid = btrfs_root_ctransid(&send_root->root_item); - spin_unlock(&send_root->root_item_lock); - key.objectid = BTRFS_FIRST_FREE_OBJECTID; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - /* - * Make sure the tree has not changed after re-joining. We detect this - * by comparing start_ctransid and ctransid. They should always match. - */ - spin_lock(&send_root->root_item_lock); - ctransid = btrfs_root_ctransid(&send_root->root_item); - spin_unlock(&send_root->root_item_lock); - - if (ctransid != start_ctransid) { - WARN(1, KERN_WARNING "BTRFS: the root that you're trying to " - "send was modified in between. This is " - "probably a bug.\n"); - ret = -EIO; - goto out; - } - ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0); if (ret < 0) goto out; @@ -5340,6 +5595,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) sctx->pending_dir_moves = RB_ROOT; sctx->waiting_dir_moves = RB_ROOT; + sctx->orphan_dirs = RB_ROOT; sctx->clone_roots = vzalloc(sizeof(struct clone_root) * (arg->clone_sources_count + 1)); @@ -5435,7 +5691,9 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) NULL); sort_clone_roots = 1; + current->journal_info = (void *)BTRFS_SEND_TRANS_STUB; ret = send_subvol(sctx); + current->journal_info = NULL; if (ret < 0) goto out; @@ -5477,6 +5735,16 @@ out: kfree(dm); } + WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs)); + while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) { + struct rb_node *n; + struct orphan_dir_info *odi; + + n = rb_first(&sctx->orphan_dirs); + odi = rb_entry(n, struct orphan_dir_info, node); + free_orphan_dir_info(sctx, odi); + } + if (sort_clone_roots) { for (i = 0; i < sctx->clone_roots_cnt; i++) btrfs_root_dec_send_in_progress( diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d04db817be5c..9601d25a4607 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -66,6 +66,8 @@ static const struct super_operations btrfs_super_ops; static struct file_system_type btrfs_fs_type; +static int btrfs_remount(struct super_block *sb, int *flags, char *data); + static const char *btrfs_decode_error(int errno) { char *errstr = "unknown"; @@ -383,20 +385,6 @@ static match_table_t tokens = { {Opt_err, NULL}, }; -#define btrfs_set_and_info(root, opt, fmt, args...) \ -{ \ - if (!btrfs_test_opt(root, opt)) \ - btrfs_info(root->fs_info, fmt, ##args); \ - btrfs_set_opt(root->fs_info->mount_opt, opt); \ -} - -#define btrfs_clear_and_info(root, opt, fmt, args...) \ -{ \ - if (btrfs_test_opt(root, opt)) \ - btrfs_info(root->fs_info, fmt, ##args); \ - btrfs_clear_opt(root->fs_info->mount_opt, opt); \ -} - /* * Regular mount options parser. Everything that is needed only when * reading in a new superblock is parsed here. @@ -1184,7 +1172,31 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, return ERR_PTR(-ENOMEM); mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs); + + if (PTR_RET(mnt) == -EBUSY) { + if (flags & MS_RDONLY) { + mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name, + newargs); + } else { + int r; + mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name, + newargs); + if (IS_ERR(mnt)) { + kfree(newargs); + return ERR_CAST(mnt); + } + + r = btrfs_remount(mnt->mnt_sb, &flags, NULL); + if (r < 0) { + /* FIXME: release vfsmount mnt ??*/ + kfree(newargs); + return ERR_PTR(r); + } + } + } + kfree(newargs); + if (IS_ERR(mnt)) return ERR_CAST(mnt); @@ -1305,13 +1317,6 @@ error_fs_info: return ERR_PTR(error); } -static void btrfs_set_max_workers(struct btrfs_workers *workers, int new_limit) -{ - spin_lock_irq(&workers->lock); - workers->max_workers = new_limit; - spin_unlock_irq(&workers->lock); -} - static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, int new_pool_size, int old_pool_size) { @@ -1323,21 +1328,20 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, btrfs_info(fs_info, "resize thread pool %d -> %d", old_pool_size, new_pool_size); - btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size); - btrfs_set_max_workers(&fs_info->workers, new_pool_size); - btrfs_set_max_workers(&fs_info->delalloc_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->endio_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->endio_meta_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->endio_meta_write_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->endio_write_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); - btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers, - new_pool_size); + btrfs_workqueue_set_max(fs_info->workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->endio_meta_write_workers, + new_pool_size); + btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size); + btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers, + new_pool_size); } static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info) @@ -1388,6 +1392,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) unsigned int old_metadata_ratio = fs_info->metadata_ratio; int ret; + sync_filesystem(sb); btrfs_remount_prepare(fs_info); ret = btrfs_parse_options(root, data); @@ -1479,6 +1484,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) sb->s_flags &= ~MS_RDONLY; } out: + wake_up_process(fs_info->transaction_kthread); btrfs_remount_cleanup(fs_info, old_opts); return 0; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 865f4cf9a769..c5eb2143dc66 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -24,6 +24,7 @@ #include <linux/kobject.h> #include <linux/bug.h> #include <linux/genhd.h> +#include <linux/debugfs.h> #include "ctree.h" #include "disk-io.h" @@ -599,6 +600,12 @@ static int add_device_membership(struct btrfs_fs_info *fs_info) /* /sys/fs/btrfs/ entry */ static struct kset *btrfs_kset; +/* /sys/kernel/debug/btrfs */ +static struct dentry *btrfs_debugfs_root_dentry; + +/* Debugging tunables and exported data */ +u64 btrfs_debugfs_test; + int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) { int error; @@ -642,27 +649,41 @@ failure: return error; } +static int btrfs_init_debugfs(void) +{ +#ifdef CONFIG_DEBUG_FS + btrfs_debugfs_root_dentry = debugfs_create_dir("btrfs", NULL); + if (!btrfs_debugfs_root_dentry) + return -ENOMEM; + + debugfs_create_u64("test", S_IRUGO | S_IWUGO, btrfs_debugfs_root_dentry, + &btrfs_debugfs_test); +#endif + return 0; +} + int btrfs_init_sysfs(void) { int ret; + btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj); if (!btrfs_kset) return -ENOMEM; - init_feature_attrs(); + ret = btrfs_init_debugfs(); + if (ret) + return ret; + init_feature_attrs(); ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); - if (ret) { - kset_unregister(btrfs_kset); - return ret; - } - return 0; + return ret; } void btrfs_exit_sysfs(void) { sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); kset_unregister(btrfs_kset); + debugfs_remove_recursive(btrfs_debugfs_root_dentry); } diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index f3cea3710d44..9ab576318a84 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -1,6 +1,11 @@ #ifndef _BTRFS_SYSFS_H_ #define _BTRFS_SYSFS_H_ +/* + * Data exported through sysfs + */ +extern u64 btrfs_debugfs_test; + enum btrfs_feature_set { FEAT_COMPAT, FEAT_COMPAT_RO, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 34cd83184c4a..7579f6d0b854 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -75,10 +75,21 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) } } -static noinline void switch_commit_root(struct btrfs_root *root) +static noinline void switch_commit_roots(struct btrfs_transaction *trans, + struct btrfs_fs_info *fs_info) { - free_extent_buffer(root->commit_root); - root->commit_root = btrfs_root_node(root); + struct btrfs_root *root, *tmp; + + down_write(&fs_info->commit_root_sem); + list_for_each_entry_safe(root, tmp, &trans->switch_commits, + dirty_list) { + list_del_init(&root->dirty_list); + free_extent_buffer(root->commit_root); + root->commit_root = btrfs_root_node(root); + if (is_fstree(root->objectid)) + btrfs_unpin_free_ino(root); + } + up_write(&fs_info->commit_root_sem); } static inline void extwriter_counter_inc(struct btrfs_transaction *trans, @@ -208,6 +219,7 @@ loop: INIT_LIST_HEAD(&cur_trans->pending_snapshots); INIT_LIST_HEAD(&cur_trans->ordered_operations); INIT_LIST_HEAD(&cur_trans->pending_chunks); + INIT_LIST_HEAD(&cur_trans->switch_commits); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, fs_info->btree_inode->i_mapping); @@ -375,7 +387,8 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return ERR_PTR(-EROFS); - if (current->journal_info) { + if (current->journal_info && + current->journal_info != (void *)BTRFS_SEND_TRANS_STUB) { WARN_ON(type & TRANS_EXTWRITERS); h = current->journal_info; h->use_count++; @@ -683,7 +696,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, int lock = (trans->type != TRANS_JOIN_NOLOCK); int err = 0; - if (--trans->use_count) { + if (trans->use_count > 1) { + trans->use_count--; trans->block_rsv = trans->orig_rsv; return 0; } @@ -731,17 +745,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, } if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) { - if (throttle) { - /* - * We may race with somebody else here so end up having - * to call end_transaction on ourselves again, so inc - * our use_count. - */ - trans->use_count++; + if (throttle) return btrfs_commit_transaction(trans, root); - } else { + else wake_up_process(info->transaction_kthread); - } } if (trans->type & __TRANS_FREEZABLE) @@ -925,9 +932,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, return ret; } - if (root != root->fs_info->extent_root) - switch_commit_root(root); - return 0; } @@ -983,15 +987,16 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, list_del_init(next); root = list_entry(next, struct btrfs_root, dirty_list); + if (root != fs_info->extent_root) + list_add_tail(&root->dirty_list, + &trans->transaction->switch_commits); ret = update_cowonly_root(trans, root); if (ret) return ret; } - down_write(&fs_info->extent_commit_sem); - switch_commit_root(fs_info->extent_root); - up_write(&fs_info->extent_commit_sem); - + list_add_tail(&fs_info->extent_root->dirty_list, + &trans->transaction->switch_commits); btrfs_after_dev_replace_commit(fs_info); return 0; @@ -1048,11 +1053,8 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, smp_wmb(); if (root->commit_root != root->node) { - mutex_lock(&root->fs_commit_mutex); - switch_commit_root(root); - btrfs_unpin_free_ino(root); - mutex_unlock(&root->fs_commit_mutex); - + list_add_tail(&root->dirty_list, + &trans->transaction->switch_commits); btrfs_set_root_node(&root->root_item, root->node); } @@ -1578,10 +1580,9 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, trace_btrfs_transaction_commit(root); - btrfs_scrub_continue(root); - if (current->journal_info == trans) current->journal_info = NULL; + btrfs_scrub_cancel(root->fs_info); kmem_cache_free(btrfs_trans_handle_cachep, trans); } @@ -1621,7 +1622,7 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) { if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) - return btrfs_start_delalloc_roots(fs_info, 1); + return btrfs_start_delalloc_roots(fs_info, 1, -1); return 0; } @@ -1754,7 +1755,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, /* ->aborted might be set after the previous check, so check it */ if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { ret = cur_trans->aborted; - goto cleanup_transaction; + goto scrub_continue; } /* * the reloc mutex makes sure that we stop @@ -1771,7 +1772,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = create_pending_snapshots(trans, root->fs_info); if (ret) { mutex_unlock(&root->fs_info->reloc_mutex); - goto cleanup_transaction; + goto scrub_continue; } /* @@ -1787,13 +1788,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = btrfs_run_delayed_items(trans, root); if (ret) { mutex_unlock(&root->fs_info->reloc_mutex); - goto cleanup_transaction; + goto scrub_continue; } ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); if (ret) { mutex_unlock(&root->fs_info->reloc_mutex); - goto cleanup_transaction; + goto scrub_continue; } /* @@ -1823,7 +1824,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, if (ret) { mutex_unlock(&root->fs_info->tree_log_mutex); mutex_unlock(&root->fs_info->reloc_mutex); - goto cleanup_transaction; + goto scrub_continue; } /* @@ -1844,7 +1845,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, if (ret) { mutex_unlock(&root->fs_info->tree_log_mutex); mutex_unlock(&root->fs_info->reloc_mutex); - goto cleanup_transaction; + goto scrub_continue; } /* @@ -1855,7 +1856,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = cur_trans->aborted; mutex_unlock(&root->fs_info->tree_log_mutex); mutex_unlock(&root->fs_info->reloc_mutex); - goto cleanup_transaction; + goto scrub_continue; } btrfs_prepare_extent_commit(trans, root); @@ -1864,11 +1865,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_set_root_node(&root->fs_info->tree_root->root_item, root->fs_info->tree_root->node); - switch_commit_root(root->fs_info->tree_root); + list_add_tail(&root->fs_info->tree_root->dirty_list, + &cur_trans->switch_commits); btrfs_set_root_node(&root->fs_info->chunk_root->root_item, root->fs_info->chunk_root->node); - switch_commit_root(root->fs_info->chunk_root); + list_add_tail(&root->fs_info->chunk_root->dirty_list, + &cur_trans->switch_commits); + + switch_commit_roots(cur_trans, root->fs_info); assert_qgroups_uptodate(trans); update_super_roots(root); @@ -1891,13 +1896,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_error(root->fs_info, ret, "Error while writing out transaction"); mutex_unlock(&root->fs_info->tree_log_mutex); - goto cleanup_transaction; + goto scrub_continue; } ret = write_ctree_super(trans, root, 0); if (ret) { mutex_unlock(&root->fs_info->tree_log_mutex); - goto cleanup_transaction; + goto scrub_continue; } /* @@ -1940,6 +1945,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, return ret; +scrub_continue: + btrfs_scrub_continue(root); cleanup_transaction: btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 6ac037e9f9f0..b57b924e8e03 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -57,6 +57,7 @@ struct btrfs_transaction { struct list_head pending_snapshots; struct list_head ordered_operations; struct list_head pending_chunks; + struct list_head switch_commits; struct btrfs_delayed_ref_root delayed_refs; int aborted; }; @@ -78,6 +79,8 @@ struct btrfs_transaction { #define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \ __TRANS_ATTACH) +#define BTRFS_SEND_TRANS_STUB 1 + struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 39d83da03e03..e2f45fc02610 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -136,13 +136,20 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, * syncing the tree wait for us to finish */ static int start_log_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_root *root, + struct btrfs_log_ctx *ctx) { + int index; int ret; - int err = 0; mutex_lock(&root->log_mutex); if (root->log_root) { + if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == + trans->transid) { + ret = -EAGAIN; + goto out; + } + if (!root->log_start_pid) { root->log_start_pid = current->pid; root->log_multiple_pids = false; @@ -152,27 +159,40 @@ static int start_log_trans(struct btrfs_trans_handle *trans, atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); + if (ctx) { + index = root->log_transid % 2; + list_add_tail(&ctx->list, &root->log_ctxs[index]); + ctx->log_transid = root->log_transid; + } mutex_unlock(&root->log_mutex); return 0; } - root->log_multiple_pids = false; - root->log_start_pid = current->pid; + + ret = 0; mutex_lock(&root->fs_info->tree_log_mutex); - if (!root->fs_info->log_root_tree) { + if (!root->fs_info->log_root_tree) ret = btrfs_init_log_root_tree(trans, root->fs_info); - if (ret) - err = ret; - } - if (err == 0 && !root->log_root) { + mutex_unlock(&root->fs_info->tree_log_mutex); + if (ret) + goto out; + + if (!root->log_root) { ret = btrfs_add_log_tree(trans, root); if (ret) - err = ret; + goto out; } - mutex_unlock(&root->fs_info->tree_log_mutex); + root->log_multiple_pids = false; + root->log_start_pid = current->pid; atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); + if (ctx) { + index = root->log_transid % 2; + list_add_tail(&ctx->list, &root->log_ctxs[index]); + ctx->log_transid = root->log_transid; + } +out: mutex_unlock(&root->log_mutex); - return err; + return ret; } /* @@ -2359,8 +2379,8 @@ static int update_log_root(struct btrfs_trans_handle *trans, return ret; } -static int wait_log_commit(struct btrfs_trans_handle *trans, - struct btrfs_root *root, unsigned long transid) +static void wait_log_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int transid) { DEFINE_WAIT(wait); int index = transid % 2; @@ -2375,36 +2395,63 @@ static int wait_log_commit(struct btrfs_trans_handle *trans, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&root->log_mutex); - if (root->fs_info->last_trans_log_full_commit != - trans->transid && root->log_transid < transid + 2 && + if (root->log_transid_committed < transid && atomic_read(&root->log_commit[index])) schedule(); finish_wait(&root->log_commit_wait[index], &wait); mutex_lock(&root->log_mutex); - } while (root->fs_info->last_trans_log_full_commit != - trans->transid && root->log_transid < transid + 2 && + } while (root->log_transid_committed < transid && atomic_read(&root->log_commit[index])); - return 0; } static void wait_for_writer(struct btrfs_trans_handle *trans, struct btrfs_root *root) { DEFINE_WAIT(wait); - while (root->fs_info->last_trans_log_full_commit != - trans->transid && atomic_read(&root->log_writers)) { + + while (atomic_read(&root->log_writers)) { prepare_to_wait(&root->log_writer_wait, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&root->log_mutex); - if (root->fs_info->last_trans_log_full_commit != - trans->transid && atomic_read(&root->log_writers)) + if (atomic_read(&root->log_writers)) schedule(); mutex_lock(&root->log_mutex); finish_wait(&root->log_writer_wait, &wait); } } +static inline void btrfs_remove_log_ctx(struct btrfs_root *root, + struct btrfs_log_ctx *ctx) +{ + if (!ctx) + return; + + mutex_lock(&root->log_mutex); + list_del_init(&ctx->list); + mutex_unlock(&root->log_mutex); +} + +/* + * Invoked in log mutex context, or be sure there is no other task which + * can access the list. + */ +static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, + int index, int error) +{ + struct btrfs_log_ctx *ctx; + + if (!error) { + INIT_LIST_HEAD(&root->log_ctxs[index]); + return; + } + + list_for_each_entry(ctx, &root->log_ctxs[index], list) + ctx->log_ret = error; + + INIT_LIST_HEAD(&root->log_ctxs[index]); +} + /* * btrfs_sync_log does sends a given tree log down to the disk and * updates the super blocks to record it. When this call is done, @@ -2418,7 +2465,7 @@ static void wait_for_writer(struct btrfs_trans_handle *trans, * that has happened. */ int btrfs_sync_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_root *root, struct btrfs_log_ctx *ctx) { int index1; int index2; @@ -2426,22 +2473,30 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, int ret; struct btrfs_root *log = root->log_root; struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; - unsigned long log_transid = 0; + int log_transid = 0; + struct btrfs_log_ctx root_log_ctx; struct blk_plug plug; mutex_lock(&root->log_mutex); - log_transid = root->log_transid; - index1 = root->log_transid % 2; + log_transid = ctx->log_transid; + if (root->log_transid_committed >= log_transid) { + mutex_unlock(&root->log_mutex); + return ctx->log_ret; + } + + index1 = log_transid % 2; if (atomic_read(&root->log_commit[index1])) { - wait_log_commit(trans, root, root->log_transid); + wait_log_commit(trans, root, log_transid); mutex_unlock(&root->log_mutex); - return 0; + return ctx->log_ret; } + ASSERT(log_transid == root->log_transid); atomic_set(&root->log_commit[index1], 1); /* wait for previous tree log sync to complete */ if (atomic_read(&root->log_commit[(index1 + 1) % 2])) - wait_log_commit(trans, root, root->log_transid - 1); + wait_log_commit(trans, root, log_transid - 1); + while (1) { int batch = atomic_read(&root->log_batch); /* when we're on an ssd, just kick the log commit out */ @@ -2456,7 +2511,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } /* bail out if we need to do a full commit */ - if (root->fs_info->last_trans_log_full_commit == trans->transid) { + if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == + trans->transid) { ret = -EAGAIN; btrfs_free_logged_extents(log, log_transid); mutex_unlock(&root->log_mutex); @@ -2477,6 +2533,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, blk_finish_plug(&plug); btrfs_abort_transaction(trans, root, ret); btrfs_free_logged_extents(log, log_transid); + ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = + trans->transid; mutex_unlock(&root->log_mutex); goto out; } @@ -2486,7 +2544,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, root->log_transid++; log->log_transid = root->log_transid; root->log_start_pid = 0; - smp_mb(); /* * IO has been started, blocks of the log tree have WRITTEN flag set * in their headers. new modifications of the log will be written to @@ -2494,9 +2551,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ mutex_unlock(&root->log_mutex); + btrfs_init_log_ctx(&root_log_ctx); + mutex_lock(&log_root_tree->log_mutex); atomic_inc(&log_root_tree->log_batch); atomic_inc(&log_root_tree->log_writers); + + index2 = log_root_tree->log_transid % 2; + list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); + root_log_ctx.log_transid = log_root_tree->log_transid; + mutex_unlock(&log_root_tree->log_mutex); ret = update_log_root(trans, log); @@ -2509,13 +2573,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } if (ret) { + if (!list_empty(&root_log_ctx.list)) + list_del_init(&root_log_ctx.list); + blk_finish_plug(&plug); + ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = + trans->transid; if (ret != -ENOSPC) { btrfs_abort_transaction(trans, root, ret); mutex_unlock(&log_root_tree->log_mutex); goto out; } - root->fs_info->last_trans_log_full_commit = trans->transid; btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); @@ -2523,22 +2591,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out; } - index2 = log_root_tree->log_transid % 2; + if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { + mutex_unlock(&log_root_tree->log_mutex); + ret = root_log_ctx.log_ret; + goto out; + } + + index2 = root_log_ctx.log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { blk_finish_plug(&plug); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); wait_log_commit(trans, log_root_tree, - log_root_tree->log_transid); + root_log_ctx.log_transid); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); - ret = 0; + ret = root_log_ctx.log_ret; goto out; } + ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); atomic_set(&log_root_tree->log_commit[index2], 1); if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { wait_log_commit(trans, log_root_tree, - log_root_tree->log_transid - 1); + root_log_ctx.log_transid - 1); } wait_for_writer(trans, log_root_tree); @@ -2547,7 +2622,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * now that we've moved on to the tree of log tree roots, * check the full commit flag again */ - if (root->fs_info->last_trans_log_full_commit == trans->transid) { + if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) == + trans->transid) { blk_finish_plug(&plug); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); btrfs_free_logged_extents(log, log_transid); @@ -2561,6 +2637,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, EXTENT_DIRTY | EXTENT_NEW); blk_finish_plug(&plug); if (ret) { + ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = + trans->transid; btrfs_abort_transaction(trans, root, ret); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); @@ -2578,8 +2656,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_header_level(log_root_tree->node)); log_root_tree->log_transid++; - smp_mb(); - mutex_unlock(&log_root_tree->log_mutex); /* @@ -2591,6 +2667,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ ret = write_ctree_super(trans, root->fs_info->tree_root, 1); if (ret) { + ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) = + trans->transid; btrfs_abort_transaction(trans, root, ret); goto out_wake_log_root; } @@ -2601,13 +2679,28 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&root->log_mutex); out_wake_log_root: + /* + * We needn't get log_mutex here because we are sure all + * the other tasks are blocked. + */ + btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); + + mutex_lock(&log_root_tree->log_mutex); + log_root_tree->log_transid_committed++; atomic_set(&log_root_tree->log_commit[index2], 0); - smp_mb(); + mutex_unlock(&log_root_tree->log_mutex); + if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) wake_up(&log_root_tree->log_commit_wait[index2]); out: + /* See above. */ + btrfs_remove_all_log_ctxs(root, index1, ret); + + mutex_lock(&root->log_mutex); + root->log_transid_committed++; atomic_set(&root->log_commit[index1], 0); - smp_mb(); + mutex_unlock(&root->log_mutex); + if (waitqueue_active(&root->log_commit_wait[index1])) wake_up(&root->log_commit_wait[index1]); return ret; @@ -3479,7 +3572,8 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) static int log_one_extent(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_root *root, - struct extent_map *em, struct btrfs_path *path) + struct extent_map *em, struct btrfs_path *path, + struct list_head *logged_list) { struct btrfs_root *log = root->log_root; struct btrfs_file_extent_item *fi; @@ -3495,7 +3589,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans, u64 extent_offset = em->start - em->orig_start; u64 block_len; int ret; - int index = log->log_transid % 2; bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; int extent_inserted = 0; @@ -3579,17 +3672,12 @@ static int log_one_extent(struct btrfs_trans_handle *trans, * First check and see if our csums are on our outstanding ordered * extents. */ -again: - spin_lock_irq(&log->log_extents_lock[index]); - list_for_each_entry(ordered, &log->logged_list[index], log_list) { + list_for_each_entry(ordered, logged_list, log_list) { struct btrfs_ordered_sum *sum; if (!mod_len) break; - if (ordered->inode != inode) - continue; - if (ordered->file_offset + ordered->len <= mod_start || mod_start + mod_len <= ordered->file_offset) continue; @@ -3632,12 +3720,6 @@ again: if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags)) continue; - atomic_inc(&ordered->refs); - spin_unlock_irq(&log->log_extents_lock[index]); - /* - * we've dropped the lock, we must either break or - * start over after this. - */ if (ordered->csum_bytes_left) { btrfs_start_ordered_extent(inode, ordered, 0); @@ -3647,16 +3729,11 @@ again: list_for_each_entry(sum, &ordered->list, list) { ret = btrfs_csum_file_blocks(trans, log, sum); - if (ret) { - btrfs_put_ordered_extent(ordered); + if (ret) goto unlocked; - } } - btrfs_put_ordered_extent(ordered); - goto again; } - spin_unlock_irq(&log->log_extents_lock[index]); unlocked: if (!mod_len || ret) @@ -3694,7 +3771,8 @@ unlocked: static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, - struct btrfs_path *path) + struct btrfs_path *path, + struct list_head *logged_list) { struct extent_map *em, *n; struct list_head extents; @@ -3752,7 +3830,7 @@ process: write_unlock(&tree->lock); - ret = log_one_extent(trans, inode, root, em, path); + ret = log_one_extent(trans, inode, root, em, path, logged_list); write_lock(&tree->lock); clear_em_logging(tree, em); free_extent_map(em); @@ -3788,6 +3866,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_key max_key; struct btrfs_root *log = root->log_root; struct extent_buffer *src = NULL; + LIST_HEAD(logged_list); u64 last_extent = 0; int err = 0; int ret; @@ -3836,7 +3915,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, mutex_lock(&BTRFS_I(inode)->log_mutex); - btrfs_get_logged_extents(log, inode); + btrfs_get_logged_extents(inode, &logged_list); /* * a brute force approach to making sure we get the most uptodate @@ -3962,7 +4041,8 @@ log_extents: btrfs_release_path(path); btrfs_release_path(dst_path); if (fast_search) { - ret = btrfs_log_changed_extents(trans, root, inode, dst_path); + ret = btrfs_log_changed_extents(trans, root, inode, dst_path, + &logged_list); if (ret) { err = ret; goto out_unlock; @@ -3987,8 +4067,10 @@ log_extents: BTRFS_I(inode)->logged_trans = trans->transid; BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; out_unlock: - if (err) - btrfs_free_logged_extents(log, log->log_transid); + if (unlikely(err)) + btrfs_put_logged_extents(&logged_list); + else + btrfs_submit_logged_extents(&logged_list, log); mutex_unlock(&BTRFS_I(inode)->log_mutex); btrfs_free_path(path); @@ -4079,7 +4161,8 @@ out: */ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, - struct dentry *parent, int exists_only) + struct dentry *parent, int exists_only, + struct btrfs_log_ctx *ctx) { int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; struct super_block *sb; @@ -4116,9 +4199,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_no_trans; } - ret = start_log_trans(trans, root); + ret = start_log_trans(trans, root, ctx); if (ret) - goto end_trans; + goto end_no_trans; ret = btrfs_log_inode(trans, root, inode, inode_only); if (ret) @@ -4166,6 +4249,9 @@ end_trans: root->fs_info->last_trans_log_full_commit = trans->transid; ret = 1; } + + if (ret) + btrfs_remove_log_ctx(root, ctx); btrfs_end_log_trans(root); end_no_trans: return ret; @@ -4178,12 +4264,14 @@ end_no_trans: * data on disk. */ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct dentry *dentry) + struct btrfs_root *root, struct dentry *dentry, + struct btrfs_log_ctx *ctx) { struct dentry *parent = dget_parent(dentry); int ret; - ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0); + ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, + 0, ctx); dput(parent); return ret; @@ -4420,6 +4508,6 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, root->fs_info->last_trans_committed)) return 0; - return btrfs_log_inode_parent(trans, root, inode, parent, 1); + return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL); } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 1d4ae0d15a70..91b145fce333 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -22,14 +22,28 @@ /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ #define BTRFS_NO_LOG_SYNC 256 +struct btrfs_log_ctx { + int log_ret; + int log_transid; + struct list_head list; +}; + +static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx) +{ + ctx->log_ret = 0; + ctx->log_transid = 0; + INIT_LIST_HEAD(&ctx->list); +} + int btrfs_sync_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root); + struct btrfs_root *root, struct btrfs_log_ctx *ctx); int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_recover_log_trees(struct btrfs_root *tree_root); int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct dentry *dentry); + struct btrfs_root *root, struct dentry *dentry, + struct btrfs_log_ctx *ctx); int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bab0b84d8f80..49d7fab73360 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -415,7 +415,8 @@ loop_lock: device->running_pending = 1; spin_unlock(&device->io_lock); - btrfs_requeue_work(&device->work); + btrfs_queue_work(fs_info->submit_workers, + &device->work); goto done; } /* unplug every 64 requests just for good measure */ @@ -447,6 +448,14 @@ static void pending_bios_fn(struct btrfs_work *work) run_scheduled_bios(device); } +/* + * Add new device to list of registered devices + * + * Returns: + * 1 - first time device is seen + * 0 - device already known + * < 0 - error + */ static noinline int device_list_add(const char *path, struct btrfs_super_block *disk_super, u64 devid, struct btrfs_fs_devices **fs_devices_ret) @@ -454,6 +463,7 @@ static noinline int device_list_add(const char *path, struct btrfs_device *device; struct btrfs_fs_devices *fs_devices; struct rcu_string *name; + int ret = 0; u64 found_transid = btrfs_super_generation(disk_super); fs_devices = find_fsid(disk_super->fsid); @@ -494,6 +504,7 @@ static noinline int device_list_add(const char *path, fs_devices->num_devices++; mutex_unlock(&fs_devices->device_list_mutex); + ret = 1; device->fs_devices = fs_devices; } else if (!device->name || strcmp(device->name->str, path)) { name = rcu_string_strdup(path, GFP_NOFS); @@ -512,7 +523,8 @@ static noinline int device_list_add(const char *path, fs_devices->latest_trans = found_transid; } *fs_devices_ret = fs_devices; - return 0; + + return ret; } static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) @@ -909,17 +921,19 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, transid = btrfs_super_generation(disk_super); total_devices = btrfs_super_num_devices(disk_super); - if (disk_super->label[0]) { - if (disk_super->label[BTRFS_LABEL_SIZE - 1]) - disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; - printk(KERN_INFO "BTRFS: device label %s ", disk_super->label); - } else { - printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid); - } - - printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path); - ret = device_list_add(path, disk_super, devid, fs_devices_ret); + if (ret > 0) { + if (disk_super->label[0]) { + if (disk_super->label[BTRFS_LABEL_SIZE - 1]) + disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; + printk(KERN_INFO "BTRFS: device label %s ", disk_super->label); + } else { + printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid); + } + + printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path); + ret = 0; + } if (!ret && fs_devices_ret) (*fs_devices_ret)->total_devices = total_devices; @@ -5263,6 +5277,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, static void btrfs_end_bio(struct bio *bio, int err) { struct btrfs_bio *bbio = bio->bi_private; + struct btrfs_device *dev = bbio->stripes[0].dev; int is_orig_bio = 0; if (err) { @@ -5270,7 +5285,6 @@ static void btrfs_end_bio(struct bio *bio, int err) if (err == -EIO || err == -EREMOTEIO) { unsigned int stripe_index = btrfs_io_bio(bio)->stripe_index; - struct btrfs_device *dev; BUG_ON(stripe_index >= bbio->num_stripes); dev = bbio->stripes[stripe_index].dev; @@ -5292,6 +5306,8 @@ static void btrfs_end_bio(struct bio *bio, int err) if (bio == bbio->orig_bio) is_orig_bio = 1; + btrfs_bio_counter_dec(bbio->fs_info); + if (atomic_dec_and_test(&bbio->stripes_pending)) { if (!is_orig_bio) { bio_put(bio); @@ -5328,13 +5344,6 @@ static void btrfs_end_bio(struct bio *bio, int err) } } -struct async_sched { - struct bio *bio; - int rw; - struct btrfs_fs_info *info; - struct btrfs_work work; -}; - /* * see run_scheduled_bios for a description of why bios are collected for * async submit. @@ -5391,8 +5400,8 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root, spin_unlock(&device->io_lock); if (should_queue) - btrfs_queue_worker(&root->fs_info->submit_workers, - &device->work); + btrfs_queue_work(root->fs_info->submit_workers, + &device->work); } static int bio_size_ok(struct block_device *bdev, struct bio *bio, @@ -5447,6 +5456,9 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, } #endif bio->bi_bdev = dev->bdev; + + btrfs_bio_counter_inc_noblocked(root->fs_info); + if (async) btrfs_schedule_bio(root, dev, rw, bio); else @@ -5515,28 +5527,38 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, length = bio->bi_iter.bi_size; map_length = length; + btrfs_bio_counter_inc_blocked(root->fs_info); ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, mirror_num, &raid_map); - if (ret) /* -ENOMEM */ + if (ret) { + btrfs_bio_counter_dec(root->fs_info); return ret; + } total_devs = bbio->num_stripes; bbio->orig_bio = first_bio; bbio->private = first_bio->bi_private; bbio->end_io = first_bio->bi_end_io; + bbio->fs_info = root->fs_info; atomic_set(&bbio->stripes_pending, bbio->num_stripes); if (raid_map) { /* In this case, map_length has been set to the length of a single stripe; not the whole write */ if (rw & WRITE) { - return raid56_parity_write(root, bio, bbio, - raid_map, map_length); + ret = raid56_parity_write(root, bio, bbio, + raid_map, map_length); } else { - return raid56_parity_recover(root, bio, bbio, - raid_map, map_length, - mirror_num); + ret = raid56_parity_recover(root, bio, bbio, + raid_map, map_length, + mirror_num); } + /* + * FIXME, replace dosen't support raid56 yet, please fix + * it in the future. + */ + btrfs_bio_counter_dec(root->fs_info); + return ret; } if (map_length < length) { @@ -5578,6 +5600,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, async_submit); dev_nr++; } + btrfs_bio_counter_dec(root->fs_info); return 0; } @@ -5666,7 +5689,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, else generate_random_uuid(dev->uuid); - dev->work.func = pending_bios_fn; + btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL); return dev; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 8b3cd142b373..80754f9dd3df 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -192,6 +192,7 @@ typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); struct btrfs_bio { atomic_t stripes_pending; + struct btrfs_fs_info *fs_info; bio_end_io_t *end_io; struct bio *orig_bio; void *private; diff --git a/fs/buffer.c b/fs/buffer.c index 27265a8b43c1..9ddb9fc7d923 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2114,8 +2114,8 @@ EXPORT_SYMBOL(generic_write_end); * Returns true if all buffers which correspond to a file portion * we want to read are uptodate. */ -int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, - unsigned long from) +int block_is_partially_uptodate(struct page *page, unsigned long from, + unsigned long count) { unsigned block_start, block_end, blocksize; unsigned to; @@ -2127,7 +2127,7 @@ int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, head = page_buffers(page); blocksize = head->b_size; - to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count); + to = min_t(unsigned, PAGE_CACHE_SIZE - from, count); to = from + to; if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize) return 0; @@ -3088,7 +3088,7 @@ EXPORT_SYMBOL(submit_bh); * until the buffer gets unlocked). * * ll_rw_block sets b_end_io to simple completion handler that marks - * the buffer up-to-date (if approriate), unlocks the buffer and wakes + * the buffer up-to-date (if appropriate), unlocks the buffer and wakes * any waiters. * * All of the buffers must be for the same device, and must also be a diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c index 622f4696e484..5b99bafc31d1 100644 --- a/fs/cachefiles/bind.c +++ b/fs/cachefiles/bind.c @@ -124,7 +124,6 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) /* check parameters */ ret = -EOPNOTSUPP; if (!root->d_inode || - !root->d_inode->i_op || !root->d_inode->i_op->lookup || !root->d_inode->i_op->mkdir || !root->d_inode->i_op->setxattr || diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index ca65f39dc8dc..c0a681705104 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -391,12 +391,12 @@ try_again: path.dentry = dir; path_to_graveyard.mnt = cache->mnt; path_to_graveyard.dentry = cache->graveyard; - ret = security_path_rename(&path, rep, &path_to_graveyard, grave); + ret = security_path_rename(&path, rep, &path_to_graveyard, grave, 0); if (ret < 0) { cachefiles_io_error(cache, "Rename security error %d", ret); } else { ret = vfs_rename(dir->d_inode, rep, - cache->graveyard->d_inode, grave, NULL); + cache->graveyard->d_inode, grave, NULL, 0); if (ret != 0 && ret != -ENOMEM) cachefiles_io_error(cache, "Rename failed with error %d", ret); @@ -779,8 +779,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, } ret = -EPERM; - if (!subdir->d_inode->i_op || - !subdir->d_inode->i_op->setxattr || + if (!subdir->d_inode->i_op->setxattr || !subdir->d_inode->i_op->getxattr || !subdir->d_inode->i_op->lookup || !subdir->d_inode->i_op->mkdir || diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index ebaff368120d..4b1fb5ca65b8 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -265,24 +265,22 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object, goto nomem_monitor; } - ret = add_to_page_cache(newpage, bmapping, - netpage->index, cachefiles_gfp); + ret = add_to_page_cache_lru(newpage, bmapping, + netpage->index, cachefiles_gfp); if (ret == 0) goto installed_new_backing_page; if (ret != -EEXIST) goto nomem_page; } - /* we've installed a new backing page, so now we need to add it - * to the LRU list and start it reading */ + /* we've installed a new backing page, so now we need to start + * it reading */ installed_new_backing_page: _debug("- new %p", newpage); backpage = newpage; newpage = NULL; - lru_cache_add_file(backpage); - read_backing_page: ret = bmapping->a_ops->readpage(NULL, backpage); if (ret < 0) @@ -510,24 +508,23 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, goto nomem; } - ret = add_to_page_cache(newpage, bmapping, - netpage->index, cachefiles_gfp); + ret = add_to_page_cache_lru(newpage, bmapping, + netpage->index, + cachefiles_gfp); if (ret == 0) goto installed_new_backing_page; if (ret != -EEXIST) goto nomem; } - /* we've installed a new backing page, so now we need to add it - * to the LRU list and start it reading */ + /* we've installed a new backing page, so now we need + * to start it reading */ installed_new_backing_page: _debug("- new %p", newpage); backpage = newpage; newpage = NULL; - lru_cache_add_file(backpage); - reread_backing_page: ret = bmapping->a_ops->readpage(NULL, backpage); if (ret < 0) @@ -538,8 +535,8 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, monitor_backing_page: _debug("- monitor add"); - ret = add_to_page_cache(netpage, op->mapping, netpage->index, - cachefiles_gfp); + ret = add_to_page_cache_lru(netpage, op->mapping, + netpage->index, cachefiles_gfp); if (ret < 0) { if (ret == -EEXIST) { page_cache_release(netpage); @@ -549,8 +546,6 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, goto nomem; } - lru_cache_add_file(netpage); - /* install a monitor */ page_cache_get(netpage); monitor->netfs_page = netpage; @@ -613,8 +608,8 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, backing_page_already_uptodate: _debug("- uptodate"); - ret = add_to_page_cache(netpage, op->mapping, netpage->index, - cachefiles_gfp); + ret = add_to_page_cache_lru(netpage, op->mapping, + netpage->index, cachefiles_gfp); if (ret < 0) { if (ret == -EEXIST) { page_cache_release(netpage); @@ -631,8 +626,6 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, fscache_mark_page_cached(op, netpage); - lru_cache_add_file(netpage); - /* the netpage is unlocked and marked up to date here */ fscache_end_io(op, netpage, 0); page_cache_release(netpage); diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 8c44fdd4e1c3..834f9f3723fb 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -205,6 +205,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, ci->fscache = fscache_acquire_cookie(fsc->fscache, &ceph_fscache_inode_object_def, ci, true); + fscache_check_consistency(ci->fscache); done: mutex_unlock(&inode->i_mutex); diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index da95f61b7a09..5ac591bd012b 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -48,6 +48,12 @@ void ceph_readpage_to_fscache(struct inode *inode, struct page *page); void ceph_invalidate_fscache_page(struct inode* inode, struct page *page); void ceph_queue_revalidate(struct inode *inode); +static inline void ceph_fscache_update_objectsize(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + fscache_attr_changed(ci->fscache); +} + static inline void ceph_fscache_invalidate(struct inode *inode) { fscache_invalidate(ceph_inode(inode)->fscache); @@ -135,6 +141,10 @@ static inline void ceph_readpage_to_fscache(struct inode *inode, { } +static inline void ceph_fscache_update_objectsize(struct inode *inode) +{ +} + static inline void ceph_fscache_invalidate(struct inode *inode) { } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 17543383545c..c561b628ebce 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -622,8 +622,10 @@ retry: if (flags & CEPH_CAP_FLAG_AUTH) { if (ci->i_auth_cap == NULL || - ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) + ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) { ci->i_auth_cap = cap; + cap->mds_wanted = wanted; + } ci->i_cap_exporting_issued = 0; } else { WARN_ON(ci->i_auth_cap == cap); @@ -885,7 +887,10 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) cap = rb_entry(p, struct ceph_cap, ci_node); if (!__cap_is_valid(cap)) continue; - mds_wanted |= cap->mds_wanted; + if (cap == ci->i_auth_cap) + mds_wanted |= cap->mds_wanted; + else + mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR); } return mds_wanted; } @@ -3256,7 +3261,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode, rel->seq = cpu_to_le32(cap->seq); rel->issue_seq = cpu_to_le32(cap->issue_seq), rel->mseq = cpu_to_le32(cap->mseq); - rel->caps = cpu_to_le32(cap->issued); + rel->caps = cpu_to_le32(cap->implemented); rel->wanted = cpu_to_le32(cap->mds_wanted); rel->dname_len = 0; rel->dname_seq = 0; diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 6d59006bfa27..16b54aa31f08 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -93,6 +93,8 @@ static int mdsc_show(struct seq_file *s, void *p) } else if (req->r_path1) { seq_printf(s, " #%llx/%s", req->r_ino1.ino, req->r_path1); + } else { + seq_printf(s, " #%llx", req->r_ino1.ino); } if (req->r_old_dentry) { @@ -102,7 +104,8 @@ static int mdsc_show(struct seq_file *s, void *p) path = NULL; spin_lock(&req->r_old_dentry->d_lock); seq_printf(s, " #%llx/%.*s (%s)", - ceph_ino(req->r_old_dentry_dir), + req->r_old_dentry_dir ? + ceph_ino(req->r_old_dentry_dir) : 0, req->r_old_dentry->d_name.len, req->r_old_dentry->d_name.name, path ? path : ""); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 45eda6d7a40c..c29d6ae68874 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -119,7 +119,8 @@ static int fpos_cmp(loff_t l, loff_t r) * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by * the MDS if/when the directory is modified). */ -static int __dcache_readdir(struct file *file, struct dir_context *ctx) +static int __dcache_readdir(struct file *file, struct dir_context *ctx, + u32 shared_gen) { struct ceph_file_info *fi = file->private_data; struct dentry *parent = file->f_dentry; @@ -133,14 +134,14 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx) last = fi->dentry; fi->dentry = NULL; - dout("__dcache_readdir %p at %llu (last %p)\n", dir, ctx->pos, - last); + dout("__dcache_readdir %p v%u at %llu (last %p)\n", + dir, shared_gen, ctx->pos, last); spin_lock(&parent->d_lock); /* start at beginning? */ if (ctx->pos == 2 || last == NULL || - ctx->pos < ceph_dentry(last)->offset) { + fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) { if (list_empty(&parent->d_subdirs)) goto out_unlock; p = parent->d_subdirs.prev; @@ -161,7 +162,8 @@ more: goto out_unlock; } spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - if (!d_unhashed(dentry) && dentry->d_inode && + if (di->lease_shared_gen == shared_gen && + !d_unhashed(dentry) && dentry->d_inode && ceph_snap(dentry->d_inode) != CEPH_SNAPDIR && ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && fpos_cmp(ctx->pos, di->offset) <= 0) @@ -180,9 +182,16 @@ more: spin_unlock(&dentry->d_lock); spin_unlock(&parent->d_lock); + /* make sure a dentry wasn't dropped while we didn't have parent lock */ + if (!ceph_dir_is_complete(dir)) { + dout(" lost dir complete on %p; falling back to mds\n", dir); + dput(dentry); + err = -EAGAIN; + goto out; + } + dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos, dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); - ctx->pos = di->offset; if (!dir_emit(ctx, dentry->d_name.name, dentry->d_name.len, ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino), @@ -190,25 +199,18 @@ more: if (last) { /* remember our position */ fi->dentry = last; - fi->next_offset = di->offset; + fi->next_offset = fpos_off(di->offset); } dput(dentry); return 0; } + ctx->pos = di->offset + 1; + if (last) dput(last); last = dentry; - ctx->pos++; - - /* make sure a dentry wasn't dropped while we didn't have parent lock */ - if (!ceph_dir_is_complete(dir)) { - dout(" lost dir complete on %p; falling back to mds\n", dir); - err = -EAGAIN; - goto out; - } - spin_lock(&parent->d_lock); p = p->prev; /* advance to next dentry */ goto more; @@ -252,8 +254,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) int err; u32 ftype; struct ceph_mds_reply_info_parsed *rinfo; - const int max_entries = fsc->mount_options->max_readdir; - const int max_bytes = fsc->mount_options->max_readdir_bytes; dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off); if (fi->flags & CEPH_F_ATEND) @@ -291,10 +291,13 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ceph_snap(inode) != CEPH_SNAPDIR && __ceph_dir_is_complete(ci) && __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { + u32 shared_gen = ci->i_shared_gen; spin_unlock(&ci->i_ceph_lock); - err = __dcache_readdir(file, ctx); + err = __dcache_readdir(file, ctx, shared_gen); if (err != -EAGAIN) return err; + frag = fpos_frag(ctx->pos); + off = fpos_off(ctx->pos); } else { spin_unlock(&ci->i_ceph_lock); } @@ -322,14 +325,16 @@ more: fi->last_readdir = NULL; } - /* requery frag tree, as the frag topology may have changed */ - frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL); - dout("readdir fetching %llx.%llx frag %x offset '%s'\n", ceph_vinop(inode), frag, fi->last_name); req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); + err = ceph_alloc_readdir_reply_buffer(req, inode); + if (err) { + ceph_mdsc_put_request(req); + return err; + } req->r_inode = inode; ihold(inode); req->r_dentry = dget(file->f_dentry); @@ -340,9 +345,6 @@ more: req->r_path2 = kstrdup(fi->last_name, GFP_NOFS); req->r_readdir_offset = fi->next_offset; req->r_args.readdir.frag = cpu_to_le32(frag); - req->r_args.readdir.max_entries = cpu_to_le32(max_entries); - req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes); - req->r_num_caps = max_entries + 1; err = ceph_mdsc_do_request(mdsc, NULL, req); if (err < 0) { ceph_mdsc_put_request(req); @@ -369,9 +371,9 @@ more: fi->next_offset = 0; off = fi->next_offset; } + fi->frag = frag; fi->offset = fi->next_offset; fi->last_readdir = req; - fi->frag = frag; if (req->r_reply_info.dir_end) { kfree(fi->last_name); @@ -446,7 +448,6 @@ more: if (atomic_read(&ci->i_release_count) == fi->dir_release_count) { dout(" marking %p complete\n", inode); __ceph_dir_set_complete(ci, fi->dir_release_count); - ci->i_max_offset = ctx->pos; } spin_unlock(&ci->i_ceph_lock); @@ -454,7 +455,7 @@ more: return 0; } -static void reset_readdir(struct ceph_file_info *fi) +static void reset_readdir(struct ceph_file_info *fi, unsigned frag) { if (fi->last_readdir) { ceph_mdsc_put_request(fi->last_readdir); @@ -462,7 +463,10 @@ static void reset_readdir(struct ceph_file_info *fi) } kfree(fi->last_name); fi->last_name = NULL; - fi->next_offset = 2; /* compensate for . and .. */ + if (ceph_frag_is_leftmost(frag)) + fi->next_offset = 2; /* compensate for . and .. */ + else + fi->next_offset = 0; if (fi->dentry) { dput(fi->dentry); fi->dentry = NULL; @@ -474,7 +478,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) { struct ceph_file_info *fi = file->private_data; struct inode *inode = file->f_mapping->host; - loff_t old_offset = offset; + loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset); loff_t retval; mutex_lock(&inode->i_mutex); @@ -491,7 +495,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) goto out; } - if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) { + if (offset >= 0) { if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; @@ -504,14 +508,14 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) * seek to new frag, or seek prior to current chunk. */ if (offset == 0 || - fpos_frag(offset) != fpos_frag(old_offset) || + fpos_frag(offset) != fi->frag || fpos_off(offset) < fi->offset) { dout("dir_llseek dropping %p content\n", file); - reset_readdir(fi); + reset_readdir(fi, fpos_frag(offset)); } /* bump dir_release_count if we did a forward seek */ - if (offset > old_offset) + if (fpos_cmp(offset, old_offset) > 0) fi->dir_release_count--; } out: @@ -812,8 +816,7 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, } req->r_dentry = dget(dentry); req->r_num_caps = 2; - req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */ - req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry); + req->r_old_dentry = dget(old_dentry); req->r_locked_dir = dir; req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; @@ -911,10 +914,11 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); + ihold(old_dir); req->r_dentry = dget(new_dentry); req->r_num_caps = 2; req->r_old_dentry = dget(old_dentry); - req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry); + req->r_old_dentry_dir = old_dir; req->r_locked_dir = new_dir; req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; @@ -932,14 +936,16 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, * to do it here. */ - /* d_move screws up d_subdirs order */ - ceph_dir_clear_complete(new_dir); - d_move(old_dentry, new_dentry); /* ensure target dentry is invalidated, despite rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(new_dentry); + + /* d_move screws up sibling dentries' offsets */ + ceph_dir_clear_complete(old_dir); + ceph_dir_clear_complete(new_dir); + } ceph_mdsc_put_request(req); return err; diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 16796be53ca5..00d6af6a32ec 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -8,23 +8,6 @@ #include "mds_client.h" /* - * NFS export support - * - * NFS re-export of a ceph mount is, at present, only semireliable. - * The basic issue is that the Ceph architectures doesn't lend itself - * well to generating filehandles that will remain valid forever. - * - * So, we do our best. If you're lucky, your inode will be in the - * client's cache. If it's not, and you have a connectable fh, then - * the MDS server may be able to find it for you. Otherwise, you get - * ESTALE. - * - * There are ways to this more reliable, but in the non-connectable fh - * case, we won't every work perfectly, and in the connectable case, - * some changes are needed on the MDS side to work better. - */ - -/* * Basic fh */ struct ceph_nfs_fh { @@ -32,22 +15,12 @@ struct ceph_nfs_fh { } __attribute__ ((packed)); /* - * Larger 'connectable' fh that includes parent ino and name hash. - * Use this whenever possible, as it works more reliably. + * Larger fh that includes parent ino. */ struct ceph_nfs_confh { u64 ino, parent_ino; - u32 parent_name_hash; } __attribute__ ((packed)); -/* - * The presence of @parent_inode here tells us whether NFS wants a - * connectable file handle. However, we want to make a connectionable - * file handle unconditionally so that the MDS gets as much of a hint - * as possible. That means we only use @parent_dentry to indicate - * whether nfsd wants a connectable fh, and whether we should indicate - * failure from a too-small @max_len. - */ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, struct inode *parent_inode) { @@ -56,54 +29,36 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, struct ceph_nfs_confh *cfh = (void *)rawfh; int connected_handle_length = sizeof(*cfh)/4; int handle_length = sizeof(*fh)/4; - struct dentry *dentry; - struct dentry *parent; /* don't re-export snaps */ if (ceph_snap(inode) != CEPH_NOSNAP) return -EINVAL; - dentry = d_find_alias(inode); + if (parent_inode && (*max_len < connected_handle_length)) { + *max_len = connected_handle_length; + return FILEID_INVALID; + } else if (*max_len < handle_length) { + *max_len = handle_length; + return FILEID_INVALID; + } - /* if we found an alias, generate a connectable fh */ - if (*max_len >= connected_handle_length && dentry) { - dout("encode_fh %p connectable\n", dentry); - spin_lock(&dentry->d_lock); - parent = dentry->d_parent; + if (parent_inode) { + dout("encode_fh %llx with parent %llx\n", + ceph_ino(inode), ceph_ino(parent_inode)); cfh->ino = ceph_ino(inode); - cfh->parent_ino = ceph_ino(parent->d_inode); - cfh->parent_name_hash = ceph_dentry_hash(parent->d_inode, - dentry); + cfh->parent_ino = ceph_ino(parent_inode); *max_len = connected_handle_length; - type = 2; - spin_unlock(&dentry->d_lock); - } else if (*max_len >= handle_length) { - if (parent_inode) { - /* nfsd wants connectable */ - *max_len = connected_handle_length; - type = FILEID_INVALID; - } else { - dout("encode_fh %p\n", dentry); - fh->ino = ceph_ino(inode); - *max_len = handle_length; - type = 1; - } + type = FILEID_INO32_GEN_PARENT; } else { + dout("encode_fh %llx\n", ceph_ino(inode)); + fh->ino = ceph_ino(inode); *max_len = handle_length; - type = FILEID_INVALID; + type = FILEID_INO32_GEN; } - if (dentry) - dput(dentry); return type; } -/* - * convert regular fh to dentry - * - * FIXME: we should try harder by querying the mds for the ino. - */ -static struct dentry *__fh_to_dentry(struct super_block *sb, - struct ceph_nfs_fh *fh, int fh_len) +static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) { struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; struct inode *inode; @@ -111,11 +66,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, struct ceph_vino vino; int err; - if (fh_len < sizeof(*fh) / 4) - return ERR_PTR(-ESTALE); - - dout("__fh_to_dentry %llx\n", fh->ino); - vino.ino = fh->ino; + vino.ino = ino; vino.snap = CEPH_NOSNAP; inode = ceph_find_inode(sb, vino); if (!inode) { @@ -139,139 +90,161 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, dentry = d_obtain_alias(inode); if (IS_ERR(dentry)) { - pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n", - fh->ino, inode); iput(inode); return dentry; } err = ceph_init_dentry(dentry); if (err < 0) { - iput(inode); + dput(dentry); return ERR_PTR(err); } - dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry); + dout("__fh_to_dentry %llx %p dentry %p\n", ino, inode, dentry); return dentry; } /* - * convert connectable fh to dentry + * convert regular fh to dentry */ -static struct dentry *__cfh_to_dentry(struct super_block *sb, - struct ceph_nfs_confh *cfh, int fh_len) +static struct dentry *ceph_fh_to_dentry(struct super_block *sb, + struct fid *fid, + int fh_len, int fh_type) +{ + struct ceph_nfs_fh *fh = (void *)fid->raw; + + if (fh_type != FILEID_INO32_GEN && + fh_type != FILEID_INO32_GEN_PARENT) + return NULL; + if (fh_len < sizeof(*fh) / 4) + return NULL; + + dout("fh_to_dentry %llx\n", fh->ino); + return __fh_to_dentry(sb, fh->ino); +} + +static struct dentry *__get_parent(struct super_block *sb, + struct dentry *child, u64 ino) { struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; + struct ceph_mds_request *req; struct inode *inode; struct dentry *dentry; - struct ceph_vino vino; int err; - if (fh_len < sizeof(*cfh) / 4) - return ERR_PTR(-ESTALE); - - dout("__cfh_to_dentry %llx (%llx/%x)\n", - cfh->ino, cfh->parent_ino, cfh->parent_name_hash); - - vino.ino = cfh->ino; - vino.snap = CEPH_NOSNAP; - inode = ceph_find_inode(sb, vino); - if (!inode) { - struct ceph_mds_request *req; - - req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH, - USE_ANY_MDS); - if (IS_ERR(req)) - return ERR_CAST(req); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT, + USE_ANY_MDS); + if (IS_ERR(req)) + return ERR_CAST(req); - req->r_ino1 = vino; - req->r_ino2.ino = cfh->parent_ino; - req->r_ino2.snap = CEPH_NOSNAP; - req->r_path2 = kmalloc(16, GFP_NOFS); - snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash); - req->r_num_caps = 1; - err = ceph_mdsc_do_request(mdsc, NULL, req); - inode = req->r_target_inode; - if (inode) - ihold(inode); - ceph_mdsc_put_request(req); - if (!inode) - return ERR_PTR(err ? err : -ESTALE); + if (child) { + req->r_inode = child->d_inode; + ihold(child->d_inode); + } else { + req->r_ino1 = (struct ceph_vino) { + .ino = ino, + .snap = CEPH_NOSNAP, + }; } + req->r_num_caps = 1; + err = ceph_mdsc_do_request(mdsc, NULL, req); + inode = req->r_target_inode; + if (inode) + ihold(inode); + ceph_mdsc_put_request(req); + if (!inode) + return ERR_PTR(-ENOENT); dentry = d_obtain_alias(inode); if (IS_ERR(dentry)) { - pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n", - cfh->ino, inode); iput(inode); return dentry; } err = ceph_init_dentry(dentry); if (err < 0) { - iput(inode); + dput(dentry); return ERR_PTR(err); } - dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry); + dout("__get_parent ino %llx parent %p ino %llx.%llx\n", + child ? ceph_ino(child->d_inode) : ino, + dentry, ceph_vinop(inode)); return dentry; } -static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) +struct dentry *ceph_get_parent(struct dentry *child) { - if (fh_type == 1) - return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw, - fh_len); - else - return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw, - fh_len); + /* don't re-export snaps */ + if (ceph_snap(child->d_inode) != CEPH_NOSNAP) + return ERR_PTR(-EINVAL); + + dout("get_parent %p ino %llx.%llx\n", + child, ceph_vinop(child->d_inode)); + return __get_parent(child->d_sb, child, 0); } /* - * get parent, if possible. - * - * FIXME: we could do better by querying the mds to discover the - * parent. + * convert regular fh to parent */ static struct dentry *ceph_fh_to_parent(struct super_block *sb, - struct fid *fid, + struct fid *fid, int fh_len, int fh_type) { struct ceph_nfs_confh *cfh = (void *)fid->raw; - struct ceph_vino vino; - struct inode *inode; struct dentry *dentry; - int err; - if (fh_type == 1) - return ERR_PTR(-ESTALE); + if (fh_type != FILEID_INO32_GEN_PARENT) + return NULL; if (fh_len < sizeof(*cfh) / 4) - return ERR_PTR(-ESTALE); + return NULL; - pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino, - cfh->parent_name_hash); + dout("fh_to_parent %llx\n", cfh->parent_ino); + dentry = __get_parent(sb, NULL, cfh->ino); + if (IS_ERR(dentry) && PTR_ERR(dentry) == -ENOENT) + dentry = __fh_to_dentry(sb, cfh->parent_ino); + return dentry; +} - vino.ino = cfh->ino; - vino.snap = CEPH_NOSNAP; - inode = ceph_find_inode(sb, vino); - if (!inode) - return ERR_PTR(-ESTALE); +static int ceph_get_name(struct dentry *parent, char *name, + struct dentry *child) +{ + struct ceph_mds_client *mdsc; + struct ceph_mds_request *req; + int err; - dentry = d_obtain_alias(inode); - if (IS_ERR(dentry)) { - pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n", - cfh->ino, inode); - iput(inode); - return dentry; - } - err = ceph_init_dentry(dentry); - if (err < 0) { - iput(inode); - return ERR_PTR(err); + mdsc = ceph_inode_to_client(child->d_inode)->mdsc; + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME, + USE_ANY_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + + mutex_lock(&parent->d_inode->i_mutex); + + req->r_inode = child->d_inode; + ihold(child->d_inode); + req->r_ino2 = ceph_vino(parent->d_inode); + req->r_locked_dir = parent->d_inode; + req->r_num_caps = 2; + err = ceph_mdsc_do_request(mdsc, NULL, req); + + mutex_unlock(&parent->d_inode->i_mutex); + + if (!err) { + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + memcpy(name, rinfo->dname, rinfo->dname_len); + name[rinfo->dname_len] = 0; + dout("get_name %p ino %llx.%llx name %s\n", + child, ceph_vinop(child->d_inode), name); + } else { + dout("get_name %p ino %llx.%llx err %d\n", + child, ceph_vinop(child->d_inode), err); } - dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry); - return dentry; + + ceph_mdsc_put_request(req); + return err; } const struct export_operations ceph_export_ops = { .encode_fh = ceph_encode_fh, .fh_to_dentry = ceph_fh_to_dentry, .fh_to_parent = ceph_fh_to_parent, + .get_parent = ceph_get_parent, + .get_name = ceph_get_name, }; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 09c7afe32e49..88a6df4cbe6d 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -210,7 +210,7 @@ int ceph_open(struct inode *inode, struct file *file) ihold(inode); req->r_num_caps = 1; - if (flags & (O_CREAT|O_TRUNC)) + if (flags & O_CREAT) parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); iput(parent_inode); @@ -291,8 +291,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, } err = finish_open(file, dentry, ceph_open, opened); } - out_err: + if (!req->r_err && req->r_target_inode) + ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode); ceph_mdsc_put_request(req); dout("atomic_open result=%d\n", err); return err; @@ -600,7 +601,7 @@ ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov, false); if (IS_ERR(req)) { ret = PTR_ERR(req); - goto out; + break; } num_pages = calc_pages_for(page_align, len); @@ -718,7 +719,7 @@ static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov, false); if (IS_ERR(req)) { ret = PTR_ERR(req); - goto out; + break; } /* @@ -970,6 +971,8 @@ retry_snap: goto retry_snap; } } else { + loff_t old_size = inode->i_size; + struct iov_iter from; /* * No need to acquire the i_truncate_mutex. Because * the MDS revokes Fwb caps before sending truncate @@ -977,9 +980,12 @@ retry_snap: * are pending vmtruncate. So write and vmtruncate * can not run at the same time */ - written = generic_file_buffered_write(iocb, iov, nr_segs, - pos, &iocb->ki_pos, - count, 0); + iov_iter_init(&from, iov, nr_segs, count, 0); + written = generic_perform_write(file, &from, pos); + if (likely(written >= 0)) + iocb->ki_pos = pos + written; + if (inode->i_size > old_size) + ceph_fscache_update_objectsize(inode); mutex_unlock(&inode->i_mutex); } @@ -1215,9 +1221,6 @@ static long ceph_fallocate(struct file *file, int mode, if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - if (IS_SWAPFILE(inode)) - return -ETXTBSY; - mutex_lock(&inode->i_mutex); if (ceph_snap(inode) != CEPH_NOSNAP) { diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 32d519d8a2e2..233c6f96910a 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -659,14 +659,6 @@ static int fill_inode(struct inode *inode, le32_to_cpu(info->time_warp_seq), &ctime, &mtime, &atime); - /* only update max_size on auth cap */ - if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && - ci->i_max_size != le64_to_cpu(info->max_size)) { - dout("max_size %lld -> %llu\n", ci->i_max_size, - le64_to_cpu(info->max_size)); - ci->i_max_size = le64_to_cpu(info->max_size); - } - ci->i_layout = info->layout; inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; @@ -752,9 +744,16 @@ static int fill_inode(struct inode *inode, !__ceph_dir_is_complete(ci)) { dout(" marking %p complete (empty)\n", inode); __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count)); - ci->i_max_offset = 2; } no_change: + /* only update max_size on auth cap */ + if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && + ci->i_max_size != le64_to_cpu(info->max_size)) { + dout("max_size %lld -> %llu\n", ci->i_max_size, + le64_to_cpu(info->max_size)); + ci->i_max_size = le64_to_cpu(info->max_size); + } + spin_unlock(&ci->i_ceph_lock); /* queue truncate if we saw i_size decrease */ @@ -890,41 +889,6 @@ out_unlock: } /* - * Set dentry's directory position based on the current dir's max, and - * order it in d_subdirs, so that dcache_readdir behaves. - * - * Always called under directory's i_mutex. - */ -static void ceph_set_dentry_offset(struct dentry *dn) -{ - struct dentry *dir = dn->d_parent; - struct inode *inode = dir->d_inode; - struct ceph_inode_info *ci; - struct ceph_dentry_info *di; - - BUG_ON(!inode); - - ci = ceph_inode(inode); - di = ceph_dentry(dn); - - spin_lock(&ci->i_ceph_lock); - if (!__ceph_dir_is_complete(ci)) { - spin_unlock(&ci->i_ceph_lock); - return; - } - di->offset = ceph_inode(inode)->i_max_offset++; - spin_unlock(&ci->i_ceph_lock); - - spin_lock(&dir->d_lock); - spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED); - list_move(&dn->d_u.d_child, &dir->d_subdirs); - dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset, - dn->d_u.d_child.prev, dn->d_u.d_child.next); - spin_unlock(&dn->d_lock); - spin_unlock(&dir->d_lock); -} - -/* * splice a dentry to an inode. * caller must hold directory i_mutex for this to be safe. * @@ -933,7 +897,7 @@ static void ceph_set_dentry_offset(struct dentry *dn) * the caller) if we fail. */ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, - bool *prehash, bool set_offset) + bool *prehash) { struct dentry *realdn; @@ -965,8 +929,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, } if ((!prehash || *prehash) && d_unhashed(dn)) d_rehash(dn); - if (set_offset) - ceph_set_dentry_offset(dn); out: return dn; } @@ -987,7 +949,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, { struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct inode *in = NULL; - struct ceph_mds_reply_inode *ininfo; struct ceph_vino vino; struct ceph_fs_client *fsc = ceph_sb_to_client(sb); int err = 0; @@ -1044,10 +1005,59 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, session, req->r_request_started, -1, &req->r_caps_reservation); if (err < 0) - return err; + goto done; } else { WARN_ON_ONCE(1); } + + if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME) { + struct qstr dname; + struct dentry *dn, *parent; + + BUG_ON(!rinfo->head->is_target); + BUG_ON(req->r_dentry); + + parent = d_find_any_alias(dir); + BUG_ON(!parent); + + dname.name = rinfo->dname; + dname.len = rinfo->dname_len; + dname.hash = full_name_hash(dname.name, dname.len); + vino.ino = le64_to_cpu(rinfo->targeti.in->ino); + vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); +retry_lookup: + dn = d_lookup(parent, &dname); + dout("d_lookup on parent=%p name=%.*s got %p\n", + parent, dname.len, dname.name, dn); + + if (!dn) { + dn = d_alloc(parent, &dname); + dout("d_alloc %p '%.*s' = %p\n", parent, + dname.len, dname.name, dn); + if (dn == NULL) { + dput(parent); + err = -ENOMEM; + goto done; + } + err = ceph_init_dentry(dn); + if (err < 0) { + dput(dn); + dput(parent); + goto done; + } + } else if (dn->d_inode && + (ceph_ino(dn->d_inode) != vino.ino || + ceph_snap(dn->d_inode) != vino.snap)) { + dout(" dn %p points to wrong inode %p\n", + dn, dn->d_inode); + d_delete(dn); + dput(dn); + goto retry_lookup; + } + + req->r_dentry = dn; + dput(parent); + } } if (rinfo->head->is_target) { @@ -1063,7 +1073,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, err = fill_inode(in, &rinfo->targeti, NULL, session, req->r_request_started, - (le32_to_cpu(rinfo->head->result) == 0) ? + (!req->r_aborted && rinfo->head->result == 0) ? req->r_fmode : -1, &req->r_caps_reservation); if (err < 0) { @@ -1112,6 +1122,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, /* rename? */ if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) { + struct inode *olddir = req->r_old_dentry_dir; + BUG_ON(!olddir); + dout(" src %p '%.*s' dst %p '%.*s'\n", req->r_old_dentry, req->r_old_dentry->d_name.len, @@ -1131,13 +1144,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(dn); - /* - * d_move() puts the renamed dentry at the end of - * d_subdirs. We need to assign it an appropriate - * directory offset so we can behave when dir is - * complete. - */ - ceph_set_dentry_offset(req->r_old_dentry); + /* d_move screws up sibling dentries' offsets */ + ceph_dir_clear_complete(dir); + ceph_dir_clear_complete(olddir); + dout("dn %p gets new offset %lld\n", req->r_old_dentry, ceph_dentry(req->r_old_dentry)->offset); @@ -1164,8 +1174,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, /* attach proper inode */ if (!dn->d_inode) { + ceph_dir_clear_complete(dir); ihold(in); - dn = splice_dentry(dn, in, &have_lease, true); + dn = splice_dentry(dn, in, &have_lease); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; @@ -1186,17 +1197,16 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, (req->r_op == CEPH_MDS_OP_LOOKUPSNAP || req->r_op == CEPH_MDS_OP_MKSNAP)) { struct dentry *dn = req->r_dentry; + struct inode *dir = req->r_locked_dir; /* fill out a snapdir LOOKUPSNAP dentry */ BUG_ON(!dn); - BUG_ON(!req->r_locked_dir); - BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR); - ininfo = rinfo->targeti.in; - vino.ino = le64_to_cpu(ininfo->ino); - vino.snap = le64_to_cpu(ininfo->snapid); + BUG_ON(!dir); + BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR); dout(" linking snapped dir %p to dn %p\n", in, dn); + ceph_dir_clear_complete(dir); ihold(in); - dn = splice_dentry(dn, in, NULL, true); + dn = splice_dentry(dn, in, NULL); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; @@ -1358,7 +1368,7 @@ retry_lookup: } if (!dn->d_inode) { - dn = splice_dentry(dn, in, NULL, false); + dn = splice_dentry(dn, in, NULL); if (IS_ERR(dn)) { err = PTR_ERR(dn); dn = NULL; @@ -1616,8 +1626,6 @@ static const struct inode_operations ceph_symlink_iops = { .getxattr = ceph_getxattr, .listxattr = ceph_listxattr, .removexattr = ceph_removexattr, - .get_acl = ceph_get_acl, - .set_acl = ceph_set_acl, }; /* @@ -1627,7 +1635,6 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct inode *parent_inode; const unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; @@ -1819,9 +1826,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) req->r_inode_drop = release; req->r_args.setattr.mask = cpu_to_le32(mask); req->r_num_caps = 1; - parent_inode = ceph_get_dentry_parent_inode(dentry); - err = ceph_mdsc_do_request(mdsc, parent_inode, req); - iput(parent_inode); + err = ceph_mdsc_do_request(mdsc, NULL, req); } dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, ceph_cap_string(dirtied), mask); diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index dc66c9e023e4..a822a6e58290 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -1,9 +1,8 @@ +#include <linux/ceph/ceph_debug.h> #include <linux/in.h> #include "super.h" #include "mds_client.h" -#include <linux/ceph/ceph_debug.h> - #include "ioctl.h" @@ -64,7 +63,6 @@ static long __validate_layout(struct ceph_mds_client *mdsc, static long ceph_ioctl_set_layout(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct inode *parent_inode; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; struct ceph_ioctl_layout l; @@ -111,6 +109,8 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) return PTR_ERR(req); req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; + req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL; req->r_args.setlayout.layout.fl_stripe_unit = @@ -121,9 +121,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) cpu_to_le32(l.object_size); req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool); - parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); - err = ceph_mdsc_do_request(mdsc, parent_inode, req); - iput(parent_inode); + err = ceph_mdsc_do_request(mdsc, NULL, req); ceph_mdsc_put_request(req); return err; } @@ -157,6 +155,7 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg) return PTR_ERR(req); req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; req->r_args.setlayout.layout.fl_stripe_unit = cpu_to_le32(l.stripe_unit); diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index ae6d14e82b0f..191398852a2e 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -2,11 +2,31 @@ #include <linux/file.h> #include <linux/namei.h> +#include <linux/random.h> #include "super.h" #include "mds_client.h" #include <linux/ceph/pagelist.h> +static u64 lock_secret; + +static inline u64 secure_addr(void *addr) +{ + u64 v = lock_secret ^ (u64)(unsigned long)addr; + /* + * Set the most significant bit, so that MDS knows the 'owner' + * is sufficient to identify the owner of lock. (old code uses + * both 'owner' and 'pid') + */ + v |= (1ULL << 63); + return v; +} + +void __init ceph_flock_init(void) +{ + get_random_bytes(&lock_secret, sizeof(lock_secret)); +} + /** * Implement fcntl and flock locking functions. */ @@ -14,17 +34,18 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, int cmd, u8 wait, struct file_lock *fl) { struct inode *inode = file_inode(file); - struct ceph_mds_client *mdsc = - ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; int err; u64 length = 0; + u64 owner; req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; /* mds requires start and length rather than start and end */ if (LLONG_MAX == fl->fl_end) @@ -32,25 +53,27 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, else length = fl->fl_end - fl->fl_start + 1; - dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " - "length: %llu, wait: %d, type: %d", (int)lock_type, - (int)operation, (u64)fl->fl_pid, fl->fl_start, - length, wait, fl->fl_type); + if (lock_type == CEPH_LOCK_FCNTL) + owner = secure_addr(fl->fl_owner); + else + owner = secure_addr(fl->fl_file); + + dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, " + "start: %llu, length: %llu, wait: %d, type: %d", (int)lock_type, + (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length, + wait, fl->fl_type); req->r_args.filelock_change.rule = lock_type; req->r_args.filelock_change.type = cmd; + req->r_args.filelock_change.owner = cpu_to_le64(owner); req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid); - /* This should be adjusted, but I'm not sure if - namespaces actually get id numbers*/ - req->r_args.filelock_change.pid_namespace = - cpu_to_le64((u64)(unsigned long)fl->fl_nspid); req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start); req->r_args.filelock_change.length = cpu_to_le64(length); req->r_args.filelock_change.wait = wait; err = ceph_mdsc_do_request(mdsc, inode, req); - if ( operation == CEPH_MDS_OP_GETFILELOCK){ + if (operation == CEPH_MDS_OP_GETFILELOCK) { fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid); if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) fl->fl_type = F_RDLCK; @@ -87,14 +110,19 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) u8 wait = 0; u16 op = CEPH_MDS_OP_SETFILELOCK; - fl->fl_nspid = get_pid(task_tgid(current)); - dout("ceph_lock, fl_pid:%d", fl->fl_pid); + if (!(fl->fl_flags & FL_POSIX)) + return -ENOLCK; + /* No mandatory locks */ + if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK) + return -ENOLCK; + + dout("ceph_lock, fl_owner: %p", fl->fl_owner); /* set wait bit as appropriate, then make command as Ceph expects it*/ - if (F_SETLKW == cmd) - wait = 1; - if (F_GETLK == cmd) + if (IS_GETLK(cmd)) op = CEPH_MDS_OP_GETFILELOCK; + else if (IS_SETLKW(cmd)) + wait = 1; if (F_RDLCK == fl->fl_type) lock_cmd = CEPH_LOCK_SHARED; @@ -105,7 +133,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl); if (!err) { - if ( op != CEPH_MDS_OP_GETFILELOCK ){ + if (op != CEPH_MDS_OP_GETFILELOCK) { dout("mds locked, locking locally"); err = posix_lock_file(file, fl, NULL); if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { @@ -131,20 +159,22 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) { u8 lock_cmd; int err; - u8 wait = 1; - - fl->fl_nspid = get_pid(task_tgid(current)); - dout("ceph_flock, fl_pid:%d", fl->fl_pid); - - /* set wait bit, then clear it out of cmd*/ - if (cmd & LOCK_NB) - wait = 0; - cmd = cmd & (LOCK_SH | LOCK_EX | LOCK_UN); - /* set command sequence that Ceph wants to see: - shared lock, exclusive lock, or unlock */ - if (LOCK_SH == cmd) + u8 wait = 0; + + if (!(fl->fl_flags & FL_FLOCK)) + return -ENOLCK; + /* No mandatory locks */ + if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK) + return -ENOLCK; + + dout("ceph_flock, fl_file: %p", fl->fl_file); + + if (IS_SETLKW(cmd)) + wait = 1; + + if (F_RDLCK == fl->fl_type) lock_cmd = CEPH_LOCK_SHARED; - else if (LOCK_EX == cmd) + else if (F_WRLCK == fl->fl_type) lock_cmd = CEPH_LOCK_EXCL; else lock_cmd = CEPH_LOCK_UNLOCK; @@ -280,13 +310,14 @@ int lock_to_ceph_filelock(struct file_lock *lock, struct ceph_filelock *cephlock) { int err = 0; - cephlock->start = cpu_to_le64(lock->fl_start); cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); cephlock->client = cpu_to_le64(0); - cephlock->pid = cpu_to_le64(lock->fl_pid); - cephlock->pid_namespace = - cpu_to_le64((u64)(unsigned long)lock->fl_nspid); + cephlock->pid = cpu_to_le64((u64)lock->fl_pid); + if (lock->fl_flags & FL_POSIX) + cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner)); + else + cephlock->owner = cpu_to_le64(secure_addr(lock->fl_file)); switch (lock->fl_type) { case F_RDLCK: diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index f4f050a69a48..2b4d093d0563 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3,6 +3,7 @@ #include <linux/fs.h> #include <linux/wait.h> #include <linux/slab.h> +#include <linux/gfp.h> #include <linux/sched.h> #include <linux/debugfs.h> #include <linux/seq_file.h> @@ -165,21 +166,18 @@ static int parse_reply_info_dir(void **p, void *end, if (num == 0) goto done; - /* alloc large array */ - info->dir_nr = num; - info->dir_in = kcalloc(num, sizeof(*info->dir_in) + - sizeof(*info->dir_dname) + - sizeof(*info->dir_dname_len) + - sizeof(*info->dir_dlease), - GFP_NOFS); - if (info->dir_in == NULL) { - err = -ENOMEM; - goto out_bad; - } + BUG_ON(!info->dir_in); info->dir_dname = (void *)(info->dir_in + num); info->dir_dname_len = (void *)(info->dir_dname + num); info->dir_dlease = (void *)(info->dir_dname_len + num); + if ((unsigned long)(info->dir_dlease + num) > + (unsigned long)info->dir_in + info->dir_buf_size) { + pr_err("dir contents are larger than expected\n"); + WARN_ON(1); + goto bad; + } + info->dir_nr = num; while (num) { /* dentry */ ceph_decode_need(p, end, sizeof(u32)*2, bad); @@ -327,7 +325,9 @@ out_bad: static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) { - kfree(info->dir_in); + if (!info->dir_in) + return; + free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size)); } @@ -512,12 +512,11 @@ void ceph_mdsc_release_request(struct kref *kref) struct ceph_mds_request *req = container_of(kref, struct ceph_mds_request, r_kref); + destroy_reply_info(&req->r_reply_info); if (req->r_request) ceph_msg_put(req->r_request); - if (req->r_reply) { + if (req->r_reply) ceph_msg_put(req->r_reply); - destroy_reply_info(&req->r_reply_info); - } if (req->r_inode) { ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); iput(req->r_inode); @@ -528,7 +527,9 @@ void ceph_mdsc_release_request(struct kref *kref) iput(req->r_target_inode); if (req->r_dentry) dput(req->r_dentry); - if (req->r_old_dentry) { + if (req->r_old_dentry) + dput(req->r_old_dentry); + if (req->r_old_dentry_dir) { /* * track (and drop pins for) r_old_dentry_dir * separately, since r_old_dentry's d_parent may have @@ -537,7 +538,6 @@ void ceph_mdsc_release_request(struct kref *kref) */ ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir), CEPH_CAP_PIN); - dput(req->r_old_dentry); iput(req->r_old_dentry_dir); } kfree(req->r_path1); @@ -1311,6 +1311,9 @@ static int trim_caps(struct ceph_mds_client *mdsc, trim_caps - session->s_trim_caps); session->s_trim_caps = 0; } + + ceph_add_cap_releases(mdsc, session); + ceph_send_cap_releases(mdsc, session); return 0; } @@ -1461,15 +1464,18 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc, dout("discard_cap_releases mds%d\n", session->s_mds); - /* zero out the in-progress message */ - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, list_head); - head = msg->front.iov_base; - num = le32_to_cpu(head->num); - dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num); - head->num = cpu_to_le32(0); - msg->front.iov_len = sizeof(*head); - session->s_num_cap_releases += num; + if (!list_empty(&session->s_cap_releases)) { + /* zero out the in-progress message */ + msg = list_first_entry(&session->s_cap_releases, + struct ceph_msg, list_head); + head = msg->front.iov_base; + num = le32_to_cpu(head->num); + dout("discard_cap_releases mds%d %p %u\n", + session->s_mds, msg, num); + head->num = cpu_to_le32(0); + msg->front.iov_len = sizeof(*head); + session->s_num_cap_releases += num; + } /* requeue completed messages */ while (!list_empty(&session->s_cap_releases_done)) { @@ -1492,6 +1498,43 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc, * requests */ +int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, + struct inode *dir) +{ + struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options; + size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) + + sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease); + int order, num_entries; + + spin_lock(&ci->i_ceph_lock); + num_entries = ci->i_files + ci->i_subdirs; + spin_unlock(&ci->i_ceph_lock); + num_entries = max(num_entries, 1); + num_entries = min(num_entries, opt->max_readdir); + + order = get_order(size * num_entries); + while (order >= 0) { + rinfo->dir_in = (void*)__get_free_pages(GFP_NOFS | __GFP_NOWARN, + order); + if (rinfo->dir_in) + break; + order--; + } + if (!rinfo->dir_in) + return -ENOMEM; + + num_entries = (PAGE_SIZE << order) / size; + num_entries = min(num_entries, opt->max_readdir); + + rinfo->dir_buf_size = PAGE_SIZE << order; + req->r_num_caps = num_entries + 1; + req->r_args.readdir.max_entries = cpu_to_le32(num_entries); + req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes); + return 0; +} + /* * Create an mds request. */ @@ -2053,7 +2096,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); if (req->r_locked_dir) ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); - if (req->r_old_dentry) + if (req->r_old_dentry_dir) ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), CEPH_CAP_PIN); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 68288917c737..e90cfccf93bd 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -67,6 +67,7 @@ struct ceph_mds_reply_info_parsed { /* for readdir results */ struct { struct ceph_mds_reply_dirfrag *dir_dir; + size_t dir_buf_size; int dir_nr; char **dir_dname; u32 *dir_dname_len; @@ -346,7 +347,8 @@ extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct dentry *dn); extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); - +extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, + struct inode *dir); extern struct ceph_mds_request * ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c index 4440f447fd3f..51cc23e48111 100644 --- a/fs/ceph/strings.c +++ b/fs/ceph/strings.c @@ -54,6 +54,7 @@ const char *ceph_mds_op_name(int op) case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash"; case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent"; case CEPH_MDS_OP_LOOKUPINO: return "lookupino"; + case CEPH_MDS_OP_LOOKUPNAME: return "lookupname"; case CEPH_MDS_OP_GETATTR: return "getattr"; case CEPH_MDS_OP_SETXATTR: return "setxattr"; case CEPH_MDS_OP_SETATTR: return "setattr"; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 10a4ccbf38da..06150fd745ac 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1026,6 +1026,7 @@ static int __init init_ceph(void) if (ret) goto out; + ceph_flock_init(); ceph_xattr_init(); ret = register_filesystem(&ceph_fs_type); if (ret) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index d8801a95b685..ead05cc1f447 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -266,7 +266,6 @@ struct ceph_inode_info { struct timespec i_rctime; u64 i_rbytes, i_rfiles, i_rsubdirs; u64 i_files, i_subdirs; - u64 i_max_offset; /* largest readdir offset, set with complete dir */ struct rb_root i_fragtree; struct mutex i_fragtree_mutex; @@ -577,7 +576,7 @@ struct ceph_file_info { /* readdir: position within a frag */ unsigned offset; /* offset of last chunk, adjusted for . and .. */ - u64 next_offset; /* offset of next chunk (last_name's + 1) */ + unsigned next_offset; /* offset of next chunk (last_name's + 1) */ char *last_name; /* last entry in previous chunk */ struct dentry *dentry; /* next dentry (for dcache readdir) */ int dir_release_count; @@ -871,6 +870,7 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg); extern const struct export_operations ceph_export_ops; /* locks.c */ +extern __init void ceph_flock_init(void); extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index a55ec37378c6..c9c2b887381e 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -64,32 +64,48 @@ static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci) } static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, - size_t size) + size_t size) { int ret; struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); struct ceph_osd_client *osdc = &fsc->client->osdc; s64 pool = ceph_file_layout_pg_pool(ci->i_layout); const char *pool_name; + char buf[128]; dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); down_read(&osdc->map_sem); pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); - if (pool_name) - ret = snprintf(val, size, - "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%s", + if (pool_name) { + size_t len = strlen(pool_name); + ret = snprintf(buf, sizeof(buf), + "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=", (unsigned long long)ceph_file_layout_su(ci->i_layout), (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), - (unsigned long long)ceph_file_layout_object_size(ci->i_layout), - pool_name); - else - ret = snprintf(val, size, + (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); + if (!size) { + ret += len; + } else if (ret + len > size) { + ret = -ERANGE; + } else { + memcpy(val, buf, ret); + memcpy(val + ret, pool_name, len); + ret += len; + } + } else { + ret = snprintf(buf, sizeof(buf), "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld", (unsigned long long)ceph_file_layout_su(ci->i_layout), (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), (unsigned long long)ceph_file_layout_object_size(ci->i_layout), (unsigned long long)pool); - + if (size) { + if (ret <= size) + memcpy(val, buf, ret); + else + ret = -ERANGE; + } + } up_read(&osdc->map_sem); return ret; } @@ -215,7 +231,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { .name_size = sizeof("ceph.dir.layout"), .getxattr_cb = ceph_vxattrcb_layout, .readonly = false, - .hidden = false, + .hidden = true, .exists_cb = ceph_vxattrcb_layout_exists, }, XATTR_LAYOUT_FIELD(dir, layout, stripe_unit), @@ -242,7 +258,7 @@ static struct ceph_vxattr ceph_file_vxattrs[] = { .name_size = sizeof("ceph.file.layout"), .getxattr_cb = ceph_vxattrcb_layout, .readonly = false, - .hidden = false, + .hidden = true, .exists_cb = ceph_vxattrcb_layout_exists, }, XATTR_LAYOUT_FIELD(file, layout, stripe_unit), @@ -842,7 +858,6 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct inode *parent_inode; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = fsc->mdsc; int err; @@ -893,9 +908,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, req->r_data_len = size; dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); - parent_inode = ceph_get_dentry_parent_inode(dentry); - err = ceph_mdsc_do_request(mdsc, parent_inode, req); - iput(parent_inode); + err = ceph_mdsc_do_request(mdsc, NULL, req); ceph_mdsc_put_request(req); dout("xattr.ver (after): %lld\n", ci->i_xattrs.version); @@ -1019,7 +1032,6 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = dentry->d_inode; - struct inode *parent_inode; struct ceph_mds_request *req; int err; @@ -1033,9 +1045,7 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) req->r_num_caps = 1; req->r_path2 = kstrdup(name, GFP_NOFS); - parent_inode = ceph_get_dentry_parent_inode(dentry); - err = ceph_mdsc_do_request(mdsc, parent_inode, req); - iput(parent_inode); + err = ceph_mdsc_do_request(mdsc, NULL, req); ceph_mdsc_put_request(req); return err; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 849f6132b327..5be1f997ecde 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -253,6 +253,11 @@ cifs_alloc_inode(struct super_block *sb) cifs_set_oplock_level(cifs_inode, 0); cifs_inode->delete_pending = false; cifs_inode->invalid_mapping = false; + clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cifs_inode->flags); + clear_bit(CIFS_INODE_PENDING_WRITERS, &cifs_inode->flags); + clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cifs_inode->flags); + spin_lock_init(&cifs_inode->writers_lock); + cifs_inode->writers = 0; cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ cifs_inode->server_eof = 0; cifs_inode->uniqueid = 0; @@ -286,7 +291,7 @@ cifs_destroy_inode(struct inode *inode) static void cifs_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); cifs_fscache_release_inode_cookie(inode); } @@ -541,6 +546,7 @@ static int cifs_show_stats(struct seq_file *s, struct dentry *root) static int cifs_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_NODIRATIME; return 0; } @@ -731,19 +737,26 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct inode *inode = file_inode(iocb->ki_filp); + struct cifsInodeInfo *cinode = CIFS_I(inode); ssize_t written; int rc; + written = cifs_get_writer(cinode); + if (written) + return written; + written = generic_file_aio_write(iocb, iov, nr_segs, pos); if (CIFS_CACHE_WRITE(CIFS_I(inode))) - return written; + goto out; rc = filemap_fdatawrite(inode->i_mapping); if (rc) cifs_dbg(FYI, "cifs_file_aio_write: %d rc on %p inode\n", rc, inode); +out: + cifs_put_writer(cinode); return written; } @@ -849,7 +862,6 @@ const struct inode_operations cifs_file_inode_ops = { /* revalidate:cifs_revalidate, */ .setattr = cifs_setattr, .getattr = cifs_getattr, /* do we need this anymore? */ - .rename = cifs_rename, .permission = cifs_permission, #ifdef CONFIG_CIFS_XATTR .setxattr = cifs_setxattr, @@ -1005,7 +1017,7 @@ cifs_init_once(void *inode) init_rwsem(&cifsi->lock_sem); } -static int +static int __init cifs_init_inodecache(void) { cifs_inode_cachep = kmem_cache_create("cifs_inode_cache", diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index c0f3718b77a8..30f6e9251a4a 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -228,6 +228,8 @@ struct smb_version_operations { /* verify the message */ int (*check_message)(char *, unsigned int); bool (*is_oplock_break)(char *, struct TCP_Server_Info *); + void (*downgrade_oplock)(struct TCP_Server_Info *, + struct cifsInodeInfo *, bool); /* process transaction2 response */ bool (*check_trans2)(struct mid_q_entry *, struct TCP_Server_Info *, char *, int); @@ -1113,6 +1115,12 @@ struct cifsInodeInfo { unsigned int epoch; /* used to track lease state changes */ bool delete_pending; /* DELETE_ON_CLOSE is set */ bool invalid_mapping; /* pagecache is invalid */ + unsigned long flags; +#define CIFS_INODE_PENDING_OPLOCK_BREAK (0) /* oplock break in progress */ +#define CIFS_INODE_PENDING_WRITERS (1) /* Writes in progress */ +#define CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2 (2) /* Downgrade oplock to L2 */ + spinlock_t writers_lock; + unsigned int writers; /* Number of writers on this inode */ unsigned long time; /* jiffies of last update of inode */ u64 server_eof; /* current file size on server -- protected by i_lock */ u64 uniqueid; /* server inode number */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index acc4ee8ed075..ca7980a1e303 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -127,6 +127,9 @@ extern u64 cifs_UnixTimeToNT(struct timespec); extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset); extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock); +extern int cifs_get_writer(struct cifsInodeInfo *cinode); +extern void cifs_put_writer(struct cifsInodeInfo *cinode); +extern void cifs_done_oplock_break(struct cifsInodeInfo *cinode); extern int cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, const unsigned int xid); extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index f3264bd7a83d..6ce4e0954b98 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -6197,6 +6197,9 @@ QAllEAsRetry: cifs_dbg(FYI, "ea length %d\n", list_len); if (list_len <= 8) { cifs_dbg(FYI, "empty EA list returned from server\n"); + /* didn't find the named attribute */ + if (ea_name) + rc = -ENODATA; goto QAllEAsOut; } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 834fce759d80..5ed03e0b8b40 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2579,19 +2579,32 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov, struct cifsInodeInfo *cinode = CIFS_I(inode); struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; ssize_t rc = -EACCES; - loff_t lock_pos = pos; + loff_t lock_pos = iocb->ki_pos; - if (file->f_flags & O_APPEND) - lock_pos = i_size_read(inode); /* * We need to hold the sem to be sure nobody modifies lock list * with a brlock that prevents writing. */ down_read(&cinode->lock_sem); + mutex_lock(&inode->i_mutex); + if (file->f_flags & O_APPEND) + lock_pos = i_size_read(inode); if (!cifs_find_lock_conflict(cfile, lock_pos, iov_length(iov, nr_segs), server->vals->exclusive_lock_type, NULL, - CIFS_WRITE_OP)) - rc = generic_file_aio_write(iocb, iov, nr_segs, pos); + CIFS_WRITE_OP)) { + rc = __generic_file_aio_write(iocb, iov, nr_segs); + mutex_unlock(&inode->i_mutex); + + if (rc > 0) { + ssize_t err; + + err = generic_write_sync(file, iocb->ki_pos - rc, rc); + if (err < 0) + rc = err; + } + } else { + mutex_unlock(&inode->i_mutex); + } up_read(&cinode->lock_sem); return rc; } @@ -2608,12 +2621,20 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); ssize_t written; + written = cifs_get_writer(cinode); + if (written) + return written; + if (CIFS_CACHE_WRITE(cinode)) { if (cap_unix(tcon->ses) && (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) - && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) - return generic_file_aio_write(iocb, iov, nr_segs, pos); - return cifs_writev(iocb, iov, nr_segs, pos); + && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) { + written = generic_file_aio_write( + iocb, iov, nr_segs, pos); + goto out; + } + written = cifs_writev(iocb, iov, nr_segs, pos); + goto out; } /* * For non-oplocked files in strict cache mode we need to write the data @@ -2633,6 +2654,8 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, inode); cinode->oplock = 0; } +out: + cifs_put_writer(cinode); return written; } @@ -2727,56 +2750,27 @@ cifs_retry_async_readv(struct cifs_readdata *rdata) /** * cifs_readdata_to_iov - copy data from pages in response to an iovec * @rdata: the readdata response with list of pages holding data - * @iov: vector in which we should copy the data - * @nr_segs: number of segments in vector - * @offset: offset into file of the first iovec - * @copied: used to return the amount of data copied to the iov + * @iter: destination for our data * * This function copies data from a list of pages in a readdata response into * an array of iovecs. It will first calculate where the data should go * based on the info in the readdata and then copy the data into that spot. */ -static ssize_t -cifs_readdata_to_iov(struct cifs_readdata *rdata, const struct iovec *iov, - unsigned long nr_segs, loff_t offset, ssize_t *copied) +static int +cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter) { - int rc = 0; - struct iov_iter ii; - size_t pos = rdata->offset - offset; - ssize_t remaining = rdata->bytes; - unsigned char *pdata; + size_t remaining = rdata->bytes; unsigned int i; - /* set up iov_iter and advance to the correct offset */ - iov_iter_init(&ii, iov, nr_segs, iov_length(iov, nr_segs), 0); - iov_iter_advance(&ii, pos); - - *copied = 0; for (i = 0; i < rdata->nr_pages; i++) { - ssize_t copy; struct page *page = rdata->pages[i]; - - /* copy a whole page or whatever's left */ - copy = min_t(ssize_t, remaining, PAGE_SIZE); - - /* ...but limit it to whatever space is left in the iov */ - copy = min_t(ssize_t, copy, iov_iter_count(&ii)); - - /* go while there's data to be copied and no errors */ - if (copy && !rc) { - pdata = kmap(page); - rc = memcpy_toiovecend(ii.iov, pdata, ii.iov_offset, - (int)copy); - kunmap(page); - if (!rc) { - *copied += copy; - remaining -= copy; - iov_iter_advance(&ii, copy); - } - } + size_t copy = min_t(size_t, remaining, PAGE_SIZE); + size_t written = copy_page_to_iter(page, 0, copy, iter); + remaining -= written; + if (written < copy && iov_iter_count(iter) > 0) + break; } - - return rc; + return remaining ? -EFAULT : 0; } static void @@ -2837,20 +2831,21 @@ cifs_uncached_read_into_pages(struct TCP_Server_Info *server, return total_read > 0 ? total_read : result; } -static ssize_t -cifs_iovec_read(struct file *file, const struct iovec *iov, - unsigned long nr_segs, loff_t *poffset) +ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { + struct file *file = iocb->ki_filp; ssize_t rc; size_t len, cur_len; ssize_t total_read = 0; - loff_t offset = *poffset; + loff_t offset = pos; unsigned int npages; struct cifs_sb_info *cifs_sb; struct cifs_tcon *tcon; struct cifsFileInfo *open_file; struct cifs_readdata *rdata, *tmp; struct list_head rdata_list; + struct iov_iter to; pid_t pid; if (!nr_segs) @@ -2860,6 +2855,8 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, if (!len) return 0; + iov_iter_init(&to, iov, nr_segs, len, 0); + INIT_LIST_HEAD(&rdata_list); cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); open_file = file->private_data; @@ -2885,7 +2882,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, cifs_uncached_readv_complete); if (!rdata) { rc = -ENOMEM; - goto error; + break; } rc = cifs_read_allocate_pages(rdata, npages); @@ -2917,55 +2914,44 @@ error: if (!list_empty(&rdata_list)) rc = 0; + len = iov_iter_count(&to); /* the loop below should proceed in the order of increasing offsets */ -restart_loop: list_for_each_entry_safe(rdata, tmp, &rdata_list, list) { + again: if (!rc) { - ssize_t copied; - /* FIXME: freezable sleep too? */ rc = wait_for_completion_killable(&rdata->done); if (rc) rc = -EINTR; - else if (rdata->result) + else if (rdata->result) { rc = rdata->result; - else { - rc = cifs_readdata_to_iov(rdata, iov, - nr_segs, *poffset, - &copied); - total_read += copied; + /* resend call if it's a retryable error */ + if (rc == -EAGAIN) { + rc = cifs_retry_async_readv(rdata); + goto again; + } + } else { + rc = cifs_readdata_to_iov(rdata, &to); } - /* resend call if it's a retryable error */ - if (rc == -EAGAIN) { - rc = cifs_retry_async_readv(rdata); - goto restart_loop; - } } list_del_init(&rdata->list); kref_put(&rdata->refcount, cifs_uncached_readdata_release); } + total_read = len - iov_iter_count(&to); + cifs_stats_bytes_read(tcon, total_read); - *poffset += total_read; /* mask nodata case */ if (rc == -ENODATA) rc = 0; - return total_read ? total_read : rc; -} - -ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - ssize_t read; - - read = cifs_iovec_read(iocb->ki_filp, iov, nr_segs, &pos); - if (read > 0) - iocb->ki_pos = pos; - - return read; + if (total_read) { + iocb->ki_pos = pos + total_read; + return total_read; + } + return rc; } ssize_t @@ -3113,6 +3099,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) static struct vm_operations_struct cifs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = cifs_page_mkwrite, .remap_pages = generic_file_remap_pages, }; @@ -3644,6 +3631,13 @@ static int cifs_launder_page(struct page *page) return rc; } +static int +cifs_pending_writers_wait(void *unused) +{ + schedule(); + return 0; +} + void cifs_oplock_break(struct work_struct *work) { struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, @@ -3651,8 +3645,15 @@ void cifs_oplock_break(struct work_struct *work) struct inode *inode = cfile->dentry->d_inode; struct cifsInodeInfo *cinode = CIFS_I(inode); struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); + struct TCP_Server_Info *server = tcon->ses->server; int rc = 0; + wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS, + cifs_pending_writers_wait, TASK_UNINTERRUPTIBLE); + + server->ops->downgrade_oplock(server, cinode, + test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags)); + if (!CIFS_CACHE_WRITE(cinode) && CIFS_CACHE_READ(cinode) && cifs_has_mand_locks(cinode)) { cifs_dbg(FYI, "Reset oplock to None for inode=%p due to mand locks\n", @@ -3689,6 +3690,7 @@ void cifs_oplock_break(struct work_struct *work) cinode); cifs_dbg(FYI, "Oplock release rc = %d\n", rc); } + cifs_done_oplock_break(cinode); } /* diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index aadc2b68678b..a22d667f1069 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1737,6 +1737,9 @@ cifs_inode_needs_reval(struct inode *inode) if (cifs_i->time == 0) return true; + if (!cifs_sb->actimeo) + return true; + if (!time_in_range(jiffies, cifs_i->time, cifs_i->time + cifs_sb->actimeo)) return true; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 2f9f3790679d..3b0c62e622da 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -466,8 +466,22 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) cifs_dbg(FYI, "file id match, oplock break\n"); pCifsInode = CIFS_I(netfile->dentry->d_inode); - cifs_set_oplock_level(pCifsInode, - pSMB->OplockLevel ? OPLOCK_READ : 0); + set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, + &pCifsInode->flags); + + /* + * Set flag if the server downgrades the oplock + * to L2 else clear. + */ + if (pSMB->OplockLevel) + set_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &pCifsInode->flags); + else + clear_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &pCifsInode->flags); + queue_work(cifsiod_wq, &netfile->oplock_break); netfile->oplock_break_cancelled = false; @@ -551,6 +565,62 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) cinode->oplock = 0; } +static int +cifs_oplock_break_wait(void *unused) +{ + schedule(); + return signal_pending(current) ? -ERESTARTSYS : 0; +} + +/* + * We wait for oplock breaks to be processed before we attempt to perform + * writes. + */ +int cifs_get_writer(struct cifsInodeInfo *cinode) +{ + int rc; + +start: + rc = wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK, + cifs_oplock_break_wait, TASK_KILLABLE); + if (rc) + return rc; + + spin_lock(&cinode->writers_lock); + if (!cinode->writers) + set_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags); + cinode->writers++; + /* Check to see if we have started servicing an oplock break */ + if (test_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags)) { + cinode->writers--; + if (cinode->writers == 0) { + clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags); + wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS); + } + spin_unlock(&cinode->writers_lock); + goto start; + } + spin_unlock(&cinode->writers_lock); + return 0; +} + +void cifs_put_writer(struct cifsInodeInfo *cinode) +{ + spin_lock(&cinode->writers_lock); + cinode->writers--; + if (cinode->writers == 0) { + clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags); + wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS); + } + spin_unlock(&cinode->writers_lock); +} + +void cifs_done_oplock_break(struct cifsInodeInfo *cinode) +{ + clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags); + wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK); +} + bool backup_cred(struct cifs_sb_info *cifs_sb) { diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 526fb89f9230..d1fdfa848703 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -372,6 +372,16 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr) return 0; } +static void +cifs_downgrade_oplock(struct TCP_Server_Info *server, + struct cifsInodeInfo *cinode, bool set_level2) +{ + if (set_level2) + cifs_set_oplock_level(cinode, OPLOCK_READ); + else + cifs_set_oplock_level(cinode, 0); +} + static bool cifs_check_trans2(struct mid_q_entry *mid, struct TCP_Server_Info *server, char *buf, int malformed) @@ -1019,6 +1029,7 @@ struct smb_version_operations smb1_operations = { .clear_stats = cifs_clear_stats, .print_stats = cifs_print_stats, .is_oplock_break = is_valid_oplock_break, + .downgrade_oplock = cifs_downgrade_oplock, .check_trans2 = cifs_check_trans2, .need_neg = cifs_need_neg, .negotiate = cifs_negotiate, diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index fb3966265b6e..b8021fde987d 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -575,9 +575,21 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) else cfile->oplock_break_cancelled = false; - server->ops->set_oplock_level(cinode, - rsp->OplockLevel ? SMB2_OPLOCK_LEVEL_II : 0, - 0, NULL); + set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, + &cinode->flags); + + /* + * Set flag if the server downgrades the oplock + * to L2 else clear. + */ + if (rsp->OplockLevel) + set_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &cinode->flags); + else + clear_bit( + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &cinode->flags); queue_work(cifsiod_wq, &cfile->oplock_break); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 192f51a12cf1..35ddc3ed119d 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -905,6 +905,17 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, } static void +smb2_downgrade_oplock(struct TCP_Server_Info *server, + struct cifsInodeInfo *cinode, bool set_level2) +{ + if (set_level2) + server->ops->set_oplock_level(cinode, SMB2_OPLOCK_LEVEL_II, + 0, NULL); + else + server->ops->set_oplock_level(cinode, 0, 0, NULL); +} + +static void smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, unsigned int epoch, bool *purge_cache) { @@ -1110,6 +1121,7 @@ struct smb_version_operations smb20_operations = { .clear_stats = smb2_clear_stats, .print_stats = smb2_print_stats, .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, @@ -1184,6 +1196,7 @@ struct smb_version_operations smb21_operations = { .clear_stats = smb2_clear_stats, .print_stats = smb2_print_stats, .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, @@ -1259,6 +1272,7 @@ struct smb_version_operations smb30_operations = { .print_stats = smb2_print_stats, .dump_share_caps = smb2_dump_share_caps, .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 860344701067..3802f8c94acc 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1352,7 +1352,6 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid) { int rc; - char *res_key = NULL; struct compress_ioctl fsctl_input; char *ret_data = NULL; @@ -1365,7 +1364,6 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, 2 /* in data len */, &ret_data /* out data */, NULL); cifs_dbg(FYI, "set compression rc %d\n", rc); - kfree(res_key); return rc; } diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h index b7143cf783ac..381c993b1427 100644 --- a/fs/coda/coda_int.h +++ b/fs/coda/coda_int.h @@ -10,7 +10,7 @@ extern int coda_hard; extern int coda_fake_statfs; void coda_destroy_inodecache(void); -int coda_init_inodecache(void); +int __init coda_init_inodecache(void); int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync); void coda_sysctl_init(void); void coda_sysctl_clean(void); diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 506de34a4ef3..d9c7751f10ac 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -73,7 +73,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -int coda_init_inodecache(void) +int __init coda_init_inodecache(void) { coda_inode_cachep = kmem_cache_create("coda_inode_cache", sizeof(struct coda_inode_info), @@ -96,6 +96,7 @@ void coda_destroy_inodecache(void) static int coda_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_NOATIME; return 0; } @@ -250,7 +251,7 @@ static void coda_put_super(struct super_block *sb) static void coda_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); coda_cache_clear_inode(inode); } diff --git a/fs/compat.c b/fs/compat.c index 6af20de2c1a3..66d3d3c6b4b2 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -72,8 +72,8 @@ int compat_printk(const char *fmt, ...) * Not all architectures have sys_utime, so implement this in terms * of sys_utimes. */ -asmlinkage long compat_sys_utime(const char __user *filename, - struct compat_utimbuf __user *t) +COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename, + struct compat_utimbuf __user *, t) { struct timespec tv[2]; @@ -87,13 +87,13 @@ asmlinkage long compat_sys_utime(const char __user *filename, return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0); } -asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filename, struct compat_timespec __user *t, int flags) +COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct compat_timespec __user *, t, int, flags) { struct timespec tv[2]; if (t) { - if (get_compat_timespec(&tv[0], &t[0]) || - get_compat_timespec(&tv[1], &t[1])) + if (compat_get_timespec(&tv[0], &t[0]) || + compat_get_timespec(&tv[1], &t[1])) return -EFAULT; if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT) @@ -102,7 +102,7 @@ asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filena return do_utimes(dfd, filename, t ? tv : NULL, flags); } -asmlinkage long compat_sys_futimesat(unsigned int dfd, const char __user *filename, struct compat_timeval __user *t) +COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filename, struct compat_timeval __user *, t) { struct timespec tv[2]; @@ -121,7 +121,7 @@ asmlinkage long compat_sys_futimesat(unsigned int dfd, const char __user *filena return do_utimes(dfd, filename, t ? tv : NULL, 0); } -asmlinkage long compat_sys_utimes(const char __user *filename, struct compat_timeval __user *t) +COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct compat_timeval __user *, t) { return compat_sys_futimesat(AT_FDCWD, filename, t); } @@ -159,8 +159,8 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0; } -asmlinkage long compat_sys_newstat(const char __user * filename, - struct compat_stat __user *statbuf) +COMPAT_SYSCALL_DEFINE2(newstat, const char __user *, filename, + struct compat_stat __user *, statbuf) { struct kstat stat; int error; @@ -171,8 +171,8 @@ asmlinkage long compat_sys_newstat(const char __user * filename, return cp_compat_stat(&stat, statbuf); } -asmlinkage long compat_sys_newlstat(const char __user * filename, - struct compat_stat __user *statbuf) +COMPAT_SYSCALL_DEFINE2(newlstat, const char __user *, filename, + struct compat_stat __user *, statbuf) { struct kstat stat; int error; @@ -184,9 +184,9 @@ asmlinkage long compat_sys_newlstat(const char __user * filename, } #ifndef __ARCH_WANT_STAT64 -asmlinkage long compat_sys_newfstatat(unsigned int dfd, - const char __user *filename, - struct compat_stat __user *statbuf, int flag) +COMPAT_SYSCALL_DEFINE4(newfstatat, unsigned int, dfd, + const char __user *, filename, + struct compat_stat __user *, statbuf, int, flag) { struct kstat stat; int error; @@ -198,8 +198,8 @@ asmlinkage long compat_sys_newfstatat(unsigned int dfd, } #endif -asmlinkage long compat_sys_newfstat(unsigned int fd, - struct compat_stat __user * statbuf) +COMPAT_SYSCALL_DEFINE2(newfstat, unsigned int, fd, + struct compat_stat __user *, statbuf) { struct kstat stat; int error = vfs_fstat(fd, &stat); @@ -247,7 +247,7 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs * * The following statfs calls are copies of code from fs/statfs.c and * should be checked against those from time to time */ -asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf) +COMPAT_SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct compat_statfs __user *, buf) { struct kstatfs tmp; int error = user_statfs(pathname, &tmp); @@ -256,7 +256,7 @@ asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_sta return error; } -asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user *buf) +COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, buf) { struct kstatfs tmp; int error = fd_statfs(fd, &tmp); @@ -298,7 +298,7 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat return 0; } -asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf) +COMPAT_SYSCALL_DEFINE3(statfs64, const char __user *, pathname, compat_size_t, sz, struct compat_statfs64 __user *, buf) { struct kstatfs tmp; int error; @@ -312,7 +312,7 @@ asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t s return error; } -asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf) +COMPAT_SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, compat_size_t, sz, struct compat_statfs64 __user *, buf) { struct kstatfs tmp; int error; @@ -331,7 +331,7 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c * Given how simple this syscall is that apporach is more maintainable * than the various conversion hacks. */ -asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u) +COMPAT_SYSCALL_DEFINE2(ustat, unsigned, dev, struct compat_ustat __user *, u) { struct compat_ustat tmp; struct kstatfs sbuf; @@ -399,12 +399,28 @@ static int put_compat_flock64(struct flock *kfl, struct compat_flock64 __user *u } #endif -asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd, - unsigned long arg) +static unsigned int +convert_fcntl_cmd(unsigned int cmd) +{ + switch (cmd) { + case F_GETLK64: + return F_GETLK; + case F_SETLK64: + return F_SETLK; + case F_SETLKW64: + return F_SETLKW; + } + + return cmd; +} + +COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, + compat_ulong_t, arg) { mm_segment_t old_fs; struct flock f; long ret; + unsigned int conv_cmd; switch (cmd) { case F_GETLK: @@ -441,16 +457,18 @@ asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd, case F_GETLK64: case F_SETLK64: case F_SETLKW64: + case F_OFD_GETLK: + case F_OFD_SETLK: + case F_OFD_SETLKW: ret = get_compat_flock64(&f, compat_ptr(arg)); if (ret != 0) break; old_fs = get_fs(); set_fs(KERNEL_DS); - ret = sys_fcntl(fd, (cmd == F_GETLK64) ? F_GETLK : - ((cmd == F_SETLK64) ? F_SETLK : F_SETLKW), - (unsigned long)&f); + conv_cmd = convert_fcntl_cmd(cmd); + ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f); set_fs(old_fs); - if (cmd == F_GETLK64 && ret == 0) { + if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) { /* need to return lock information - see above for commentary */ if (f.l_start > COMPAT_LOFF_T_MAX) ret = -EOVERFLOW; @@ -468,16 +486,22 @@ asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd, return ret; } -asmlinkage long compat_sys_fcntl(unsigned int fd, unsigned int cmd, - unsigned long arg) +COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, + compat_ulong_t, arg) { - if ((cmd == F_GETLK64) || (cmd == F_SETLK64) || (cmd == F_SETLKW64)) + switch (cmd) { + case F_GETLK64: + case F_SETLK64: + case F_SETLKW64: + case F_OFD_GETLK: + case F_OFD_SETLK: + case F_OFD_SETLKW: return -EINVAL; + } return compat_sys_fcntl64(fd, cmd, arg); } -asmlinkage long -compat_sys_io_setup(unsigned nr_reqs, u32 __user *ctx32p) +COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_reqs, u32 __user *, ctx32p) { long ret; aio_context_t ctx64; @@ -496,32 +520,24 @@ compat_sys_io_setup(unsigned nr_reqs, u32 __user *ctx32p) return ret; } -asmlinkage long -compat_sys_io_getevents(aio_context_t ctx_id, - unsigned long min_nr, - unsigned long nr, - struct io_event __user *events, - struct compat_timespec __user *timeout) +COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id, + compat_long_t, min_nr, + compat_long_t, nr, + struct io_event __user *, events, + struct compat_timespec __user *, timeout) { - long ret; struct timespec t; struct timespec __user *ut = NULL; - ret = -EFAULT; - if (unlikely(!access_ok(VERIFY_WRITE, events, - nr * sizeof(struct io_event)))) - goto out; if (timeout) { - if (get_compat_timespec(&t, timeout)) - goto out; + if (compat_get_timespec(&t, timeout)) + return -EFAULT; ut = compat_alloc_user_space(sizeof(*ut)); if (copy_to_user(ut, &t, sizeof(t)) ) - goto out; + return -EFAULT; } - ret = sys_io_getevents(ctx_id, min_nr, nr, events, ut); -out: - return ret; + return sys_io_getevents(ctx_id, min_nr, nr, events, ut); } /* A write operation does a read from user space and vice versa */ @@ -617,8 +633,8 @@ copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64) #define MAX_AIO_SUBMITS (PAGE_SIZE/sizeof(struct iocb *)) -asmlinkage long -compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb) +COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, + int, nr, u32 __user *, iocb) { struct iocb __user * __user *iocb64; long ret; @@ -770,10 +786,10 @@ static int do_nfs4_super_data_conv(void *raw_data) #define NCPFS_NAME "ncpfs" #define NFS4_NAME "nfs4" -asmlinkage long compat_sys_mount(const char __user * dev_name, - const char __user * dir_name, - const char __user * type, unsigned long flags, - const void __user * data) +COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name, + const char __user *, dir_name, + const char __user *, type, compat_ulong_t, flags, + const void __user *, data) { char *kernel_type; unsigned long data_page; @@ -869,8 +885,8 @@ efault: return -EFAULT; } -asmlinkage long compat_sys_old_readdir(unsigned int fd, - struct compat_old_linux_dirent __user *dirent, unsigned int count) +COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd, + struct compat_old_linux_dirent __user *, dirent, unsigned int, count) { int error; struct fd f = fdget(fd); @@ -948,8 +964,8 @@ efault: return -EFAULT; } -asmlinkage long compat_sys_getdents(unsigned int fd, - struct compat_linux_dirent __user *dirent, unsigned int count) +COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, + struct compat_linux_dirent __user *, dirent, unsigned int, count) { struct fd f; struct compat_linux_dirent __user * lastdirent; @@ -981,7 +997,7 @@ asmlinkage long compat_sys_getdents(unsigned int fd, return error; } -#ifndef __ARCH_OMIT_COMPAT_SYS_GETDENTS64 +#ifdef __ARCH_WANT_COMPAT_SYS_GETDENTS64 struct compat_getdents_callback64 { struct dir_context ctx; @@ -1033,8 +1049,8 @@ efault: return -EFAULT; } -asmlinkage long compat_sys_getdents64(unsigned int fd, - struct linux_dirent64 __user * dirent, unsigned int count) +COMPAT_SYSCALL_DEFINE3(getdents64, unsigned int, fd, + struct linux_dirent64 __user *, dirent, unsigned int, count) { struct fd f; struct linux_dirent64 __user * lastdirent; @@ -1066,7 +1082,7 @@ asmlinkage long compat_sys_getdents64(unsigned int fd, fdput(f); return error; } -#endif /* ! __ARCH_OMIT_COMPAT_SYS_GETDENTS64 */ +#endif /* __ARCH_WANT_COMPAT_SYS_GETDENTS64 */ /* * Exactly like fs/open.c:sys_open(), except that it doesn't set the @@ -1287,9 +1303,9 @@ out_nofds: return ret; } -asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, - compat_ulong_t __user *outp, compat_ulong_t __user *exp, - struct compat_timeval __user *tvp) +COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, + compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, + struct compat_timeval __user *, tvp) { struct timespec end_time, *to = NULL; struct compat_timeval tv; @@ -1320,7 +1336,7 @@ struct compat_sel_arg_struct { compat_uptr_t tvp; }; -asmlinkage long compat_sys_old_select(struct compat_sel_arg_struct __user *arg) +COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg) { struct compat_sel_arg_struct a; @@ -1381,9 +1397,9 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp, return ret; } -asmlinkage long compat_sys_pselect6(int n, compat_ulong_t __user *inp, - compat_ulong_t __user *outp, compat_ulong_t __user *exp, - struct compat_timespec __user *tsp, void __user *sig) +COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp, + compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, + struct compat_timespec __user *, tsp, void __user *, sig) { compat_size_t sigsetsize = 0; compat_uptr_t up = 0; @@ -1400,9 +1416,9 @@ asmlinkage long compat_sys_pselect6(int n, compat_ulong_t __user *inp, sigsetsize); } -asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, - unsigned int nfds, struct compat_timespec __user *tsp, - const compat_sigset_t __user *sigmask, compat_size_t sigsetsize) +COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, + unsigned int, nfds, struct compat_timespec __user *, tsp, + const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { compat_sigset_t ss32; sigset_t ksigmask, sigsaved; diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index a81147e2e4ef..4d24d17bcfc1 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -88,6 +88,11 @@ static void cputime_to_compat_timeval(const cputime_t cputime, #define ELF_HWCAP COMPAT_ELF_HWCAP #endif +#ifdef COMPAT_ELF_HWCAP2 +#undef ELF_HWCAP2 +#define ELF_HWCAP2 COMPAT_ELF_HWCAP2 +#endif + #ifdef COMPAT_ARCH_DLINFO #undef ARCH_DLINFO #define ARCH_DLINFO COMPAT_ARCH_DLINFO diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 3881610b6438..e82289047272 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1538,9 +1538,10 @@ static int compat_ioctl_check_table(unsigned int xcmd) return ioctl_pointer[i] == xcmd; } -asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, - unsigned long arg) +COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, + compat_ulong_t, arg32) { + unsigned long arg = arg32; struct fd f = fdget(fd); int error = -EBADF; if (!f.file) diff --git a/fs/coredump.c b/fs/coredump.c index e3ad709a4232..0b2528fb640e 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -73,10 +73,15 @@ static int expand_corename(struct core_name *cn, int size) static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg) { int free, need; + va_list arg_copy; again: free = cn->size - cn->used; - need = vsnprintf(cn->corename + cn->used, free, fmt, arg); + + va_copy(arg_copy, arg); + need = vsnprintf(cn->corename + cn->used, free, fmt, arg_copy); + va_end(arg_copy); + if (need < free) { cn->used += need; return 0; diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 06610cf94d57..ddcfe590b8a8 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -195,8 +195,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i struct page *page = NULL; if (blocknr + i < devsize) { - page = read_mapping_page_async(mapping, blocknr + i, - NULL); + page = read_mapping_page(mapping, blocknr + i, NULL); /* synchronous error? */ if (IS_ERR(page)) page = NULL; @@ -244,6 +243,7 @@ static void cramfs_kill_sb(struct super_block *sb) static int cramfs_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_RDONLY; return 0; } diff --git a/fs/dcache.c b/fs/dcache.c index ca02c13a84aa..be2bea834bf4 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -246,16 +246,8 @@ static void __d_free(struct rcu_head *head) kmem_cache_free(dentry_cache, dentry); } -/* - * no locks, please. - */ -static void d_free(struct dentry *dentry) +static void dentry_free(struct dentry *dentry) { - BUG_ON((int)dentry->d_lockref.count > 0); - this_cpu_dec(nr_dentry); - if (dentry->d_op && dentry->d_op->d_release) - dentry->d_op->d_release(dentry); - /* if dentry was never visible to RCU, immediate free is OK */ if (!(dentry->d_flags & DCACHE_RCUACCESS)) __d_free(&dentry->d_u.d_rcu); @@ -403,56 +395,6 @@ static void dentry_lru_add(struct dentry *dentry) d_lru_add(dentry); } -/* - * Remove a dentry with references from the LRU. - * - * If we are on the shrink list, then we can get to try_prune_one_dentry() and - * lose our last reference through the parent walk. In this case, we need to - * remove ourselves from the shrink list, not the LRU. - */ -static void dentry_lru_del(struct dentry *dentry) -{ - if (dentry->d_flags & DCACHE_LRU_LIST) { - if (dentry->d_flags & DCACHE_SHRINK_LIST) - return d_shrink_del(dentry); - d_lru_del(dentry); - } -} - -/** - * d_kill - kill dentry and return parent - * @dentry: dentry to kill - * @parent: parent dentry - * - * The dentry must already be unhashed and removed from the LRU. - * - * If this is the root of the dentry tree, return NULL. - * - * dentry->d_lock and parent->d_lock must be held by caller, and are dropped by - * d_kill. - */ -static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) - __releases(dentry->d_lock) - __releases(parent->d_lock) - __releases(dentry->d_inode->i_lock) -{ - list_del(&dentry->d_u.d_child); - /* - * Inform d_walk() that we are no longer attached to the - * dentry tree - */ - dentry->d_flags |= DCACHE_DENTRY_KILLED; - if (parent) - spin_unlock(&parent->d_lock); - dentry_iput(dentry); - /* - * dentry_iput drops the locks, at which point nobody (except - * transient RCU lookups) can reach this dentry. - */ - d_free(dentry); - return parent; -} - /** * d_drop - drop a dentry * @dentry: dentry to drop @@ -499,37 +441,12 @@ void d_drop(struct dentry *dentry) } EXPORT_SYMBOL(d_drop); -/* - * Finish off a dentry we've decided to kill. - * dentry->d_lock must be held, returns with it unlocked. - * If ref is non-zero, then decrement the refcount too. - * Returns dentry requiring refcount drop, or NULL if we're done. - */ -static struct dentry * -dentry_kill(struct dentry *dentry, int unlock_on_failure) - __releases(dentry->d_lock) +static void __dentry_kill(struct dentry *dentry) { - struct inode *inode; - struct dentry *parent; - - inode = dentry->d_inode; - if (inode && !spin_trylock(&inode->i_lock)) { -relock: - if (unlock_on_failure) { - spin_unlock(&dentry->d_lock); - cpu_relax(); - } - return dentry; /* try again with same dentry */ - } - if (IS_ROOT(dentry)) - parent = NULL; - else + struct dentry *parent = NULL; + bool can_free = true; + if (!IS_ROOT(dentry)) parent = dentry->d_parent; - if (parent && !spin_trylock(&parent->d_lock)) { - if (inode) - spin_unlock(&inode->i_lock); - goto relock; - } /* * The dentry is now unrecoverably dead to the world. @@ -543,10 +460,103 @@ relock: if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry)) dentry->d_op->d_prune(dentry); - dentry_lru_del(dentry); + if (dentry->d_flags & DCACHE_LRU_LIST) { + if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) + d_lru_del(dentry); + } /* if it was on the hash then remove it */ __d_drop(dentry); - return d_kill(dentry, parent); + list_del(&dentry->d_u.d_child); + /* + * Inform d_walk() that we are no longer attached to the + * dentry tree + */ + dentry->d_flags |= DCACHE_DENTRY_KILLED; + if (parent) + spin_unlock(&parent->d_lock); + dentry_iput(dentry); + /* + * dentry_iput drops the locks, at which point nobody (except + * transient RCU lookups) can reach this dentry. + */ + BUG_ON((int)dentry->d_lockref.count > 0); + this_cpu_dec(nr_dentry); + if (dentry->d_op && dentry->d_op->d_release) + dentry->d_op->d_release(dentry); + + spin_lock(&dentry->d_lock); + if (dentry->d_flags & DCACHE_SHRINK_LIST) { + dentry->d_flags |= DCACHE_MAY_FREE; + can_free = false; + } + spin_unlock(&dentry->d_lock); + if (likely(can_free)) + dentry_free(dentry); +} + +/* + * Finish off a dentry we've decided to kill. + * dentry->d_lock must be held, returns with it unlocked. + * If ref is non-zero, then decrement the refcount too. + * Returns dentry requiring refcount drop, or NULL if we're done. + */ +static struct dentry *dentry_kill(struct dentry *dentry) + __releases(dentry->d_lock) +{ + struct inode *inode = dentry->d_inode; + struct dentry *parent = NULL; + + if (inode && unlikely(!spin_trylock(&inode->i_lock))) + goto failed; + + if (!IS_ROOT(dentry)) { + parent = dentry->d_parent; + if (unlikely(!spin_trylock(&parent->d_lock))) { + if (inode) + spin_unlock(&inode->i_lock); + goto failed; + } + } + + __dentry_kill(dentry); + return parent; + +failed: + spin_unlock(&dentry->d_lock); + cpu_relax(); + return dentry; /* try again with same dentry */ +} + +static inline struct dentry *lock_parent(struct dentry *dentry) +{ + struct dentry *parent = dentry->d_parent; + if (IS_ROOT(dentry)) + return NULL; + if (likely(spin_trylock(&parent->d_lock))) + return parent; + spin_unlock(&dentry->d_lock); + rcu_read_lock(); +again: + parent = ACCESS_ONCE(dentry->d_parent); + spin_lock(&parent->d_lock); + /* + * We can't blindly lock dentry until we are sure + * that we won't violate the locking order. + * Any changes of dentry->d_parent must have + * been done with parent->d_lock held, so + * spin_lock() above is enough of a barrier + * for checking if it's still our child. + */ + if (unlikely(parent != dentry->d_parent)) { + spin_unlock(&parent->d_lock); + goto again; + } + rcu_read_unlock(); + if (parent != dentry) + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + else + parent = NULL; + return parent; } /* @@ -602,7 +612,7 @@ repeat: return; kill_it: - dentry = dentry_kill(dentry, 1); + dentry = dentry_kill(dentry); if (dentry) goto repeat; } @@ -815,64 +825,15 @@ restart: } EXPORT_SYMBOL(d_prune_aliases); -/* - * Try to throw away a dentry - free the inode, dput the parent. - * Requires dentry->d_lock is held, and dentry->d_count == 0. - * Releases dentry->d_lock. - * - * This may fail if locks cannot be acquired no problem, just try again. - */ -static struct dentry * try_prune_one_dentry(struct dentry *dentry) - __releases(dentry->d_lock) -{ - struct dentry *parent; - - parent = dentry_kill(dentry, 0); - /* - * If dentry_kill returns NULL, we have nothing more to do. - * if it returns the same dentry, trylocks failed. In either - * case, just loop again. - * - * Otherwise, we need to prune ancestors too. This is necessary - * to prevent quadratic behavior of shrink_dcache_parent(), but - * is also expected to be beneficial in reducing dentry cache - * fragmentation. - */ - if (!parent) - return NULL; - if (parent == dentry) - return dentry; - - /* Prune ancestors. */ - dentry = parent; - while (dentry) { - if (lockref_put_or_lock(&dentry->d_lockref)) - return NULL; - dentry = dentry_kill(dentry, 1); - } - return NULL; -} - static void shrink_dentry_list(struct list_head *list) { - struct dentry *dentry; + struct dentry *dentry, *parent; - rcu_read_lock(); - for (;;) { - dentry = list_entry_rcu(list->prev, struct dentry, d_lru); - if (&dentry->d_lru == list) - break; /* empty */ - - /* - * Get the dentry lock, and re-verify that the dentry is - * this on the shrinking list. If it is, we know that - * DCACHE_SHRINK_LIST and DCACHE_LRU_LIST are set. - */ + while (!list_empty(list)) { + struct inode *inode; + dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); - if (dentry != list_entry(list->prev, struct dentry, d_lru)) { - spin_unlock(&dentry->d_lock); - continue; - } + parent = lock_parent(dentry); /* * The dispose list is isolated and dentries are not accounted @@ -885,30 +846,63 @@ static void shrink_dentry_list(struct list_head *list) * We found an inuse dentry which was not removed from * the LRU because of laziness during lookup. Do not free it. */ - if (dentry->d_lockref.count) { + if ((int)dentry->d_lockref.count > 0) { spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); continue; } - rcu_read_unlock(); - /* - * If 'try_to_prune()' returns a dentry, it will - * be the same one we passed in, and d_lock will - * have been held the whole time, so it will not - * have been added to any other lists. We failed - * to get the inode lock. - * - * We just add it back to the shrink list. - */ - dentry = try_prune_one_dentry(dentry); - rcu_read_lock(); - if (dentry) { + if (unlikely(dentry->d_flags & DCACHE_DENTRY_KILLED)) { + bool can_free = dentry->d_flags & DCACHE_MAY_FREE; + spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); + if (can_free) + dentry_free(dentry); + continue; + } + + inode = dentry->d_inode; + if (inode && unlikely(!spin_trylock(&inode->i_lock))) { d_shrink_add(dentry, list); spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); + continue; + } + + __dentry_kill(dentry); + + /* + * We need to prune ancestors too. This is necessary to prevent + * quadratic behavior of shrink_dcache_parent(), but is also + * expected to be beneficial in reducing dentry cache + * fragmentation. + */ + dentry = parent; + while (dentry && !lockref_put_or_lock(&dentry->d_lockref)) { + parent = lock_parent(dentry); + if (dentry->d_lockref.count != 1) { + dentry->d_lockref.count--; + spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); + break; + } + inode = dentry->d_inode; /* can't be NULL */ + if (unlikely(!spin_trylock(&inode->i_lock))) { + spin_unlock(&dentry->d_lock); + if (parent) + spin_unlock(&parent->d_lock); + cpu_relax(); + continue; + } + __dentry_kill(dentry); + dentry = parent; } } - rcu_read_unlock(); } static enum lru_status @@ -1261,34 +1255,23 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) if (data->start == dentry) goto out; - /* - * move only zero ref count dentries to the dispose list. - * - * Those which are presently on the shrink list, being processed - * by shrink_dentry_list(), shouldn't be moved. Otherwise the - * loop in shrink_dcache_parent() might not make any progress - * and loop forever. - */ - if (dentry->d_lockref.count) { - dentry_lru_del(dentry); - } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { - /* - * We can't use d_lru_shrink_move() because we - * need to get the global LRU lock and do the - * LRU accounting. - */ - d_lru_del(dentry); - d_shrink_add(dentry, &data->dispose); + if (dentry->d_flags & DCACHE_SHRINK_LIST) { data->found++; - ret = D_WALK_NORETRY; + } else { + if (dentry->d_flags & DCACHE_LRU_LIST) + d_lru_del(dentry); + if (!dentry->d_lockref.count) { + d_shrink_add(dentry, &data->dispose); + data->found++; + } } /* * We can return to the caller if we have found some (this * ensures forward progress). We'll be coming back to find * the rest. */ - if (data->found && need_resched()) - ret = D_WALK_QUIT; + if (!list_empty(&data->dispose)) + ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; out: return ret; } @@ -1318,45 +1301,35 @@ void shrink_dcache_parent(struct dentry *parent) } EXPORT_SYMBOL(shrink_dcache_parent); -static enum d_walk_ret umount_collect(void *_data, struct dentry *dentry) +static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) { - struct select_data *data = _data; - enum d_walk_ret ret = D_WALK_CONTINUE; + /* it has busy descendents; complain about those instead */ + if (!list_empty(&dentry->d_subdirs)) + return D_WALK_CONTINUE; - if (dentry->d_lockref.count) { - dentry_lru_del(dentry); - if (likely(!list_empty(&dentry->d_subdirs))) - goto out; - if (dentry == data->start && dentry->d_lockref.count == 1) - goto out; - printk(KERN_ERR - "BUG: Dentry %p{i=%lx,n=%s}" - " still in use (%d)" - " [unmount of %s %s]\n", + /* root with refcount 1 is fine */ + if (dentry == _data && dentry->d_lockref.count == 1) + return D_WALK_CONTINUE; + + printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%pd} " + " still in use (%d) [unmount of %s %s]\n", dentry, dentry->d_inode ? dentry->d_inode->i_ino : 0UL, - dentry->d_name.name, + dentry, dentry->d_lockref.count, dentry->d_sb->s_type->name, dentry->d_sb->s_id); - BUG(); - } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { - /* - * We can't use d_lru_shrink_move() because we - * need to get the global LRU lock and do the - * LRU accounting. - */ - if (dentry->d_flags & DCACHE_LRU_LIST) - d_lru_del(dentry); - d_shrink_add(dentry, &data->dispose); - data->found++; - ret = D_WALK_NORETRY; - } -out: - if (data->found && need_resched()) - ret = D_WALK_QUIT; - return ret; + WARN_ON(1); + return D_WALK_CONTINUE; +} + +static void do_one_tree(struct dentry *dentry) +{ + shrink_dcache_parent(dentry); + d_walk(dentry, dentry, umount_check, NULL); + d_drop(dentry); + dput(dentry); } /* @@ -1366,40 +1339,15 @@ void shrink_dcache_for_umount(struct super_block *sb) { struct dentry *dentry; - if (down_read_trylock(&sb->s_umount)) - BUG(); + WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked"); dentry = sb->s_root; sb->s_root = NULL; - for (;;) { - struct select_data data; - - INIT_LIST_HEAD(&data.dispose); - data.start = dentry; - data.found = 0; - - d_walk(dentry, &data, umount_collect, NULL); - if (!data.found) - break; - - shrink_dentry_list(&data.dispose); - cond_resched(); - } - d_drop(dentry); - dput(dentry); + do_one_tree(dentry); while (!hlist_bl_empty(&sb->s_anon)) { - struct select_data data; - dentry = hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash); - - INIT_LIST_HEAD(&data.dispose); - data.start = NULL; - data.found = 0; - - d_walk(dentry, &data, umount_collect, NULL); - if (data.found) - shrink_dentry_list(&data.dispose); - cond_resched(); + dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash)); + do_one_tree(dentry); } } @@ -1647,8 +1595,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) unsigned add_flags = d_flags_for_inode(inode); spin_lock(&dentry->d_lock); - dentry->d_flags &= ~DCACHE_ENTRY_TYPE; - dentry->d_flags |= add_flags; + __d_set_type(dentry, add_flags); if (inode) hlist_add_head(&dentry->d_alias, &inode->i_dentry); dentry->d_inode = inode; @@ -2483,12 +2430,14 @@ static void switch_names(struct dentry *dentry, struct dentry *target) dentry->d_name.name = dentry->d_iname; } else { /* - * Both are internal. Just copy target to dentry + * Both are internal. */ - memcpy(dentry->d_iname, target->d_name.name, - target->d_name.len + 1); - dentry->d_name.len = target->d_name.len; - return; + unsigned int i; + BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long))); + for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) { + swap(((long *) &dentry->d_iname)[i], + ((long *) &target->d_iname)[i]); + } } } swap(dentry->d_name.len, target->d_name.len); @@ -2545,13 +2494,15 @@ static void dentry_unlock_parents_for_move(struct dentry *dentry, * __d_move - move a dentry * @dentry: entry to move * @target: new dentry + * @exchange: exchange the two dentries * * Update the dcache to reflect the move of a file name. Negative * dcache entries should not be moved in this way. Caller must hold * rename_lock, the i_mutex of the source and target directories, * and the sb->s_vfs_rename_mutex if they differ. See lock_rename(). */ -static void __d_move(struct dentry * dentry, struct dentry * target) +static void __d_move(struct dentry *dentry, struct dentry *target, + bool exchange) { if (!dentry->d_inode) printk(KERN_WARNING "VFS: moving negative dcache entry\n"); @@ -2573,8 +2524,15 @@ static void __d_move(struct dentry * dentry, struct dentry * target) __d_drop(dentry); __d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash)); - /* Unhash the target: dput() will then get rid of it */ + /* + * Unhash the target (d_delete() is not usable here). If exchanging + * the two dentries, then rehash onto the other's hash queue. + */ __d_drop(target); + if (exchange) { + __d_rehash(target, + d_hash(dentry->d_parent, dentry->d_name.hash)); + } list_del(&dentry->d_u.d_child); list_del(&target->d_u.d_child); @@ -2601,6 +2559,8 @@ static void __d_move(struct dentry * dentry, struct dentry * target) write_seqcount_end(&dentry->d_seq); dentry_unlock_parents_for_move(dentry, target); + if (exchange) + fsnotify_d_move(target); spin_unlock(&target->d_lock); fsnotify_d_move(dentry); spin_unlock(&dentry->d_lock); @@ -2618,11 +2578,30 @@ static void __d_move(struct dentry * dentry, struct dentry * target) void d_move(struct dentry *dentry, struct dentry *target) { write_seqlock(&rename_lock); - __d_move(dentry, target); + __d_move(dentry, target, false); write_sequnlock(&rename_lock); } EXPORT_SYMBOL(d_move); +/* + * d_exchange - exchange two dentries + * @dentry1: first dentry + * @dentry2: second dentry + */ +void d_exchange(struct dentry *dentry1, struct dentry *dentry2) +{ + write_seqlock(&rename_lock); + + WARN_ON(!dentry1->d_inode); + WARN_ON(!dentry2->d_inode); + WARN_ON(IS_ROOT(dentry1)); + WARN_ON(IS_ROOT(dentry2)); + + __d_move(dentry1, dentry2, true); + + write_sequnlock(&rename_lock); +} + /** * d_ancestor - search for an ancestor * @p1: ancestor dentry @@ -2670,7 +2649,7 @@ static struct dentry *__d_unalias(struct inode *inode, m2 = &alias->d_parent->d_inode->i_mutex; out_unalias: if (likely(!d_mountpoint(alias))) { - __d_move(alias, dentry); + __d_move(alias, dentry, false); ret = alias; } out_err: @@ -3112,6 +3091,7 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen) end = ERR_PTR(-ENAMETOOLONG); return end; } +EXPORT_SYMBOL(simple_dname); /* * Write full pathname from the root of the filesystem into the buffer. diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 9c0444cccbe1..8c41b52da358 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -218,6 +218,7 @@ static int debugfs_remount(struct super_block *sb, int *flags, char *data) int err; struct debugfs_fs_info *fsi = sb->s_fs_info; + sync_filesystem(sb); err = debugfs_parse_options(data, &fsi->mount_opts); if (err) goto fail; @@ -358,7 +359,7 @@ exit: * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have. * @parent: a pointer to the parent dentry for this file. This should be a - * directory dentry if set. If this paramater is NULL, then the + * directory dentry if set. If this parameter is NULL, then the * file will be created in the root of the debugfs filesystem. * @data: a pointer to something that the caller will want to get to later * on. The inode.i_private pointer will point to this value on @@ -400,7 +401,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_file); * @name: a pointer to a string containing the name of the directory to * create. * @parent: a pointer to the parent dentry for this file. This should be a - * directory dentry if set. If this paramater is NULL, then the + * directory dentry if set. If this parameter is NULL, then the * directory will be created in the root of the debugfs filesystem. * * This function creates a directory in debugfs with the given name. @@ -425,7 +426,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir); * @name: a pointer to a string containing the name of the symbolic link to * create. * @parent: a pointer to the parent dentry for this symbolic link. This - * should be a directory dentry if set. If this paramater is NULL, + * should be a directory dentry if set. If this parameter is NULL, * then the symbolic link will be created in the root of the debugfs * filesystem. * @target: a pointer to a string containing the path to the target of the diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index a726b9f29cb7..c71038079b47 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -313,6 +313,7 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data) struct pts_fs_info *fsi = DEVPTS_SB(sb); struct pts_mount_opts *opts = &fsi->mount_opts; + sync_filesystem(sb); err = parse_mount_options(data, PARSE_REMOUNT, opts); /* diff --git a/fs/direct-io.c b/fs/direct-io.c index 160a5489a939..31ba0935e32e 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -664,7 +664,6 @@ static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio, goto out; sector = start_sector << (sdio->blkbits - 9); nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev)); - nr_pages = min(nr_pages, BIO_MAX_PAGES); BUG_ON(nr_pages <= 0); dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages); sdio->boundary = 0; @@ -1194,13 +1193,19 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, } /* - * For file extending writes updating i_size before data - * writeouts complete can expose uninitialized blocks. So - * even for AIO, we need to wait for i/o to complete before - * returning in this case. + * For file extending writes updating i_size before data writeouts + * complete can expose uninitialized blocks in dumb filesystems. + * In that case we need to wait for I/O completion even if asked + * for an asynchronous write. */ - dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) && - (end > i_size_read(inode))); + if (is_sync_kiocb(iocb)) + dio->is_async = false; + else if (!(dio->flags & DIO_ASYNC_EXTEND) && + (rw & WRITE) && end > i_size_read(inode)) + dio->is_async = false; + else + dio->is_async = true; + dio->inode = inode; dio->rw = rw; diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 0e90f0c91b93..dcea1e37a1b7 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -14,6 +14,7 @@ #include "dlm_internal.h" #include "lock.h" #include "user.h" +#include "ast.h" static uint64_t dlm_cb_seq; static DEFINE_SPINLOCK(dlm_cb_seq_spin); @@ -308,6 +309,6 @@ void dlm_callback_resume(struct dlm_ls *ls) mutex_unlock(&ls->ls_cb_mutex); if (count) - log_debug(ls, "dlm_callback_resume %d", count); + log_rinfo(ls, "dlm_callback_resume %d", count); } diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c index 278a75cda446..d975851a7e1e 100644 --- a/fs/dlm/dir.c +++ b/fs/dlm/dir.c @@ -68,7 +68,7 @@ int dlm_recover_directory(struct dlm_ls *ls) uint16_t namelen; unsigned int count = 0, count_match = 0, count_bad = 0, count_add = 0; - log_debug(ls, "dlm_recover_directory"); + log_rinfo(ls, "dlm_recover_directory"); if (dlm_no_directory(ls)) goto out_status; @@ -189,7 +189,7 @@ int dlm_recover_directory(struct dlm_ls *ls) error = 0; dlm_set_recover_status(ls, DLM_RS_DIR); - log_debug(ls, "dlm_recover_directory %u in %u new", + log_rinfo(ls, "dlm_recover_directory %u in %u new", count, count_add); out_free: kfree(last_name); diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index e7665c31f7b1..5eff6ea3e27f 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -65,6 +65,8 @@ struct dlm_mhandle; printk(KERN_ERR "dlm: "fmt"\n" , ##args) #define log_error(ls, fmt, args...) \ printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args) +#define log_rinfo(ls, fmt, args...) \ + printk(KERN_INFO "dlm: %s: " fmt "\n", (ls)->ls_name , ##args); #define log_debug(ls, fmt, args...) \ do { \ diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index e223a911a834..83f3d5520307 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -687,6 +687,7 @@ static int find_rsb_dir(struct dlm_ls *ls, char *name, int len, log_error(ls, "find_rsb new from_other %d dir %d our %d %s", from_nodeid, dir_nodeid, our_nodeid, r->res_name); dlm_free_rsb(r); + r = NULL; error = -ENOTBLK; goto out_unlock; } @@ -5462,7 +5463,7 @@ void dlm_recover_purge(struct dlm_ls *ls) up_write(&ls->ls_root_sem); if (lkb_count) - log_debug(ls, "dlm_recover_purge %u locks for %u nodes", + log_rinfo(ls, "dlm_recover_purge %u locks for %u nodes", lkb_count, nodes_count); } @@ -5536,7 +5537,7 @@ void dlm_recover_grant(struct dlm_ls *ls) } if (lkb_count) - log_debug(ls, "dlm_recover_grant %u locks on %u resources", + log_rinfo(ls, "dlm_recover_grant %u locks on %u resources", lkb_count, rsb_count); } @@ -5695,7 +5696,7 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) put_rsb(r); out: if (error && error != -EEXIST) - log_debug(ls, "dlm_recover_master_copy remote %d %x error %d", + log_rinfo(ls, "dlm_recover_master_copy remote %d %x error %d", from_nodeid, remid, error); rl->rl_result = cpu_to_le32(error); return error; diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index d5abafd56a6d..04d6398c1f1c 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -190,7 +190,7 @@ static int do_uevent(struct dlm_ls *ls, int in) else kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE); - log_debug(ls, "%s the lockspace group...", in ? "joining" : "leaving"); + log_rinfo(ls, "%s the lockspace group...", in ? "joining" : "leaving"); /* dlm_controld will see the uevent, do the necessary group management and then write to sysfs to wake us */ @@ -198,7 +198,7 @@ static int do_uevent(struct dlm_ls *ls, int in) error = wait_event_interruptible(ls->ls_uevent_wait, test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags)); - log_debug(ls, "group event done %d %d", error, ls->ls_uevent_result); + log_rinfo(ls, "group event done %d %d", error, ls->ls_uevent_result); if (error) goto out; @@ -640,7 +640,7 @@ static int new_lockspace(const char *name, const char *cluster, dlm_create_debug_file(ls); - log_debug(ls, "join complete"); + log_rinfo(ls, "join complete"); *lockspace = ls; return 0; @@ -835,7 +835,7 @@ static int release_lockspace(struct dlm_ls *ls, int force) dlm_clear_members(ls); dlm_clear_members_gone(ls); kfree(ls->ls_node_array); - log_debug(ls, "release_lockspace final free"); + log_rinfo(ls, "release_lockspace final free"); kobject_put(&ls->ls_kobj); /* The ls structure will be freed when the kobject is done with */ diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 3190ca973dd6..1e5b45359509 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -424,7 +424,7 @@ int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len) } /* Data available on socket or listen socket received a connect */ -static void lowcomms_data_ready(struct sock *sk, int count_unused) +static void lowcomms_data_ready(struct sock *sk) { struct connection *con = sock2con(sk); if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags)) diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 476557b54921..9c47f1c14a8b 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -60,18 +60,15 @@ void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc) #define SLOT_DEBUG_LINE 128 -static void log_debug_slots(struct dlm_ls *ls, uint32_t gen, int num_slots, - struct rcom_slot *ro0, struct dlm_slot *array, - int array_size) +static void log_slots(struct dlm_ls *ls, uint32_t gen, int num_slots, + struct rcom_slot *ro0, struct dlm_slot *array, + int array_size) { char line[SLOT_DEBUG_LINE]; int len = SLOT_DEBUG_LINE - 1; int pos = 0; int ret, i; - if (!dlm_config.ci_log_debug) - return; - memset(line, 0, sizeof(line)); if (array) { @@ -95,7 +92,7 @@ static void log_debug_slots(struct dlm_ls *ls, uint32_t gen, int num_slots, } } - log_debug(ls, "generation %u slots %d%s", gen, num_slots, line); + log_rinfo(ls, "generation %u slots %d%s", gen, num_slots, line); } int dlm_slots_copy_in(struct dlm_ls *ls) @@ -129,7 +126,7 @@ int dlm_slots_copy_in(struct dlm_ls *ls) ro->ro_slot = le16_to_cpu(ro->ro_slot); } - log_debug_slots(ls, gen, num_slots, ro0, NULL, 0); + log_slots(ls, gen, num_slots, ro0, NULL, 0); list_for_each_entry(memb, &ls->ls_nodes, list) { for (i = 0, ro = ro0; i < num_slots; i++, ro++) { @@ -274,7 +271,7 @@ int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size, gen++; - log_debug_slots(ls, gen, num, NULL, array, array_size); + log_slots(ls, gen, num, NULL, array, array_size); max_slots = (dlm_config.ci_buffer_size - sizeof(struct dlm_rcom) - sizeof(struct rcom_config)) / sizeof(struct rcom_slot); @@ -447,7 +444,7 @@ static int ping_members(struct dlm_ls *ls) break; } if (error) - log_debug(ls, "ping_members aborted %d last nodeid %d", + log_rinfo(ls, "ping_members aborted %d last nodeid %d", error, ls->ls_recover_nodeid); return error; } @@ -539,7 +536,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) count as a negative change so the "neg" recovery steps will happen */ list_for_each_entry(memb, &ls->ls_nodes_gone, list) { - log_debug(ls, "prev removed member %d", memb->nodeid); + log_rinfo(ls, "prev removed member %d", memb->nodeid); neg++; } @@ -551,10 +548,10 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) continue; if (!node) { - log_debug(ls, "remove member %d", memb->nodeid); + log_rinfo(ls, "remove member %d", memb->nodeid); } else { /* removed and re-added */ - log_debug(ls, "remove member %d comm_seq %u %u", + log_rinfo(ls, "remove member %d comm_seq %u %u", memb->nodeid, memb->comm_seq, node->comm_seq); } @@ -571,7 +568,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) if (dlm_is_member(ls, node->nodeid)) continue; dlm_add_member(ls, node); - log_debug(ls, "add member %d", node->nodeid); + log_rinfo(ls, "add member %d", node->nodeid); } list_for_each_entry(memb, &ls->ls_nodes, list) { @@ -591,7 +588,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) complete(&ls->ls_members_done); } - log_debug(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes); + log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes); return error; } diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index a6bc63f6e31b..eaea789bf97d 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c @@ -526,7 +526,7 @@ int dlm_recover_masters(struct dlm_ls *ls) int nodir = dlm_no_directory(ls); int error; - log_debug(ls, "dlm_recover_masters"); + log_rinfo(ls, "dlm_recover_masters"); down_read(&ls->ls_root_sem); list_for_each_entry(r, &ls->ls_root_list, res_root_list) { @@ -552,7 +552,7 @@ int dlm_recover_masters(struct dlm_ls *ls) } up_read(&ls->ls_root_sem); - log_debug(ls, "dlm_recover_masters %u of %u", count, total); + log_rinfo(ls, "dlm_recover_masters %u of %u", count, total); error = dlm_wait_function(ls, &recover_idr_empty); out: @@ -685,7 +685,7 @@ int dlm_recover_locks(struct dlm_ls *ls) } up_read(&ls->ls_root_sem); - log_debug(ls, "dlm_recover_locks %d out", count); + log_rinfo(ls, "dlm_recover_locks %d out", count); error = dlm_wait_function(ls, &recover_list_empty); out: @@ -883,7 +883,7 @@ void dlm_recover_rsbs(struct dlm_ls *ls) up_read(&ls->ls_root_sem); if (count) - log_debug(ls, "dlm_recover_rsbs %d done", count); + log_rinfo(ls, "dlm_recover_rsbs %d done", count); } /* Create a single list of all root rsb's to be used during recovery */ @@ -950,6 +950,6 @@ void dlm_clear_toss(struct dlm_ls *ls) } if (count) - log_debug(ls, "dlm_clear_toss %u done", count); + log_rinfo(ls, "dlm_clear_toss %u done", count); } diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 32f9f8926ec3..6859b4bf971e 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -55,7 +55,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) unsigned long start; int error, neg = 0; - log_debug(ls, "dlm_recover %llu", (unsigned long long)rv->seq); + log_rinfo(ls, "dlm_recover %llu", (unsigned long long)rv->seq); mutex_lock(&ls->ls_recoverd_active); @@ -76,7 +76,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_members(ls, rv, &neg); if (error) { - log_debug(ls, "dlm_recover_members error %d", error); + log_rinfo(ls, "dlm_recover_members error %d", error); goto fail; } @@ -90,7 +90,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_members_wait(ls); if (error) { - log_debug(ls, "dlm_recover_members_wait error %d", error); + log_rinfo(ls, "dlm_recover_members_wait error %d", error); goto fail; } @@ -103,7 +103,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_directory(ls); if (error) { - log_debug(ls, "dlm_recover_directory error %d", error); + log_rinfo(ls, "dlm_recover_directory error %d", error); goto fail; } @@ -111,11 +111,11 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_directory_wait(ls); if (error) { - log_debug(ls, "dlm_recover_directory_wait error %d", error); + log_rinfo(ls, "dlm_recover_directory_wait error %d", error); goto fail; } - log_debug(ls, "dlm_recover_directory %u out %u messages", + log_rinfo(ls, "dlm_recover_directory %u out %u messages", ls->ls_recover_dir_sent_res, ls->ls_recover_dir_sent_msg); /* @@ -144,7 +144,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_masters(ls); if (error) { - log_debug(ls, "dlm_recover_masters error %d", error); + log_rinfo(ls, "dlm_recover_masters error %d", error); goto fail; } @@ -154,7 +154,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_locks(ls); if (error) { - log_debug(ls, "dlm_recover_locks error %d", error); + log_rinfo(ls, "dlm_recover_locks error %d", error); goto fail; } @@ -162,11 +162,11 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_locks_wait(ls); if (error) { - log_debug(ls, "dlm_recover_locks_wait error %d", error); + log_rinfo(ls, "dlm_recover_locks_wait error %d", error); goto fail; } - log_debug(ls, "dlm_recover_locks %u in", + log_rinfo(ls, "dlm_recover_locks %u in", ls->ls_recover_locks_in); /* @@ -186,7 +186,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_locks_wait(ls); if (error) { - log_debug(ls, "dlm_recover_locks_wait error %d", error); + log_rinfo(ls, "dlm_recover_locks_wait error %d", error); goto fail; } } @@ -205,7 +205,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = dlm_recover_done_wait(ls); if (error) { - log_debug(ls, "dlm_recover_done_wait error %d", error); + log_rinfo(ls, "dlm_recover_done_wait error %d", error); goto fail; } @@ -217,25 +217,25 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) error = enable_locking(ls, rv->seq); if (error) { - log_debug(ls, "enable_locking error %d", error); + log_rinfo(ls, "enable_locking error %d", error); goto fail; } error = dlm_process_requestqueue(ls); if (error) { - log_debug(ls, "dlm_process_requestqueue error %d", error); + log_rinfo(ls, "dlm_process_requestqueue error %d", error); goto fail; } error = dlm_recover_waiters_post(ls); if (error) { - log_debug(ls, "dlm_recover_waiters_post error %d", error); + log_rinfo(ls, "dlm_recover_waiters_post error %d", error); goto fail; } dlm_recover_grant(ls); - log_debug(ls, "dlm_recover %llu generation %u done: %u ms", + log_rinfo(ls, "dlm_recover %llu generation %u done: %u ms", (unsigned long long)rv->seq, ls->ls_generation, jiffies_to_msecs(jiffies - start)); mutex_unlock(&ls->ls_recoverd_active); @@ -245,7 +245,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) fail: dlm_release_root_list(ls); - log_debug(ls, "dlm_recover %llu error %d", + log_rinfo(ls, "dlm_recover %llu error %d", (unsigned long long)rv->seq, error); mutex_unlock(&ls->ls_recoverd_active); return error; diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 9fd702f5bfb2..9280202e488c 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -59,10 +59,22 @@ int drop_caches_sysctl_handler(ctl_table *table, int write, if (ret) return ret; if (write) { - if (sysctl_drop_caches & 1) + static int stfu; + + if (sysctl_drop_caches & 1) { iterate_supers(drop_pagecache_sb, NULL); - if (sysctl_drop_caches & 2) + count_vm_event(DROP_PAGECACHE); + } + if (sysctl_drop_caches & 2) { drop_slab(); + count_vm_event(DROP_SLAB); + } + if (!stfu) { + pr_info("%s (%d): drop_caches: %d\n", + current->comm, task_pid_nr(current), + sysctl_drop_caches); + } + stfu |= sysctl_drop_caches & 4; } return 0; } diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index b167ca48b8ee..d4a9431ec73c 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -641,7 +641,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, } rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry, lower_new_dir_dentry->d_inode, lower_new_dentry, - NULL); + NULL, 0); if (rc) goto out_lock; if (target_inode) diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index e879cf8ff0b1..afa1b81c3418 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -132,7 +132,7 @@ static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf) */ static void ecryptfs_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); iput(ecryptfs_inode_to_lower(inode)); } diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index 8dd524f32284..cdb2971192a5 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -21,7 +21,7 @@ static ssize_t efivarfs_file_write(struct file *file, u32 attributes; struct inode *inode = file->f_mapping->host; unsigned long datasize = count - sizeof(attributes); - ssize_t bytes = 0; + ssize_t bytes; bool set = false; if (count < sizeof(attributes)) @@ -33,14 +33,9 @@ static ssize_t efivarfs_file_write(struct file *file, if (attributes & ~(EFI_VARIABLE_MASK)) return -EINVAL; - data = kmalloc(datasize, GFP_KERNEL); - if (!data) - return -ENOMEM; - - if (copy_from_user(data, userbuf + sizeof(attributes), datasize)) { - bytes = -EFAULT; - goto out; - } + data = memdup_user(userbuf + sizeof(attributes), datasize); + if (IS_ERR(data)) + return PTR_ERR(data); bytes = efivar_entry_set_get_size(var, attributes, &datasize, data, &set); diff --git a/fs/efs/super.c b/fs/efs/super.c index 50215bbd6463..3befcc9f5d63 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -91,7 +91,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { efs_inode_cachep = kmem_cache_create("efs_inode_cache", sizeof(struct efs_inode_info), @@ -114,6 +114,7 @@ static void destroy_inodecache(void) static int efs_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_RDONLY; return 0; } diff --git a/fs/exec.c b/fs/exec.c index 3d78fccdd723..238b7aa26f68 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -26,6 +26,7 @@ #include <linux/file.h> #include <linux/fdtable.h> #include <linux/mm.h> +#include <linux/vmacache.h> #include <linux/stat.h> #include <linux/fcntl.h> #include <linux/swap.h> @@ -97,6 +98,7 @@ static inline void put_binfmt(struct linux_binfmt * fmt) module_put(fmt->module); } +#ifdef CONFIG_USELIB /* * Note that a shared library must be both readable and executable due to * security reasons. @@ -156,6 +158,7 @@ exit: out: return error; } +#endif /* #ifdef CONFIG_USELIB */ #ifdef CONFIG_MMU /* @@ -654,10 +657,10 @@ int setup_arg_pages(struct linux_binprm *bprm, unsigned long rlim_stack; #ifdef CONFIG_STACK_GROWSUP - /* Limit stack size to 1GB */ + /* Limit stack size */ stack_base = rlimit_max(RLIMIT_STACK); - if (stack_base > (1 << 30)) - stack_base = 1 << 30; + if (stack_base > STACK_SIZE_MAX) + stack_base = STACK_SIZE_MAX; /* Make sure we didn't let the argument array grow too large. */ if (vma->vm_end - vma->vm_start > stack_base) @@ -810,7 +813,7 @@ EXPORT_SYMBOL(kernel_read); ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) { - ssize_t res = file->f_op->read(file, (void __user *)addr, len, &pos); + ssize_t res = vfs_read(file, (void __user *)addr, len, &pos); if (res > 0) flush_icache_range(addr, addr + len); return res; @@ -820,7 +823,7 @@ EXPORT_SYMBOL(read_code); static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; - struct mm_struct * old_mm, *active_mm; + struct mm_struct *old_mm, *active_mm; /* Notify parent that we're no longer interested in the old VM */ tsk = current; @@ -846,6 +849,8 @@ static int exec_mmap(struct mm_struct *mm) tsk->mm = mm; tsk->active_mm = mm; activate_mm(active_mm, mm); + tsk->mm->vmacache_seqnum = 0; + vmacache_flush(tsk); task_unlock(tsk); if (old_mm) { up_read(&old_mm->mmap_sem); @@ -1041,7 +1046,7 @@ EXPORT_SYMBOL_GPL(get_task_comm); * so that a new one can be started */ -void set_task_comm(struct task_struct *tsk, char *buf) +void set_task_comm(struct task_struct *tsk, const char *buf) { task_lock(tsk); trace_task_rename(tsk, buf); @@ -1050,21 +1055,6 @@ void set_task_comm(struct task_struct *tsk, char *buf) perf_event_comm(tsk); } -static void filename_to_taskname(char *tcomm, const char *fn, unsigned int len) -{ - int i, ch; - - /* Copies the binary name from after last slash */ - for (i = 0; (ch = *(fn++)) != '\0';) { - if (ch == '/') - i = 0; /* overwrite what we wrote */ - else - if (i < len - 1) - tcomm[i++] = ch; - } - tcomm[i] = '\0'; -} - int flush_old_exec(struct linux_binprm * bprm) { int retval; @@ -1078,8 +1068,6 @@ int flush_old_exec(struct linux_binprm * bprm) goto out; set_mm_exe_file(bprm->mm, bprm->file); - - filename_to_taskname(bprm->tcomm, bprm->filename, sizeof(bprm->tcomm)); /* * Release all of the old mmap stuff */ @@ -1122,7 +1110,7 @@ void setup_new_exec(struct linux_binprm * bprm) else set_dumpable(current->mm, suid_dumpable); - set_task_comm(current, bprm->tcomm); + set_task_comm(current, kbasename(bprm->filename)); /* Set the new mm task size. We have to do that late because it may * depend on TIF_32BIT which is only updated in flush_thread() on @@ -1619,9 +1607,9 @@ SYSCALL_DEFINE3(execve, return do_execve(getname(filename), argv, envp); } #ifdef CONFIG_COMPAT -asmlinkage long compat_sys_execve(const char __user * filename, - const compat_uptr_t __user * argv, - const compat_uptr_t __user * envp) +COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename, + const compat_uptr_t __user *, argv, + const compat_uptr_t __user *, envp) { return compat_do_execve(getname(filename), argv, envp); } diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index ee4317faccb1..d1c244d67667 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -1486,7 +1486,7 @@ void exofs_evict_inode(struct inode *inode) struct ore_io_state *ios; int ret; - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); /* TODO: should do better here */ if (inode->i_nlink || is_bad_inode(inode)) diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index 7682b970d0f1..4e2c032ab8a1 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -21,12 +21,12 @@ #undef ORE_DBGMSG2 #define ORE_DBGMSG2 ORE_DBGMSG -struct page *_raid_page_alloc(void) +static struct page *_raid_page_alloc(void) { return alloc_page(GFP_KERNEL); } -void _raid_page_free(struct page *p) +static void _raid_page_free(struct page *p) { __free_page(p); } diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 9d9763328734..ed73ed8ebbee 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -543,7 +543,7 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, return !(odi->systemid_len || odi->osdname_len); } -int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs, +static int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs, struct exofs_dev **peds) { struct __alloc_ore_devs_and_exofs_devs { diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 1b8001bbe947..27695e6f4e46 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -4,7 +4,6 @@ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> */ -#include <linux/capability.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/slab.h> diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 7cadd823bb31..7d66fb0e4cca 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -284,7 +284,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) int best_ndir = inodes_per_group; int best_group = -1; - get_random_bytes(&group, sizeof(group)); + group = prandom_u32(); parent_group = (unsigned)group % ngroups; for (i = 0; i < ngroups; i++) { group = (parent_group + i) % ngroups; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 94ed36849b71..b1d2a4675d42 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -78,7 +78,7 @@ void ext2_evict_inode(struct inode * inode) dquot_drop(inode); } - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); if (want_delete) { sb_start_intwrite(inode->i_sb); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 20d6697bd638..3750031cfa2f 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -192,7 +192,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { ext2_inode_cachep = kmem_cache_create("ext2_inode_cache", sizeof(struct ext2_inode_info), @@ -1254,6 +1254,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) unsigned long old_sb_flags; int err; + sync_filesystem(sb); spin_lock(&sbi->s_lock); /* Store the old options */ diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index cfedb2cb0d8c..c0ebc4db8849 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -42,8 +42,8 @@ ext2_xattr_security_set(struct dentry *dentry, const char *name, value, size, flags); } -int ext2_initxattrs(struct inode *inode, const struct xattr *xattr_array, - void *fs_info) +static int ext2_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) { const struct xattr *xattr; int err = 0; diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index 22548f56197b..158b5d4ce067 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -1727,10 +1727,7 @@ allocated: percpu_counter_sub(&sbi->s_freeblocks_counter, num); BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); - err = ext3_journal_dirty_metadata(handle, gdp_bh); - if (!fatal) - fatal = err; - + fatal = ext3_journal_dirty_metadata(handle, gdp_bh); if (fatal) goto out; diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index e66e4808719f..17742eed2c16 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -275,7 +275,7 @@ static inline loff_t ext3_get_htree_eof(struct file *filp) * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX) * will be invalid once the directory was converted into a dx directory */ -loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence) +static loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; int dx_dir = is_dx_dir(inode); diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index 082afd78b107..a1b810230cc5 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -215,7 +215,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) int best_ndir = inodes_per_group; int best_group = -1; - get_random_bytes(&group, sizeof(group)); + group = prandom_u32(); parent_group = (unsigned)group % ngroups; for (i = 0; i < ngroups; i++) { group = (parent_group + i) % ngroups; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 384b6ebb655f..f5157d0d1b43 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -228,7 +228,7 @@ void ext3_evict_inode (struct inode *inode) log_wait_commit(journal, commit_tid); filemap_write_and_wait(&inode->i_data); } - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); ext3_discard_reservation(inode); rsv = ei->i_block_alloc_info; @@ -1559,56 +1559,17 @@ static int buffer_unmapped(handle_t *handle, struct buffer_head *bh) } /* - * Note that we always start a transaction even if we're not journalling - * data. This is to preserve ordering: any hole instantiation within - * __block_write_full_page -> ext3_get_block() should be journalled - * along with the data so we don't crash and then get metadata which + * Note that whenever we need to map blocks we start a transaction even if + * we're not journalling data. This is to preserve ordering: any hole + * instantiation within __block_write_full_page -> ext3_get_block() should be + * journalled along with the data so we don't crash and then get metadata which * refers to old data. * * In all journalling modes block_write_full_page() will start the I/O. * - * Problem: - * - * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> - * ext3_writepage() - * - * Similar for: - * - * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ... - * - * Same applies to ext3_get_block(). We will deadlock on various things like - * lock_journal and i_truncate_mutex. - * - * Setting PF_MEMALLOC here doesn't work - too many internal memory - * allocations fail. - * - * 16May01: If we're reentered then journal_current_handle() will be - * non-zero. We simply *return*. - * - * 1 July 2001: @@@ FIXME: - * In journalled data mode, a data buffer may be metadata against the - * current transaction. But the same file is part of a shared mapping - * and someone does a writepage() on it. - * - * We will move the buffer onto the async_data list, but *after* it has - * been dirtied. So there's a small window where we have dirty data on - * BJ_Metadata. - * - * Note that this only applies to the last partial page in the file. The - * bit which block_write_full_page() uses prepare/commit for. (That's - * broken code anyway: it's wrong for msync()). - * - * It's a rare case: affects the final partial page, for journalled data - * where the file is subject to bith write() and writepage() in the same - * transction. To fix it we'll need a custom block_write_full_page(). - * We'll probably need that anyway for journalling writepage() output. - * * We don't honour synchronous mounts for writepage(). That would be * disastrous. Any write() or metadata operation will sync the fs for * us. - * - * AKPM2: if all the page's buffers are mapped to disk and !data=journal, - * we don't need to open a transaction here. */ static int ext3_ordered_writepage(struct page *page, struct writeback_control *wbc) @@ -1673,12 +1634,9 @@ static int ext3_ordered_writepage(struct page *page, * block_write_full_page() succeeded. Otherwise they are unmapped, * and generally junk. */ - if (ret == 0) { - err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, + if (ret == 0) + ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, journal_dirty_data_fn); - if (!ret) - ret = err; - } walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, bput_one); err = ext3_journal_stop(handle); @@ -1925,6 +1883,8 @@ retry: * and pretend the write failed... */ ext3_truncate_failed_direct_write(inode); ret = PTR_ERR(handle); + if (inode->i_nlink) + ext3_orphan_del(NULL, inode); goto out; } if (inode->i_nlink) @@ -3212,21 +3172,20 @@ out_brelse: * * We are called from a few places: * - * - Within generic_file_write() for O_SYNC files. + * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. * Here, there will be no transaction running. We wait for any running * transaction to commit. * - * - Within sys_sync(), kupdate and such. - * We wait on commit, if tol to. + * - Within flush work (for sys_sync(), kupdate and such). + * We wait on commit, if told to. * - * - Within prune_icache() (PF_MEMALLOC == true) - * Here we simply return. We can't afford to block kswapd on the - * journal commit. + * - Within iput_final() -> write_inode_now() + * We wait on commit, if told to. * * In all cases it is actually safe for us to return without doing anything, * because the inode has been copied into a raw inode buffer in - * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for - * knfsd. + * ext3_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL + * writeback. * * Note that we are absolutely dependent upon all inode dirtiers doing the * right thing: they *must* call mark_inode_dirty() after dirtying info in @@ -3238,13 +3197,13 @@ out_brelse: * stuff(); * inode->i_size = expr; * - * is in error because a kswapd-driven write_inode() could occur while - * `stuff()' is running, and the new i_size will be lost. Plus the inode - * will no longer be on the superblock's dirty inode list. + * is in error because write_inode() could occur while `stuff()' is running, + * and the new i_size will be lost. Plus the inode will no longer be on the + * superblock's dirty inode list. */ int ext3_write_inode(struct inode *inode, struct writeback_control *wbc) { - if (current->flags & PF_MEMALLOC) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; if (ext3_journal_current_handle()) { @@ -3253,7 +3212,12 @@ int ext3_write_inode(struct inode *inode, struct writeback_control *wbc) return -EIO; } - if (wbc->sync_mode != WB_SYNC_ALL) + /* + * No need to force transaction in WB_SYNC_NONE mode. Also + * ext3_sync_fs() will force the commit after everything is + * written. + */ + if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) return 0; return ext3_force_commit(inode->i_sb); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 37fd31ed16e7..08cdfe5461e3 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -527,7 +527,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { ext3_inode_cachep = kmem_cache_create("ext3_inode_cache", sizeof(struct ext3_inode_info), @@ -2649,6 +2649,8 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) int i; #endif + sync_filesystem(sb); + /* Store the original options */ old_sb_flags = sb->s_flags; old_opts.s_mount_opt = sbi->s_mount_opt; diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c index 3387664ad70e..722c2bf9645d 100644 --- a/fs/ext3/xattr_security.c +++ b/fs/ext3/xattr_security.c @@ -43,8 +43,9 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name, name, value, size, flags); } -int ext3_initxattrs(struct inode *inode, const struct xattr *xattr_array, - void *fs_info) +static int ext3_initxattrs(struct inode *inode, + const struct xattr *xattr_array, + void *fs_info) { const struct xattr *xattr; handle_t *handle = fs_info; diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 6ea7b1436bbc..5c56785007e0 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -667,7 +667,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) continue; x = ext4_count_free(bitmap_bh->b_data, - EXT4_BLOCKS_PER_GROUP(sb) / 8); + EXT4_CLUSTERS_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", i, ext4_free_group_clusters(sb, gdp), x); bitmap_count += x; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d3a534fdc5ff..66946aa62127 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -31,6 +31,7 @@ #include <linux/percpu_counter.h> #include <linux/ratelimit.h> #include <crypto/hash.h> +#include <linux/falloc.h> #ifdef __KERNEL__ #include <linux/compat.h> #endif @@ -567,6 +568,8 @@ enum { #define EXT4_GET_BLOCKS_NO_LOCK 0x0100 /* Do not put hole in extent cache */ #define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 + /* Convert written extents to unwritten */ +#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0400 /* * The bit position of these flags must not overlap with any of the @@ -998,6 +1001,8 @@ struct ext4_inode_info { #define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group size of blocksize * 8 blocks */ +#define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated + file systems */ #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt @@ -1326,6 +1331,7 @@ struct ext4_sb_info { struct list_head s_es_lru; unsigned long s_es_last_sorted; struct percpu_counter s_extent_cache_cnt; + struct mb_cache *s_mb_cache; spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; /* Ratelimit ext4 messages. */ @@ -2133,8 +2139,6 @@ extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); -extern int ext4_block_zero_page_range(handle_t *handle, - struct address_space *mapping, loff_t from, loff_t length); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); @@ -2462,23 +2466,6 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) up_write(&EXT4_I(inode)->i_data_sem); } -/* - * Update i_disksize after writeback has been started. Races with truncate - * are avoided by checking i_size under i_data_sem. - */ -static inline void ext4_wb_update_i_disksize(struct inode *inode, loff_t newsize) -{ - loff_t i_size; - - down_write(&EXT4_I(inode)->i_data_sem); - i_size = i_size_read(inode); - if (newsize > i_size) - newsize = i_size; - if (newsize > EXT4_I(inode)->i_disksize) - EXT4_I(inode)->i_disksize = newsize; - up_write(&EXT4_I(inode)->i_data_sem); -} - struct ext4_group_info { unsigned long bb_state; struct rb_root bb_free_root; @@ -2757,6 +2744,7 @@ extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); extern int ext4_ext_precache(struct inode *inode); +extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); /* move_extent.c */ extern void ext4_double_down_write_data_sem(struct inode *first, @@ -2766,6 +2754,8 @@ extern void ext4_double_up_write_data_sem(struct inode *orig_inode, extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 start_orig, __u64 start_donor, __u64 len, __u64 *moved_len); +extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path, + struct ext4_extent **extent); /* page-io.c */ extern int __init ext4_init_pageio(void); diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 3fe29de832c8..c3fb607413ed 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -259,6 +259,16 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, if (WARN_ON_ONCE(err)) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); + if (inode == NULL) { + pr_err("EXT4: jbd2_journal_dirty_metadata " + "failed: handle type %u started at " + "line %u, credits %u/%u, errcode %d", + handle->h_type, + handle->h_line_no, + handle->h_requested_credits, + handle->h_buffer_credits, err); + return err; + } ext4_error_inode(inode, where, line, bh->b_blocknr, "journal_dirty_metadata failed: " diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 74bc2d549c58..01b0c208f625 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -37,7 +37,6 @@ #include <linux/quotaops.h> #include <linux/string.h> #include <linux/slab.h> -#include <linux/falloc.h> #include <asm/uaccess.h> #include <linux/fiemap.h> #include "ext4_jbd2.h" @@ -1691,7 +1690,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, * the extent that was written properly split out and conversion to * initialized is trivial. */ - if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2)) + if (ext4_ext_is_uninitialized(ex1) != ext4_ext_is_uninitialized(ex2)) return 0; ext1_ee_len = ext4_ext_get_actual_len(ex1); @@ -1708,6 +1707,11 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, */ if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) return 0; + if (ext4_ext_is_uninitialized(ex1) && + (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || + atomic_read(&EXT4_I(inode)->i_unwritten) || + (ext1_ee_len + ext2_ee_len > EXT_UNINIT_MAX_LEN))) + return 0; #ifdef AGGRESSIVE_TEST if (ext1_ee_len >= 4) return 0; @@ -1731,7 +1735,7 @@ static int ext4_ext_try_to_merge_right(struct inode *inode, { struct ext4_extent_header *eh; unsigned int depth, len; - int merge_done = 0; + int merge_done = 0, uninit; depth = ext_depth(inode); BUG_ON(path[depth].p_hdr == NULL); @@ -1741,8 +1745,11 @@ static int ext4_ext_try_to_merge_right(struct inode *inode, if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) break; /* merge with next extent! */ + uninit = ext4_ext_is_uninitialized(ex); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(ex + 1)); + if (uninit) + ext4_ext_mark_uninitialized(ex); if (ex + 1 < EXT_LAST_EXTENT(eh)) { len = (EXT_LAST_EXTENT(eh) - ex - 1) @@ -1896,7 +1903,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path *npath = NULL; int depth, len, err; ext4_lblk_t next; - int mb_flags = 0; + int mb_flags = 0, uninit; if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); @@ -1946,9 +1953,11 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, path + depth); if (err) return err; - + uninit = ext4_ext_is_uninitialized(ex); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); + if (uninit) + ext4_ext_mark_uninitialized(ex); eh = path[depth].p_hdr; nearex = ex; goto merge; @@ -1971,10 +1980,13 @@ prepend: if (err) return err; + uninit = ext4_ext_is_uninitialized(ex); ex->ee_block = newext->ee_block; ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); + if (uninit) + ext4_ext_mark_uninitialized(ex); eh = path[depth].p_hdr; nearex = ex; goto merge; @@ -2585,6 +2597,27 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); + /* + * If we're starting with an extent other than the last one in the + * node, we need to see if it shares a cluster with the extent to + * the right (towards the end of the file). If its leftmost cluster + * is this extent's rightmost cluster and it is not cluster aligned, + * we'll mark it as a partial that is not to be deallocated. + */ + + if (ex != EXT_LAST_EXTENT(eh)) { + ext4_fsblk_t current_pblk, right_pblk; + long long current_cluster, right_cluster; + + current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; + current_cluster = (long long)EXT4_B2C(sbi, current_pblk); + right_pblk = ext4_ext_pblock(ex + 1); + right_cluster = (long long)EXT4_B2C(sbi, right_pblk); + if (current_cluster == right_cluster && + EXT4_PBLK_COFF(sbi, right_pblk)) + *partial_cluster = -right_cluster; + } + trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); while (ex >= EXT_FIRST_EXTENT(eh) && @@ -2710,10 +2743,15 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, err = ext4_ext_correct_indexes(handle, inode, path); /* - * Free the partial cluster only if the current extent does not - * reference it. Otherwise we might free used cluster. + * If there's a partial cluster and at least one extent remains in + * the leaf, free the partial cluster if it isn't shared with the + * current extent. If there's a partial cluster and no extents + * remain in the leaf, it can't be freed here. It can only be + * freed when it's possible to determine if it's not shared with + * any other extent - when the next leaf is processed or when space + * removal is complete. */ - if (*partial_cluster > 0 && + if (*partial_cluster > 0 && eh->eh_entries && (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != *partial_cluster)) { int flags = get_default_free_blocks_flags(inode); @@ -3275,6 +3313,11 @@ static int ext4_split_extent(handle_t *handle, return PTR_ERR(path); depth = ext_depth(inode); ex = path[depth].p_ext; + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + return -EIO; + } uninitialized = ext4_ext_is_uninitialized(ex); split_flag1 = 0; @@ -3569,6 +3612,8 @@ out: * b> Splits in two extents: Write is happening at either end of the extent * c> Splits in three extents: Somone is writing in middle of the extent * + * This works the same way in the case of initialized -> unwritten conversion. + * * One of more index blocks maybe needed if the extent tree grow after * the uninitialized extent split. To prevent ENOSPC occur at the IO * complete, we need to split the uninitialized extent before DIO submit @@ -3579,7 +3624,7 @@ out: * * Returns the size of uninitialized extent to be written on success. */ -static int ext4_split_unwritten_extents(handle_t *handle, +static int ext4_split_convert_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path *path, @@ -3591,9 +3636,9 @@ static int ext4_split_unwritten_extents(handle_t *handle, unsigned int ee_len; int split_flag = 0, depth; - ext_debug("ext4_split_unwritten_extents: inode %lu, logical" - "block %llu, max_blocks %u\n", inode->i_ino, - (unsigned long long)map->m_lblk, map->m_len); + ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n", + __func__, inode->i_ino, + (unsigned long long)map->m_lblk, map->m_len); eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; @@ -3608,14 +3653,79 @@ static int ext4_split_unwritten_extents(handle_t *handle, ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; - split_flag |= EXT4_EXT_MARK_UNINIT2; - if (flags & EXT4_GET_BLOCKS_CONVERT) - split_flag |= EXT4_EXT_DATA_VALID2; + /* Convert to unwritten */ + if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) { + split_flag |= EXT4_EXT_DATA_VALID1; + /* Convert to initialized */ + } else if (flags & EXT4_GET_BLOCKS_CONVERT) { + split_flag |= ee_block + ee_len <= eof_block ? + EXT4_EXT_MAY_ZEROOUT : 0; + split_flag |= (EXT4_EXT_MARK_UNINIT2 | EXT4_EXT_DATA_VALID2); + } flags |= EXT4_GET_BLOCKS_PRE_IO; return ext4_split_extent(handle, inode, path, map, split_flag, flags); } +static int ext4_convert_initialized_extents(handle_t *handle, + struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path *path) +{ + struct ext4_extent *ex; + ext4_lblk_t ee_block; + unsigned int ee_len; + int depth; + int err = 0; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + + ext_debug("%s: inode %lu, logical" + "block %llu, max_blocks %u\n", __func__, inode->i_ino, + (unsigned long long)ee_block, ee_len); + + if (ee_block != map->m_lblk || ee_len > map->m_len) { + err = ext4_split_convert_extents(handle, inode, map, path, + EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); + if (err < 0) + goto out; + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out; + } + depth = ext_depth(inode); + ex = path[depth].p_ext; + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + err = -EIO; + goto out; + } + } + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + /* first mark the extent as uninitialized */ + ext4_ext_mark_uninitialized(ex); + + /* note: ext4_ext_correct_indexes() isn't needed here because + * borders are not changed + */ + ext4_ext_try_to_merge(handle, inode, path, ex); + + /* Mark modified extent as dirty */ + err = ext4_ext_dirty(handle, inode, path + path->p_depth); +out: + ext4_ext_show_leaf(inode, path); + return err; +} + + static int ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, @@ -3649,8 +3759,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, inode->i_ino, (unsigned long long)ee_block, ee_len, (unsigned long long)map->m_lblk, map->m_len); #endif - err = ext4_split_unwritten_extents(handle, inode, map, path, - EXT4_GET_BLOCKS_CONVERT); + err = ext4_split_convert_extents(handle, inode, map, path, + EXT4_GET_BLOCKS_CONVERT); if (err < 0) goto out; ext4_ext_drop_refs(path); @@ -3851,6 +3961,38 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, } static int +ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path *path, int flags, + unsigned int allocated, ext4_fsblk_t newblock) +{ + int ret = 0; + int err = 0; + + /* + * Make sure that the extent is no bigger than we support with + * uninitialized extent + */ + if (map->m_len > EXT_UNINIT_MAX_LEN) + map->m_len = EXT_UNINIT_MAX_LEN / 2; + + ret = ext4_convert_initialized_extents(handle, inode, map, + path); + if (ret >= 0) { + ext4_update_inode_fsync_trans(handle, inode, 1); + err = check_eofblocks_fl(handle, inode, map->m_lblk, + path, map->m_len); + } else + err = ret; + map->m_flags |= EXT4_MAP_UNWRITTEN; + if (allocated > map->m_len) + allocated = map->m_len; + map->m_len = allocated; + + return err ? err : allocated; +} + +static int ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path *path, int flags, @@ -3877,8 +4019,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, /* get_block() before submit the IO, split the extent */ if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { - ret = ext4_split_unwritten_extents(handle, inode, map, - path, flags); + ret = ext4_split_convert_extents(handle, inode, map, + path, flags | EXT4_GET_BLOCKS_CONVERT); if (ret <= 0) goto out; /* @@ -3993,10 +4135,6 @@ out1: map->m_pblk = newblock; map->m_len = allocated; out2: - if (path) { - ext4_ext_drop_refs(path); - kfree(path); - } return err ? err : allocated; } @@ -4128,7 +4266,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_extent newex, *ex, *ex2; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_fsblk_t newblock = 0; - int free_on_err = 0, err = 0, depth; + int free_on_err = 0, err = 0, depth, ret; unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; struct ext4_allocation_request ar; @@ -4170,6 +4308,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t ee_start = ext4_ext_pblock(ex); unsigned short ee_len; + /* * Uninitialized extents are treated as holes, except that * we split out initialized portions during a write. @@ -4186,13 +4325,27 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, ee_block, ee_len, newblock); - if (!ext4_ext_is_uninitialized(ex)) + /* + * If the extent is initialized check whether the + * caller wants to convert it to unwritten. + */ + if ((!ext4_ext_is_uninitialized(ex)) && + (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { + allocated = ext4_ext_convert_initialized_extent( + handle, inode, map, path, flags, + allocated, newblock); + goto out2; + } else if (!ext4_ext_is_uninitialized(ex)) goto out; - allocated = ext4_ext_handle_uninitialized_extents( + ret = ext4_ext_handle_uninitialized_extents( handle, inode, map, path, flags, allocated, newblock); - goto out3; + if (ret < 0) + err = ret; + else + allocated = ret; + goto out2; } } @@ -4473,7 +4626,6 @@ out2: kfree(path); } -out3: trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated); ext4_es_lru_add(inode); @@ -4514,34 +4666,203 @@ retry: ext4_std_error(inode->i_sb, err); } -static void ext4_falloc_update_inode(struct inode *inode, - int mode, loff_t new_size, int update_ctime) +static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, + ext4_lblk_t len, int flags, int mode) +{ + struct inode *inode = file_inode(file); + handle_t *handle; + int ret = 0; + int ret2 = 0; + int retries = 0; + struct ext4_map_blocks map; + unsigned int credits; + + map.m_lblk = offset; + /* + * Don't normalize the request if it can fit in one extent so + * that it doesn't get unnecessarily split into multiple + * extents. + */ + if (len <= EXT_UNINIT_MAX_LEN) + flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; + + /* + * credits to insert 1 extent into extent tree + */ + credits = ext4_chunk_trans_blocks(inode, len); + +retry: + while (ret >= 0 && ret < len) { + map.m_lblk = map.m_lblk + ret; + map.m_len = len = len - ret; + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, + credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + break; + } + ret = ext4_map_blocks(handle, inode, &map, flags); + if (ret <= 0) { + ext4_debug("inode #%lu: block %u: len %u: " + "ext4_ext_map_blocks returned %d", + inode->i_ino, map.m_lblk, + map.m_len, ret); + ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_journal_stop(handle); + break; + } + ret2 = ext4_journal_stop(handle); + if (ret2) + break; + } + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) { + ret = 0; + goto retry; + } + + return ret > 0 ? ret2 : ret; +} + +static long ext4_zero_range(struct file *file, loff_t offset, + loff_t len, int mode) { - struct timespec now; + struct inode *inode = file_inode(file); + handle_t *handle = NULL; + unsigned int max_blocks; + loff_t new_size = 0; + int ret = 0; + int flags; + int partial; + loff_t start, end; + ext4_lblk_t lblk; + struct address_space *mapping = inode->i_mapping; + unsigned int blkbits = inode->i_blkbits; + + trace_ext4_zero_range(inode, offset, len, mode); - if (update_ctime) { - now = current_fs_time(inode->i_sb); - if (!timespec_equal(&inode->i_ctime, &now)) - inode->i_ctime = now; + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + /* + * Write out all dirty pages to avoid race conditions + * Then release them. + */ + if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + ret = filemap_write_and_wait_range(mapping, offset, + offset + len - 1); + if (ret) + return ret; } + + /* + * Round up offset. This is not fallocate, we neet to zero out + * blocks, so convert interior block aligned part of the range to + * unwritten and possibly manually zero out unaligned parts of the + * range. + */ + start = round_up(offset, 1 << blkbits); + end = round_down((offset + len), 1 << blkbits); + + if (start < offset || end > offset + len) + return -EINVAL; + partial = (offset + len) & ((1 << blkbits) - 1); + + lblk = start >> blkbits; + max_blocks = (end >> blkbits); + if (max_blocks < lblk) + max_blocks = 0; + else + max_blocks -= lblk; + + flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT | + EXT4_GET_BLOCKS_CONVERT_UNWRITTEN; + if (mode & FALLOC_FL_KEEP_SIZE) + flags |= EXT4_GET_BLOCKS_KEEP_SIZE; + + mutex_lock(&inode->i_mutex); + /* - * Update only when preallocation was requested beyond - * the file size. + * Indirect files do not support unwritten extnets */ - if (!(mode & FALLOC_FL_KEEP_SIZE)) { + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + ret = -EOPNOTSUPP; + goto out_mutex; + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) { + new_size = offset + len; + ret = inode_newsize_ok(inode, new_size); + if (ret) + goto out_mutex; + /* + * If we have a partial block after EOF we have to allocate + * the entire block. + */ + if (partial) + max_blocks += 1; + } + + if (max_blocks > 0) { + + /* Now release the pages and zero block aligned part of pages*/ + truncate_pagecache_range(inode, start, end - 1); + + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + + /* + * Remove entire range from the extent status tree. + */ + ret = ext4_es_remove_extent(inode, lblk, max_blocks); + if (ret) + goto out_dio; + + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, + mode); + if (ret) + goto out_dio; + } + + handle = ext4_journal_start(inode, EXT4_HT_MISC, 4); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + ext4_std_error(inode->i_sb, ret); + goto out_dio; + } + + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + + if (new_size) { if (new_size > i_size_read(inode)) i_size_write(inode, new_size); if (new_size > EXT4_I(inode)->i_disksize) ext4_update_i_disksize(inode, new_size); } else { /* - * Mark that we allocate beyond EOF so the subsequent truncate - * can proceed even if the new size is the same as i_size. - */ - if (new_size > i_size_read(inode)) + * Mark that we allocate beyond EOF so the subsequent truncate + * can proceed even if the new size is the same as i_size. + */ + if ((offset + len) > i_size_read(inode)) ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); } + ext4_mark_inode_dirty(handle, inode); + + /* Zero out partial block at the edges of the range */ + ret = ext4_zero_partial_blocks(handle, inode, offset, len); + + if (file->f_flags & O_SYNC) + ext4_handle_sync(handle); + + ext4_journal_stop(handle); +out_dio: + ext4_inode_resume_unlocked_dio(inode); +out_mutex: + mutex_unlock(&inode->i_mutex); + return ret; } /* @@ -4555,17 +4876,17 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); handle_t *handle; - loff_t new_size; + loff_t new_size = 0; unsigned int max_blocks; int ret = 0; - int ret2 = 0; - int retries = 0; int flags; - struct ext4_map_blocks map; - unsigned int credits, blkbits = inode->i_blkbits; + ext4_lblk_t lblk; + struct timespec tv; + unsigned int blkbits = inode->i_blkbits; /* Return error if mode is not supported */ - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) @@ -4582,83 +4903,69 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EOPNOTSUPP; + if (mode & FALLOC_FL_COLLAPSE_RANGE) + return ext4_collapse_range(inode, offset, len); + + if (mode & FALLOC_FL_ZERO_RANGE) + return ext4_zero_range(file, offset, len, mode); + trace_ext4_fallocate_enter(inode, offset, len, mode); - map.m_lblk = offset >> blkbits; + lblk = offset >> blkbits; /* * We can't just convert len to max_blocks because * If blocksize = 4096 offset = 3072 and len = 2048 */ max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - - map.m_lblk; - /* - * credits to insert 1 extent into extent tree - */ - credits = ext4_chunk_trans_blocks(inode, max_blocks); - mutex_lock(&inode->i_mutex); - ret = inode_newsize_ok(inode, (len + offset)); - if (ret) { - mutex_unlock(&inode->i_mutex); - trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); - return ret; - } + - lblk; + flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT; if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; - /* - * Don't normalize the request if it can fit in one extent so - * that it doesn't get unnecessarily split into multiple - * extents. - */ - if (len <= EXT_UNINIT_MAX_LEN << blkbits) - flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; -retry: - while (ret >= 0 && ret < max_blocks) { - map.m_lblk = map.m_lblk + ret; - map.m_len = max_blocks = max_blocks - ret; - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, - credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - break; - } - ret = ext4_map_blocks(handle, inode, &map, flags); - if (ret <= 0) { -#ifdef EXT4FS_DEBUG - ext4_warning(inode->i_sb, - "inode #%lu: block %u: len %u: " - "ext4_ext_map_blocks returned %d", - inode->i_ino, map.m_lblk, - map.m_len, ret); -#endif - ext4_mark_inode_dirty(handle, inode); - ret2 = ext4_journal_stop(handle); - break; - } - if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len, - blkbits) >> blkbits)) - new_size = offset + len; - else - new_size = ((loff_t) map.m_lblk + ret) << blkbits; + mutex_lock(&inode->i_mutex); - ext4_falloc_update_inode(inode, mode, new_size, - (map.m_flags & EXT4_MAP_NEW)); - ext4_mark_inode_dirty(handle, inode); - if ((file->f_flags & O_SYNC) && ret >= max_blocks) - ext4_handle_sync(handle); - ret2 = ext4_journal_stop(handle); - if (ret2) - break; + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) { + new_size = offset + len; + ret = inode_newsize_ok(inode, new_size); + if (ret) + goto out; } - if (ret == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) { - ret = 0; - goto retry; + + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode); + if (ret) + goto out; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) + goto out; + + tv = inode->i_ctime = ext4_current_time(inode); + + if (new_size) { + if (new_size > i_size_read(inode)) { + i_size_write(inode, new_size); + inode->i_mtime = tv; + } + if (new_size > EXT4_I(inode)->i_disksize) + ext4_update_i_disksize(inode, new_size); + } else { + /* + * Mark that we allocate beyond EOF so the subsequent truncate + * can proceed even if the new size is the same as i_size. + */ + if ((offset + len) > i_size_read(inode)) + ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); } + ext4_mark_inode_dirty(handle, inode); + if (file->f_flags & O_SYNC) + ext4_handle_sync(handle); + + ext4_journal_stop(handle); +out: mutex_unlock(&inode->i_mutex); - trace_ext4_fallocate_exit(inode, offset, max_blocks, - ret > 0 ? ret2 : ret); - return ret > 0 ? ret2 : ret; + trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); + return ret; } /* @@ -4869,3 +5176,333 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, ext4_es_lru_add(inode); return error; } + +/* + * ext4_access_path: + * Function to access the path buffer for marking it dirty. + * It also checks if there are sufficient credits left in the journal handle + * to update path. + */ +static int +ext4_access_path(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) +{ + int credits, err; + + if (!ext4_handle_valid(handle)) + return 0; + + /* + * Check if need to extend journal credits + * 3 for leaf, sb, and inode plus 2 (bmap and group + * descriptor) for each block group; assume two block + * groups + */ + if (handle->h_buffer_credits < 7) { + credits = ext4_writepage_trans_blocks(inode); + err = ext4_ext_truncate_extend_restart(handle, inode, credits); + /* EAGAIN is success */ + if (err && err != -EAGAIN) + return err; + } + + err = ext4_ext_get_access(handle, inode, path); + return err; +} + +/* + * ext4_ext_shift_path_extents: + * Shift the extents of a path structure lying between path[depth].p_ext + * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift + * from starting block for each extent. + */ +static int +ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, + struct inode *inode, handle_t *handle, + ext4_lblk_t *start) +{ + int depth, err = 0; + struct ext4_extent *ex_start, *ex_last; + bool update = 0; + depth = path->p_depth; + + while (depth >= 0) { + if (depth == path->p_depth) { + ex_start = path[depth].p_ext; + if (!ex_start) + return -EIO; + + ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); + if (!ex_last) + return -EIO; + + err = ext4_access_path(handle, inode, path + depth); + if (err) + goto out; + + if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) + update = 1; + + *start = le32_to_cpu(ex_last->ee_block) + + ext4_ext_get_actual_len(ex_last); + + while (ex_start <= ex_last) { + le32_add_cpu(&ex_start->ee_block, -shift); + /* Try to merge to the left. */ + if ((ex_start > + EXT_FIRST_EXTENT(path[depth].p_hdr)) && + ext4_ext_try_to_merge_right(inode, + path, ex_start - 1)) + ex_last--; + else + ex_start++; + } + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto out; + + if (--depth < 0 || !update) + break; + } + + /* Update index too */ + err = ext4_access_path(handle, inode, path + depth); + if (err) + goto out; + + le32_add_cpu(&path[depth].p_idx->ei_block, -shift); + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto out; + + /* we are done if current index is not a starting index */ + if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr)) + break; + + depth--; + } + +out: + return err; +} + +/* + * ext4_ext_shift_extents: + * All the extents which lies in the range from start to the last allocated + * block for the file are shifted downwards by shift blocks. + * On success, 0 is returned, error otherwise. + */ +static int +ext4_ext_shift_extents(struct inode *inode, handle_t *handle, + ext4_lblk_t start, ext4_lblk_t shift) +{ + struct ext4_ext_path *path; + int ret = 0, depth; + struct ext4_extent *extent; + ext4_lblk_t stop_block, current_block; + ext4_lblk_t ex_start, ex_end; + + /* Let path point to the last extent */ + path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + + depth = path->p_depth; + extent = path[depth].p_ext; + if (!extent) { + ext4_ext_drop_refs(path); + kfree(path); + return ret; + } + + stop_block = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); + ext4_ext_drop_refs(path); + kfree(path); + + /* Nothing to shift, if hole is at the end of file */ + if (start >= stop_block) + return ret; + + /* + * Don't start shifting extents until we make sure the hole is big + * enough to accomodate the shift. + */ + path = ext4_ext_find_extent(inode, start - 1, NULL, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + depth = path->p_depth; + extent = path[depth].p_ext; + if (extent) { + ex_start = le32_to_cpu(extent->ee_block); + ex_end = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); + } else { + ex_start = 0; + ex_end = 0; + } + ext4_ext_drop_refs(path); + kfree(path); + + if ((start == ex_start && shift > ex_start) || + (shift > start - ex_end)) + return -EINVAL; + + /* Its safe to start updating extents */ + while (start < stop_block) { + path = ext4_ext_find_extent(inode, start, NULL, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + depth = path->p_depth; + extent = path[depth].p_ext; + if (!extent) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) start); + return -EIO; + } + + current_block = le32_to_cpu(extent->ee_block); + if (start > current_block) { + /* Hole, move to the next extent */ + ret = mext_next_extent(inode, path, &extent); + if (ret != 0) { + ext4_ext_drop_refs(path); + kfree(path); + if (ret == 1) + ret = 0; + break; + } + } + ret = ext4_ext_shift_path_extents(path, shift, inode, + handle, &start); + ext4_ext_drop_refs(path); + kfree(path); + if (ret) + break; + } + + return ret; +} + +/* + * ext4_collapse_range: + * This implements the fallocate's collapse range functionality for ext4 + * Returns: 0 and non-zero on error. + */ +int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) +{ + struct super_block *sb = inode->i_sb; + ext4_lblk_t punch_start, punch_stop; + handle_t *handle; + unsigned int credits; + loff_t new_size, ioffset; + int ret; + + /* Collapse range works only on fs block size aligned offsets. */ + if (offset & (EXT4_BLOCK_SIZE(sb) - 1) || + len & (EXT4_BLOCK_SIZE(sb) - 1)) + return -EINVAL; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) + return -EOPNOTSUPP; + + trace_ext4_collapse_range(inode, offset, len); + + punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); + punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); + + /* Call ext4_force_commit to flush all data in case of data=journal. */ + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + if (ret) + return ret; + } + + /* + * Need to round down offset to be aligned with page size boundary + * for page size > block size. + */ + ioffset = round_down(offset, PAGE_SIZE); + + /* Write out all dirty pages */ + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, + LLONG_MAX); + if (ret) + return ret; + + /* Take mutex lock */ + mutex_lock(&inode->i_mutex); + + /* + * There is no need to overlap collapse range with EOF, in which case + * it is effectively a truncate operation + */ + if (offset + len >= i_size_read(inode)) { + ret = -EINVAL; + goto out_mutex; + } + + /* Currently just for extent based files */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + ret = -EOPNOTSUPP; + goto out_mutex; + } + + truncate_pagecache(inode, ioffset); + + /* Wait for existing dio to complete */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + + credits = ext4_writepage_trans_blocks(inode); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_dio; + } + + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + + ret = ext4_es_remove_extent(inode, punch_start, + EXT_MAX_BLOCKS - punch_start); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + ext4_discard_preallocations(inode); + + ret = ext4_ext_shift_extents(inode, handle, punch_stop, + punch_stop - punch_start); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + new_size = i_size_read(inode) - len; + i_size_write(inode, new_size); + EXT4_I(inode)->i_disksize = new_size; + + up_write(&EXT4_I(inode)->i_data_sem); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + +out_stop: + ext4_journal_stop(handle); +out_dio: + ext4_inode_resume_unlocked_dio(inode); +out_mutex: + mutex_unlock(&inode->i_mutex); + return ret; +} diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 3981ff783950..0ebc21204b51 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -184,7 +184,7 @@ static void ext4_es_print_tree(struct inode *inode) while (node) { struct extent_status *es; es = rb_entry(node, struct extent_status, rb_node); - printk(KERN_DEBUG " [%u/%u) %llu %llx", + printk(KERN_DEBUG " [%u/%u) %llu %x", es->es_lblk, es->es_len, ext4_es_pblock(es), ext4_es_status(es)); node = rb_next(node); @@ -445,8 +445,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, pr_warn("ES insert assertion failed for " "inode: %lu we can find an extent " "at block [%d/%d/%llu/%c], but we " - "want to add an delayed/hole extent " - "[%d/%d/%llu/%llx]\n", + "want to add a delayed/hole extent " + "[%d/%d/%llu/%x]\n", inode->i_ino, ee_block, ee_len, ee_start, ee_status ? 'u' : 'w', es->es_lblk, es->es_len, @@ -486,8 +486,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { pr_warn("ES insert assertion failed for inode: %lu " "can't find an extent at block %d but we want " - "to add an written/unwritten extent " - "[%d/%d/%llu/%llx]\n", inode->i_ino, + "to add a written/unwritten extent " + "[%d/%d/%llu/%x]\n", inode->i_ino, es->es_lblk, es->es_lblk, es->es_len, ext4_es_pblock(es), ext4_es_status(es)); } @@ -524,7 +524,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode, */ pr_warn("ES insert assertion failed for inode: %lu " "We can find blocks but we want to add a " - "delayed/hole extent [%d/%d/%llu/%llx]\n", + "delayed/hole extent [%d/%d/%llu/%x]\n", inode->i_ino, es->es_lblk, es->es_len, ext4_es_pblock(es), ext4_es_status(es)); return; @@ -554,7 +554,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode, if (ext4_es_is_written(es)) { pr_warn("ES insert assertion failed for inode: %lu " "We can't find the block but we want to add " - "an written extent [%d/%d/%llu/%llx]\n", + "a written extent [%d/%d/%llu/%x]\n", inode->i_ino, es->es_lblk, es->es_len, ext4_es_pblock(es), ext4_es_status(es)); return; @@ -658,8 +658,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, newes.es_lblk = lblk; newes.es_len = len; - ext4_es_store_pblock(&newes, pblk); - ext4_es_store_status(&newes, status); + ext4_es_store_pblock_status(&newes, pblk, status); trace_ext4_es_insert_extent(inode, &newes); ext4_es_insert_extent_check(inode, &newes); @@ -699,8 +698,7 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, newes.es_lblk = lblk; newes.es_len = len; - ext4_es_store_pblock(&newes, pblk); - ext4_es_store_status(&newes, status); + ext4_es_store_pblock_status(&newes, pblk, status); trace_ext4_es_cache_extent(inode, &newes); if (!len) @@ -812,13 +810,13 @@ retry: newes.es_lblk = end + 1; newes.es_len = len2; + block = 0x7FDEADBEEFULL; if (ext4_es_is_written(&orig_es) || - ext4_es_is_unwritten(&orig_es)) { + ext4_es_is_unwritten(&orig_es)) block = ext4_es_pblock(&orig_es) + orig_es.es_len - len2; - ext4_es_store_pblock(&newes, block); - } - ext4_es_store_status(&newes, ext4_es_status(&orig_es)); + ext4_es_store_pblock_status(&newes, block, + ext4_es_status(&orig_es)); err = __es_insert_extent(inode, &newes); if (err) { es->es_lblk = orig_es.es_lblk; diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 167f4ab8ecc3..f1b62a419920 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -129,6 +129,15 @@ static inline void ext4_es_store_status(struct extent_status *es, (es->es_pblk & ~ES_MASK)); } +static inline void ext4_es_store_pblock_status(struct extent_status *es, + ext4_fsblk_t pb, + unsigned int status) +{ + es->es_pblk = (((ext4_fsblk_t) + (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) | + (pb & ~ES_MASK)); +} + extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_lru_add(struct inode *inode); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 1a5073959f32..063fc1538355 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -82,7 +82,7 @@ ext4_unaligned_aio(struct inode *inode, const struct iovec *iov, size_t count = iov_length(iov, nr_segs); loff_t final_size = pos + count; - if (pos >= inode->i_size) + if (pos >= i_size_read(inode)) return 0; if ((pos & blockmask) || (final_size & blockmask)) @@ -146,14 +146,14 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov, overwrite = 1; } - ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); + ret = __generic_file_aio_write(iocb, iov, nr_segs); mutex_unlock(&inode->i_mutex); if (ret > 0) { ssize_t err; err = generic_write_sync(file, iocb->ki_pos - ret, ret); - if (err < 0 && ret > 0) + if (err < 0) ret = err; } blk_finish_plug(&plug); @@ -200,6 +200,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, static const struct vm_operations_struct ext4_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 24bfd7ff3049..d7b7462a0e13 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -215,7 +215,7 @@ void ext4_evict_inode(struct inode *inode) jbd2_complete_transaction(journal, commit_tid); filemap_write_and_wait(&inode->i_data); } - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); goto no_delete; @@ -226,7 +226,7 @@ void ext4_evict_inode(struct inode *inode) if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); if (is_bad_inode(inode)) @@ -504,6 +504,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, { struct extent_status es; int retval; + int ret = 0; #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; @@ -515,6 +516,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, "logical block %lu\n", inode->i_ino, flags, map->m_len, (unsigned long) map->m_lblk); + /* + * ext4_map_blocks returns an int, and m_len is an unsigned int + */ + if (unlikely(map->m_len > INT_MAX)) + map->m_len = INT_MAX; + + /* We can handle the block number less than EXT_MAX_BLOCKS */ + if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) + return -EIO; + /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { ext4_es_lru_add(inode); @@ -553,7 +564,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, EXT4_GET_BLOCKS_KEEP_SIZE); } if (retval > 0) { - int ret; unsigned int status; if (unlikely(retval != map->m_len)) { @@ -580,7 +590,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, found: if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { - int ret = check_block_validity(inode, map); + ret = check_block_validity(inode, map); if (ret != 0) return ret; } @@ -597,7 +607,13 @@ found: * with buffer head unmapped. */ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) - return retval; + /* + * If we need to convert extent to unwritten + * we continue and do the actual work in + * ext4_ext_map_blocks() + */ + if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) + return retval; /* * Here we clear m_flags because after allocating an new extent, @@ -653,7 +669,6 @@ found: ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); if (retval > 0) { - int ret; unsigned int status; if (unlikely(retval != map->m_len)) { @@ -688,7 +703,7 @@ found: has_zeroout: up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { - int ret = check_block_validity(inode, map); + ret = check_block_validity(inode, map); if (ret != 0) return ret; } @@ -2232,13 +2247,23 @@ static int mpage_map_and_submit_extent(handle_t *handle, return err; } while (map->m_len); - /* Update on-disk size after IO is submitted */ + /* + * Update on-disk size after IO is submitted. Races with + * truncate are avoided by checking i_size under i_data_sem. + */ disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT; if (disksize > EXT4_I(inode)->i_disksize) { int err2; - - ext4_wb_update_i_disksize(inode, disksize); + loff_t i_size; + + down_write(&EXT4_I(inode)->i_data_sem); + i_size = i_size_read(inode); + if (disksize > i_size) + disksize = i_size; + if (disksize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = disksize; err2 = ext4_mark_inode_dirty(handle, inode); + up_write(&EXT4_I(inode)->i_data_sem); if (err2) ext4_error(inode->i_sb, "Failed to mark inode %lu dirty", @@ -3313,33 +3338,13 @@ void ext4_set_aops(struct inode *inode) } /* - * ext4_block_truncate_page() zeroes out a mapping from file offset `from' - * up to the end of the block which corresponds to `from'. - * This required during truncate. We need to physically zero the tail end - * of that block so it doesn't yield old data if the file is later grown. - */ -int ext4_block_truncate_page(handle_t *handle, - struct address_space *mapping, loff_t from) -{ - unsigned offset = from & (PAGE_CACHE_SIZE-1); - unsigned length; - unsigned blocksize; - struct inode *inode = mapping->host; - - blocksize = inode->i_sb->s_blocksize; - length = blocksize - (offset & (blocksize - 1)); - - return ext4_block_zero_page_range(handle, mapping, from, length); -} - -/* * ext4_block_zero_page_range() zeros out a mapping of length 'length' * starting from file offset 'from'. The range to be zero'd must * be contained with in one block. If the specified range exceeds * the end of the block it will be shortened to end of the block * that cooresponds to 'from' */ -int ext4_block_zero_page_range(handle_t *handle, +static int ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; @@ -3429,6 +3434,26 @@ unlock: return err; } +/* + * ext4_block_truncate_page() zeroes out a mapping from file offset `from' + * up to the end of the block which corresponds to `from'. + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +int ext4_block_truncate_page(handle_t *handle, + struct address_space *mapping, loff_t from) +{ + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned length; + unsigned blocksize; + struct inode *inode = mapping->host; + + blocksize = inode->i_sb->s_blocksize; + length = blocksize - (offset & (blocksize - 1)); + + return ext4_block_zero_page_range(handle, mapping, from, length); +} + int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t length) { @@ -3502,7 +3527,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - trace_ext4_punch_hole(inode, offset, length); + trace_ext4_punch_hole(inode, offset, length, 0); /* * Write out all dirty pages to avoid race conditions @@ -3516,15 +3541,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) } mutex_lock(&inode->i_mutex); - /* It's not possible punch hole on append only file */ - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { - ret = -EPERM; - goto out_mutex; - } - if (IS_SWAPFILE(inode)) { - ret = -ETXTBSY; - goto out_mutex; - } /* No need to punch hole beyond i_size */ if (offset >= inode->i_size) @@ -3605,10 +3621,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) ret = ext4_free_hole_blocks(handle, inode, first_block, stop_block); - ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); + + /* Now release the pages again to reduce race window */ + if (last_block_offset > first_block_offset) + truncate_pagecache_range(inode, first_block_offset, + last_block_offset); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); out_stop: @@ -3682,7 +3703,7 @@ void ext4_truncate(struct inode *inode) /* * There is a possibility that we're either freeing the inode - * or it completely new indode. In those cases we might not + * or it's a completely new inode. In those cases we might not * have i_mutex locked because it's not necessary. */ if (!(inode->i_state & (I_NEW|I_FREEING))) @@ -3934,8 +3955,8 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; - set_mask_bits(&inode->i_flags, - S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC, new_fl); + inode_set_flags(inode, new_fl, + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); } /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ @@ -4154,11 +4175,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); - inode->i_version = le32_to_cpu(raw_inode->i_disk_version); - if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { - if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) - inode->i_version |= - (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { + inode->i_version = le32_to_cpu(raw_inode->i_disk_version); + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { + if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) + inode->i_version |= + (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; + } } ret = 0; @@ -4328,8 +4351,7 @@ static int ext4_do_update_inode(handle_t *handle, goto out_brelse; raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); - if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != - cpu_to_le32(EXT4_OS_HURD)) + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) raw_inode->i_file_acl_high = cpu_to_le16(ei->i_file_acl >> 32); raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); @@ -4374,12 +4396,15 @@ static int ext4_do_update_inode(handle_t *handle, raw_inode->i_block[block] = ei->i_data[block]; } - raw_inode->i_disk_version = cpu_to_le32(inode->i_version); - if (ei->i_extra_isize) { - if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) - raw_inode->i_version_hi = - cpu_to_le32(inode->i_version >> 32); - raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { + raw_inode->i_disk_version = cpu_to_le32(inode->i_version); + if (ei->i_extra_isize) { + if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) + raw_inode->i_version_hi = + cpu_to_le32(inode->i_version >> 32); + raw_inode->i_extra_isize = + cpu_to_le16(ei->i_extra_isize); + } } ext4_inode_csum_set(inode, raw_inode, ei); @@ -4402,21 +4427,20 @@ out_brelse: * * We are called from a few places: * - * - Within generic_file_write() for O_SYNC files. + * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. * Here, there will be no transaction running. We wait for any running * transaction to commit. * - * - Within sys_sync(), kupdate and such. - * We wait on commit, if tol to. + * - Within flush work (sys_sync(), kupdate and such). + * We wait on commit, if told to. * - * - Within prune_icache() (PF_MEMALLOC == true) - * Here we simply return. We can't afford to block kswapd on the - * journal commit. + * - Within iput_final() -> write_inode_now() + * We wait on commit, if told to. * * In all cases it is actually safe for us to return without doing anything, * because the inode has been copied into a raw inode buffer in - * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for - * knfsd. + * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL + * writeback. * * Note that we are absolutely dependent upon all inode dirtiers doing the * right thing: they *must* call mark_inode_dirty() after dirtying info in @@ -4428,15 +4452,15 @@ out_brelse: * stuff(); * inode->i_size = expr; * - * is in error because a kswapd-driven write_inode() could occur while - * `stuff()' is running, and the new i_size will be lost. Plus the inode - * will no longer be on the superblock's dirty inode list. + * is in error because write_inode() could occur while `stuff()' is running, + * and the new i_size will be lost. Plus the inode will no longer be on the + * superblock's dirty inode list. */ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; - if (current->flags & PF_MEMALLOC) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; if (EXT4_SB(inode->i_sb)->s_journal) { @@ -4446,7 +4470,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) return -EIO; } - if (wbc->sync_mode != WB_SYNC_ALL) + /* + * No need to force transaction in WB_SYNC_NONE mode. Also + * ext4_sync_fs() will force the commit after everything is + * written. + */ + if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) return 0; err = ext4_force_commit(inode->i_sb); @@ -4456,7 +4485,11 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) err = __ext4_get_inode_loc(inode, &iloc, 0); if (err) return err; - if (wbc->sync_mode == WB_SYNC_ALL) + /* + * sync(2) will flush the whole buffer cache. No need to do + * it here separately for each inode. + */ + if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index a2a837f00407..0f2252ec274d 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -104,21 +104,15 @@ static long swap_inode_boot_loader(struct super_block *sb, struct ext4_inode_info *ei_bl; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) { - err = -EINVAL; - goto swap_boot_out; - } + if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) + return -EINVAL; - if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) { - err = -EPERM; - goto swap_boot_out; - } + if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) + return -EPERM; inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); - if (IS_ERR(inode_bl)) { - err = PTR_ERR(inode_bl); - goto swap_boot_out; - } + if (IS_ERR(inode_bl)) + return PTR_ERR(inode_bl); ei_bl = EXT4_I(inode_bl); filemap_flush(inode->i_mapping); @@ -193,20 +187,14 @@ static long swap_inode_boot_loader(struct super_block *sb, ext4_mark_inode_dirty(handle, inode); } } - ext4_journal_stop(handle); - ext4_double_up_write_data_sem(inode, inode_bl); journal_err_out: ext4_inode_resume_unlocked_dio(inode); ext4_inode_resume_unlocked_dio(inode_bl); - unlock_two_nondirectories(inode, inode_bl); - iput(inode_bl); - -swap_boot_out: return err; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 04a5c7504be9..c8238a26818c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -989,7 +989,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, poff = block % blocks_per_page; page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); if (!page) - return -EIO; + return -ENOMEM; BUG_ON(page->mapping != inode->i_mapping); e4b->bd_bitmap_page = page; e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); @@ -1003,7 +1003,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, pnum = block / blocks_per_page; page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); if (!page) - return -EIO; + return -ENOMEM; BUG_ON(page->mapping != inode->i_mapping); e4b->bd_buddy_page = page; return 0; @@ -1168,7 +1168,11 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, unlock_page(page); } } - if (page == NULL || !PageUptodate(page)) { + if (page == NULL) { + ret = -ENOMEM; + goto err; + } + if (!PageUptodate(page)) { ret = -EIO; goto err; } @@ -1197,7 +1201,11 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, unlock_page(page); } } - if (page == NULL || !PageUptodate(page)) { + if (page == NULL) { + ret = -ENOMEM; + goto err; + } + if (!PageUptodate(page)) { ret = -EIO; goto err; } @@ -1808,6 +1816,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, ext4_lock_group(ac->ac_sb, group); max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, ac->ac_g_ex.fe_len, &ex); + ex.fe_logical = 0xDEADFA11; /* debug value */ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { ext4_fsblk_t start; @@ -1936,7 +1945,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, */ break; } - + ex.fe_logical = 0xDEADC0DE; /* debug value */ ext4_mb_measure_extent(ac, &ex, e4b); i += ex.fe_len; @@ -1977,6 +1986,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, max = mb_find_extent(e4b, i, sbi->s_stripe, &ex); if (max >= sbi->s_stripe) { ac->ac_found++; + ex.fe_logical = 0xDEADF00D; /* debug value */ ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); break; @@ -4006,8 +4016,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (unsigned long)ac->ac_b_ex.fe_len, (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); - ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found", - ac->ac_ex_scanned, ac->ac_found); + ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found); ext4_msg(ac->ac_sb, KERN_ERR, "groups: "); ngroups = ext4_get_groups_count(sb); for (i = 0; i < ngroups; i++) { @@ -5007,6 +5016,8 @@ error_return: */ static int ext4_trim_extent(struct super_block *sb, int start, int count, ext4_group_t group, struct ext4_buddy *e4b) +__releases(bitlock) +__acquires(bitlock) { struct ext4_free_extent ex; int ret = 0; diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 08481ee84cd5..d634e183b4d4 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -48,7 +48,7 @@ extern ushort ext4_mballoc_debug; } \ } while (0) #else -#define mb_debug(n, fmt, a...) +#define mb_debug(n, fmt, a...) no_printk(fmt, ## a) #endif #define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ @@ -175,8 +175,6 @@ struct ext4_allocation_context { /* copy of the best found extent taken before preallocation efforts */ struct ext4_free_extent ac_f_ex; - /* number of iterations done. we have to track to limit searching */ - unsigned long ac_ex_scanned; __u16 ac_groups_scanned; __u16 ac_found; __u16 ac_tail; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 773b503bd18c..58ee7dc87669 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -76,7 +76,7 @@ copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest) * ext4_ext_path structure refers to the last extent, or a negative error * value on failure. */ -static int +int mext_next_extent(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent **extent) { @@ -861,8 +861,7 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) } if (!buffer_mapped(bh)) { zero_user(page, block_start, blocksize); - if (!err) - set_buffer_uptodate(bh); + set_buffer_uptodate(bh); continue; } } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index d050e043e884..1cb84f78909e 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3000,6 +3000,154 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, return ext4_get_first_inline_block(inode, parent_de, retval); } +struct ext4_renament { + struct inode *dir; + struct dentry *dentry; + struct inode *inode; + bool is_dir; + int dir_nlink_delta; + + /* entry for "dentry" */ + struct buffer_head *bh; + struct ext4_dir_entry_2 *de; + int inlined; + + /* entry for ".." in inode if it's a directory */ + struct buffer_head *dir_bh; + struct ext4_dir_entry_2 *parent_de; + int dir_inlined; +}; + +static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent) +{ + int retval; + + ent->dir_bh = ext4_get_first_dir_block(handle, ent->inode, + &retval, &ent->parent_de, + &ent->dir_inlined); + if (!ent->dir_bh) + return retval; + if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino) + return -EIO; + BUFFER_TRACE(ent->dir_bh, "get_write_access"); + return ext4_journal_get_write_access(handle, ent->dir_bh); +} + +static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent, + unsigned dir_ino) +{ + int retval; + + ent->parent_de->inode = cpu_to_le32(dir_ino); + BUFFER_TRACE(ent->dir_bh, "call ext4_handle_dirty_metadata"); + if (!ent->dir_inlined) { + if (is_dx(ent->inode)) { + retval = ext4_handle_dirty_dx_node(handle, + ent->inode, + ent->dir_bh); + } else { + retval = ext4_handle_dirty_dirent_node(handle, + ent->inode, + ent->dir_bh); + } + } else { + retval = ext4_mark_inode_dirty(handle, ent->inode); + } + if (retval) { + ext4_std_error(ent->dir->i_sb, retval); + return retval; + } + return 0; +} + +static int ext4_setent(handle_t *handle, struct ext4_renament *ent, + unsigned ino, unsigned file_type) +{ + int retval; + + BUFFER_TRACE(ent->bh, "get write access"); + retval = ext4_journal_get_write_access(handle, ent->bh); + if (retval) + return retval; + ent->de->inode = cpu_to_le32(ino); + if (EXT4_HAS_INCOMPAT_FEATURE(ent->dir->i_sb, + EXT4_FEATURE_INCOMPAT_FILETYPE)) + ent->de->file_type = file_type; + ent->dir->i_version++; + ent->dir->i_ctime = ent->dir->i_mtime = + ext4_current_time(ent->dir); + ext4_mark_inode_dirty(handle, ent->dir); + BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata"); + if (!ent->inlined) { + retval = ext4_handle_dirty_dirent_node(handle, + ent->dir, ent->bh); + if (unlikely(retval)) { + ext4_std_error(ent->dir->i_sb, retval); + return retval; + } + } + brelse(ent->bh); + ent->bh = NULL; + + return 0; +} + +static int ext4_find_delete_entry(handle_t *handle, struct inode *dir, + const struct qstr *d_name) +{ + int retval = -ENOENT; + struct buffer_head *bh; + struct ext4_dir_entry_2 *de; + + bh = ext4_find_entry(dir, d_name, &de, NULL); + if (bh) { + retval = ext4_delete_entry(handle, dir, de, bh); + brelse(bh); + } + return retval; +} + +static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent) +{ + int retval; + /* + * ent->de could have moved from under us during htree split, so make + * sure that we are deleting the right entry. We might also be pointing + * to a stale entry in the unused part of ent->bh so just checking inum + * and the name isn't enough. + */ + if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino || + ent->de->name_len != ent->dentry->d_name.len || + strncmp(ent->de->name, ent->dentry->d_name.name, + ent->de->name_len)) { + retval = ext4_find_delete_entry(handle, ent->dir, + &ent->dentry->d_name); + } else { + retval = ext4_delete_entry(handle, ent->dir, ent->de, ent->bh); + if (retval == -ENOENT) { + retval = ext4_find_delete_entry(handle, ent->dir, + &ent->dentry->d_name); + } + } + + if (retval) { + ext4_warning(ent->dir->i_sb, + "Deleting old file (%lu), %d, error=%d", + ent->dir->i_ino, ent->dir->i_nlink, retval); + } +} + +static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent) +{ + if (ent->dir_nlink_delta) { + if (ent->dir_nlink_delta == -1) + ext4_dec_count(handle, ent->dir); + else + ext4_inc_count(handle, ent->dir); + ext4_mark_inode_dirty(handle, ent->dir); + } +} + /* * Anybody can rename anything with this: the permission checks are left to the * higher-level routines. @@ -3012,198 +3160,267 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { handle_t *handle = NULL; - struct inode *old_inode, *new_inode; - struct buffer_head *old_bh, *new_bh, *dir_bh; - struct ext4_dir_entry_2 *old_de, *new_de; + struct ext4_renament old = { + .dir = old_dir, + .dentry = old_dentry, + .inode = old_dentry->d_inode, + }; + struct ext4_renament new = { + .dir = new_dir, + .dentry = new_dentry, + .inode = new_dentry->d_inode, + }; int retval; - int inlined = 0, new_inlined = 0; - struct ext4_dir_entry_2 *parent_de; - dquot_initialize(old_dir); - dquot_initialize(new_dir); - - old_bh = new_bh = dir_bh = NULL; + dquot_initialize(old.dir); + dquot_initialize(new.dir); /* Initialize quotas before so that eventual writes go * in separate transaction */ - if (new_dentry->d_inode) - dquot_initialize(new_dentry->d_inode); + if (new.inode) + dquot_initialize(new.inode); - old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL); + old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL); /* * Check for inode number is _not_ due to possible IO errors. * We might rmdir the source, keep it as pwd of some process * and merrily kill the link to whatever was created under the * same name. Goodbye sticky bit ;-< */ - old_inode = old_dentry->d_inode; retval = -ENOENT; - if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino) + if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino) goto end_rename; - new_inode = new_dentry->d_inode; - new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, - &new_de, &new_inlined); - if (new_bh) { - if (!new_inode) { - brelse(new_bh); - new_bh = NULL; + new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, + &new.de, &new.inlined); + if (new.bh) { + if (!new.inode) { + brelse(new.bh); + new.bh = NULL; } } - if (new_inode && !test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC)) - ext4_alloc_da_blocks(old_inode); + if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC)) + ext4_alloc_da_blocks(old.inode); - handle = ext4_journal_start(old_dir, EXT4_HT_DIR, - (2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + + handle = ext4_journal_start(old.dir, EXT4_HT_DIR, + (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); if (IS_ERR(handle)) return PTR_ERR(handle); - if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) + if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) ext4_handle_sync(handle); - if (S_ISDIR(old_inode->i_mode)) { - if (new_inode) { + if (S_ISDIR(old.inode->i_mode)) { + if (new.inode) { retval = -ENOTEMPTY; - if (!empty_dir(new_inode)) + if (!empty_dir(new.inode)) + goto end_rename; + } else { + retval = -EMLINK; + if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir)) goto end_rename; } - retval = -EIO; - dir_bh = ext4_get_first_dir_block(handle, old_inode, - &retval, &parent_de, - &inlined); - if (!dir_bh) - goto end_rename; - if (le32_to_cpu(parent_de->inode) != old_dir->i_ino) - goto end_rename; - retval = -EMLINK; - if (!new_inode && new_dir != old_dir && - EXT4_DIR_LINK_MAX(new_dir)) - goto end_rename; - BUFFER_TRACE(dir_bh, "get_write_access"); - retval = ext4_journal_get_write_access(handle, dir_bh); + retval = ext4_rename_dir_prepare(handle, &old); if (retval) goto end_rename; } - if (!new_bh) { - retval = ext4_add_entry(handle, new_dentry, old_inode); + if (!new.bh) { + retval = ext4_add_entry(handle, new.dentry, old.inode); if (retval) goto end_rename; } else { - BUFFER_TRACE(new_bh, "get write access"); - retval = ext4_journal_get_write_access(handle, new_bh); + retval = ext4_setent(handle, &new, + old.inode->i_ino, old.de->file_type); if (retval) goto end_rename; - new_de->inode = cpu_to_le32(old_inode->i_ino); - if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb, - EXT4_FEATURE_INCOMPAT_FILETYPE)) - new_de->file_type = old_de->file_type; - new_dir->i_version++; - new_dir->i_ctime = new_dir->i_mtime = - ext4_current_time(new_dir); - ext4_mark_inode_dirty(handle, new_dir); - BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata"); - if (!new_inlined) { - retval = ext4_handle_dirty_dirent_node(handle, - new_dir, new_bh); - if (unlikely(retval)) { - ext4_std_error(new_dir->i_sb, retval); - goto end_rename; - } - } - brelse(new_bh); - new_bh = NULL; } /* * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old_inode->i_ctime = ext4_current_time(old_inode); - ext4_mark_inode_dirty(handle, old_inode); + old.inode->i_ctime = ext4_current_time(old.inode); + ext4_mark_inode_dirty(handle, old.inode); /* * ok, that's it */ - if (le32_to_cpu(old_de->inode) != old_inode->i_ino || - old_de->name_len != old_dentry->d_name.len || - strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) || - (retval = ext4_delete_entry(handle, old_dir, - old_de, old_bh)) == -ENOENT) { - /* old_de could have moved from under us during htree split, so - * make sure that we are deleting the right entry. We might - * also be pointing to a stale entry in the unused part of - * old_bh so just checking inum and the name isn't enough. */ - struct buffer_head *old_bh2; - struct ext4_dir_entry_2 *old_de2; - - old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, - &old_de2, NULL); - if (old_bh2) { - retval = ext4_delete_entry(handle, old_dir, - old_de2, old_bh2); - brelse(old_bh2); - } + ext4_rename_delete(handle, &old); + + if (new.inode) { + ext4_dec_count(handle, new.inode); + new.inode->i_ctime = ext4_current_time(new.inode); } - if (retval) { - ext4_warning(old_dir->i_sb, - "Deleting old file (%lu), %d, error=%d", - old_dir->i_ino, old_dir->i_nlink, retval); - } - - if (new_inode) { - ext4_dec_count(handle, new_inode); - new_inode->i_ctime = ext4_current_time(new_inode); - } - old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir); - ext4_update_dx_flag(old_dir); - if (dir_bh) { - parent_de->inode = cpu_to_le32(new_dir->i_ino); - BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); - if (!inlined) { - if (is_dx(old_inode)) { - retval = ext4_handle_dirty_dx_node(handle, - old_inode, - dir_bh); - } else { - retval = ext4_handle_dirty_dirent_node(handle, - old_inode, dir_bh); - } - } else { - retval = ext4_mark_inode_dirty(handle, old_inode); - } - if (retval) { - ext4_std_error(old_dir->i_sb, retval); + old.dir->i_ctime = old.dir->i_mtime = ext4_current_time(old.dir); + ext4_update_dx_flag(old.dir); + if (old.dir_bh) { + retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); + if (retval) goto end_rename; - } - ext4_dec_count(handle, old_dir); - if (new_inode) { + + ext4_dec_count(handle, old.dir); + if (new.inode) { /* checked empty_dir above, can't have another parent, * ext4_dec_count() won't work for many-linked dirs */ - clear_nlink(new_inode); + clear_nlink(new.inode); } else { - ext4_inc_count(handle, new_dir); - ext4_update_dx_flag(new_dir); - ext4_mark_inode_dirty(handle, new_dir); + ext4_inc_count(handle, new.dir); + ext4_update_dx_flag(new.dir); + ext4_mark_inode_dirty(handle, new.dir); } } - ext4_mark_inode_dirty(handle, old_dir); - if (new_inode) { - ext4_mark_inode_dirty(handle, new_inode); - if (!new_inode->i_nlink) - ext4_orphan_add(handle, new_inode); + ext4_mark_inode_dirty(handle, old.dir); + if (new.inode) { + ext4_mark_inode_dirty(handle, new.inode); + if (!new.inode->i_nlink) + ext4_orphan_add(handle, new.inode); } retval = 0; end_rename: - brelse(dir_bh); - brelse(old_bh); - brelse(new_bh); + brelse(old.dir_bh); + brelse(old.bh); + brelse(new.bh); if (handle) ext4_journal_stop(handle); return retval; } +static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + handle_t *handle = NULL; + struct ext4_renament old = { + .dir = old_dir, + .dentry = old_dentry, + .inode = old_dentry->d_inode, + }; + struct ext4_renament new = { + .dir = new_dir, + .dentry = new_dentry, + .inode = new_dentry->d_inode, + }; + u8 new_file_type; + int retval; + + dquot_initialize(old.dir); + dquot_initialize(new.dir); + + old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, + &old.de, &old.inlined); + /* + * Check for inode number is _not_ due to possible IO errors. + * We might rmdir the source, keep it as pwd of some process + * and merrily kill the link to whatever was created under the + * same name. Goodbye sticky bit ;-< + */ + retval = -ENOENT; + if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino) + goto end_rename; + + new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, + &new.de, &new.inlined); + + /* RENAME_EXCHANGE case: old *and* new must both exist */ + if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino) + goto end_rename; + + handle = ext4_journal_start(old.dir, EXT4_HT_DIR, + (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) + + 2 * EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) + ext4_handle_sync(handle); + + if (S_ISDIR(old.inode->i_mode)) { + old.is_dir = true; + retval = ext4_rename_dir_prepare(handle, &old); + if (retval) + goto end_rename; + } + if (S_ISDIR(new.inode->i_mode)) { + new.is_dir = true; + retval = ext4_rename_dir_prepare(handle, &new); + if (retval) + goto end_rename; + } + + /* + * Other than the special case of overwriting a directory, parents' + * nlink only needs to be modified if this is a cross directory rename. + */ + if (old.dir != new.dir && old.is_dir != new.is_dir) { + old.dir_nlink_delta = old.is_dir ? -1 : 1; + new.dir_nlink_delta = -old.dir_nlink_delta; + retval = -EMLINK; + if ((old.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(old.dir)) || + (new.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(new.dir))) + goto end_rename; + } + + new_file_type = new.de->file_type; + retval = ext4_setent(handle, &new, old.inode->i_ino, old.de->file_type); + if (retval) + goto end_rename; + + retval = ext4_setent(handle, &old, new.inode->i_ino, new_file_type); + if (retval) + goto end_rename; + + /* + * Like most other Unix systems, set the ctime for inodes on a + * rename. + */ + old.inode->i_ctime = ext4_current_time(old.inode); + new.inode->i_ctime = ext4_current_time(new.inode); + ext4_mark_inode_dirty(handle, old.inode); + ext4_mark_inode_dirty(handle, new.inode); + + if (old.dir_bh) { + retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); + if (retval) + goto end_rename; + } + if (new.dir_bh) { + retval = ext4_rename_dir_finish(handle, &new, old.dir->i_ino); + if (retval) + goto end_rename; + } + ext4_update_dir_count(handle, &old); + ext4_update_dir_count(handle, &new); + retval = 0; + +end_rename: + brelse(old.dir_bh); + brelse(new.dir_bh); + brelse(old.bh); + brelse(new.bh); + if (handle) + ext4_journal_stop(handle); + return retval; +} + +static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + if (flags & RENAME_EXCHANGE) { + return ext4_cross_rename(old_dir, old_dentry, + new_dir, new_dentry); + } + /* + * Existence checking was done by the VFS, otherwise "RENAME_NOREPLACE" + * is equivalent to regular rename. + */ + return ext4_rename(old_dir, old_dentry, new_dir, new_dentry); +} + /* * directories can handle most operations... */ @@ -3218,6 +3435,7 @@ const struct inode_operations ext4_dir_inode_operations = { .mknod = ext4_mknod, .tmpfile = ext4_tmpfile, .rename = ext4_rename, + .rename2 = ext4_rename2, .setattr = ext4_setattr, .setxattr = generic_setxattr, .getxattr = generic_getxattr, diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index ab95508e3d40..c18d95b50540 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -308,13 +308,14 @@ static void ext4_end_bio(struct bio *bio, int error) if (error) { struct inode *inode = io_end->inode; - ext4_warning(inode->i_sb, "I/O error writing to inode %lu " + ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " "(offset %llu size %ld starting block %llu)", - inode->i_ino, + error, inode->i_ino, (unsigned long long) io_end->offset, (long) io_end->size, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); + mapping_set_error(inode->i_mapping, error); } if (io_end->flag & EXT4_IO_END_UNWRITTEN) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 710fed2377d4..6f9e6fadac04 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -59,6 +59,7 @@ static struct kset *ext4_kset; static struct ext4_lazy_init *ext4_li_info; static struct mutex ext4_li_mtx; static struct ext4_features *ext4_feat; +static int ext4_mballoc_ready; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); @@ -845,6 +846,10 @@ static void ext4_put_super(struct super_block *sb) invalidate_bdev(sbi->journal_bdev); ext4_blkdev_remove(sbi); } + if (sbi->s_mb_cache) { + ext4_xattr_destroy_cache(sbi->s_mb_cache); + sbi->s_mb_cache = NULL; + } if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); sb->s_fs_info = NULL; @@ -940,7 +945,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { ext4_inode_cachep = kmem_cache_create("ext4_inode_cache", sizeof(struct ext4_inode_info), @@ -3575,6 +3580,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "feature flags set on rev 0 fs, " "running e2fsck is recommended"); + if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) { + set_opt2(sb, HURD_COMPAT); + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_64BIT)) { + ext4_msg(sb, KERN_ERR, + "The Hurd can't support 64-bit file systems"); + goto failed_mount; + } + } + if (IS_EXT2_SB(sb)) { if (ext2_feature_set_ok(sb)) ext4_msg(sb, KERN_INFO, "mounting ext2 file system " @@ -3854,19 +3869,38 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount2; } } + + /* + * set up enough so that it can read an inode, + * and create new inode for buddy allocator + */ + sbi->s_gdb_count = db_count; + if (!test_opt(sb, NOLOAD) && + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) + sb->s_op = &ext4_sops; + else + sb->s_op = &ext4_nojournal_sops; + + ext4_ext_init(sb); + err = ext4_mb_init(sb); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", + err); + goto failed_mount2; + } + if (!ext4_check_descriptors(sb, &first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); - goto failed_mount2; + goto failed_mount2a; } if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) if (!ext4_fill_flex_info(sb)) { ext4_msg(sb, KERN_ERR, "unable to initialize " "flex_bg meta info!"); - goto failed_mount2; + goto failed_mount2a; } - sbi->s_gdb_count = db_count; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); @@ -3901,14 +3935,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_stripe = ext4_get_stripe_size(sbi); sbi->s_extent_max_zeroout_kb = 32; - /* - * set up enough so that it can read an inode - */ - if (!test_opt(sb, NOLOAD) && - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) - sb->s_op = &ext4_sops; - else - sb->s_op = &ext4_nojournal_sops; sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; #ifdef CONFIG_QUOTA @@ -4010,6 +4036,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) percpu_counter_set(&sbi->s_dirtyclusters_counter, 0); no_journal: + if (ext4_mballoc_ready) { + sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id); + if (!sbi->s_mb_cache) { + ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache"); + goto failed_mount_wq; + } + } + /* * Get the # of file system overhead blocks from the * superblock if present. @@ -4090,21 +4124,13 @@ no_journal: if (err) { ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for " "reserved pool", ext4_calculate_resv_clusters(sb)); - goto failed_mount4a; + goto failed_mount5; } err = ext4_setup_system_zone(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize system " "zone (%d)", err); - goto failed_mount4a; - } - - ext4_ext_init(sb); - err = ext4_mb_init(sb); - if (err) { - ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", - err); goto failed_mount5; } @@ -4181,11 +4207,8 @@ failed_mount8: failed_mount7: ext4_unregister_li_request(sb); failed_mount6: - ext4_mb_release(sb); -failed_mount5: - ext4_ext_release(sb); ext4_release_system_zone(sb); -failed_mount4a: +failed_mount5: dput(sb->s_root); sb->s_root = NULL; failed_mount4: @@ -4209,11 +4232,14 @@ failed_mount3: percpu_counter_destroy(&sbi->s_extent_cache_cnt); if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); +failed_mount2a: + ext4_mb_release(sb); failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); ext4_kvfree(sbi->s_group_desc); failed_mount: + ext4_ext_release(sb); if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); if (sbi->s_proc) { @@ -4835,6 +4861,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } if (*flags & MS_RDONLY) { + err = sync_filesystem(sb); + if (err < 0) + goto restore_opts; err = dquot_suspend(sb, -1); if (err < 0) goto restore_opts; @@ -5516,11 +5545,9 @@ static int __init ext4_init_fs(void) err = ext4_init_mballoc(); if (err) - goto out3; - - err = ext4_init_xattr(); - if (err) goto out2; + else + ext4_mballoc_ready = 1; err = init_inodecache(); if (err) goto out1; @@ -5536,10 +5563,9 @@ out: unregister_as_ext3(); destroy_inodecache(); out1: - ext4_exit_xattr(); -out2: + ext4_mballoc_ready = 0; ext4_exit_mballoc(); -out3: +out2: ext4_exit_feat_adverts(); out4: if (ext4_proc_root) @@ -5562,7 +5588,6 @@ static void __exit ext4_exit_fs(void) unregister_as_ext3(); unregister_filesystem(&ext4_fs_type); destroy_inodecache(); - ext4_exit_xattr(); ext4_exit_mballoc(); ext4_exit_feat_adverts(); remove_proc_entry("fs/ext4", NULL); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index e175e94116ac..4eec399ec807 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -81,7 +81,7 @@ # define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif -static void ext4_xattr_cache_insert(struct buffer_head *); +static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *); static struct buffer_head *ext4_xattr_cache_find(struct inode *, struct ext4_xattr_header *, struct mb_cache_entry **); @@ -90,8 +90,6 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *, static int ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size); -static struct mb_cache *ext4_xattr_cache; - static const struct xattr_handler *ext4_xattr_handler_map[] = { [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, #ifdef CONFIG_EXT4_FS_POSIX_ACL @@ -117,6 +115,9 @@ const struct xattr_handler *ext4_xattr_handlers[] = { NULL }; +#define EXT4_GET_MB_CACHE(inode) (((struct ext4_sb_info *) \ + inode->i_sb->s_fs_info)->s_mb_cache) + static __le32 ext4_xattr_block_csum(struct inode *inode, sector_t block_nr, struct ext4_xattr_header *hdr) @@ -265,6 +266,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, struct ext4_xattr_entry *entry; size_t size; int error; + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", name_index, name, buffer, (long)buffer_size); @@ -286,7 +288,7 @@ bad_block: error = -EIO; goto cleanup; } - ext4_xattr_cache_insert(bh); + ext4_xattr_cache_insert(ext4_mb_cache, bh); entry = BFIRST(bh); error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1); if (error == -EIO) @@ -409,6 +411,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) struct inode *inode = dentry->d_inode; struct buffer_head *bh = NULL; int error; + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); @@ -430,7 +433,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) error = -EIO; goto cleanup; } - ext4_xattr_cache_insert(bh); + ext4_xattr_cache_insert(ext4_mb_cache, bh); error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); cleanup: @@ -517,8 +520,8 @@ static void ext4_xattr_update_super_block(handle_t *handle, } /* - * Release the xattr block BH: If the reference count is > 1, decrement - * it; otherwise free the block. + * Release the xattr block BH: If the reference count is > 1, decrement it; + * otherwise free the block. */ static void ext4_xattr_release_block(handle_t *handle, struct inode *inode, @@ -526,8 +529,9 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, { struct mb_cache_entry *ce = NULL; int error = 0; + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); - ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr); + ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr); error = ext4_journal_get_write_access(handle, bh); if (error) goto out; @@ -538,16 +542,31 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, if (ce) mb_cache_entry_free(ce); get_bh(bh); + unlock_buffer(bh); ext4_free_blocks(handle, inode, bh, 0, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); - unlock_buffer(bh); } else { le32_add_cpu(&BHDR(bh)->h_refcount, -1); if (ce) mb_cache_entry_release(ce); + /* + * Beware of this ugliness: Releasing of xattr block references + * from different inodes can race and so we have to protect + * from a race where someone else frees the block (and releases + * its journal_head) before we are done dirtying the buffer. In + * nojournal mode this race is harmless and we actually cannot + * call ext4_handle_dirty_xattr_block() with locked buffer as + * that function can call sync_dirty_buffer() so for that case + * we handle the dirtying after unlocking the buffer. + */ + if (ext4_handle_valid(handle)) + error = ext4_handle_dirty_xattr_block(handle, inode, + bh); unlock_buffer(bh); - error = ext4_handle_dirty_xattr_block(handle, inode, bh); + if (!ext4_handle_valid(handle)) + error = ext4_handle_dirty_xattr_block(handle, inode, + bh); if (IS_SYNC(inode)) ext4_handle_sync(handle); dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1)); @@ -567,12 +586,13 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, size_t *min_offs, void *base, int *total) { for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - *total += EXT4_XATTR_LEN(last->e_name_len); if (!last->e_value_block && last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < *min_offs) *min_offs = offs; } + if (total) + *total += EXT4_XATTR_LEN(last->e_name_len); } return (*min_offs - ((void *)last - base) - sizeof(__u32)); } @@ -745,13 +765,14 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, struct ext4_xattr_search *s = &bs->s; struct mb_cache_entry *ce = NULL; int error = 0; + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); #define header(x) ((struct ext4_xattr_header *)(x)) if (i->value && i->value_len > sb->s_blocksize) return -ENOSPC; if (s->base) { - ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev, + ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev, bs->bh->b_blocknr); error = ext4_journal_get_write_access(handle, bs->bh); if (error) @@ -769,7 +790,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, if (!IS_LAST_ENTRY(s->first)) ext4_xattr_rehash(header(s->base), s->here); - ext4_xattr_cache_insert(bs->bh); + ext4_xattr_cache_insert(ext4_mb_cache, + bs->bh); } unlock_buffer(bs->bh); if (error == -EIO) @@ -905,7 +927,7 @@ getblk_failed: memcpy(new_bh->b_data, s->base, new_bh->b_size); set_buffer_uptodate(new_bh); unlock_buffer(new_bh); - ext4_xattr_cache_insert(new_bh); + ext4_xattr_cache_insert(ext4_mb_cache, new_bh); error = ext4_handle_dirty_xattr_block(handle, inode, new_bh); if (error) @@ -1228,7 +1250,7 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_xattr_block_find *bs = NULL; char *buffer = NULL, *b_entry_name = NULL; size_t min_offs, free; - int total_ino, total_blk; + int total_ino; void *base, *start, *end; int extra_isize = 0, error = 0, tried_min_extra_isize = 0; int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); @@ -1286,8 +1308,7 @@ retry: first = BFIRST(bh); end = bh->b_data + bh->b_size; min_offs = end - base; - free = ext4_xattr_free_space(first, &min_offs, base, - &total_blk); + free = ext4_xattr_free_space(first, &min_offs, base, NULL); if (free < new_extra_isize) { if (!tried_min_extra_isize && s_min_extra_isize) { tried_min_extra_isize++; @@ -1495,13 +1516,13 @@ ext4_xattr_put_super(struct super_block *sb) * Returns 0, or a negative error number on failure. */ static void -ext4_xattr_cache_insert(struct buffer_head *bh) +ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh) { __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); struct mb_cache_entry *ce; int error; - ce = mb_cache_entry_alloc(ext4_xattr_cache, GFP_NOFS); + ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS); if (!ce) { ea_bdebug(bh, "out of memory"); return; @@ -1573,12 +1594,13 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, { __u32 hash = le32_to_cpu(header->h_hash); struct mb_cache_entry *ce; + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); if (!header->h_hash) return NULL; /* never share */ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); again: - ce = mb_cache_entry_find_first(ext4_xattr_cache, inode->i_sb->s_bdev, + ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev, hash); while (ce) { struct buffer_head *bh; @@ -1676,19 +1698,17 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header, #undef BLOCK_HASH_SHIFT -int __init -ext4_init_xattr(void) +#define HASH_BUCKET_BITS 10 + +struct mb_cache * +ext4_xattr_create_cache(char *name) { - ext4_xattr_cache = mb_cache_create("ext4_xattr", 6); - if (!ext4_xattr_cache) - return -ENOMEM; - return 0; + return mb_cache_create(name, HASH_BUCKET_BITS); } -void -ext4_exit_xattr(void) +void ext4_xattr_destroy_cache(struct mb_cache *cache) { - if (ext4_xattr_cache) - mb_cache_destroy(ext4_xattr_cache); - ext4_xattr_cache = NULL; + if (cache) + mb_cache_destroy(cache); } + diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 819d6398833f..29bedf5589f6 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -110,9 +110,6 @@ extern void ext4_xattr_put_super(struct super_block *); extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle); -extern int __init ext4_init_xattr(void); -extern void ext4_exit_xattr(void); - extern const struct xattr_handler *ext4_xattr_handlers[]; extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, @@ -124,6 +121,9 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is); +extern struct mb_cache *ext4_xattr_create_cache(char *name); +extern void ext4_xattr_destroy_cache(struct mb_cache *); + #ifdef CONFIG_EXT4_FS_SECURITY extern int ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, const struct qstr *qstr); diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index fa8da4cb8c4b..e93e4ec7d165 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -174,7 +174,7 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type) retval = f2fs_getxattr(inode, name_index, "", NULL, 0); if (retval > 0) { - value = kmalloc(retval, GFP_KERNEL); + value = kmalloc(retval, GFP_F2FS_ZERO); if (!value) return ERR_PTR(-ENOMEM); retval = f2fs_getxattr(inode, name_index, "", value, retval); @@ -203,6 +203,12 @@ static int __f2fs_set_acl(struct inode *inode, int type, size_t size = 0; int error; + if (acl) { + error = posix_acl_valid(acl); + if (error < 0) + return error; + } + switch (type) { case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 293d0486a40f..4aa521aa9bc3 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -33,14 +33,12 @@ struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) struct address_space *mapping = META_MAPPING(sbi); struct page *page = NULL; repeat: - page = grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); if (!page) { cond_resched(); goto repeat; } - /* We wait writeback only inside grab_meta_page() */ - wait_on_page_writeback(page); SetPageUptodate(page); return page; } @@ -75,23 +73,102 @@ out: return page; } +inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type) +{ + switch (type) { + case META_NAT: + return NM_I(sbi)->max_nid / NAT_ENTRY_PER_BLOCK; + case META_SIT: + return SIT_BLK_CNT(sbi); + case META_SSA: + case META_CP: + return 0; + default: + BUG(); + } +} + +/* + * Readahead CP/NAT/SIT/SSA pages + */ +int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type) +{ + block_t prev_blk_addr = 0; + struct page *page; + int blkno = start; + int max_blks = get_max_meta_blks(sbi, type); + + struct f2fs_io_info fio = { + .type = META, + .rw = READ_SYNC | REQ_META | REQ_PRIO + }; + + for (; nrpages-- > 0; blkno++) { + block_t blk_addr; + + switch (type) { + case META_NAT: + /* get nat block addr */ + if (unlikely(blkno >= max_blks)) + blkno = 0; + blk_addr = current_nat_addr(sbi, + blkno * NAT_ENTRY_PER_BLOCK); + break; + case META_SIT: + /* get sit block addr */ + if (unlikely(blkno >= max_blks)) + goto out; + blk_addr = current_sit_addr(sbi, + blkno * SIT_ENTRY_PER_BLOCK); + if (blkno != start && prev_blk_addr + 1 != blk_addr) + goto out; + prev_blk_addr = blk_addr; + break; + case META_SSA: + case META_CP: + /* get ssa/cp block addr */ + blk_addr = blkno; + break; + default: + BUG(); + } + + page = grab_cache_page(META_MAPPING(sbi), blk_addr); + if (!page) + continue; + if (PageUptodate(page)) { + mark_page_accessed(page); + f2fs_put_page(page, 1); + continue; + } + + f2fs_submit_page_mbio(sbi, page, blk_addr, &fio); + mark_page_accessed(page); + f2fs_put_page(page, 0); + } +out: + f2fs_submit_merged_bio(sbi, META, READ); + return blkno - start; +} + static int f2fs_write_meta_page(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - /* Should not write any meta pages, if any IO error was occurred */ - if (unlikely(sbi->por_doing || - is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG))) + if (unlikely(sbi->por_doing)) goto redirty_out; - if (wbc->for_reclaim) goto redirty_out; - wait_on_page_writeback(page); + /* Should not write any meta pages, if any IO error was occurred */ + if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG))) + goto no_write; + f2fs_wait_on_page_writeback(page, META); write_meta_page(sbi, page); +no_write: dec_page_count(sbi, F2FS_DIRTY_META); unlock_page(page); return 0; @@ -99,6 +176,7 @@ static int f2fs_write_meta_page(struct page *page, redirty_out: dec_page_count(sbi, F2FS_DIRTY_META); wbc->pages_skipped++; + account_page_redirty(page); set_page_dirty(page); return AOP_WRITEPAGE_ACTIVATE; } @@ -107,21 +185,23 @@ static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); - int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); - long written; - - if (wbc->for_kupdate) - return 0; + long diff, written; /* collect a number of dirty meta pages and write together */ - if (get_pages(sbi, F2FS_DIRTY_META) < nrpages) - return 0; + if (wbc->for_kupdate || + get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) + goto skip_write; /* if mounting is failed, skip writing node pages */ mutex_lock(&sbi->cp_mutex); - written = sync_meta_pages(sbi, META, nrpages); + diff = nr_pages_to_write(sbi, META, wbc); + written = sync_meta_pages(sbi, META, wbc->nr_to_write); mutex_unlock(&sbi->cp_mutex); - wbc->nr_to_write -= written; + wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); + return 0; + +skip_write: + wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META); return 0; } @@ -148,10 +228,22 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + lock_page(page); - f2fs_bug_on(page->mapping != mapping); - f2fs_bug_on(!PageDirty(page)); - clear_page_dirty_for_io(page); + + if (unlikely(page->mapping != mapping)) { +continue_unlock: + unlock_page(page); + continue; + } + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + if (f2fs_write_meta_page(page, &wbc)) { unlock_page(page); break; @@ -216,16 +308,15 @@ void release_orphan_inode(struct f2fs_sb_info *sbi) void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { - struct list_head *head, *this; - struct orphan_inode_entry *new = NULL, *orphan = NULL; + struct list_head *head; + struct orphan_inode_entry *new, *orphan; new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC); new->ino = ino; spin_lock(&sbi->orphan_inode_lock); head = &sbi->orphan_inode_list; - list_for_each(this, head) { - orphan = list_entry(this, struct orphan_inode_entry, list); + list_for_each_entry(orphan, head, list) { if (orphan->ino == ino) { spin_unlock(&sbi->orphan_inode_lock); kmem_cache_free(orphan_entry_slab, new); @@ -234,14 +325,10 @@ void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) if (orphan->ino > ino) break; - orphan = NULL; } - /* add new_oentry into list which is sorted by inode number */ - if (orphan) - list_add(&new->list, this->prev); - else - list_add_tail(&new->list, head); + /* add new orphan entry into list which is sorted by inode number */ + list_add_tail(&new->list, &orphan->list); spin_unlock(&sbi->orphan_inode_lock); } @@ -255,10 +342,11 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) list_for_each_entry(orphan, head, list) { if (orphan->ino == ino) { list_del(&orphan->list); - kmem_cache_free(orphan_entry_slab, orphan); f2fs_bug_on(sbi->n_orphans == 0); sbi->n_orphans--; - break; + spin_unlock(&sbi->orphan_inode_lock); + kmem_cache_free(orphan_entry_slab, orphan); + return; } } spin_unlock(&sbi->orphan_inode_lock); @@ -285,6 +373,8 @@ void recover_orphan_inodes(struct f2fs_sb_info *sbi) start_blk = __start_cp_addr(sbi) + 1; orphan_blkaddr = __start_sum_addr(sbi) - 1; + ra_meta_pages(sbi, start_blk, orphan_blkaddr, META_CP); + for (i = 0; i < orphan_blkaddr; i++) { struct page *page = get_meta_page(sbi, start_blk + i); struct f2fs_orphan_block *orphan_blk; @@ -466,14 +556,12 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct list_head *head = &sbi->dir_inode_list; - struct list_head *this; + struct dir_inode_entry *entry; - list_for_each(this, head) { - struct dir_inode_entry *entry; - entry = list_entry(this, struct dir_inode_entry, list); + list_for_each_entry(entry, head, list) if (unlikely(entry->inode == inode)) return -EEXIST; - } + list_add_tail(&new->list, head); stat_inc_dirty_dir(sbi); return 0; @@ -483,6 +571,7 @@ void set_dirty_dir_page(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct dir_inode_entry *new; + int ret = 0; if (!S_ISDIR(inode->i_mode)) return; @@ -492,13 +581,13 @@ void set_dirty_dir_page(struct inode *inode, struct page *page) INIT_LIST_HEAD(&new->list); spin_lock(&sbi->dir_inode_lock); - if (__add_dirty_inode(inode, new)) - kmem_cache_free(inode_entry_slab, new); - - inc_page_count(sbi, F2FS_DIRTY_DENTS); + ret = __add_dirty_inode(inode, new); inode_inc_dirty_dents(inode); SetPagePrivate(page); spin_unlock(&sbi->dir_inode_lock); + + if (ret) + kmem_cache_free(inode_entry_slab, new); } void add_dirty_dir_inode(struct inode *inode) @@ -506,44 +595,47 @@ void add_dirty_dir_inode(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct dir_inode_entry *new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); + int ret = 0; new->inode = inode; INIT_LIST_HEAD(&new->list); spin_lock(&sbi->dir_inode_lock); - if (__add_dirty_inode(inode, new)) - kmem_cache_free(inode_entry_slab, new); + ret = __add_dirty_inode(inode, new); spin_unlock(&sbi->dir_inode_lock); + + if (ret) + kmem_cache_free(inode_entry_slab, new); } void remove_dirty_dir_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - - struct list_head *this, *head; + struct list_head *head; + struct dir_inode_entry *entry; if (!S_ISDIR(inode->i_mode)) return; spin_lock(&sbi->dir_inode_lock); - if (atomic_read(&F2FS_I(inode)->dirty_dents)) { + if (get_dirty_dents(inode)) { spin_unlock(&sbi->dir_inode_lock); return; } head = &sbi->dir_inode_list; - list_for_each(this, head) { - struct dir_inode_entry *entry; - entry = list_entry(this, struct dir_inode_entry, list); + list_for_each_entry(entry, head, list) { if (entry->inode == inode) { list_del(&entry->list); - kmem_cache_free(inode_entry_slab, entry); stat_dec_dirty_dir(sbi); - break; + spin_unlock(&sbi->dir_inode_lock); + kmem_cache_free(inode_entry_slab, entry); + goto done; } } spin_unlock(&sbi->dir_inode_lock); +done: /* Only from the recovery routine */ if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) { clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT); @@ -554,15 +646,14 @@ void remove_dirty_dir_inode(struct inode *inode) struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino) { - struct list_head *this, *head; + struct list_head *head; struct inode *inode = NULL; + struct dir_inode_entry *entry; spin_lock(&sbi->dir_inode_lock); head = &sbi->dir_inode_list; - list_for_each(this, head) { - struct dir_inode_entry *entry; - entry = list_entry(this, struct dir_inode_entry, list); + list_for_each_entry(entry, head, list) { if (entry->inode->i_ino == ino) { inode = entry->inode; break; @@ -589,7 +680,7 @@ retry: inode = igrab(entry->inode); spin_unlock(&sbi->dir_inode_lock); if (inode) { - filemap_flush(inode->i_mapping); + filemap_fdatawrite(inode->i_mapping); iput(inode); } else { /* @@ -824,6 +915,7 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) unblock_operations(sbi); mutex_unlock(&sbi->cp_mutex); + stat_inc_cp_count(sbi->stat_info); trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint"); } @@ -845,11 +937,11 @@ void init_orphan_info(struct f2fs_sb_info *sbi) int __init create_checkpoint_caches(void) { orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry", - sizeof(struct orphan_inode_entry), NULL); + sizeof(struct orphan_inode_entry)); if (!orphan_entry_slab) return -ENOMEM; inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry", - sizeof(struct dir_inode_entry), NULL); + sizeof(struct dir_inode_entry)); if (!inode_entry_slab) { kmem_cache_destroy(orphan_entry_slab); return -ENOMEM; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2261ccdd0b5f..45abd60e2bff 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -45,7 +45,7 @@ static void f2fs_read_end_io(struct bio *bio, int err) static void f2fs_write_end_io(struct bio *bio, int err) { - struct f2fs_sb_info *sbi = F2FS_SB(bio->bi_io_vec->bv_page->mapping->host->i_sb); + struct f2fs_sb_info *sbi = bio->bi_private; struct bio_vec *bvec; int i; @@ -55,15 +55,16 @@ static void f2fs_write_end_io(struct bio *bio, int err) if (unlikely(err)) { SetPageError(page); set_bit(AS_EIO, &page->mapping->flags); - set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); - sbi->sb->s_flags |= MS_RDONLY; + f2fs_stop_checkpoint(sbi); } end_page_writeback(page); dec_page_count(sbi, F2FS_WRITEBACK); } - if (bio->bi_private) - complete(bio->bi_private); + if (sbi->wait_io) { + complete(sbi->wait_io); + sbi->wait_io = NULL; + } if (!get_pages(sbi, F2FS_WRITEBACK) && !list_empty(&sbi->cp_wait.task_list)) @@ -86,6 +87,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, bio->bi_bdev = sbi->sb->s_bdev; bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; + bio->bi_private = sbi; return bio; } @@ -113,7 +115,7 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) */ if (fio->type == META_FLUSH) { DECLARE_COMPLETION_ONSTACK(wait); - io->bio->bi_private = &wait; + io->sbi->wait_io = &wait; submit_bio(rw, io->bio); wait_for_completion(&wait); } else { @@ -132,7 +134,7 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype]; - mutex_lock(&io->io_mutex); + down_write(&io->io_rwsem); /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { @@ -140,7 +142,7 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; } __submit_merged_bio(io); - mutex_unlock(&io->io_mutex); + up_write(&io->io_rwsem); } /* @@ -178,7 +180,7 @@ void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page, verify_block_addr(sbi, blk_addr); - mutex_lock(&io->io_mutex); + down_write(&io->io_rwsem); if (!is_read) inc_page_count(sbi, F2FS_WRITEBACK); @@ -202,7 +204,7 @@ alloc_new: io->last_block_in_bio = blk_addr; - mutex_unlock(&io->io_mutex); + up_write(&io->io_rwsem); trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr); } @@ -797,48 +799,36 @@ static int f2fs_write_data_page(struct page *page, */ offset = i_size & (PAGE_CACHE_SIZE - 1); if ((page->index >= end_index + 1) || !offset) { - if (S_ISDIR(inode->i_mode)) { - dec_page_count(sbi, F2FS_DIRTY_DENTS); - inode_dec_dirty_dents(inode); - } + inode_dec_dirty_dents(inode); goto out; } zero_user_segment(page, offset, PAGE_CACHE_SIZE); write: - if (unlikely(sbi->por_doing)) { - err = AOP_WRITEPAGE_ACTIVATE; + if (unlikely(sbi->por_doing)) goto redirty_out; - } /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { - dec_page_count(sbi, F2FS_DIRTY_DENTS); inode_dec_dirty_dents(inode); err = do_write_data_page(page, &fio); - } else { - f2fs_lock_op(sbi); - - if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode)) { - err = f2fs_write_inline_data(inode, page, offset); - f2fs_unlock_op(sbi); - goto out; - } else { - err = do_write_data_page(page, &fio); - } + goto done; + } - f2fs_unlock_op(sbi); + if (!wbc->for_reclaim) need_balance_fs = true; - } - if (err == -ENOENT) - goto out; - else if (err) + else if (has_not_enough_free_secs(sbi, 0)) goto redirty_out; - if (wbc->for_reclaim) { - f2fs_submit_merged_bio(sbi, DATA, WRITE); - need_balance_fs = false; - } + f2fs_lock_op(sbi); + if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode)) + err = f2fs_write_inline_data(inode, page, offset); + else + err = do_write_data_page(page, &fio); + f2fs_unlock_op(sbi); +done: + if (err && err != -ENOENT) + goto redirty_out; clear_cold_data(page); out: @@ -849,12 +839,11 @@ out: redirty_out: wbc->pages_skipped++; + account_page_redirty(page); set_page_dirty(page); - return err; + return AOP_WRITEPAGE_ACTIVATE; } -#define MAX_DESIRED_PAGES_WP 4096 - static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, void *data) { @@ -871,17 +860,17 @@ static int f2fs_write_data_pages(struct address_space *mapping, struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); bool locked = false; int ret; - long excess_nrtw = 0, desired_nrtw; + long diff; /* deal with chardevs and other special file */ if (!mapping->a_ops->writepage) return 0; - if (wbc->nr_to_write < MAX_DESIRED_PAGES_WP) { - desired_nrtw = MAX_DESIRED_PAGES_WP; - excess_nrtw = desired_nrtw - wbc->nr_to_write; - wbc->nr_to_write = desired_nrtw; - } + if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && + get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA)) + goto skip_write; + + diff = nr_pages_to_write(sbi, DATA, wbc); if (!S_ISDIR(inode->i_mode)) { mutex_lock(&sbi->writepages); @@ -895,8 +884,12 @@ static int f2fs_write_data_pages(struct address_space *mapping, remove_dirty_dir_inode(inode); - wbc->nr_to_write -= excess_nrtw; + wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return ret; + +skip_write: + wbc->pages_skipped += get_dirty_dents(inode); + return 0; } static int f2fs_write_begin(struct file *file, struct address_space *mapping, @@ -949,13 +942,19 @@ inline_data: if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_CACHE_SIZE); } else { - if (f2fs_has_inline_data(inode)) + if (f2fs_has_inline_data(inode)) { err = f2fs_read_inline_data(inode, page); - else + if (err) { + page_cache_release(page); + return err; + } + } else { err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC); - if (err) - return err; + if (err) + return err; + } + lock_page(page); if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); @@ -1031,11 +1030,8 @@ static void f2fs_invalidate_data_page(struct page *page, unsigned int offset, unsigned int length) { struct inode *inode = page->mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - if (S_ISDIR(inode->i_mode) && PageDirty(page)) { - dec_page_count(sbi, F2FS_DIRTY_DENTS); + if (PageDirty(page)) inode_dec_dirty_dents(inode); - } ClearPagePrivate(page); } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 3de9d20d0c14..b52c12cf5873 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -86,7 +86,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = F2FS_STAT(sbi); unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist; - struct sit_info *sit_i = SIT_I(sbi); unsigned int segno, vblocks; int ndirty = 0; @@ -94,7 +93,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi) total_vblocks = 0; blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg); hblks_per_sec = blks_per_sec / 2; - mutex_lock(&sit_i->sentry_lock); for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) { vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); dist = abs(vblocks - hblks_per_sec); @@ -105,7 +103,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi) ndirty++; } } - mutex_unlock(&sit_i->sentry_lock); dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100; si->bimodal = bimodal / dist; if (si->dirty_count) @@ -236,6 +233,7 @@ static int stat_show(struct seq_file *s, void *v) si->dirty_count); seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n", si->prefree_count, si->free_segs, si->free_secs); + seq_printf(s, "CP calls: %d\n", si->cp_count); seq_printf(s, "GC calls: %d (BG: %d)\n", si->call_count, si->bg_gc); seq_printf(s, " - data segments : %d\n", si->data_segs); @@ -252,10 +250,10 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_dent, si->ndirty_dirs); seq_printf(s, " - meta: %4d in %4d\n", si->ndirty_meta, si->meta_pages); - seq_printf(s, " - NATs: %5d > %lu\n", - si->nats, NM_WOUT_THRESHOLD); - seq_printf(s, " - SITs: %5d\n - free_nids: %5d\n", - si->sits, si->fnids); + seq_printf(s, " - NATs: %9d\n - SITs: %9d\n", + si->nats, si->sits); + seq_printf(s, " - free_nids: %9d\n", + si->fnids); seq_puts(s, "\nDistribution of User Blocks:"); seq_puts(s, " [ valid | invalid | free ]\n"); seq_puts(s, " ["); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 2b7c255bcbdf..972fd0ef230f 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -21,12 +21,12 @@ static unsigned long dir_blocks(struct inode *inode) >> PAGE_CACHE_SHIFT; } -static unsigned int dir_buckets(unsigned int level) +static unsigned int dir_buckets(unsigned int level, int dir_level) { if (level < MAX_DIR_HASH_DEPTH / 2) - return 1 << level; + return 1 << (level + dir_level); else - return 1 << ((MAX_DIR_HASH_DEPTH / 2) - 1); + return 1 << ((MAX_DIR_HASH_DEPTH / 2 + dir_level) - 1); } static unsigned int bucket_blocks(unsigned int level) @@ -65,13 +65,14 @@ static void set_de_type(struct f2fs_dir_entry *de, struct inode *inode) de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; } -static unsigned long dir_block_index(unsigned int level, unsigned int idx) +static unsigned long dir_block_index(unsigned int level, + int dir_level, unsigned int idx) { unsigned long i; unsigned long bidx = 0; for (i = 0; i < level; i++) - bidx += dir_buckets(i) * bucket_blocks(i); + bidx += dir_buckets(i, dir_level) * bucket_blocks(i); bidx += idx * bucket_blocks(level); return bidx; } @@ -93,16 +94,21 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, f2fs_hash_t namehash, struct page **res_page) { struct f2fs_dir_entry *de; - unsigned long bit_pos, end_pos, next_pos; + unsigned long bit_pos = 0; struct f2fs_dentry_block *dentry_blk = kmap(dentry_page); - int slots; + const void *dentry_bits = &dentry_blk->dentry_bitmap; + int max_len = 0; - bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, - NR_DENTRY_IN_BLOCK, 0); while (bit_pos < NR_DENTRY_IN_BLOCK) { + if (!test_bit_le(bit_pos, dentry_bits)) { + if (bit_pos == 0) + max_len = 1; + else if (!test_bit_le(bit_pos - 1, dentry_bits)) + max_len++; + bit_pos++; + continue; + } de = &dentry_blk->dentry[bit_pos]; - slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); - if (early_match_name(name, namelen, namehash, de)) { if (!memcmp(dentry_blk->filename[bit_pos], name, namelen)) { @@ -110,20 +116,18 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, goto found; } } - next_pos = bit_pos + slots; - bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, - NR_DENTRY_IN_BLOCK, next_pos); - if (bit_pos >= NR_DENTRY_IN_BLOCK) - end_pos = NR_DENTRY_IN_BLOCK; - else - end_pos = bit_pos; - if (*max_slots < end_pos - next_pos) - *max_slots = end_pos - next_pos; + if (max_len > *max_slots) { + *max_slots = max_len; + max_len = 0; + } + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); } de = NULL; kunmap(dentry_page); found: + if (max_len > *max_slots) + *max_slots = max_len; return de; } @@ -141,10 +145,11 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, f2fs_bug_on(level > MAX_DIR_HASH_DEPTH); - nbucket = dir_buckets(level); + nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); - bidx = dir_block_index(level, le32_to_cpu(namehash) % nbucket); + bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level, + le32_to_cpu(namehash) % nbucket); end_block = bidx + nblock; for (; bidx < end_block; bidx++) { @@ -248,7 +253,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct page *page, struct inode *inode) { lock_page(page); - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, DATA); de->ino = cpu_to_le32(inode->i_ino); set_de_type(de, inode); kunmap(page); @@ -347,14 +352,11 @@ static struct page *init_inode_metadata(struct inode *inode, err = f2fs_init_security(inode, dir, name, page); if (err) goto put_error; - - wait_on_page_writeback(page); } else { page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino); if (IS_ERR(page)) return page; - wait_on_page_writeback(page); set_cold_node(inode, page); } @@ -372,6 +374,10 @@ static struct page *init_inode_metadata(struct inode *inode, put_error: f2fs_put_page(page, 1); + /* once the failed inode becomes a bad inode, i_mode is S_IFREG */ + truncate_inode_pages(&inode->i_data, 0); + truncate_blocks(inode, 0); + remove_dirty_dir_inode(inode); error: remove_inode_page(inode); return ERR_PTR(err); @@ -395,9 +401,6 @@ static void update_parent_metadata(struct inode *dir, struct inode *inode, set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); } - if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) - update_inode_page(dir); - if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) clear_inode_flag(F2FS_I(inode), FI_INC_LINK); } @@ -464,10 +467,11 @@ start: if (level == current_depth) ++current_depth; - nbucket = dir_buckets(level); + nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); - bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket)); + bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level, + (le32_to_cpu(dentry_hash) % nbucket)); for (block = bidx; block <= (bidx + nblock - 1); block++) { dentry_page = get_new_data_page(dir, NULL, block, true); @@ -487,8 +491,9 @@ start: ++level; goto start; add_dentry: - wait_on_page_writeback(dentry_page); + f2fs_wait_on_page_writeback(dentry_page, DATA); + down_write(&F2FS_I(inode)->i_sem); page = init_inode_metadata(inode, dir, name); if (IS_ERR(page)) { err = PTR_ERR(page); @@ -511,7 +516,12 @@ add_dentry: update_parent_metadata(dir, inode, current_depth); fail: - clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); + up_write(&F2FS_I(inode)->i_sem); + + if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) { + update_inode_page(dir); + clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); + } kunmap(dentry_page); f2fs_put_page(dentry_page, 1); return err; @@ -528,13 +538,12 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, unsigned int bit_pos; struct address_space *mapping = page->mapping; struct inode *dir = mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); void *kaddr = page_address(page); int i; lock_page(page); - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, DATA); dentry_blk = (struct f2fs_dentry_block *)kaddr; bit_pos = dentry - (struct f2fs_dir_entry *)dentry_blk->dentry; @@ -551,6 +560,10 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, dir->i_ctime = dir->i_mtime = CURRENT_TIME; if (inode) { + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + + down_write(&F2FS_I(inode)->i_sem); + if (S_ISDIR(inode->i_mode)) { drop_nlink(dir); update_inode_page(dir); @@ -561,6 +574,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, drop_nlink(inode); i_size_write(inode, 0); } + up_write(&F2FS_I(inode)->i_sem); update_inode_page(inode); if (inode->i_nlink == 0) @@ -573,7 +587,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, truncate_hole(dir, page->index, page->index + 1); clear_page_dirty_for_io(page); ClearPageUptodate(page); - dec_page_count(sbi, F2FS_DIRTY_DENTS); inode_dec_dirty_dents(dir); } f2fs_put_page(page, 1); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fc3c558cb4f3..2ecac8312359 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -40,6 +40,7 @@ #define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040 #define F2FS_MOUNT_INLINE_XATTR 0x00000080 #define F2FS_MOUNT_INLINE_DATA 0x00000100 +#define F2FS_MOUNT_FLUSH_MERGE 0x00000200 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) @@ -88,6 +89,16 @@ enum { SIT_BITMAP }; +/* + * For CP/NAT/SIT/SSA readahead + */ +enum { + META_CP, + META_NAT, + META_SIT, + META_SSA +}; + /* for the list of orphan inodes */ struct orphan_inode_entry { struct list_head list; /* list head */ @@ -187,16 +198,20 @@ struct extent_info { #define FADVISE_COLD_BIT 0x01 #define FADVISE_LOST_PINO_BIT 0x02 +#define DEF_DIR_LEVEL 0 + struct f2fs_inode_info { struct inode vfs_inode; /* serve a vfs inode */ unsigned long i_flags; /* keep an inode flags for ioctl */ unsigned char i_advise; /* use to give file attribute hints */ + unsigned char i_dir_level; /* use for dentry level for large dir */ unsigned int i_current_depth; /* use only in directory structure */ unsigned int i_pino; /* parent inode number */ umode_t i_acl_mode; /* keep file acl mode temporarily */ /* Use below internally in f2fs*/ unsigned long flags; /* use to pass per-file flags */ + struct rw_semaphore i_sem; /* protect fi info */ atomic_t dirty_dents; /* # of dirty dentry pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ @@ -229,6 +244,7 @@ struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ nid_t next_scan_nid; /* the next nid to be scanned */ + unsigned int ram_thresh; /* control the memory footprint */ /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ @@ -238,6 +254,7 @@ struct f2fs_nm_info { struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */ /* free node ids management */ + struct radix_tree_root free_nid_root;/* root of the free_nid cache */ struct list_head free_nid_list; /* a list for free nids */ spinlock_t free_nid_list_lock; /* protect free nid list */ unsigned int fcnt; /* the number of free node id */ @@ -300,6 +317,12 @@ enum { NO_CHECK_TYPE }; +struct flush_cmd { + struct flush_cmd *next; + struct completion wait; + int ret; +}; + struct f2fs_sm_info { struct sit_info *sit_info; /* whole segment information */ struct free_segmap_info *free_info; /* free segment information */ @@ -328,6 +351,14 @@ struct f2fs_sm_info { unsigned int ipu_policy; /* in-place-update policy */ unsigned int min_ipu_util; /* in-place-update threshold */ + + /* for flush command control */ + struct task_struct *f2fs_issue_flush; /* flush thread */ + wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ + struct flush_cmd *issue_list; /* list for command issue */ + struct flush_cmd *dispatch_list; /* list for command dispatch */ + spinlock_t issue_lock; /* for issue list lock */ + struct flush_cmd *issue_tail; /* list tail of issue list */ }; /* @@ -378,7 +409,7 @@ struct f2fs_bio_info { struct bio *bio; /* bios to merge */ sector_t last_block_in_bio; /* last block number */ struct f2fs_io_info fio; /* store buffered io info. */ - struct mutex io_mutex; /* mutex for bio */ + struct rw_semaphore io_rwsem; /* blocking op for bio */ }; struct f2fs_sb_info { @@ -398,6 +429,7 @@ struct f2fs_sb_info { /* for bio operations */ struct f2fs_bio_info read_io; /* for read bios */ struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ + struct completion *wait_io; /* for completion bios */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ @@ -407,7 +439,6 @@ struct f2fs_sb_info { struct mutex node_write; /* locking node writes */ struct mutex writepages; /* mutex for writepages() */ bool por_doing; /* recovery is doing or not */ - bool on_build_free_nids; /* build_free_nids is doing */ wait_queue_head_t cp_wait; /* for orphan inode management */ @@ -436,6 +467,7 @@ struct f2fs_sb_info { unsigned int total_valid_node_count; /* valid node block count */ unsigned int total_valid_inode_count; /* valid inode count */ int active_logs; /* # of active logs */ + int dir_level; /* directory level */ block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ @@ -622,6 +654,11 @@ static inline int F2FS_HAS_BLOCKS(struct inode *inode) return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS; } +static inline bool f2fs_has_xattr_block(unsigned int ofs) +{ + return ofs == XATTR_NODE_OFFSET; +} + static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, blkcnt_t count) { @@ -661,6 +698,7 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) static inline void inode_inc_dirty_dents(struct inode *inode) { + inc_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS); atomic_inc(&F2FS_I(inode)->dirty_dents); } @@ -671,6 +709,10 @@ static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) static inline void inode_dec_dirty_dents(struct inode *inode) { + if (!S_ISDIR(inode->i_mode)) + return; + + dec_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS); atomic_dec(&F2FS_I(inode)->dirty_dents); } @@ -679,6 +721,11 @@ static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) return atomic_read(&sbi->nr_pages[count_type]); } +static inline int get_dirty_dents(struct inode *inode) +{ + return atomic_read(&F2FS_I(inode)->dirty_dents); +} + static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) { unsigned int pages_per_sec = sbi->segs_per_sec * @@ -689,11 +736,7 @@ static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) { - block_t ret; - spin_lock(&sbi->stat_lock); - ret = sbi->total_valid_block_count; - spin_unlock(&sbi->stat_lock); - return ret; + return sbi->total_valid_block_count; } static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag) @@ -789,11 +832,7 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) { - unsigned int ret; - spin_lock(&sbi->stat_lock); - ret = sbi->total_valid_node_count; - spin_unlock(&sbi->stat_lock); - return ret; + return sbi->total_valid_node_count; } static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) @@ -814,11 +853,7 @@ static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi) static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) { - unsigned int ret; - spin_lock(&sbi->stat_lock); - ret = sbi->total_valid_inode_count; - spin_unlock(&sbi->stat_lock); - return ret; + return sbi->total_valid_inode_count; } static inline void f2fs_put_page(struct page *page, int unlock) @@ -844,9 +879,9 @@ static inline void f2fs_put_dnode(struct dnode_of_data *dn) } static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name, - size_t size, void (*ctor)(void *)) + size_t size) { - return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, ctor); + return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, NULL); } static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, @@ -983,24 +1018,28 @@ static inline void set_raw_inline(struct f2fs_inode_info *fi, ri->i_inline |= F2FS_INLINE_DATA; } +static inline int f2fs_has_inline_xattr(struct inode *inode) +{ + return is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR); +} + static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi) { - if (is_inode_flag_set(fi, FI_INLINE_XATTR)) + if (f2fs_has_inline_xattr(&fi->vfs_inode)) return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS; return DEF_ADDRS_PER_INODE; } static inline void *inline_xattr_addr(struct page *page) { - struct f2fs_inode *ri; - ri = (struct f2fs_inode *)page_address(page); + struct f2fs_inode *ri = F2FS_INODE(page); return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS]); } static inline int inline_xattr_size(struct inode *inode) { - if (is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR)) + if (f2fs_has_inline_xattr(inode)) return F2FS_INLINE_XATTR_ADDRS << 2; else return 0; @@ -1013,8 +1052,7 @@ static inline int f2fs_has_inline_data(struct inode *inode) static inline void *inline_data_addr(struct page *page) { - struct f2fs_inode *ri; - ri = (struct f2fs_inode *)page_address(page); + struct f2fs_inode *ri = F2FS_INODE(page); return (void *)&(ri->i_addr[1]); } @@ -1023,6 +1061,12 @@ static inline int f2fs_readonly(struct super_block *sb) return sb->s_flags & MS_RDONLY; } +static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi) +{ + set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); + sbi->sb->s_flags |= MS_RDONLY; +} + #define get_inode_mode(i) \ ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) @@ -1048,7 +1092,7 @@ void f2fs_set_inode_flags(struct inode *); struct inode *f2fs_iget(struct super_block *, unsigned long); int try_to_free_nats(struct f2fs_sb_info *, int); void update_inode(struct inode *, struct page *); -int update_inode_page(struct inode *); +void update_inode_page(struct inode *); int f2fs_write_inode(struct inode *, struct writeback_control *); void f2fs_evict_inode(struct inode *); @@ -1097,6 +1141,7 @@ struct dnode_of_data; struct node_info; int is_checkpointed_node(struct f2fs_sb_info *, nid_t); +bool fsync_mark_done(struct f2fs_sb_info *, nid_t); void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); int truncate_inode_blocks(struct inode *, pgoff_t); @@ -1115,6 +1160,7 @@ void alloc_nid_done(struct f2fs_sb_info *, nid_t); void alloc_nid_failed(struct f2fs_sb_info *, nid_t); void recover_node_page(struct f2fs_sb_info *, struct page *, struct f2fs_summary *, struct node_info *, block_t); +bool recover_xattr_data(struct inode *, struct page *, block_t); int recover_inode_page(struct f2fs_sb_info *, struct page *); int restore_node_summary(struct f2fs_sb_info *, unsigned int, struct f2fs_summary_block *); @@ -1129,7 +1175,9 @@ void destroy_node_manager_caches(void); */ void f2fs_balance_fs(struct f2fs_sb_info *); void f2fs_balance_fs_bg(struct f2fs_sb_info *); +int f2fs_issue_flush(struct f2fs_sb_info *); void invalidate_blocks(struct f2fs_sb_info *, block_t); +void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); void clear_prefree_segments(struct f2fs_sb_info *); int npages_for_summary_flush(struct f2fs_sb_info *); void allocate_new_segments(struct f2fs_sb_info *); @@ -1162,6 +1210,7 @@ void destroy_segment_manager_caches(void); */ struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); +int ra_meta_pages(struct f2fs_sb_info *, int, int, int); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); int acquire_orphan_inode(struct f2fs_sb_info *); void release_orphan_inode(struct f2fs_sb_info *); @@ -1231,7 +1280,7 @@ struct f2fs_stat_info { int util_free, util_valid, util_invalid; int rsvd_segs, overp_segs; int dirty_count, node_pages, meta_pages; - int prefree_count, call_count; + int prefree_count, call_count, cp_count; int tot_segs, node_segs, data_segs, free_segs, free_secs; int tot_blks, data_blks, node_blks; int curseg[NR_CURSEG_TYPE]; @@ -1248,6 +1297,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) return (struct f2fs_stat_info *)sbi->stat_info; } +#define stat_inc_cp_count(si) ((si)->cp_count++) #define stat_inc_call_count(si) ((si)->call_count++) #define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) #define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++) @@ -1302,6 +1352,7 @@ void f2fs_destroy_stats(struct f2fs_sb_info *); void __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else +#define stat_inc_cp_count(si) #define stat_inc_call_count(si) #define stat_inc_bggc_count(si) #define stat_inc_dirty_dir(sbi) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 0dfcef53a6ed..60e7d5448a1d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -76,7 +76,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, trace_f2fs_vm_page_mkwrite(page, DATA); mapped: /* fill the page */ - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, DATA); out: sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(err); @@ -84,6 +84,7 @@ out: static const struct vm_operations_struct f2fs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = f2fs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; @@ -111,11 +112,12 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; + struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); int ret = 0; bool need_cp = false; struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, + .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .for_reclaim = 0, }; @@ -133,7 +135,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) /* guarantee free sections for fsync */ f2fs_balance_fs(sbi); - mutex_lock(&inode->i_mutex); + down_read(&fi->i_sem); /* * Both of fdatasync() and fsync() are able to be recovered from @@ -150,25 +152,33 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi))) need_cp = true; + up_read(&fi->i_sem); + if (need_cp) { nid_t pino; - F2FS_I(inode)->xattr_ver = 0; - /* all the dirty node pages should be flushed for POR */ ret = f2fs_sync_fs(inode->i_sb, 1); + + down_write(&fi->i_sem); + F2FS_I(inode)->xattr_ver = 0; if (file_wrong_pino(inode) && inode->i_nlink == 1 && get_parent_ino(inode, &pino)) { F2FS_I(inode)->i_pino = pino; file_got_pino(inode); + up_write(&fi->i_sem); mark_inode_dirty_sync(inode); ret = f2fs_write_inode(inode, NULL); if (ret) goto out; + } else { + up_write(&fi->i_sem); } } else { /* if there is no written node page, write its inode page */ while (!sync_node_pages(sbi, inode->i_ino, &wbc)) { + if (fsync_mark_done(sbi, inode->i_ino)) + goto out; mark_inode_dirty_sync(inode); ret = f2fs_write_inode(inode, NULL); if (ret) @@ -177,10 +187,9 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = wait_on_node_pages_writeback(sbi, inode->i_ino); if (ret) goto out; - ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + ret = f2fs_issue_flush(F2FS_SB(inode->i_sb)); } out: - mutex_unlock(&inode->i_mutex); trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); return ret; } @@ -245,7 +254,7 @@ static void truncate_partial_data_page(struct inode *inode, u64 from) f2fs_put_page(page, 1); return; } - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, DATA); zero_user(page, offset, PAGE_CACHE_SIZE - offset); set_page_dirty(page); f2fs_put_page(page, 1); @@ -422,7 +431,7 @@ static void fill_zero(struct inode *inode, pgoff_t index, f2fs_unlock_op(sbi); if (!IS_ERR(page)) { - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, DATA); zero_user(page, start, len); set_page_dirty(page); f2fs_put_page(page, 1); @@ -560,6 +569,8 @@ static long f2fs_fallocate(struct file *file, int mode, if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; + mutex_lock(&inode->i_mutex); + if (mode & FALLOC_FL_PUNCH_HOLE) ret = punch_hole(inode, offset, len); else @@ -569,6 +580,9 @@ static long f2fs_fallocate(struct file *file, int mode, inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); } + + mutex_unlock(&inode->i_mutex); + trace_f2fs_fallocate(inode, mode, offset, len, ret); return ret; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index ea0371e854b4..b90dbe55403a 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -531,15 +531,10 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type) set_page_dirty(page); set_cold_data(page); } else { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - f2fs_wait_on_page_writeback(page, DATA); - if (clear_page_dirty_for_io(page) && - S_ISDIR(inode->i_mode)) { - dec_page_count(sbi, F2FS_DIRTY_DENTS); + if (clear_page_dirty_for_io(page)) inode_dec_dirty_dents(inode); - } set_cold_data(page); do_write_data_page(page, &fio); clear_cold_data(page); @@ -701,6 +696,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi) gc_more: if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) goto stop; + if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG))) + goto stop; if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { gc_type = FG_GC; @@ -711,6 +708,11 @@ gc_more: goto stop; ret = 0; + /* readahead multi ssa blocks those have contiguous address */ + if (sbi->segs_per_sec > 1) + ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec, + META_SSA); + for (i = 0; i < sbi->segs_per_sec; i++) do_garbage_collect(sbi, segno + i, &ilist, gc_type); @@ -740,7 +742,7 @@ void build_gc_manager(struct f2fs_sb_info *sbi) int __init create_gc_caches(void) { winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes", - sizeof(struct inode_entry), NULL); + sizeof(struct inode_entry)); if (!winode_slab) return -ENOMEM; return 0; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 31ee5b164ff9..383db1fabcf4 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -45,8 +45,10 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) } ipage = get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) + if (IS_ERR(ipage)) { + unlock_page(page); return PTR_ERR(ipage); + } zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 4d67ed736dca..ee829d360468 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -107,6 +107,7 @@ static int do_read_inode(struct inode *inode) fi->flags = 0; fi->i_advise = ri->i_advise; fi->i_pino = le32_to_cpu(ri->i_pino); + fi->i_dir_level = ri->i_dir_level; get_extent_info(&fi->ext, ri->i_ext); get_inline_info(fi, ri); @@ -204,6 +205,7 @@ void update_inode(struct inode *inode, struct page *node_page) ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags); ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino); ri->i_generation = cpu_to_le32(inode->i_generation); + ri->i_dir_level = F2FS_I(inode)->i_dir_level; __set_inode_rdev(inode, ri); set_cold_node(inode, node_page); @@ -212,24 +214,29 @@ void update_inode(struct inode *inode, struct page *node_page) clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); } -int update_inode_page(struct inode *inode) +void update_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *node_page; - +retry: node_page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(node_page)) - return PTR_ERR(node_page); - + if (IS_ERR(node_page)) { + int err = PTR_ERR(node_page); + if (err == -ENOMEM) { + cond_resched(); + goto retry; + } else if (err != -ENOENT) { + f2fs_stop_checkpoint(sbi); + } + return; + } update_inode(inode, node_page); f2fs_put_page(node_page, 1); - return 0; } int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - int ret; if (inode->i_ino == F2FS_NODE_INO(sbi) || inode->i_ino == F2FS_META_INO(sbi)) @@ -243,13 +250,13 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) * during the urgent cleaning time when runing out of free sections. */ f2fs_lock_op(sbi); - ret = update_inode_page(inode); + update_inode_page(inode); f2fs_unlock_op(sbi); if (wbc) f2fs_balance_fs(sbi); - return ret; + return 0; } /* @@ -260,13 +267,13 @@ void f2fs_evict_inode(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); trace_f2fs_evict_inode(inode); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); if (inode->i_ino == F2FS_NODE_INO(sbi) || inode->i_ino == F2FS_META_INO(sbi)) goto no_delete; - f2fs_bug_on(atomic_read(&F2FS_I(inode)->dirty_dents)); + f2fs_bug_on(get_dirty_dents(inode)); remove_dirty_dir_inode(inode); if (inode->i_nlink || is_bad_inode(inode)) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 397d459e97bf..a9409d19dfd4 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -207,6 +207,8 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, inode = f2fs_iget(dir->i_sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); + + stat_inc_inline_inode(inode); } return d_splice_alias(inode, dentry); @@ -424,12 +426,17 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } f2fs_set_link(new_dir, new_entry, new_page, old_inode); + down_write(&F2FS_I(old_inode)->i_sem); F2FS_I(old_inode)->i_pino = new_dir->i_ino; + up_write(&F2FS_I(old_inode)->i_sem); new_inode->i_ctime = CURRENT_TIME; + down_write(&F2FS_I(new_inode)->i_sem); if (old_dir_entry) drop_nlink(new_inode); drop_nlink(new_inode); + up_write(&F2FS_I(new_inode)->i_sem); + mark_inode_dirty(new_inode); if (!new_inode->i_nlink) @@ -459,7 +466,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_dir != new_dir) { f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); + down_write(&F2FS_I(old_inode)->i_sem); F2FS_I(old_inode)->i_pino = new_dir->i_ino; + up_write(&F2FS_I(old_inode)->i_sem); update_inode_page(old_inode); } else { kunmap(old_dir_page); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b0649b76eb4f..a161e955c4c8 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -21,9 +21,27 @@ #include "segment.h" #include <trace/events/f2fs.h> +#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock) + static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; +static inline bool available_free_memory(struct f2fs_nm_info *nm_i, int type) +{ + struct sysinfo val; + unsigned long mem_size = 0; + + si_meminfo(&val); + if (type == FREE_NIDS) + mem_size = nm_i->fcnt * sizeof(struct free_nid); + else if (type == NAT_ENTRIES) + mem_size += nm_i->nat_cnt * sizeof(struct nat_entry); + mem_size >>= 12; + + /* give 50:50 memory for free nids and nat caches respectively */ + return (mem_size < ((val.totalram * nm_i->ram_thresh) >> 11)); +} + static void clear_node_page_dirty(struct page *page) { struct address_space *mapping = page->mapping; @@ -82,42 +100,6 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) return dst_page; } -/* - * Readahead NAT pages - */ -static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid) -{ - struct address_space *mapping = META_MAPPING(sbi); - struct f2fs_nm_info *nm_i = NM_I(sbi); - struct page *page; - pgoff_t index; - int i; - struct f2fs_io_info fio = { - .type = META, - .rw = READ_SYNC | REQ_META | REQ_PRIO - }; - - - for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) { - if (unlikely(nid >= nm_i->max_nid)) - nid = 0; - index = current_nat_addr(sbi, nid); - - page = grab_cache_page(mapping, index); - if (!page) - continue; - if (PageUptodate(page)) { - mark_page_accessed(page); - f2fs_put_page(page, 1); - continue; - } - f2fs_submit_page_mbio(sbi, page, index, &fio); - mark_page_accessed(page); - f2fs_put_page(page, 0); - } - f2fs_submit_merged_bio(sbi, META, READ); -} - static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) { return radix_tree_lookup(&nm_i->nat_root, n); @@ -151,6 +133,20 @@ int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) return is_cp; } +bool fsync_mark_done(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct nat_entry *e; + bool fsync_done = false; + + read_lock(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, nid); + if (e) + fsync_done = e->fsync_done; + read_unlock(&nm_i->nat_tree_lock); + return fsync_done; +} + static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) { struct nat_entry *new; @@ -164,6 +160,7 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) } memset(new, 0, sizeof(struct nat_entry)); nat_set_nid(new, nid); + new->checkpointed = true; list_add_tail(&new->list, &nm_i->nat_entries); nm_i->nat_cnt++; return new; @@ -185,13 +182,12 @@ retry: nat_set_blkaddr(e, le32_to_cpu(ne->block_addr)); nat_set_ino(e, le32_to_cpu(ne->ino)); nat_set_version(e, ne->version); - e->checkpointed = true; } write_unlock(&nm_i->nat_tree_lock); } static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, - block_t new_blkaddr) + block_t new_blkaddr, bool fsync_done) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; @@ -205,7 +201,6 @@ retry: goto retry; } e->ni = *ni; - e->checkpointed = true; f2fs_bug_on(ni->blk_addr == NEW_ADDR); } else if (new_blkaddr == NEW_ADDR) { /* @@ -217,9 +212,6 @@ retry: f2fs_bug_on(ni->blk_addr != NULL_ADDR); } - if (new_blkaddr == NEW_ADDR) - e->checkpointed = false; - /* sanity check */ f2fs_bug_on(nat_get_blkaddr(e) != ni->blk_addr); f2fs_bug_on(nat_get_blkaddr(e) == NULL_ADDR && @@ -239,6 +231,11 @@ retry: /* change address */ nat_set_blkaddr(e, new_blkaddr); __set_nat_cache_dirty(nm_i, e); + + /* update fsync_mark if its inode nat entry is still alive */ + e = __lookup_nat_cache(nm_i, ni->ino); + if (e) + e->fsync_done = fsync_done; write_unlock(&nm_i->nat_tree_lock); } @@ -246,7 +243,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) { struct f2fs_nm_info *nm_i = NM_I(sbi); - if (nm_i->nat_cnt <= NM_WOUT_THRESHOLD) + if (available_free_memory(nm_i, NAT_ENTRIES)) return 0; write_lock(&nm_i->nat_tree_lock); @@ -505,7 +502,7 @@ static void truncate_node(struct dnode_of_data *dn) /* Deallocate node address */ invalidate_blocks(sbi, ni.blk_addr); dec_valid_node_count(sbi, dn->inode); - set_node_addr(sbi, &ni, NULL_ADDR); + set_node_addr(sbi, &ni, NULL_ADDR, false); if (dn->nid == dn->inode->i_ino) { remove_orphan_inode(sbi, dn->nid); @@ -763,7 +760,7 @@ skip_partial: f2fs_put_page(page, 1); goto restart; } - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, NODE); ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; set_page_dirty(page); unlock_page(page); @@ -852,7 +849,8 @@ struct page *new_node_page(struct dnode_of_data *dn, if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) return ERR_PTR(-EPERM); - page = grab_cache_page(NODE_MAPPING(sbi), dn->nid); + page = grab_cache_page_write_begin(NODE_MAPPING(sbi), + dn->nid, AOP_FLAG_NOFS); if (!page) return ERR_PTR(-ENOMEM); @@ -867,14 +865,14 @@ struct page *new_node_page(struct dnode_of_data *dn, f2fs_bug_on(old_ni.blk_addr != NULL_ADDR); new_ni = old_ni; new_ni.ino = dn->inode->i_ino; - set_node_addr(sbi, &new_ni, NEW_ADDR); + set_node_addr(sbi, &new_ni, NEW_ADDR, false); fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); set_cold_node(dn->inode, page); SetPageUptodate(page); set_page_dirty(page); - if (ofs == XATTR_NODE_OFFSET) + if (f2fs_has_xattr_block(ofs)) F2FS_I(dn->inode)->i_xattr_nid = dn->nid; dn->node_page = page; @@ -948,7 +946,8 @@ struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) struct page *page; int err; repeat: - page = grab_cache_page(NODE_MAPPING(sbi), nid); + page = grab_cache_page_write_begin(NODE_MAPPING(sbi), + nid, AOP_FLAG_NOFS); if (!page) return ERR_PTR(-ENOMEM); @@ -959,7 +958,7 @@ repeat: goto got_it; lock_page(page); - if (unlikely(!PageUptodate(page))) { + if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) { f2fs_put_page(page, 1); return ERR_PTR(-EIO); } @@ -968,7 +967,6 @@ repeat: goto repeat; } got_it: - f2fs_bug_on(nid != nid_of_node(page)); mark_page_accessed(page); return page; } @@ -1168,7 +1166,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) continue; if (ino && ino_of_node(page) == ino) { - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, NODE); if (TestClearPageError(page)) ret = -EIO; } @@ -1201,7 +1199,7 @@ static int f2fs_write_node_page(struct page *page, if (unlikely(sbi->por_doing)) goto redirty_out; - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, NODE); /* get old block addr of this node page */ nid = nid_of_node(page); @@ -1222,7 +1220,7 @@ static int f2fs_write_node_page(struct page *page, mutex_lock(&sbi->node_write); set_page_writeback(page); write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr); - set_node_addr(sbi, &ni, new_addr); + set_node_addr(sbi, &ni, new_addr, is_fsync_dnode(page)); dec_page_count(sbi, F2FS_DIRTY_NODES); mutex_unlock(&sbi->node_write); unlock_page(page); @@ -1231,35 +1229,32 @@ static int f2fs_write_node_page(struct page *page, redirty_out: dec_page_count(sbi, F2FS_DIRTY_NODES); wbc->pages_skipped++; + account_page_redirty(page); set_page_dirty(page); return AOP_WRITEPAGE_ACTIVATE; } -/* - * It is very important to gather dirty pages and write at once, so that we can - * submit a big bio without interfering other data writes. - * Be default, 512 pages (2MB) * 3 node types, is more reasonable. - */ -#define COLLECT_DIRTY_NODES 1536 static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); - long nr_to_write = wbc->nr_to_write; + long diff; /* balancing f2fs's metadata in background */ f2fs_balance_fs_bg(sbi); /* collect a number of dirty node pages and write together */ - if (get_pages(sbi, F2FS_DIRTY_NODES) < COLLECT_DIRTY_NODES) - return 0; + if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE)) + goto skip_write; - /* if mounting is failed, skip writing node pages */ - wbc->nr_to_write = 3 * max_hw_blocks(sbi); + diff = nr_pages_to_write(sbi, NODE, wbc); wbc->sync_mode = WB_SYNC_NONE; sync_node_pages(sbi, 0, wbc); - wbc->nr_to_write = nr_to_write - (3 * max_hw_blocks(sbi) - - wbc->nr_to_write); + wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); + return 0; + +skip_write: + wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES); return 0; } @@ -1307,22 +1302,17 @@ const struct address_space_operations f2fs_node_aops = { .releasepage = f2fs_release_node_page, }; -static struct free_nid *__lookup_free_nid_list(nid_t n, struct list_head *head) +static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, + nid_t n) { - struct list_head *this; - struct free_nid *i; - list_for_each(this, head) { - i = list_entry(this, struct free_nid, list); - if (i->nid == n) - return i; - } - return NULL; + return radix_tree_lookup(&nm_i->free_nid_root, n); } -static void __del_from_free_nid_list(struct free_nid *i) +static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i, + struct free_nid *i) { list_del(&i->list); - kmem_cache_free(free_nid_slab, i); + radix_tree_delete(&nm_i->free_nid_root, i->nid); } static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) @@ -1331,7 +1321,7 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) struct nat_entry *ne; bool allocated = false; - if (nm_i->fcnt > 2 * MAX_FREE_NIDS) + if (!available_free_memory(nm_i, FREE_NIDS)) return -1; /* 0 nid should not be used */ @@ -1342,7 +1332,8 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) /* do not add allocated nids */ read_lock(&nm_i->nat_tree_lock); ne = __lookup_nat_cache(nm_i, nid); - if (ne && nat_get_blkaddr(ne) != NULL_ADDR) + if (ne && + (!ne->checkpointed || nat_get_blkaddr(ne) != NULL_ADDR)) allocated = true; read_unlock(&nm_i->nat_tree_lock); if (allocated) @@ -1354,7 +1345,7 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) i->state = NID_NEW; spin_lock(&nm_i->free_nid_list_lock); - if (__lookup_free_nid_list(nid, &nm_i->free_nid_list)) { + if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) { spin_unlock(&nm_i->free_nid_list_lock); kmem_cache_free(free_nid_slab, i); return 0; @@ -1368,13 +1359,19 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) { struct free_nid *i; + bool need_free = false; + spin_lock(&nm_i->free_nid_list_lock); - i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); + i = __lookup_free_nid_list(nm_i, nid); if (i && i->state == NID_NEW) { - __del_from_free_nid_list(i); + __del_from_free_nid_list(nm_i, i); nm_i->fcnt--; + need_free = true; } spin_unlock(&nm_i->free_nid_list_lock); + + if (need_free) + kmem_cache_free(free_nid_slab, i); } static void scan_nat_page(struct f2fs_nm_info *nm_i, @@ -1413,7 +1410,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi) return; /* readahead nat pages to be scanned */ - ra_nat_pages(sbi, nid); + ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT); while (1) { struct page *page = get_current_nat_page(sbi, nid); @@ -1454,7 +1451,6 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i = NULL; - struct list_head *this; retry: if (unlikely(sbi->total_valid_node_count + 1 >= nm_i->max_nid)) return false; @@ -1462,13 +1458,11 @@ retry: spin_lock(&nm_i->free_nid_list_lock); /* We should not use stale free nids created by build_free_nids */ - if (nm_i->fcnt && !sbi->on_build_free_nids) { + if (nm_i->fcnt && !on_build_free_nids(nm_i)) { f2fs_bug_on(list_empty(&nm_i->free_nid_list)); - list_for_each(this, &nm_i->free_nid_list) { - i = list_entry(this, struct free_nid, list); + list_for_each_entry(i, &nm_i->free_nid_list, list) if (i->state == NID_NEW) break; - } f2fs_bug_on(i->state != NID_NEW); *nid = i->nid; @@ -1481,9 +1475,7 @@ retry: /* Let's scan nat pages and its caches to get free nids */ mutex_lock(&nm_i->build_lock); - sbi->on_build_free_nids = true; build_free_nids(sbi); - sbi->on_build_free_nids = false; mutex_unlock(&nm_i->build_lock); goto retry; } @@ -1497,10 +1489,12 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) struct free_nid *i; spin_lock(&nm_i->free_nid_list_lock); - i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); + i = __lookup_free_nid_list(nm_i, nid); f2fs_bug_on(!i || i->state != NID_ALLOC); - __del_from_free_nid_list(i); + __del_from_free_nid_list(nm_i, i); spin_unlock(&nm_i->free_nid_list_lock); + + kmem_cache_free(free_nid_slab, i); } /* @@ -1510,20 +1504,25 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; + bool need_free = false; if (!nid) return; spin_lock(&nm_i->free_nid_list_lock); - i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); + i = __lookup_free_nid_list(nm_i, nid); f2fs_bug_on(!i || i->state != NID_ALLOC); - if (nm_i->fcnt > 2 * MAX_FREE_NIDS) { - __del_from_free_nid_list(i); + if (!available_free_memory(nm_i, FREE_NIDS)) { + __del_from_free_nid_list(nm_i, i); + need_free = true; } else { i->state = NID_NEW; nm_i->fcnt++; } spin_unlock(&nm_i->free_nid_list_lock); + + if (need_free) + kmem_cache_free(free_nid_slab, i); } void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, @@ -1531,10 +1530,83 @@ void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, block_t new_blkaddr) { rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr); - set_node_addr(sbi, ni, new_blkaddr); + set_node_addr(sbi, ni, new_blkaddr, false); clear_node_page_dirty(page); } +void recover_inline_xattr(struct inode *inode, struct page *page) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + void *src_addr, *dst_addr; + size_t inline_size; + struct page *ipage; + struct f2fs_inode *ri; + + if (!f2fs_has_inline_xattr(inode)) + return; + + if (!IS_INODE(page)) + return; + + ri = F2FS_INODE(page); + if (!(ri->i_inline & F2FS_INLINE_XATTR)) + return; + + ipage = get_node_page(sbi, inode->i_ino); + f2fs_bug_on(IS_ERR(ipage)); + + dst_addr = inline_xattr_addr(ipage); + src_addr = inline_xattr_addr(page); + inline_size = inline_xattr_size(inode); + + memcpy(dst_addr, src_addr, inline_size); + + update_inode(inode, ipage); + f2fs_put_page(ipage, 1); +} + +bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; + nid_t new_xnid = nid_of_node(page); + struct node_info ni; + + recover_inline_xattr(inode, page); + + if (!f2fs_has_xattr_block(ofs_of_node(page))) + return false; + + /* 1: invalidate the previous xattr nid */ + if (!prev_xnid) + goto recover_xnid; + + /* Deallocate node address */ + get_node_info(sbi, prev_xnid, &ni); + f2fs_bug_on(ni.blk_addr == NULL_ADDR); + invalidate_blocks(sbi, ni.blk_addr); + dec_valid_node_count(sbi, inode); + set_node_addr(sbi, &ni, NULL_ADDR, false); + +recover_xnid: + /* 2: allocate new xattr nid */ + if (unlikely(!inc_valid_node_count(sbi, inode))) + f2fs_bug_on(1); + + remove_free_nid(NM_I(sbi), new_xnid); + get_node_info(sbi, new_xnid, &ni); + ni.ino = inode->i_ino; + set_node_addr(sbi, &ni, NEW_ADDR, false); + F2FS_I(inode)->i_xattr_nid = new_xnid; + + /* 3: update xattr blkaddr */ + refresh_sit_entry(sbi, NEW_ADDR, blkaddr); + set_node_addr(sbi, &ni, blkaddr, false); + + update_inode_page(inode); + return true; +} + int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) { struct f2fs_inode *src, *dst; @@ -1567,7 +1639,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) if (unlikely(!inc_valid_node_count(sbi, NULL))) WARN_ON(1); - set_node_addr(sbi, &new_ni, NEW_ADDR); + set_node_addr(sbi, &new_ni, NEW_ADDR, false); inc_valid_inode_count(sbi); f2fs_put_page(ipage, 1); return 0; @@ -1590,15 +1662,8 @@ static int ra_sum_pages(struct f2fs_sb_info *sbi, struct list_head *pages, for (; page_idx < start + nrpages; page_idx++) { /* alloc temporal page for read node summary info*/ page = alloc_page(GFP_F2FS_ZERO); - if (!page) { - struct page *tmp; - list_for_each_entry_safe(page, tmp, pages, lru) { - list_del(&page->lru); - unlock_page(page); - __free_pages(page, 0); - } - return -ENOMEM; - } + if (!page) + break; lock_page(page); page->index = page_idx; @@ -1609,7 +1674,8 @@ static int ra_sum_pages(struct f2fs_sb_info *sbi, struct list_head *pages, f2fs_submit_page_mbio(sbi, page, page->index, &fio); f2fs_submit_merged_bio(sbi, META, READ); - return 0; + + return page_idx - start; } int restore_node_summary(struct f2fs_sb_info *sbi, @@ -1628,15 +1694,17 @@ int restore_node_summary(struct f2fs_sb_info *sbi, addr = START_BLOCK(sbi, segno); sum_entry = &sum->entries[0]; - for (i = 0; i < last_offset; i += nrpages, addr += nrpages) { + for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) { nrpages = min(last_offset - i, bio_blocks); /* read ahead node pages */ - err = ra_sum_pages(sbi, &page_list, addr, nrpages); - if (err) - return err; + nrpages = ra_sum_pages(sbi, &page_list, addr, nrpages); + if (!nrpages) + return -ENOMEM; list_for_each_entry_safe(page, tmp, &page_list, lru) { + if (err) + goto skip; lock_page(page); if (unlikely(!PageUptodate(page))) { @@ -1648,9 +1716,9 @@ int restore_node_summary(struct f2fs_sb_info *sbi, sum_entry->ofs_in_node = 0; sum_entry++; } - - list_del(&page->lru); unlock_page(page); +skip: + list_del(&page->lru); __free_pages(page, 0); } } @@ -1709,7 +1777,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_summary_block *sum = curseg->sum_blk; - struct list_head *cur, *n; + struct nat_entry *ne, *cur; struct page *page = NULL; struct f2fs_nat_block *nat_blk = NULL; nid_t start_nid = 0, end_nid = 0; @@ -1721,18 +1789,17 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) mutex_lock(&curseg->curseg_mutex); /* 1) flush dirty nat caches */ - list_for_each_safe(cur, n, &nm_i->dirty_nat_entries) { - struct nat_entry *ne; + list_for_each_entry_safe(ne, cur, &nm_i->dirty_nat_entries, list) { nid_t nid; struct f2fs_nat_entry raw_ne; int offset = -1; block_t new_blkaddr; - ne = list_entry(cur, struct nat_entry, list); - nid = nat_get_nid(ne); - if (nat_get_blkaddr(ne) == NEW_ADDR) continue; + + nid = nat_get_nid(ne); + if (flushed) goto to_nat_page; @@ -1783,16 +1850,12 @@ flush_now: } else { write_lock(&nm_i->nat_tree_lock); __clear_nat_cache_dirty(nm_i, ne); - ne->checkpointed = true; write_unlock(&nm_i->nat_tree_lock); } } if (!flushed) mutex_unlock(&curseg->curseg_mutex); f2fs_put_page(page, 1); - - /* 2) shrink nat caches if necessary */ - try_to_free_nats(sbi, nm_i->nat_cnt - NM_WOUT_THRESHOLD); } static int init_node_manager(struct f2fs_sb_info *sbi) @@ -1807,10 +1870,14 @@ static int init_node_manager(struct f2fs_sb_info *sbi) /* segment_count_nat includes pair segment so divide to 2. */ nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1; nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); - nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; + + /* not used nids: 0, node, meta, (and root counted as valid node) */ + nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks - 3; nm_i->fcnt = 0; nm_i->nat_cnt = 0; + nm_i->ram_thresh = DEF_RAM_THRESHOLD; + INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->free_nid_list); INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->nat_entries); @@ -1864,8 +1931,11 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) spin_lock(&nm_i->free_nid_list_lock); list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { f2fs_bug_on(i->state == NID_ALLOC); - __del_from_free_nid_list(i); + __del_from_free_nid_list(nm_i, i); nm_i->fcnt--; + spin_unlock(&nm_i->free_nid_list_lock); + kmem_cache_free(free_nid_slab, i); + spin_lock(&nm_i->free_nid_list_lock); } f2fs_bug_on(nm_i->fcnt); spin_unlock(&nm_i->free_nid_list_lock); @@ -1875,11 +1945,9 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) while ((found = __gang_lookup_nat_cache(nm_i, nid, NATVEC_SIZE, natvec))) { unsigned idx; - for (idx = 0; idx < found; idx++) { - struct nat_entry *e = natvec[idx]; - nid = nat_get_nid(e) + 1; - __del_from_nat_cache(nm_i, e); - } + nid = nat_get_nid(natvec[found - 1]) + 1; + for (idx = 0; idx < found; idx++) + __del_from_nat_cache(nm_i, natvec[idx]); } f2fs_bug_on(nm_i->nat_cnt); write_unlock(&nm_i->nat_tree_lock); @@ -1892,12 +1960,12 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) int __init create_node_manager_caches(void) { nat_entry_slab = f2fs_kmem_cache_create("nat_entry", - sizeof(struct nat_entry), NULL); + sizeof(struct nat_entry)); if (!nat_entry_slab) return -ENOMEM; free_nid_slab = f2fs_kmem_cache_create("free_nid", - sizeof(struct free_nid), NULL); + sizeof(struct free_nid)); if (!free_nid_slab) { kmem_cache_destroy(nat_entry_slab); return -ENOMEM; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index c4c79885c993..5decc1a375f0 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -17,14 +17,11 @@ /* # of pages to perform readahead before building free nids */ #define FREE_NID_PAGES 4 -/* maximum # of free node ids to produce during build_free_nids */ -#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES) - /* maximum readahead size for node during getting data blocks */ #define MAX_RA_NODE 128 -/* maximum cached nat entries to manage memory footprint */ -#define NM_WOUT_THRESHOLD (64 * NAT_ENTRY_PER_BLOCK) +/* control the memory footprint threshold (10MB per 1GB ram) */ +#define DEF_RAM_THRESHOLD 10 /* vector size for gang look-up from nat cache that consists of radix tree */ #define NATVEC_SIZE 64 @@ -45,6 +42,7 @@ struct node_info { struct nat_entry { struct list_head list; /* for clean or dirty nat list */ bool checkpointed; /* whether it is checkpointed or not */ + bool fsync_done; /* whether the latest node has fsync mark */ struct node_info ni; /* in-memory node information */ }; @@ -58,9 +56,15 @@ struct nat_entry { #define nat_set_version(nat, v) (nat->ni.version = v) #define __set_nat_cache_dirty(nm_i, ne) \ - list_move_tail(&ne->list, &nm_i->dirty_nat_entries); + do { \ + ne->checkpointed = false; \ + list_move_tail(&ne->list, &nm_i->dirty_nat_entries); \ + } while (0); #define __clear_nat_cache_dirty(nm_i, ne) \ - list_move_tail(&ne->list, &nm_i->nat_entries); + do { \ + ne->checkpointed = true; \ + list_move_tail(&ne->list, &nm_i->nat_entries); \ + } while (0); #define inc_node_version(version) (++version) static inline void node_info_from_raw_nat(struct node_info *ni, @@ -71,6 +75,11 @@ static inline void node_info_from_raw_nat(struct node_info *ni, ni->version = raw_ne->version; } +enum nid_type { + FREE_NIDS, /* indicates the free nid list */ + NAT_ENTRIES /* indicates the cached nat entry */ +}; + /* * For free nid mangement */ @@ -236,7 +245,7 @@ static inline bool IS_DNODE(struct page *node_page) { unsigned int ofs = ofs_of_node(node_page); - if (ofs == XATTR_NODE_OFFSET) + if (f2fs_has_xattr_block(ofs)) return false; if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK || diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 976a7a934db5..b1ae89f0f44e 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -27,14 +27,12 @@ bool space_for_roll_forward(struct f2fs_sb_info *sbi) static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, nid_t ino) { - struct list_head *this; struct fsync_inode_entry *entry; - list_for_each(this, head) { - entry = list_entry(this, struct fsync_inode_entry, list); + list_for_each_entry(entry, head, list) if (entry->inode->i_ino == ino) return entry; - } + return NULL; } @@ -136,7 +134,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) /* get node pages in the current segment */ curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); - blkaddr = START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff; + blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); /* read node page */ page = alloc_page(GFP_F2FS_ZERO); @@ -218,13 +216,12 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, { struct seg_entry *sentry; unsigned int segno = GET_SEGNO(sbi, blkaddr); - unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & - (sbi->blocks_per_seg - 1); + unsigned short blkoff = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); + struct f2fs_summary_block *sum_node; struct f2fs_summary sum; + struct page *sum_page, *node_page; nid_t ino, nid; - void *kaddr; struct inode *inode; - struct page *node_page; unsigned int offset; block_t bidx; int i; @@ -238,18 +235,15 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, struct curseg_info *curseg = CURSEG_I(sbi, i); if (curseg->segno == segno) { sum = curseg->sum_blk->entries[blkoff]; - break; + goto got_it; } } - if (i > CURSEG_COLD_DATA) { - struct page *sum_page = get_sum_page(sbi, segno); - struct f2fs_summary_block *sum_node; - kaddr = page_address(sum_page); - sum_node = (struct f2fs_summary_block *)kaddr; - sum = sum_node->entries[blkoff]; - f2fs_put_page(sum_page, 1); - } + sum_page = get_sum_page(sbi, segno); + sum_node = (struct f2fs_summary_block *)page_address(sum_page); + sum = sum_node->entries[blkoff]; + f2fs_put_page(sum_page, 1); +got_it: /* Use the locked dnode page and inode */ nid = le32_to_cpu(sum.nid); if (dn->inode->i_ino == nid) { @@ -301,6 +295,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (recover_inline_data(inode, page)) goto out; + if (recover_xattr_data(inode, page, blkaddr)) + goto out; + start = start_bidx_of_node(ofs_of_node(page), fi); if (IS_INODE(page)) end = start + ADDRS_PER_INODE(fi); @@ -317,7 +314,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, goto out; } - wait_on_page_writeback(dn.node_page); + f2fs_wait_on_page_writeback(dn.node_page, NODE); get_node_info(sbi, dn.nid, &ni); f2fs_bug_on(ni.ino != ino_of_node(page)); @@ -437,7 +434,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) bool need_writecp = false; fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", - sizeof(struct fsync_inode_entry), NULL); + sizeof(struct fsync_inode_entry)); if (!fsync_entry_slab) return -ENOMEM; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7caac5f2ca9e..085f548be7a3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -13,6 +13,7 @@ #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/prefetch.h> +#include <linux/kthread.h> #include <linux/vmalloc.h> #include <linux/swap.h> @@ -24,6 +25,7 @@ #define __reverse_ffz(x) __reverse_ffs(~(x)) static struct kmem_cache *discard_entry_slab; +static struct kmem_cache *flush_cmd_slab; /* * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since @@ -195,6 +197,73 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) f2fs_sync_fs(sbi->sb, true); } +static int issue_flush_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct f2fs_sm_info *sm_i = SM_I(sbi); + wait_queue_head_t *q = &sm_i->flush_wait_queue; +repeat: + if (kthread_should_stop()) + return 0; + + spin_lock(&sm_i->issue_lock); + if (sm_i->issue_list) { + sm_i->dispatch_list = sm_i->issue_list; + sm_i->issue_list = sm_i->issue_tail = NULL; + } + spin_unlock(&sm_i->issue_lock); + + if (sm_i->dispatch_list) { + struct bio *bio = bio_alloc(GFP_NOIO, 0); + struct flush_cmd *cmd, *next; + int ret; + + bio->bi_bdev = sbi->sb->s_bdev; + ret = submit_bio_wait(WRITE_FLUSH, bio); + + for (cmd = sm_i->dispatch_list; cmd; cmd = next) { + cmd->ret = ret; + next = cmd->next; + complete(&cmd->wait); + } + sm_i->dispatch_list = NULL; + } + + wait_event_interruptible(*q, kthread_should_stop() || sm_i->issue_list); + goto repeat; +} + +int f2fs_issue_flush(struct f2fs_sb_info *sbi) +{ + struct f2fs_sm_info *sm_i = SM_I(sbi); + struct flush_cmd *cmd; + int ret; + + if (!test_opt(sbi, FLUSH_MERGE)) + return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL); + + cmd = f2fs_kmem_cache_alloc(flush_cmd_slab, GFP_ATOMIC); + cmd->next = NULL; + cmd->ret = 0; + init_completion(&cmd->wait); + + spin_lock(&sm_i->issue_lock); + if (sm_i->issue_list) + sm_i->issue_tail->next = cmd; + else + sm_i->issue_list = cmd; + sm_i->issue_tail = cmd; + spin_unlock(&sm_i->issue_lock); + + if (!sm_i->dispatch_list) + wake_up(&sm_i->flush_wait_queue); + + wait_for_completion(&cmd->wait); + ret = cmd->ret; + kmem_cache_free(flush_cmd_slab, cmd); + return ret; +} + static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, enum dirty_type dirty_type) { @@ -340,8 +409,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi) { struct list_head *head = &(SM_I(sbi)->discard_list); - struct list_head *this, *next; - struct discard_entry *entry; + struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int total_segs = TOTAL_SEGS(sbi); @@ -370,8 +438,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi) mutex_unlock(&dirty_i->seglist_lock); /* send small discards */ - list_for_each_safe(this, next, head) { - entry = list_entry(this, struct discard_entry, list); + list_for_each_entry_safe(entry, this, head, list) { f2fs_issue_discard(sbi, entry->blkaddr, entry->len); list_del(&entry->list); SM_I(sbi)->nr_discards -= entry->len; @@ -405,7 +472,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) se = get_seg_entry(sbi, segno); new_vblocks = se->valid_blocks + del; - offset = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & (sbi->blocks_per_seg - 1); + offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); f2fs_bug_on((new_vblocks >> (sizeof(unsigned short) << 3) || (new_vblocks > sbi->blocks_per_seg))); @@ -434,12 +501,14 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) get_sec_entry(sbi, segno)->valid_blocks += del; } -static void refresh_sit_entry(struct f2fs_sb_info *sbi, - block_t old_blkaddr, block_t new_blkaddr) +void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new) { - update_sit_entry(sbi, new_blkaddr, 1); - if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) - update_sit_entry(sbi, old_blkaddr, -1); + update_sit_entry(sbi, new, 1); + if (GET_SEGNO(sbi, old) != NULL_SEGNO) + update_sit_entry(sbi, old, -1); + + locate_dirty_segment(sbi, GET_SEGNO(sbi, old)); + locate_dirty_segment(sbi, GET_SEGNO(sbi, new)); } void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) @@ -881,17 +950,15 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, stat_inc_block_count(sbi, curseg); + if (!__has_curseg_space(sbi, type)) + sit_i->s_ops->allocate_segment(sbi, type, false); /* * SIT information should be updated before segment allocation, * since SSR needs latest valid block information. */ refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); - - if (!__has_curseg_space(sbi, type)) - sit_i->s_ops->allocate_segment(sbi, type, false); - locate_dirty_segment(sbi, old_cursegno); - locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); + mutex_unlock(&sit_i->sentry_lock); if (page && IS_NODESEG(type)) @@ -987,14 +1054,11 @@ void recover_data_page(struct f2fs_sb_info *sbi, change_curseg(sbi, type, true); } - curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & - (sbi->blocks_per_seg - 1); + curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); __add_sum_entry(sbi, type, sum); refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); - locate_dirty_segment(sbi, old_cursegno); - locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); mutex_unlock(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); @@ -1028,8 +1092,7 @@ void rewrite_node_page(struct f2fs_sb_info *sbi, curseg->next_segno = segno; change_curseg(sbi, type, true); } - curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & - (sbi->blocks_per_seg - 1); + curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); __add_sum_entry(sbi, type, sum); /* change the current log to the next block addr in advance */ @@ -1037,28 +1100,50 @@ void rewrite_node_page(struct f2fs_sb_info *sbi, curseg->next_segno = next_segno; change_curseg(sbi, type, true); } - curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, next_blkaddr) & - (sbi->blocks_per_seg - 1); + curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, next_blkaddr); /* rewrite node page */ set_page_writeback(page); f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio); f2fs_submit_merged_bio(sbi, NODE, WRITE); refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); - locate_dirty_segment(sbi, old_cursegno); - locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); mutex_unlock(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); } +static inline bool is_merged_page(struct f2fs_sb_info *sbi, + struct page *page, enum page_type type) +{ + enum page_type btype = PAGE_TYPE_OF_BIO(type); + struct f2fs_bio_info *io = &sbi->write_io[btype]; + struct bio_vec *bvec; + int i; + + down_read(&io->io_rwsem); + if (!io->bio) + goto out; + + bio_for_each_segment_all(bvec, io->bio, i) { + if (page == bvec->bv_page) { + up_read(&io->io_rwsem); + return true; + } + } + +out: + up_read(&io->io_rwsem); + return false; +} + void f2fs_wait_on_page_writeback(struct page *page, enum page_type type) { struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); if (PageWriteback(page)) { - f2fs_submit_merged_bio(sbi, type, WRITE); + if (is_merged_page(sbi, page, type)) + f2fs_submit_merged_bio(sbi, type, WRITE); wait_on_page_writeback(page); } } @@ -1167,9 +1252,12 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) ns->ofs_in_node = 0; } } else { - if (restore_node_summary(sbi, segno, sum)) { + int err; + + err = restore_node_summary(sbi, segno, sum); + if (err) { f2fs_put_page(new, 1); - return -EINVAL; + return err; } } } @@ -1190,6 +1278,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) static int restore_curseg_summaries(struct f2fs_sb_info *sbi) { int type = CURSEG_HOT_DATA; + int err; if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) { /* restore for compacted data summary */ @@ -1198,9 +1287,12 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) type = CURSEG_HOT_NODE; } - for (; type <= CURSEG_COLD_NODE; type++) - if (read_normal_summaries(sbi, type)) - return -EINVAL; + for (; type <= CURSEG_COLD_NODE; type++) { + err = read_normal_summaries(sbi, type); + if (err) + return err; + } + return 0; } @@ -1583,47 +1675,6 @@ static int build_curseg(struct f2fs_sb_info *sbi) return restore_curseg_summaries(sbi); } -static int ra_sit_pages(struct f2fs_sb_info *sbi, int start, int nrpages) -{ - struct address_space *mapping = META_MAPPING(sbi); - struct page *page; - block_t blk_addr, prev_blk_addr = 0; - int sit_blk_cnt = SIT_BLK_CNT(sbi); - int blkno = start; - struct f2fs_io_info fio = { - .type = META, - .rw = READ_SYNC | REQ_META | REQ_PRIO - }; - - for (; blkno < start + nrpages && blkno < sit_blk_cnt; blkno++) { - - blk_addr = current_sit_addr(sbi, blkno * SIT_ENTRY_PER_BLOCK); - - if (blkno != start && prev_blk_addr + 1 != blk_addr) - break; - prev_blk_addr = blk_addr; -repeat: - page = grab_cache_page(mapping, blk_addr); - if (!page) { - cond_resched(); - goto repeat; - } - if (PageUptodate(page)) { - mark_page_accessed(page); - f2fs_put_page(page, 1); - continue; - } - - f2fs_submit_page_mbio(sbi, page, blk_addr, &fio); - - mark_page_accessed(page); - f2fs_put_page(page, 0); - } - - f2fs_submit_merged_bio(sbi, META, READ); - return blkno - start; -} - static void build_sit_entries(struct f2fs_sb_info *sbi) { struct sit_info *sit_i = SIT_I(sbi); @@ -1635,7 +1686,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); do { - readed = ra_sit_pages(sbi, start_blk, nrpages); + readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT); start = start_blk * sit_i->sents_per_block; end = (start_blk + readed) * sit_i->sents_per_block; @@ -1781,6 +1832,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + dev_t dev = sbi->sb->s_bdev->bd_dev; struct f2fs_sm_info *sm_info; int err; @@ -1799,7 +1851,8 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main); sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); - sm_info->rec_prefree_segments = DEF_RECLAIM_PREFREE_SEGMENTS; + sm_info->rec_prefree_segments = sm_info->main_segments * + DEF_RECLAIM_PREFREE_SEGMENTS / 100; sm_info->ipu_policy = F2FS_IPU_DISABLE; sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; @@ -1807,6 +1860,16 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->nr_discards = 0; sm_info->max_discards = 0; + if (test_opt(sbi, FLUSH_MERGE)) { + spin_lock_init(&sm_info->issue_lock); + init_waitqueue_head(&sm_info->flush_wait_queue); + + sm_info->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, + "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(sm_info->f2fs_issue_flush)) + return PTR_ERR(sm_info->f2fs_issue_flush); + } + err = build_sit_info(sbi); if (err) return err; @@ -1915,6 +1978,8 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) struct f2fs_sm_info *sm_info = SM_I(sbi); if (!sm_info) return; + if (sm_info->f2fs_issue_flush) + kthread_stop(sm_info->f2fs_issue_flush); destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); @@ -1926,13 +1991,20 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) int __init create_segment_manager_caches(void) { discard_entry_slab = f2fs_kmem_cache_create("discard_entry", - sizeof(struct discard_entry), NULL); + sizeof(struct discard_entry)); if (!discard_entry_slab) return -ENOMEM; + flush_cmd_slab = f2fs_kmem_cache_create("flush_command", + sizeof(struct flush_cmd)); + if (!flush_cmd_slab) { + kmem_cache_destroy(discard_entry_slab); + return -ENOMEM; + } return 0; } void destroy_segment_manager_caches(void) { kmem_cache_destroy(discard_entry_slab); + kmem_cache_destroy(flush_cmd_slab); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5731682d7516..7091204680f4 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -14,7 +14,7 @@ #define NULL_SEGNO ((unsigned int)(~0)) #define NULL_SECNO ((unsigned int)(~0)) -#define DEF_RECLAIM_PREFREE_SEGMENTS 100 /* 200MB of prefree segments */ +#define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */ /* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) @@ -57,6 +57,9 @@ ((blk_addr) - SM_I(sbi)->seg0_blkaddr) #define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg) +#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \ + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1)) + #define GET_SEGNO(sbi, blk_addr) \ (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ @@ -377,26 +380,12 @@ static inline void get_sit_bitmap(struct f2fs_sb_info *sbi, static inline block_t written_block_count(struct f2fs_sb_info *sbi) { - struct sit_info *sit_i = SIT_I(sbi); - block_t vblocks; - - mutex_lock(&sit_i->sentry_lock); - vblocks = sit_i->written_valid_blocks; - mutex_unlock(&sit_i->sentry_lock); - - return vblocks; + return SIT_I(sbi)->written_valid_blocks; } static inline unsigned int free_segments(struct f2fs_sb_info *sbi) { - struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int free_segs; - - read_lock(&free_i->segmap_lock); - free_segs = free_i->free_segments; - read_unlock(&free_i->segmap_lock); - - return free_segs; + return FREE_I(sbi)->free_segments; } static inline int reserved_segments(struct f2fs_sb_info *sbi) @@ -406,14 +395,7 @@ static inline int reserved_segments(struct f2fs_sb_info *sbi) static inline unsigned int free_sections(struct f2fs_sb_info *sbi) { - struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int free_secs; - - read_lock(&free_i->segmap_lock); - free_secs = free_i->free_sections; - read_unlock(&free_i->segmap_lock); - - return free_secs; + return FREE_I(sbi)->free_sections; } static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi) @@ -682,3 +664,46 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi) struct request_queue *q = bdev_get_queue(bdev); return SECTOR_TO_BLOCK(sbi, queue_max_sectors(q)); } + +/* + * It is very important to gather dirty pages and write at once, so that we can + * submit a big bio without interfering other data writes. + * By default, 512 pages for directory data, + * 512 pages (2MB) * 3 for three types of nodes, and + * max_bio_blocks for meta are set. + */ +static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) +{ + if (type == DATA) + return sbi->blocks_per_seg; + else if (type == NODE) + return 3 * sbi->blocks_per_seg; + else if (type == META) + return MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + else + return 0; +} + +/* + * When writing pages, it'd better align nr_to_write for segment size. + */ +static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, + struct writeback_control *wbc) +{ + long nr_to_write, desired; + + if (wbc->sync_mode != WB_SYNC_NONE) + return 0; + + nr_to_write = wbc->nr_to_write; + + if (type == DATA) + desired = 4096; + else if (type == NODE) + desired = 3 * max_hw_blocks(sbi); + else + desired = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + + wbc->nr_to_write = desired; + return desired - nr_to_write; +} diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1a85f83abd53..c756923a7302 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -51,6 +51,7 @@ enum { Opt_disable_ext_identify, Opt_inline_xattr, Opt_inline_data, + Opt_flush_merge, Opt_err, }; @@ -67,6 +68,7 @@ static match_table_t f2fs_tokens = { {Opt_disable_ext_identify, "disable_ext_identify"}, {Opt_inline_xattr, "inline_xattr"}, {Opt_inline_data, "inline_data"}, + {Opt_flush_merge, "flush_merge"}, {Opt_err, NULL}, }; @@ -74,6 +76,7 @@ static match_table_t f2fs_tokens = { enum { GC_THREAD, /* struct f2fs_gc_thread */ SM_INFO, /* struct f2fs_sm_info */ + NM_INFO, /* struct f2fs_nm_info */ F2FS_SBI, /* struct f2fs_sb_info */ }; @@ -92,6 +95,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) return (unsigned char *)sbi->gc_thread; else if (struct_type == SM_INFO) return (unsigned char *)SM_I(sbi); + else if (struct_type == NM_INFO) + return (unsigned char *)NM_I(sbi); else if (struct_type == F2FS_SBI) return (unsigned char *)sbi; return NULL; @@ -183,7 +188,9 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -196,6 +203,8 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), ATTR_LIST(max_victim_search), + ATTR_LIST(dir_level), + ATTR_LIST(ram_thresh), NULL, }; @@ -256,9 +265,9 @@ static int parse_options(struct super_block *sb, char *options) if (!name) return -ENOMEM; - if (!strncmp(name, "on", 2)) + if (strlen(name) == 2 && !strncmp(name, "on", 2)) set_opt(sbi, BG_GC); - else if (!strncmp(name, "off", 3)) + else if (strlen(name) == 3 && !strncmp(name, "off", 3)) clear_opt(sbi, BG_GC); else { kfree(name); @@ -327,6 +336,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_inline_data: set_opt(sbi, INLINE_DATA); break; + case Opt_flush_merge: + set_opt(sbi, FLUSH_MERGE); + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -353,12 +365,16 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) fi->i_current_depth = 1; fi->i_advise = 0; rwlock_init(&fi->ext.ext_lock); + init_rwsem(&fi->i_sem); set_inode_flag(fi, FI_NEW_INODE); if (test_opt(F2FS_SB(sb), INLINE_XATTR)) set_inode_flag(fi, FI_INLINE_XATTR); + /* Will be used by directory only */ + fi->i_dir_level = F2FS_SB(sb)->dir_level; + return &fi->vfs_inode; } @@ -526,6 +542,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",disable_ext_identify"); if (test_opt(sbi, INLINE_DATA)) seq_puts(seq, ",inline_data"); + if (test_opt(sbi, FLUSH_MERGE)) + seq_puts(seq, ",flush_merge"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); return 0; @@ -539,13 +557,22 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset) le32_to_cpu(sbi->raw_super->segment_count_main); int i; + seq_puts(seq, "format: segment_type|valid_blocks\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + for (i = 0; i < total_segs; i++) { - seq_printf(seq, "%u", get_valid_blocks(sbi, i, 1)); - if (i != 0 && (i % 10) == 0) - seq_puts(seq, "\n"); + struct seg_entry *se = get_seg_entry(sbi, i); + + if ((i % 10) == 0) + seq_printf(seq, "%-5d", i); + seq_printf(seq, "%d|%-3u", se->type, + get_valid_blocks(sbi, i, 1)); + if ((i % 10) == 9 || i == (total_segs - 1)) + seq_putc(seq, '\n'); else - seq_puts(seq, " "); + seq_putc(seq, ' '); } + return 0; } @@ -568,6 +595,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) struct f2fs_mount_info org_mount_opt; int err, active_logs; + sync_filesystem(sb); + /* * Save the old mount options in case we * need to restore them. @@ -638,6 +667,8 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb, if (unlikely(ino < F2FS_ROOT_INO(sbi))) return ERR_PTR(-ESTALE); + if (unlikely(ino >= NM_I(sbi)->max_nid)) + return ERR_PTR(-ESTALE); /* * f2fs_iget isn't quite right if the inode is currently unallocated! @@ -785,6 +816,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) for (i = 0; i < NR_COUNT_TYPE; i++) atomic_set(&sbi->nr_pages[i], 0); + + sbi->dir_level = DEF_DIR_LEVEL; } /* @@ -896,11 +929,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sbi->por_doing = false; spin_lock_init(&sbi->stat_lock); - mutex_init(&sbi->read_io.io_mutex); + init_rwsem(&sbi->read_io.io_rwsem); sbi->read_io.sbi = sbi; sbi->read_io.bio = NULL; for (i = 0; i < NR_PAGE_TYPE; i++) { - mutex_init(&sbi->write_io[i].io_mutex); + init_rwsem(&sbi->write_io[i].io_rwsem); sbi->write_io[i].sbi = sbi; sbi->write_io[i].bio = NULL; } @@ -989,28 +1022,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_root_inode; } - /* recover fsynced data */ - if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { - err = recover_fsync_data(sbi); - if (err) - f2fs_msg(sb, KERN_ERR, - "Cannot recover all fsync data errno=%ld", err); - } - - /* - * If filesystem is not mounted as read-only then - * do start the gc_thread. - */ - if (!(sb->s_flags & MS_RDONLY)) { - /* After POR, we can run background GC thread.*/ - err = start_gc_thread(sbi); - if (err) - goto free_gc; - } - err = f2fs_build_stats(sbi); if (err) - goto free_gc; + goto free_root_inode; if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); @@ -1032,17 +1046,36 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, "%s", sb->s_id); if (err) - goto fail; + goto free_proc; + + /* recover fsynced data */ + if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { + err = recover_fsync_data(sbi); + if (err) + f2fs_msg(sb, KERN_ERR, + "Cannot recover all fsync data errno=%ld", err); + } + /* + * If filesystem is not mounted as read-only then + * do start the gc_thread. + */ + if (!(sb->s_flags & MS_RDONLY)) { + /* After POR, we can run background GC thread.*/ + err = start_gc_thread(sbi); + if (err) + goto free_kobj; + } return 0; -fail: + +free_kobj: + kobject_del(&sbi->s_kobj); +free_proc: if (sbi->s_proc) { remove_proc_entry("segment_info", sbi->s_proc); remove_proc_entry(sb->s_id, f2fs_proc_root); } f2fs_destroy_stats(sbi); -free_gc: - stop_gc_thread(sbi); free_root_inode: dput(sb->s_root); sb->s_root = NULL; @@ -1082,7 +1115,7 @@ MODULE_ALIAS_FS("f2fs"); static int __init init_inodecache(void) { f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", - sizeof(struct f2fs_inode_info), NULL); + sizeof(struct f2fs_inode_info)); if (!f2fs_inode_cachep) return -ENOMEM; return 0; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 89d0422a91a8..503c2451131e 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -275,7 +275,7 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage) inline_size = inline_xattr_size(inode); - txattr_addr = kzalloc(inline_size + size, GFP_KERNEL); + txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO); if (!txattr_addr) return NULL; @@ -407,6 +407,8 @@ int f2fs_getxattr(struct inode *inode, int name_index, const char *name, if (name == NULL) return -EINVAL; name_len = strlen(name); + if (name_len > F2FS_NAME_LEN) + return -ERANGE; base_addr = read_all_xattrs(inode, NULL); if (!base_addr) @@ -590,7 +592,10 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, f2fs_balance_fs(sbi); f2fs_lock_op(sbi); + /* protect xattr_ver */ + down_write(&F2FS_I(inode)->i_sem); err = __f2fs_setxattr(inode, name_index, name, value, value_len, ipage); + up_write(&F2FS_I(inode)->i_sem); f2fs_unlock_op(sbi); return err; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 854b578f6695..b3361fe2bcb5 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -490,7 +490,7 @@ EXPORT_SYMBOL_GPL(fat_build_inode); static void fat_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); if (!inode->i_nlink) { inode->i_size = 0; fat_truncate_blocks(inode, 0); @@ -635,6 +635,8 @@ static int fat_remount(struct super_block *sb, int *flags, char *data) struct msdos_sb_info *sbi = MSDOS_SB(sb); *flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME); + sync_filesystem(sb); + /* make sure we update state on remount. */ new_rdonly = *flags & MS_RDONLY; if (new_rdonly != (sb->s_flags & MS_RDONLY)) { diff --git a/fs/fcntl.c b/fs/fcntl.c index ef6866592a0f..72c82f69b01b 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -272,9 +272,19 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, case F_SETFL: err = setfl(fd, filp, arg); break; +#if BITS_PER_LONG != 32 + /* 32-bit arches must use fcntl64() */ + case F_OFD_GETLK: +#endif case F_GETLK: - err = fcntl_getlk(filp, (struct flock __user *) arg); + err = fcntl_getlk(filp, cmd, (struct flock __user *) arg); break; +#if BITS_PER_LONG != 32 + /* 32-bit arches must use fcntl64() */ + case F_OFD_SETLK: + case F_OFD_SETLKW: +#endif + /* Fallthrough */ case F_SETLK: case F_SETLKW: err = fcntl_setlk(fd, filp, cmd, (struct flock __user *) arg); @@ -388,17 +398,20 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, goto out1; switch (cmd) { - case F_GETLK64: - err = fcntl_getlk64(f.file, (struct flock64 __user *) arg); - break; - case F_SETLK64: - case F_SETLKW64: - err = fcntl_setlk64(fd, f.file, cmd, - (struct flock64 __user *) arg); - break; - default: - err = do_fcntl(fd, cmd, arg, f.file); - break; + case F_GETLK64: + case F_OFD_GETLK: + err = fcntl_getlk64(f.file, cmd, (struct flock64 __user *) arg); + break; + case F_SETLK64: + case F_SETLKW64: + case F_OFD_SETLK: + case F_OFD_SETLKW: + err = fcntl_setlk64(fd, f.file, cmd, + (struct flock64 __user *) arg); + break; + default: + err = do_fcntl(fd, cmd, arg, f.file); + break; } out1: fdput(f); diff --git a/fs/file.c b/fs/file.c index eb56a13dab3e..8f294cfac697 100644 --- a/fs/file.c +++ b/fs/file.c @@ -25,7 +25,10 @@ int sysctl_nr_open __read_mostly = 1024*1024; int sysctl_nr_open_min = BITS_PER_LONG; -int sysctl_nr_open_max = 1024 * 1024; /* raised later */ +/* our max() is unusable in constant expressions ;-/ */ +#define __const_max(x, y) ((x) < (y) ? (x) : (y)) +int sysctl_nr_open_max = __const_max(INT_MAX, ~(size_t)0/sizeof(void *)) & + -BITS_PER_LONG; static void *alloc_fdmem(size_t size) { @@ -429,12 +432,6 @@ void exit_files(struct task_struct *tsk) } } -void __init files_defer_init(void) -{ - sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) & - -BITS_PER_LONG; -} - struct files_struct init_files = { .count = ATOMIC_INIT(1), .fdt = &init_files.fdtab, @@ -497,7 +494,7 @@ repeat: error = fd; #if 1 /* Sanity check */ - if (rcu_dereference_raw(fdt->fd[fd]) != NULL) { + if (rcu_access_pointer(fdt->fd[fd]) != NULL) { printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); rcu_assign_pointer(fdt->fd[fd], NULL); } diff --git a/fs/file_table.c b/fs/file_table.c index 5b24008ea4f6..a374f5033e97 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -52,7 +52,6 @@ static void file_free_rcu(struct rcu_head *head) static inline void file_free(struct file *f) { percpu_counter_dec(&nr_files); - file_check_state(f); call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); } @@ -178,47 +177,12 @@ struct file *alloc_file(struct path *path, fmode_t mode, file->f_mapping = path->dentry->d_inode->i_mapping; file->f_mode = mode; file->f_op = fop; - - /* - * These mounts don't really matter in practice - * for r/o bind mounts. They aren't userspace- - * visible. We do this for consistency, and so - * that we can do debugging checks at __fput() - */ - if ((mode & FMODE_WRITE) && !special_file(path->dentry->d_inode->i_mode)) { - file_take_write(file); - WARN_ON(mnt_clone_write(path->mnt)); - } if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_inc(path->dentry->d_inode); return file; } EXPORT_SYMBOL(alloc_file); -/** - * drop_file_write_access - give up ability to write to a file - * @file: the file to which we will stop writing - * - * This is a central place which will give up the ability - * to write to @file, along with access to write through - * its vfsmount. - */ -static void drop_file_write_access(struct file *file) -{ - struct vfsmount *mnt = file->f_path.mnt; - struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; - - put_write_access(inode); - - if (special_file(inode->i_mode)) - return; - if (file_check_writeable(file) != 0) - return; - __mnt_drop_write(mnt); - file_release_write(file); -} - /* the real guts of fput() - releasing the last reference to file */ static void __fput(struct file *file) @@ -235,7 +199,7 @@ static void __fput(struct file *file) * in the file cleanup chain. */ eventpoll_release(file); - locks_remove_flock(file); + locks_remove_file(file); if (unlikely(file->f_flags & FASYNC)) { if (file->f_op->fasync) @@ -253,8 +217,10 @@ static void __fput(struct file *file) put_pid(file->f_owner.pid); if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_dec(inode); - if (file->f_mode & FMODE_WRITE) - drop_file_write_access(file); + if (file->f_mode & FMODE_WRITER) { + put_write_access(inode); + __mnt_drop_write(mnt); + } file->f_path.dentry = NULL; file->f_path.mnt = NULL; file->f_inode = NULL; @@ -359,6 +325,5 @@ void __init files_init(unsigned long mempages) n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); - files_defer_init(); percpu_counter_init(&nr_files, 0); } diff --git a/fs/filesystems.c b/fs/filesystems.c index 92567d95ba6a..5797d45a78cb 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -121,6 +121,7 @@ int unregister_filesystem(struct file_system_type * fs) EXPORT_SYMBOL(unregister_filesystem); +#ifdef CONFIG_SYSFS_SYSCALL static int fs_index(const char __user * __name) { struct file_system_type * tmp; @@ -199,6 +200,7 @@ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2) } return retval; } +#endif int __init get_filesystem_list(char *buf) { diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index f47df72cef17..363e3ae25f6b 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -354,7 +354,7 @@ static void vxfs_i_callback(struct rcu_head *head) void vxfs_evict_inode(struct inode *ip) { - truncate_inode_pages(&ip->i_data, 0); + truncate_inode_pages_final(&ip->i_data); clear_inode(ip); call_rcu(&ip->i_rcu, vxfs_i_callback); } diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c index 25d4099a4aea..99c7f0a37af4 100644 --- a/fs/freevxfs/vxfs_lookup.c +++ b/fs/freevxfs/vxfs_lookup.c @@ -192,7 +192,7 @@ vxfs_inode_by_name(struct inode *dip, struct dentry *dp) * vxfs_lookup - lookup pathname component * @dip: dir in which we lookup * @dp: dentry we lookup - * @nd: lookup nameidata + * @flags: lookup flags * * Description: * vxfs_lookup tries to lookup the pathname component described diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index e37eb274e492..7ca8c75d50d3 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -124,6 +124,7 @@ vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp) static int vxfs_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_RDONLY; return 0; } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d754e3cf99a8..be568b7311d6 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -89,16 +89,31 @@ static inline struct inode *wb_inode(struct list_head *head) #define CREATE_TRACE_POINTS #include <trace/events/writeback.h> +EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage); + +static void bdi_wakeup_thread(struct backing_dev_info *bdi) +{ + spin_lock_bh(&bdi->wb_lock); + if (test_bit(BDI_registered, &bdi->state)) + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); + spin_unlock_bh(&bdi->wb_lock); +} + static void bdi_queue_work(struct backing_dev_info *bdi, struct wb_writeback_work *work) { trace_writeback_queue(bdi, work); spin_lock_bh(&bdi->wb_lock); + if (!test_bit(BDI_registered, &bdi->state)) { + if (work->done) + complete(work->done); + goto out_unlock; + } list_add_tail(&work->list, &bdi->work_list); - spin_unlock_bh(&bdi->wb_lock); - mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); +out_unlock: + spin_unlock_bh(&bdi->wb_lock); } static void @@ -114,7 +129,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) { trace_writeback_nowork(bdi); - mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); + bdi_wakeup_thread(bdi); return; } @@ -161,7 +176,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) * writeback as soon as there is no other work to do. */ trace_writeback_wake_background(bdi); - mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); + bdi_wakeup_thread(bdi); } /* @@ -1017,7 +1032,7 @@ void bdi_writeback_workfn(struct work_struct *work) current->flags |= PF_SWAPWRITE; if (likely(!current_is_workqueue_rescuer() || - list_empty(&bdi->bdi_list))) { + !test_bit(BDI_registered, &bdi->state))) { /* * The normal path. Keep writing back @bdi until its * work_list is empty. Note that this path is also taken @@ -1039,10 +1054,10 @@ void bdi_writeback_workfn(struct work_struct *work) trace_writeback_pages_written(pages_written); } - if (!list_empty(&bdi->work_list) || - (wb_has_dirty_io(wb) && dirty_writeback_interval)) - queue_delayed_work(bdi_wq, &wb->dwork, - msecs_to_jiffies(dirty_writeback_interval * 10)); + if (!list_empty(&bdi->work_list)) + mod_delayed_work(bdi_wq, &wb->dwork, 0); + else if (wb_has_dirty_io(wb) && dirty_writeback_interval) + bdi_wakeup_thread_delayed(bdi); current->flags &= ~PF_SWAPWRITE; } diff --git a/fs/fuse/control.c b/fs/fuse/control.c index a0b0855d00a9..205e0d5d5307 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -348,7 +348,7 @@ int __init fuse_ctl_init(void) return register_filesystem(&fuse_ctl_fs_type); } -void fuse_ctl_cleanup(void) +void __exit fuse_ctl_cleanup(void) { unregister_filesystem(&fuse_ctl_fs_type); } diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index b96a49b37d66..13b691a8a7d2 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -95,7 +95,7 @@ static ssize_t cuse_read(struct file *file, char __user *buf, size_t count, struct iovec iov = { .iov_base = buf, .iov_len = count }; struct fuse_io_priv io = { .async = 0, .file = file }; - return fuse_direct_io(&io, &iov, 1, count, &pos, 0); + return fuse_direct_io(&io, &iov, 1, count, &pos, FUSE_DIO_CUSE); } static ssize_t cuse_write(struct file *file, const char __user *buf, @@ -109,7 +109,8 @@ static ssize_t cuse_write(struct file *file, const char __user *buf, * No locking or generic_write_checks(), the server is * responsible for locking and sanity checks. */ - return fuse_direct_io(&io, &iov, 1, count, &pos, 1); + return fuse_direct_io(&io, &iov, 1, count, &pos, + FUSE_DIO_WRITE | FUSE_DIO_CUSE); } static int cuse_open(struct inode *inode, struct file *file) @@ -568,7 +569,7 @@ static ssize_t cuse_class_waiting_show(struct device *dev, return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting)); } -static DEVICE_ATTR(waiting, S_IFREG | 0400, cuse_class_waiting_show, NULL); +static DEVICE_ATTR(waiting, 0400, cuse_class_waiting_show, NULL); static ssize_t cuse_class_abort_store(struct device *dev, struct device_attribute *attr, @@ -579,7 +580,7 @@ static ssize_t cuse_class_abort_store(struct device *dev, fuse_abort_conn(&cc->fc); return count; } -static DEVICE_ATTR(abort, S_IFREG | 0200, NULL, cuse_class_abort_store); +static DEVICE_ATTR(abort, 0200, NULL, cuse_class_abort_store); static struct attribute *cuse_class_dev_attrs[] = { &dev_attr_waiting.attr, diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 0a648bb455ae..aac71ce373e4 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -667,15 +667,15 @@ static void fuse_copy_finish(struct fuse_copy_state *cs) struct pipe_buffer *buf = cs->currbuf; if (!cs->write) { - buf->ops->unmap(cs->pipe, buf, cs->mapaddr); + kunmap_atomic(cs->mapaddr); } else { - kunmap(buf->page); + kunmap_atomic(cs->mapaddr); buf->len = PAGE_SIZE - cs->len; } cs->currbuf = NULL; cs->mapaddr = NULL; } else if (cs->mapaddr) { - kunmap(cs->pg); + kunmap_atomic(cs->mapaddr); if (cs->write) { flush_dcache_page(cs->pg); set_page_dirty_lock(cs->pg); @@ -706,7 +706,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) BUG_ON(!cs->nr_segs); cs->currbuf = buf; - cs->mapaddr = buf->ops->map(cs->pipe, buf, 0); + cs->mapaddr = kmap_atomic(buf->page); cs->len = buf->len; cs->buf = cs->mapaddr + buf->offset; cs->pipebufs++; @@ -726,7 +726,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) buf->len = 0; cs->currbuf = buf; - cs->mapaddr = kmap(page); + cs->mapaddr = kmap_atomic(page); cs->buf = cs->mapaddr; cs->len = PAGE_SIZE; cs->pipebufs++; @@ -745,7 +745,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) return err; BUG_ON(err != 1); offset = cs->addr % PAGE_SIZE; - cs->mapaddr = kmap(cs->pg); + cs->mapaddr = kmap_atomic(cs->pg); cs->buf = cs->mapaddr + offset; cs->len = min(PAGE_SIZE - offset, cs->seglen); cs->seglen -= cs->len; @@ -874,7 +874,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) out_fallback_unlock: unlock_page(newpage); out_fallback: - cs->mapaddr = buf->ops->map(cs->pipe, buf, 1); + cs->mapaddr = kmap_atomic(buf->page); cs->buf = cs->mapaddr + buf->offset; err = lock_request(cs->fc, cs->req); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 1d1292c581c3..42198359fa1b 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -679,6 +679,14 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, return create_new_entry(fc, req, dir, entry, S_IFLNK); } +static inline void fuse_update_ctime(struct inode *inode) +{ + if (!IS_NOCMTIME(inode)) { + inode->i_ctime = current_fs_time(inode->i_sb); + mark_inode_dirty_sync(inode); + } +} + static int fuse_unlink(struct inode *dir, struct dentry *entry) { int err; @@ -713,6 +721,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) fuse_invalidate_attr(inode); fuse_invalidate_attr(dir); fuse_invalidate_entry_cache(entry); + fuse_update_ctime(inode); } else if (err == -EINTR) fuse_invalidate_entry(entry); return err; @@ -743,23 +752,26 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) return err; } -static int fuse_rename(struct inode *olddir, struct dentry *oldent, - struct inode *newdir, struct dentry *newent) +static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent, + unsigned int flags, int opcode, size_t argsize) { int err; - struct fuse_rename_in inarg; + struct fuse_rename2_in inarg; struct fuse_conn *fc = get_fuse_conn(olddir); - struct fuse_req *req = fuse_get_req_nopages(fc); + struct fuse_req *req; + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); - memset(&inarg, 0, sizeof(inarg)); + memset(&inarg, 0, argsize); inarg.newdir = get_node_id(newdir); - req->in.h.opcode = FUSE_RENAME; + inarg.flags = flags; + req->in.h.opcode = opcode; req->in.h.nodeid = get_node_id(olddir); req->in.numargs = 3; - req->in.args[0].size = sizeof(inarg); + req->in.args[0].size = argsize; req->in.args[0].value = &inarg; req->in.args[1].size = oldent->d_name.len + 1; req->in.args[1].value = oldent->d_name.name; @@ -771,15 +783,22 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, if (!err) { /* ctime changes */ fuse_invalidate_attr(oldent->d_inode); + fuse_update_ctime(oldent->d_inode); + + if (flags & RENAME_EXCHANGE) { + fuse_invalidate_attr(newent->d_inode); + fuse_update_ctime(newent->d_inode); + } fuse_invalidate_attr(olddir); if (olddir != newdir) fuse_invalidate_attr(newdir); /* newent will end up negative */ - if (newent->d_inode) { + if (!(flags & RENAME_EXCHANGE) && newent->d_inode) { fuse_invalidate_attr(newent->d_inode); fuse_invalidate_entry_cache(newent); + fuse_update_ctime(newent->d_inode); } } else if (err == -EINTR) { /* If request was interrupted, DEITY only knows if the @@ -795,6 +814,36 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, return err; } +static int fuse_rename(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent) +{ + return fuse_rename_common(olddir, oldent, newdir, newent, 0, + FUSE_RENAME, sizeof(struct fuse_rename_in)); +} + +static int fuse_rename2(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent, + unsigned int flags) +{ + struct fuse_conn *fc = get_fuse_conn(olddir); + int err; + + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + if (fc->no_rename2 || fc->minor < 23) + return -EINVAL; + + err = fuse_rename_common(olddir, oldent, newdir, newent, flags, + FUSE_RENAME2, sizeof(struct fuse_rename2_in)); + if (err == -ENOSYS) { + fc->no_rename2 = 1; + err = -EINVAL; + } + return err; + +} + static int fuse_link(struct dentry *entry, struct inode *newdir, struct dentry *newent) { @@ -829,6 +878,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, inc_nlink(inode); spin_unlock(&fc->lock); fuse_invalidate_attr(inode); + fuse_update_ctime(inode); } else if (err == -EINTR) { fuse_invalidate_attr(inode); } @@ -839,6 +889,16 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, struct kstat *stat) { unsigned int blkbits; + struct fuse_conn *fc = get_fuse_conn(inode); + + /* see the comment in fuse_change_attributes() */ + if (fc->writeback_cache && S_ISREG(inode->i_mode)) { + attr->size = i_size_read(inode); + attr->mtime = inode->i_mtime.tv_sec; + attr->mtimensec = inode->i_mtime.tv_nsec; + attr->ctime = inode->i_ctime.tv_sec; + attr->ctimensec = inode->i_ctime.tv_nsec; + } stat->dev = inode->i_sb->s_dev; stat->ino = attr->ino; @@ -1477,12 +1537,16 @@ static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd, FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR); } -static bool update_mtime(unsigned ivalid) +static bool update_mtime(unsigned ivalid, bool trust_local_mtime) { /* Always update if mtime is explicitly set */ if (ivalid & ATTR_MTIME_SET) return true; + /* Or if kernel i_mtime is the official one */ + if (trust_local_mtime) + return true; + /* If it's an open(O_TRUNC) or an ftruncate(), don't update */ if ((ivalid & ATTR_SIZE) && (ivalid & (ATTR_OPEN | ATTR_FILE))) return false; @@ -1491,7 +1555,8 @@ static bool update_mtime(unsigned ivalid) return true; } -static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg) +static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg, + bool trust_local_cmtime) { unsigned ivalid = iattr->ia_valid; @@ -1510,13 +1575,18 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg) if (!(ivalid & ATTR_ATIME_SET)) arg->valid |= FATTR_ATIME_NOW; } - if ((ivalid & ATTR_MTIME) && update_mtime(ivalid)) { + if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_cmtime)) { arg->valid |= FATTR_MTIME; arg->mtime = iattr->ia_mtime.tv_sec; arg->mtimensec = iattr->ia_mtime.tv_nsec; - if (!(ivalid & ATTR_MTIME_SET)) + if (!(ivalid & ATTR_MTIME_SET) && !trust_local_cmtime) arg->valid |= FATTR_MTIME_NOW; } + if ((ivalid & ATTR_CTIME) && trust_local_cmtime) { + arg->valid |= FATTR_CTIME; + arg->ctime = iattr->ia_ctime.tv_sec; + arg->ctimensec = iattr->ia_ctime.tv_nsec; + } } /* @@ -1563,6 +1633,62 @@ void fuse_release_nowrite(struct inode *inode) spin_unlock(&fc->lock); } +static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_req *req, + struct inode *inode, + struct fuse_setattr_in *inarg_p, + struct fuse_attr_out *outarg_p) +{ + req->in.h.opcode = FUSE_SETATTR; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 1; + req->in.args[0].size = sizeof(*inarg_p); + req->in.args[0].value = inarg_p; + req->out.numargs = 1; + if (fc->minor < 9) + req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; + else + req->out.args[0].size = sizeof(*outarg_p); + req->out.args[0].value = outarg_p; +} + +/* + * Flush inode->i_mtime to the server + */ +int fuse_flush_times(struct inode *inode, struct fuse_file *ff) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + struct fuse_setattr_in inarg; + struct fuse_attr_out outarg; + int err; + + req = fuse_get_req_nopages(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + memset(&inarg, 0, sizeof(inarg)); + memset(&outarg, 0, sizeof(outarg)); + + inarg.valid = FATTR_MTIME; + inarg.mtime = inode->i_mtime.tv_sec; + inarg.mtimensec = inode->i_mtime.tv_nsec; + if (fc->minor >= 23) { + inarg.valid |= FATTR_CTIME; + inarg.ctime = inode->i_ctime.tv_sec; + inarg.ctimensec = inode->i_ctime.tv_nsec; + } + if (ff) { + inarg.valid |= FATTR_FH; + inarg.fh = ff->fh; + } + fuse_setattr_fill(fc, req, inode, &inarg, &outarg); + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + + return err; +} + /* * Set attributes, and at the same time refresh them. * @@ -1580,8 +1706,10 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, struct fuse_setattr_in inarg; struct fuse_attr_out outarg; bool is_truncate = false; + bool is_wb = fc->writeback_cache; loff_t oldsize; int err; + bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode); if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) attr->ia_valid |= ATTR_FORCE; @@ -1606,11 +1734,13 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, if (is_truncate) { fuse_set_nowrite(inode); set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + if (trust_local_cmtime && attr->ia_size != inode->i_size) + attr->ia_valid |= ATTR_MTIME | ATTR_CTIME; } memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); - iattr_to_fattr(attr, &inarg); + iattr_to_fattr(attr, &inarg, trust_local_cmtime); if (file) { struct fuse_file *ff = file->private_data; inarg.valid |= FATTR_FH; @@ -1621,17 +1751,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, inarg.valid |= FATTR_LOCKOWNER; inarg.lock_owner = fuse_lock_owner_id(fc, current->files); } - req->in.h.opcode = FUSE_SETATTR; - req->in.h.nodeid = get_node_id(inode); - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - req->out.numargs = 1; - if (fc->minor < 9) - req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; - else - req->out.args[0].size = sizeof(outarg); - req->out.args[0].value = &outarg; + fuse_setattr_fill(fc, req, inode, &inarg, &outarg); fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); @@ -1648,10 +1768,21 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, } spin_lock(&fc->lock); + /* the kernel maintains i_mtime locally */ + if (trust_local_cmtime) { + if (attr->ia_valid & ATTR_MTIME) + inode->i_mtime = attr->ia_mtime; + if (attr->ia_valid & ATTR_CTIME) + inode->i_ctime = attr->ia_ctime; + /* FIXME: clear I_DIRTY_SYNC? */ + } + fuse_change_attributes_common(inode, &outarg.attr, attr_timeout(&outarg)); oldsize = inode->i_size; - i_size_write(inode, outarg.attr.size); + /* see the comment in fuse_change_attributes() */ + if (!is_wb || is_truncate || !S_ISREG(inode->i_mode)) + i_size_write(inode, outarg.attr.size); if (is_truncate) { /* NOTE: this may release/reacquire fc->lock */ @@ -1663,7 +1794,8 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, * Only call invalidate_inode_pages2() after removing * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock. */ - if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { + if ((is_truncate || !is_wb) && + S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { truncate_pagecache(inode, outarg.attr.size); invalidate_inode_pages2(inode->i_mapping); } @@ -1739,8 +1871,10 @@ static int fuse_setxattr(struct dentry *entry, const char *name, fc->no_setxattr = 1; err = -EOPNOTSUPP; } - if (!err) + if (!err) { fuse_invalidate_attr(inode); + fuse_update_ctime(inode); + } return err; } @@ -1870,8 +2004,10 @@ static int fuse_removexattr(struct dentry *entry, const char *name) fc->no_removexattr = 1; err = -EOPNOTSUPP; } - if (!err) + if (!err) { fuse_invalidate_attr(inode); + fuse_update_ctime(inode); + } return err; } @@ -1882,6 +2018,7 @@ static const struct inode_operations fuse_dir_inode_operations = { .unlink = fuse_unlink, .rmdir = fuse_rmdir, .rename = fuse_rename, + .rename2 = fuse_rename2, .link = fuse_link, .setattr = fuse_setattr, .create = fuse_create, diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 77bcc303c3ae..96d513e01a5d 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -188,6 +188,22 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, } EXPORT_SYMBOL_GPL(fuse_do_open); +static void fuse_link_write_file(struct file *file) +{ + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_file *ff = file->private_data; + /* + * file may be written through mmap, so chain it onto the + * inodes's write_file list + */ + spin_lock(&fc->lock); + if (list_empty(&ff->write_entry)) + list_add(&ff->write_entry, &fi->write_files); + spin_unlock(&fc->lock); +} + void fuse_finish_open(struct inode *inode, struct file *file) { struct fuse_file *ff = file->private_data; @@ -207,25 +223,37 @@ void fuse_finish_open(struct inode *inode, struct file *file) i_size_write(inode, 0); spin_unlock(&fc->lock); fuse_invalidate_attr(inode); + if (fc->writeback_cache) + file_update_time(file); } + if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) + fuse_link_write_file(file); } int fuse_open_common(struct inode *inode, struct file *file, bool isdir) { struct fuse_conn *fc = get_fuse_conn(inode); int err; + bool lock_inode = (file->f_flags & O_TRUNC) && + fc->atomic_o_trunc && + fc->writeback_cache; err = generic_file_open(inode, file); if (err) return err; + if (lock_inode) + mutex_lock(&inode->i_mutex); + err = fuse_do_open(fc, get_node_id(inode), file, isdir); - if (err) - return err; - fuse_finish_open(inode, file); + if (!err) + fuse_finish_open(inode, file); - return 0; + if (lock_inode) + mutex_unlock(&inode->i_mutex); + + return err; } static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode) @@ -292,6 +320,12 @@ static int fuse_open(struct inode *inode, struct file *file) static int fuse_release(struct inode *inode, struct file *file) { + struct fuse_conn *fc = get_fuse_conn(inode); + + /* see fuse_vma_close() for !writeback_cache case */ + if (fc->writeback_cache) + write_inode_now(inode, 1); + fuse_release_common(file, FUSE_RELEASE); /* return value is ignored by VFS */ @@ -333,12 +367,13 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) } /* - * Check if page is under writeback + * Check if any page in a range is under writeback * * This is currently done by walking the list of writepage requests * for the inode, which can be pretty inefficient. */ -static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) +static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, + pgoff_t idx_to) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); @@ -351,8 +386,8 @@ static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) BUG_ON(req->inode != inode); curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT; - if (curr_index <= index && - index < curr_index + req->num_pages) { + if (idx_from < curr_index + req->num_pages && + curr_index <= idx_to) { found = true; break; } @@ -362,6 +397,11 @@ static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) return found; } +static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) +{ + return fuse_range_is_writeback(inode, index, index); +} + /* * Wait for page writeback to be completed. * @@ -376,6 +416,21 @@ static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) return 0; } +/* + * Wait for all pending writepages on the inode to finish. + * + * This is currently done by blocking further writes with FUSE_NOWRITE + * and waiting for all sent writes to complete. + * + * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage + * could conflict with truncation. + */ +static void fuse_sync_writes(struct inode *inode) +{ + fuse_set_nowrite(inode); + fuse_release_nowrite(inode); +} + static int fuse_flush(struct file *file, fl_owner_t id) { struct inode *inode = file_inode(file); @@ -391,6 +446,14 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (fc->no_flush) return 0; + err = write_inode_now(inode, 1); + if (err) + return err; + + mutex_lock(&inode->i_mutex); + fuse_sync_writes(inode); + mutex_unlock(&inode->i_mutex); + req = fuse_get_req_nofail_nopages(fc, file); memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; @@ -411,21 +474,6 @@ static int fuse_flush(struct file *file, fl_owner_t id) return err; } -/* - * Wait for all pending writepages on the inode to finish. - * - * This is currently done by blocking further writes with FUSE_NOWRITE - * and waiting for all sent writes to complete. - * - * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage - * could conflict with truncation. - */ -static void fuse_sync_writes(struct inode *inode) -{ - fuse_set_nowrite(inode); - fuse_release_nowrite(inode); -} - int fuse_fsync_common(struct file *file, loff_t start, loff_t end, int datasync, int isdir) { @@ -439,13 +487,6 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, if (is_bad_inode(inode)) return -EIO; - err = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (err) - return err; - - if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) - return 0; - mutex_lock(&inode->i_mutex); /* @@ -453,11 +494,17 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, * wait for all outstanding writes, before sending the FSYNC * request. */ - err = write_inode_now(inode, 0); + err = filemap_write_and_wait_range(inode->i_mapping, start, end); if (err) goto out; fuse_sync_writes(inode); + err = sync_inode_metadata(inode, 1); + if (err) + goto out; + + if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) + goto out; req = fuse_get_req_nopages(fc); if (IS_ERR(req)) { @@ -655,7 +702,33 @@ static void fuse_read_update_size(struct inode *inode, loff_t size, spin_unlock(&fc->lock); } -static int fuse_readpage(struct file *file, struct page *page) +static void fuse_short_read(struct fuse_req *req, struct inode *inode, + u64 attr_ver) +{ + size_t num_read = req->out.args[0].size; + struct fuse_conn *fc = get_fuse_conn(inode); + + if (fc->writeback_cache) { + /* + * A hole in a file. Some data after the hole are in page cache, + * but have not reached the client fs yet. So, the hole is not + * present there. + */ + int i; + int start_idx = num_read >> PAGE_CACHE_SHIFT; + size_t off = num_read & (PAGE_CACHE_SIZE - 1); + + for (i = start_idx; i < req->num_pages; i++) { + zero_user_segment(req->pages[i], off, PAGE_CACHE_SIZE); + off = 0; + } + } else { + loff_t pos = page_offset(req->pages[0]) + num_read; + fuse_read_update_size(inode, pos, attr_ver); + } +} + +static int fuse_do_readpage(struct file *file, struct page *page) { struct fuse_io_priv io = { .async = 0, .file = file }; struct inode *inode = page->mapping->host; @@ -667,10 +740,6 @@ static int fuse_readpage(struct file *file, struct page *page) u64 attr_ver; int err; - err = -EIO; - if (is_bad_inode(inode)) - goto out; - /* * Page writeback can extend beyond the lifetime of the * page-cache page, so make sure we read a properly synced @@ -679,9 +748,8 @@ static int fuse_readpage(struct file *file, struct page *page) fuse_wait_on_page_writeback(inode, page->index); req = fuse_get_req(fc, 1); - err = PTR_ERR(req); if (IS_ERR(req)) - goto out; + return PTR_ERR(req); attr_ver = fuse_get_attr_version(fc); @@ -692,18 +760,32 @@ static int fuse_readpage(struct file *file, struct page *page) req->page_descs[0].length = count; num_read = fuse_send_read(req, &io, pos, count, NULL); err = req->out.h.error; - fuse_put_request(fc, req); if (!err) { /* * Short read means EOF. If file size is larger, truncate it */ if (num_read < count) - fuse_read_update_size(inode, pos + num_read, attr_ver); + fuse_short_read(req, inode, attr_ver); SetPageUptodate(page); } + fuse_put_request(fc, req); + + return err; +} + +static int fuse_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + int err; + + err = -EIO; + if (is_bad_inode(inode)) + goto out; + + err = fuse_do_readpage(file, page); fuse_invalidate_atime(inode); out: unlock_page(page); @@ -726,13 +808,9 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) /* * Short read means EOF. If file size is larger, truncate it */ - if (!req->out.h.error && num_read < count) { - loff_t pos; + if (!req->out.h.error && num_read < count) + fuse_short_read(req, inode, req->misc.read.attr_ver); - pos = page_offset(req->pages[0]) + num_read; - fuse_read_update_size(inode, pos, - req->misc.read.attr_ver); - } fuse_invalidate_atime(inode); } @@ -922,16 +1000,21 @@ static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io, return req->misc.write.out.size; } -void fuse_write_update_size(struct inode *inode, loff_t pos) +bool fuse_write_update_size(struct inode *inode, loff_t pos) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); + bool ret = false; spin_lock(&fc->lock); fi->attr_version = ++fc->attr_version; - if (pos > inode->i_size) + if (pos > inode->i_size) { i_size_write(inode, pos); + ret = true; + } spin_unlock(&fc->lock); + + return ret; } static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, @@ -1003,9 +1086,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, if (mapping_writably_mapped(mapping)) flush_dcache_page(page); - pagefault_disable(); tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); - pagefault_enable(); flush_dcache_page(page); mark_page_accessed(page); @@ -1116,6 +1197,15 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, struct iov_iter i; loff_t endbyte = 0; + if (get_fuse_conn(inode)->writeback_cache) { + /* Update size (EOF optimization) and mode (SUID clearing) */ + err = fuse_update_attributes(mapping->host, NULL, file, NULL); + if (err) + return err; + + return generic_file_aio_write(iocb, iov, nr_segs, pos); + } + WARN_ON(iocb->ki_pos != pos); ocount = 0; @@ -1145,8 +1235,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, goto out; if (file->f_flags & O_DIRECT) { - written = generic_file_direct_write(iocb, iov, &nr_segs, - pos, &iocb->ki_pos, + written = generic_file_direct_write(iocb, iov, &nr_segs, pos, count, ocount); if (written < 0 || written == count) goto out; @@ -1289,13 +1378,18 @@ static inline int fuse_iter_npages(const struct iov_iter *ii_p) ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, unsigned long nr_segs, size_t count, loff_t *ppos, - int write) + int flags) { + int write = flags & FUSE_DIO_WRITE; + int cuse = flags & FUSE_DIO_CUSE; struct file *file = io->file; + struct inode *inode = file->f_mapping->host; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; size_t nmax = write ? fc->max_write : fc->max_read; loff_t pos = *ppos; + pgoff_t idx_from = pos >> PAGE_CACHE_SHIFT; + pgoff_t idx_to = (pos + count - 1) >> PAGE_CACHE_SHIFT; ssize_t res = 0; struct fuse_req *req; struct iov_iter ii; @@ -1309,6 +1403,14 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, if (IS_ERR(req)) return PTR_ERR(req); + if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { + if (!write) + mutex_lock(&inode->i_mutex); + fuse_sync_writes(inode); + if (!write) + mutex_unlock(&inode->i_mutex); + } + while (count) { size_t nres; fl_owner_t owner = current->files; @@ -1397,7 +1499,8 @@ static ssize_t __fuse_direct_write(struct fuse_io_priv *io, res = generic_write_checks(file, ppos, &count, 0); if (!res) - res = fuse_direct_io(io, iov, nr_segs, count, ppos, 1); + res = fuse_direct_io(io, iov, nr_segs, count, ppos, + FUSE_DIO_WRITE); fuse_invalidate_attr(inode); @@ -1556,13 +1659,13 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req) fuse_writepage_free(fc, req); } -static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc, - struct fuse_inode *fi) +static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc, + struct fuse_inode *fi) { struct fuse_file *ff = NULL; spin_lock(&fc->lock); - if (!WARN_ON(list_empty(&fi->write_files))) { + if (!list_empty(&fi->write_files)) { ff = list_entry(fi->write_files.next, struct fuse_file, write_entry); fuse_file_get(ff); @@ -1572,6 +1675,29 @@ static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc, return ff; } +static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc, + struct fuse_inode *fi) +{ + struct fuse_file *ff = __fuse_write_file_get(fc, fi); + WARN_ON(!ff); + return ff; +} + +int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_file *ff; + int err; + + ff = __fuse_write_file_get(fc, fi); + err = fuse_flush_times(inode, ff); + if (ff) + fuse_file_put(ff, 0); + + return err; +} + static int fuse_writepage_locked(struct page *page) { struct address_space *mapping = page->mapping; @@ -1885,6 +2011,77 @@ out: return err; } +/* + * It's worthy to make sure that space is reserved on disk for the write, + * but how to implement it without killing performance need more thinking. + */ +static int fuse_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct fuse_conn *fc = get_fuse_conn(file->f_dentry->d_inode); + struct page *page; + loff_t fsize; + int err = -ENOMEM; + + WARN_ON(!fc->writeback_cache); + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + goto error; + + fuse_wait_on_page_writeback(mapping->host, page->index); + + if (PageUptodate(page) || len == PAGE_CACHE_SIZE) + goto success; + /* + * Check if the start this page comes after the end of file, in which + * case the readpage can be optimized away. + */ + fsize = i_size_read(mapping->host); + if (fsize <= (pos & PAGE_CACHE_MASK)) { + size_t off = pos & ~PAGE_CACHE_MASK; + if (off) + zero_user_segment(page, 0, off); + goto success; + } + err = fuse_do_readpage(file, page); + if (err) + goto cleanup; +success: + *pagep = page; + return 0; + +cleanup: + unlock_page(page); + page_cache_release(page); +error: + return err; +} + +static int fuse_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + + if (!PageUptodate(page)) { + /* Zero any unwritten bytes at the end of the page */ + size_t endoff = (pos + copied) & ~PAGE_CACHE_MASK; + if (endoff) + zero_user_segment(page, endoff, PAGE_CACHE_SIZE); + SetPageUptodate(page); + } + + fuse_write_update_size(inode, pos + copied); + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + + return copied; +} + static int fuse_launder_page(struct page *page) { int err = 0; @@ -1940,26 +2137,16 @@ static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) static const struct vm_operations_struct fuse_file_vm_ops = { .close = fuse_vma_close, .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = fuse_page_mkwrite, .remap_pages = generic_file_remap_pages, }; static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) { - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) { - struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_inode *fi = get_fuse_inode(inode); - struct fuse_file *ff = file->private_data; - /* - * file may be written through mmap, so chain it onto the - * inodes's write_file list - */ - spin_lock(&fc->lock); - if (list_empty(&ff->write_entry)) - list_add(&ff->write_entry, &fi->write_files); - spin_unlock(&fc->lock); - } + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + fuse_link_write_file(file); + file_accessed(file); vma->vm_ops = &fuse_file_vm_ops; return 0; @@ -2606,7 +2793,7 @@ static void fuse_register_polled_file(struct fuse_conn *fc, { spin_lock(&fc->lock); if (RB_EMPTY_NODE(&ff->polled_node)) { - struct rb_node **link, *parent; + struct rb_node **link, *uninitialized_var(parent); link = fuse_find_polled_node(fc, ff->kh, &parent); BUG_ON(*link); @@ -2808,6 +2995,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) || (mode & FALLOC_FL_PUNCH_HOLE); + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + if (fc->no_fallocate) return -EOPNOTSUPP; @@ -2850,8 +3040,12 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, goto out; /* we could have extended the file */ - if (!(mode & FALLOC_FL_KEEP_SIZE)) - fuse_write_update_size(inode, offset + length); + if (!(mode & FALLOC_FL_KEEP_SIZE)) { + bool changed = fuse_write_update_size(inode, offset + length); + + if (changed && fc->writeback_cache) + file_update_time(file); + } if (mode & FALLOC_FL_PUNCH_HOLE) truncate_pagecache_range(inode, offset, offset + length - 1); @@ -2915,6 +3109,8 @@ static const struct address_space_operations fuse_file_aops = { .set_page_dirty = __set_page_dirty_nobuffers, .bmap = fuse_bmap, .direct_IO = fuse_direct_IO, + .write_begin = fuse_write_begin, + .write_end = fuse_write_end, }; void fuse_init_file_inode(struct inode *inode) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 2da5db2c8bdb..7aa5c75e0de1 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -480,6 +480,9 @@ struct fuse_conn { /** Set if bdi is valid */ unsigned bdi_initialized:1; + /** write-back cache policy (default is write-through) */ + unsigned writeback_cache:1; + /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction @@ -539,6 +542,9 @@ struct fuse_conn { /** Is fallocate not implemented by fs? */ unsigned no_fallocate:1; + /** Is rename with flags implemented by fs? */ + unsigned no_rename2:1; + /** Use enhanced/automatic page cache invalidation. */ unsigned auto_inval_data:1; @@ -720,7 +726,7 @@ int fuse_dev_init(void); void fuse_dev_cleanup(void); int fuse_ctl_init(void); -void fuse_ctl_cleanup(void); +void __exit fuse_ctl_cleanup(void); /** * Allocate a request @@ -863,9 +869,20 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, bool isdir); + +/** + * fuse_direct_io() flags + */ + +/** If set, it is WRITE; otherwise - READ */ +#define FUSE_DIO_WRITE (1 << 0) + +/** CUSE pass fuse_direct_io() a file which f_mapping->host is not from FUSE */ +#define FUSE_DIO_CUSE (1 << 1) + ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, unsigned long nr_segs, size_t count, loff_t *ppos, - int write); + int flags); long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, unsigned int flags); long fuse_ioctl_common(struct file *file, unsigned int cmd, @@ -873,7 +890,10 @@ long fuse_ioctl_common(struct file *file, unsigned int cmd, unsigned fuse_file_poll(struct file *file, poll_table *wait); int fuse_dev_release(struct inode *inode, struct file *file); -void fuse_write_update_size(struct inode *inode, loff_t pos); +bool fuse_write_update_size(struct inode *inode, loff_t pos); + +int fuse_flush_times(struct inode *inode, struct fuse_file *ff); +int fuse_write_inode(struct inode *inode, struct writeback_control *wbc); int fuse_do_setattr(struct inode *inode, struct iattr *attr, struct file *file); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index d468643a68b2..754dcf23de8a 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -123,7 +123,7 @@ static void fuse_destroy_inode(struct inode *inode) static void fuse_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (inode->i_sb->s_flags & MS_ACTIVE) { struct fuse_conn *fc = get_fuse_conn(inode); @@ -135,6 +135,7 @@ static void fuse_evict_inode(struct inode *inode) static int fuse_remount_fs(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); if (*flags & MS_MANDLOCK) return -EINVAL; @@ -170,10 +171,13 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, inode->i_blocks = attr->blocks; inode->i_atime.tv_sec = attr->atime; inode->i_atime.tv_nsec = attr->atimensec; - inode->i_mtime.tv_sec = attr->mtime; - inode->i_mtime.tv_nsec = attr->mtimensec; - inode->i_ctime.tv_sec = attr->ctime; - inode->i_ctime.tv_nsec = attr->ctimensec; + /* mtime from server may be stale due to local buffered write */ + if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) { + inode->i_mtime.tv_sec = attr->mtime; + inode->i_mtime.tv_nsec = attr->mtimensec; + inode->i_ctime.tv_sec = attr->ctime; + inode->i_ctime.tv_nsec = attr->ctimensec; + } if (attr->blksize != 0) inode->i_blkbits = ilog2(attr->blksize); @@ -197,6 +201,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); + bool is_wb = fc->writeback_cache; loff_t oldsize; struct timespec old_mtime; @@ -211,10 +216,16 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, fuse_change_attributes_common(inode, attr, attr_valid); oldsize = inode->i_size; - i_size_write(inode, attr->size); + /* + * In case of writeback_cache enabled, the cached writes beyond EOF + * extend local i_size without keeping userspace server in sync. So, + * attr->size coming from server can be stale. We cannot trust it. + */ + if (!is_wb || !S_ISREG(inode->i_mode)) + i_size_write(inode, attr->size); spin_unlock(&fc->lock); - if (S_ISREG(inode->i_mode)) { + if (!is_wb && S_ISREG(inode->i_mode)) { bool inval = false; if (oldsize != attr->size) { @@ -243,6 +254,10 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) { inode->i_mode = attr->mode & S_IFMT; inode->i_size = attr->size; + inode->i_mtime.tv_sec = attr->mtime; + inode->i_mtime.tv_nsec = attr->mtimensec; + inode->i_ctime.tv_sec = attr->ctime; + inode->i_ctime.tv_nsec = attr->ctimensec; if (S_ISREG(inode->i_mode)) { fuse_init_common(inode); fuse_init_file_inode(inode); @@ -289,7 +304,9 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, return NULL; if ((inode->i_state & I_NEW)) { - inode->i_flags |= S_NOATIME|S_NOCMTIME; + inode->i_flags |= S_NOATIME; + if (!fc->writeback_cache || !S_ISREG(attr->mode)) + inode->i_flags |= S_NOCMTIME; inode->i_generation = generation; inode->i_data.backing_dev_info = &fc->bdi; fuse_init_inode(inode, attr); @@ -773,6 +790,7 @@ static const struct super_operations fuse_super_operations = { .alloc_inode = fuse_alloc_inode, .destroy_inode = fuse_destroy_inode, .evict_inode = fuse_evict_inode, + .write_inode = fuse_write_inode, .drop_inode = generic_delete_inode, .remount_fs = fuse_remount_fs, .put_super = fuse_put_super, @@ -873,6 +891,13 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) } if (arg->flags & FUSE_ASYNC_DIO) fc->async_dio = 1; + if (arg->flags & FUSE_WRITEBACK_CACHE) + fc->writeback_cache = 1; + if (arg->time_gran && arg->time_gran <= 1000000000) + fc->sb->s_time_gran = arg->time_gran; + else + fc->sb->s_time_gran = 1000000000; + } else { ra_pages = fc->max_read / PAGE_CACHE_SIZE; fc->no_lock = 1; @@ -900,7 +925,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | - FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO; + FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | + FUSE_WRITEBACK_CACHE; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -978,7 +1004,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (sb->s_flags & MS_MANDLOCK) goto err; - sb->s_flags &= ~MS_NOSEC; + sb->s_flags &= ~(MS_NOSEC | MS_I_VERSION); if (!parse_fuse_opt((char *) data, &d, is_bdev)) goto err; diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index ba9456685f47..3088e2a38e30 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -64,18 +64,6 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type) return acl; } -static int gfs2_set_mode(struct inode *inode, umode_t mode) -{ - int error = 0; - - if (mode != inode->i_mode) { - inode->i_mode = mode; - mark_inode_dirty(inode); - } - - return error; -} - int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) { int error; @@ -85,8 +73,8 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) BUG_ON(name == NULL); - if (acl->a_count > GFS2_ACL_MAX_ENTRIES) - return -EINVAL; + if (acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode))) + return -E2BIG; if (type == ACL_TYPE_ACCESS) { umode_t mode = inode->i_mode; @@ -98,9 +86,10 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (error == 0) acl = NULL; - error = gfs2_set_mode(inode, mode); - if (error) - return error; + if (mode != inode->i_mode) { + inode->i_mode = mode; + mark_inode_dirty(inode); + } } if (acl) { diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index 301260c999ba..2d65ec4cd4be 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h @@ -14,7 +14,7 @@ #define GFS2_POSIX_ACL_ACCESS "posix_acl_access" #define GFS2_POSIX_ACL_DEFAULT "posix_acl_default" -#define GFS2_ACL_MAX_ENTRIES 25 +#define GFS2_ACL_MAX_ENTRIES(sdp) ((300 << (sdp)->sd_sb.sb_bsize_shift) >> 12) extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type); extern int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 49436fa7cd4f..ce62dcac90b6 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -21,6 +21,7 @@ #include <linux/gfs2_ondisk.h> #include <linux/backing-dev.h> #include <linux/aio.h> +#include <trace/events/writeback.h> #include "gfs2.h" #include "incore.h" @@ -230,13 +231,11 @@ static int gfs2_writepages(struct address_space *mapping, static int gfs2_write_jdata_pagevec(struct address_space *mapping, struct writeback_control *wbc, struct pagevec *pvec, - int nr_pages, pgoff_t end) + int nr_pages, pgoff_t end, + pgoff_t *done_index) { struct inode *inode = mapping->host; struct gfs2_sbd *sdp = GFS2_SB(inode); - loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; - unsigned offset = i_size & (PAGE_CACHE_SIZE-1); unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize); int i; int ret; @@ -248,40 +247,83 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping, for(i = 0; i < nr_pages; i++) { struct page *page = pvec->pages[i]; + /* + * At this point, the page may be truncated or + * invalidated (changing page->mapping to NULL), or + * even swizzled back from swapper_space to tmpfs file + * mapping. However, page->index will not change + * because we have a reference on the page. + */ + if (page->index > end) { + /* + * can't be range_cyclic (1st pass) because + * end == -1 in that case. + */ + ret = 1; + break; + } + + *done_index = page->index; + lock_page(page); if (unlikely(page->mapping != mapping)) { +continue_unlock: unlock_page(page); continue; } - if (!wbc->range_cyclic && page->index > end) { - ret = 1; - unlock_page(page); - continue; + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; } - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); - - if (PageWriteback(page) || - !clear_page_dirty_for_io(page)) { - unlock_page(page); - continue; + if (PageWriteback(page)) { + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + else + goto continue_unlock; } - /* Is the page fully outside i_size? (truncate in progress) */ - if (page->index > end_index || (page->index == end_index && !offset)) { - page->mapping->a_ops->invalidatepage(page, 0, - PAGE_CACHE_SIZE); - unlock_page(page); - continue; - } + BUG_ON(PageWriteback(page)); + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + trace_wbc_writepage(wbc, mapping->backing_dev_info); ret = __gfs2_jdata_writepage(page, wbc); + if (unlikely(ret)) { + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(page); + ret = 0; + } else { + + /* + * done_index is set past this page, + * so media errors will not choke + * background writeout for the entire + * file. This has consequences for + * range_cyclic semantics (ie. it may + * not be suitable for data integrity + * writeout). + */ + *done_index = page->index + 1; + ret = 1; + break; + } + } - if (ret || (--(wbc->nr_to_write) <= 0)) + /* + * We stop writing back only if we are not doing + * integrity sync. In case of integrity sync we have to + * keep going until we have written all the pages + * we tagged for writeback prior to entering this loop. + */ + if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { ret = 1; + break; + } + } gfs2_trans_end(sdp); return ret; @@ -306,51 +348,69 @@ static int gfs2_write_cache_jdata(struct address_space *mapping, int done = 0; struct pagevec pvec; int nr_pages; + pgoff_t uninitialized_var(writeback_index); pgoff_t index; pgoff_t end; - int scanned = 0; + pgoff_t done_index; + int cycled; int range_whole = 0; + int tag; pagevec_init(&pvec, 0); if (wbc->range_cyclic) { - index = mapping->writeback_index; /* Start from prev offset */ + writeback_index = mapping->writeback_index; /* prev offset */ + index = writeback_index; + if (index == 0) + cycled = 1; + else + cycled = 0; end = -1; } else { index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; - scanned = 1; + cycled = 1; /* ignore range_cyclic tests */ } + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; retry: - while (!done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { - scanned = 1; - ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end); + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, index, end); + done_index = index; + while (!done && (index <= end)) { + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; + + ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end, &done_index); if (ret) done = 1; if (ret > 0) ret = 0; - pagevec_release(&pvec); cond_resched(); } - if (!scanned && !done) { + if (!cycled && !done) { /* + * range_cyclic: * We hit the last page and there is more work to be done: wrap * back to the start of the file */ - scanned = 1; + cycled = 1; index = 0; + end = writeback_index - 1; goto retry; } if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) - mapping->writeback_index = index; + mapping->writeback_index = done_index; + return ret; } diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index fe0500c0af7a..c62d4b9f51dc 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1328,6 +1328,121 @@ int gfs2_file_dealloc(struct gfs2_inode *ip) } /** + * gfs2_free_journal_extents - Free cached journal bmap info + * @jd: The journal + * + */ + +void gfs2_free_journal_extents(struct gfs2_jdesc *jd) +{ + struct gfs2_journal_extent *jext; + + while(!list_empty(&jd->extent_list)) { + jext = list_entry(jd->extent_list.next, struct gfs2_journal_extent, list); + list_del(&jext->list); + kfree(jext); + } +} + +/** + * gfs2_add_jextent - Add or merge a new extent to extent cache + * @jd: The journal descriptor + * @lblock: The logical block at start of new extent + * @pblock: The physical block at start of new extent + * @blocks: Size of extent in fs blocks + * + * Returns: 0 on success or -ENOMEM + */ + +static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks) +{ + struct gfs2_journal_extent *jext; + + if (!list_empty(&jd->extent_list)) { + jext = list_entry(jd->extent_list.prev, struct gfs2_journal_extent, list); + if ((jext->dblock + jext->blocks) == dblock) { + jext->blocks += blocks; + return 0; + } + } + + jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS); + if (jext == NULL) + return -ENOMEM; + jext->dblock = dblock; + jext->lblock = lblock; + jext->blocks = blocks; + list_add_tail(&jext->list, &jd->extent_list); + jd->nr_extents++; + return 0; +} + +/** + * gfs2_map_journal_extents - Cache journal bmap info + * @sdp: The super block + * @jd: The journal to map + * + * Create a reusable "extent" mapping from all logical + * blocks to all physical blocks for the given journal. This will save + * us time when writing journal blocks. Most journals will have only one + * extent that maps all their logical blocks. That's because gfs2.mkfs + * arranges the journal blocks sequentially to maximize performance. + * So the extent would map the first block for the entire file length. + * However, gfs2_jadd can happen while file activity is happening, so + * those journals may not be sequential. Less likely is the case where + * the users created their own journals by mounting the metafs and + * laying it out. But it's still possible. These journals might have + * several extents. + * + * Returns: 0 on success, or error on failure + */ + +int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd) +{ + u64 lblock = 0; + u64 lblock_stop; + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct buffer_head bh; + unsigned int shift = sdp->sd_sb.sb_bsize_shift; + u64 size; + int rc; + + lblock_stop = i_size_read(jd->jd_inode) >> shift; + size = (lblock_stop - lblock) << shift; + jd->nr_extents = 0; + WARN_ON(!list_empty(&jd->extent_list)); + + do { + bh.b_state = 0; + bh.b_blocknr = 0; + bh.b_size = size; + rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0); + if (rc || !buffer_mapped(&bh)) + goto fail; + rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift); + if (rc) + goto fail; + size -= bh.b_size; + lblock += (bh.b_size >> ip->i_inode.i_blkbits); + } while(size > 0); + + fs_info(sdp, "journal %d mapped with %u extents\n", jd->jd_jid, + jd->nr_extents); + return 0; + +fail: + fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n", + rc, jd->jd_jid, + (unsigned long long)(i_size_read(jd->jd_inode) - size), + jd->nr_extents); + fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n", + rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr, + bh.b_state, (unsigned long long)bh.b_size); + gfs2_free_journal_extents(jd); + return rc; +} + +/** * gfs2_write_alloc_required - figure out if a write will require an allocation * @ip: the file being written to * @offset: the offset to write to diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h index 42fea03e2bd9..81ded5e2aaa2 100644 --- a/fs/gfs2/bmap.h +++ b/fs/gfs2/bmap.h @@ -55,5 +55,7 @@ extern int gfs2_truncatei_resume(struct gfs2_inode *ip); extern int gfs2_file_dealloc(struct gfs2_inode *ip); extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, unsigned int len); +extern int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd); +extern void gfs2_free_journal_extents(struct gfs2_jdesc *jd); #endif /* __BMAP_DOT_H__ */ diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index fa32655449c8..1a349f9a9685 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -53,6 +53,8 @@ * but never before the maximum hash table size has been reached. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/buffer_head.h> @@ -507,8 +509,8 @@ static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset, goto error; return 0; error: - printk(KERN_WARNING "gfs2_check_dirent: %s (%s)\n", msg, - first ? "first in block" : "not first in block"); + pr_warn("%s: %s (%s)\n", + __func__, msg, first ? "first in block" : "not first in block"); return -EIO; } @@ -531,8 +533,7 @@ static int gfs2_dirent_offset(const void *buf) } return offset; wrong_type: - printk(KERN_WARNING "gfs2_scan_dirent: wrong block type %u\n", - be32_to_cpu(h->mh_type)); + pr_warn("%s: wrong block type %u\n", __func__, be32_to_cpu(h->mh_type)); return -1; } @@ -728,7 +729,7 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no, error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp); if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) { - /* printk(KERN_INFO "block num=%llu\n", leaf_no); */ + /* pr_info("block num=%llu\n", leaf_no); */ error = -EIO; } @@ -1006,7 +1007,8 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) len = 1 << (dip->i_depth - be16_to_cpu(oleaf->lf_depth)); half_len = len >> 1; if (!half_len) { - printk(KERN_WARNING "i_depth %u lf_depth %u index %u\n", dip->i_depth, be16_to_cpu(oleaf->lf_depth), index); + pr_warn("i_depth %u lf_depth %u index %u\n", + dip->i_depth, be16_to_cpu(oleaf->lf_depth), index); gfs2_consist_inode(dip); error = -EIO; goto fail_brelse; @@ -1684,6 +1686,14 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name) return 0; } +static u16 gfs2_inode_ra_len(const struct gfs2_inode *ip) +{ + u64 where = ip->i_no_addr + 1; + if (ip->i_eattr == where) + return 1; + return 0; +} + /** * gfs2_dir_add - Add new filename into directory * @inode: The directory inode @@ -1721,6 +1731,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name, dent = gfs2_init_dirent(inode, dent, name, bh); gfs2_inum_out(nip, dent); dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode)); + dent->de_rahead = cpu_to_be16(gfs2_inode_ra_len(nip)); tv = CURRENT_TIME; if (ip->i_diskflags & GFS2_DIF_EXHASH) { leaf = (struct gfs2_leaf *)bh->b_data; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index efc078f0ee4e..80d67253623c 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -494,6 +494,7 @@ out: static const struct vm_operations_struct gfs2_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = gfs2_page_mkwrite, .remap_pages = generic_file_remap_pages, }; @@ -811,6 +812,8 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1); loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift; loff_t max_chunk_size = UINT_MAX & bsize_mask; + struct gfs2_holder gh; + next = (next + 1) << sdp->sd_sb.sb_bsize_shift; /* We only support the FALLOC_FL_KEEP_SIZE mode */ @@ -831,8 +834,10 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, if (error) return error; - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); - error = gfs2_glock_nq(&ip->i_gh); + mutex_lock(&inode->i_mutex); + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + error = gfs2_glock_nq(&gh); if (unlikely(error)) goto out_uninit; @@ -900,9 +905,10 @@ out_trans_fail: out_qunlock: gfs2_quota_unlock(ip); out_unlock: - gfs2_glock_dq(&ip->i_gh); + gfs2_glock_dq(&gh); out_uninit: - gfs2_holder_uninit(&ip->i_gh); + gfs2_holder_uninit(&gh); + mutex_unlock(&inode->i_mutex); return error; } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index ca0be6c69a26..aec7f73832f0 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -7,6 +7,8 @@ * of the GNU General Public License version 2. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> @@ -468,7 +470,7 @@ retry: do_xmote(gl, gh, LM_ST_UNLOCKED); break; default: /* Everything else */ - printk(KERN_ERR "GFS2: wanted %u got %u\n", gl->gl_target, state); + pr_err("wanted %u got %u\n", gl->gl_target, state); GLOCK_BUG_ON(gl, 1); } spin_unlock(&gl->gl_spin); @@ -542,7 +544,7 @@ __acquires(&gl->gl_spin) /* lock_dlm */ ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags); if (ret) { - printk(KERN_ERR "GFS2: lm_lock ret %d\n", ret); + pr_err("lm_lock ret %d\n", ret); GLOCK_BUG_ON(gl, 1); } } else { /* lock_nolock */ @@ -935,7 +937,7 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) vaf.fmt = fmt; vaf.va = &args; - printk(KERN_ERR " %pV", &vaf); + pr_err("%pV", &vaf); } va_end(args); @@ -1010,13 +1012,13 @@ do_cancel: return; trap_recursive: - printk(KERN_ERR "original: %pSR\n", (void *)gh2->gh_ip); - printk(KERN_ERR "pid: %d\n", pid_nr(gh2->gh_owner_pid)); - printk(KERN_ERR "lock type: %d req lock state : %d\n", + pr_err("original: %pSR\n", (void *)gh2->gh_ip); + pr_err("pid: %d\n", pid_nr(gh2->gh_owner_pid)); + pr_err("lock type: %d req lock state : %d\n", gh2->gh_gl->gl_name.ln_type, gh2->gh_state); - printk(KERN_ERR "new: %pSR\n", (void *)gh->gh_ip); - printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid)); - printk(KERN_ERR "lock type: %d req lock state : %d\n", + pr_err("new: %pSR\n", (void *)gh->gh_ip); + pr_err("pid: %d\n", pid_nr(gh->gh_owner_pid)); + pr_err("lock type: %d req lock state : %d\n", gh->gh_gl->gl_name.ln_type, gh->gh_state); gfs2_dump_glock(NULL, gl); BUG(); @@ -1045,9 +1047,13 @@ int gfs2_glock_nq(struct gfs2_holder *gh) spin_lock(&gl->gl_spin); add_to_queue(gh); - if ((LM_FLAG_NOEXP & gh->gh_flags) && - test_and_clear_bit(GLF_FROZEN, &gl->gl_flags)) + if (unlikely((LM_FLAG_NOEXP & gh->gh_flags) && + test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))) { set_bit(GLF_REPLY_PENDING, &gl->gl_flags); + gl->gl_lockref.count++; + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gl->gl_lockref.count--; + } run_queue(gl, 1); spin_unlock(&gl->gl_spin); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 3bf0631b5d56..54b66809e818 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -82,6 +82,8 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) struct gfs2_trans tr; memset(&tr, 0, sizeof(tr)); + INIT_LIST_HEAD(&tr.tr_buf); + INIT_LIST_HEAD(&tr.tr_databuf); tr.tr_revokes = atomic_read(&gl->gl_ail_count); if (!tr.tr_revokes) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index cf0e34400f71..bdf70c18610c 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -52,7 +52,7 @@ struct gfs2_log_header_host { */ struct gfs2_log_operations { - void (*lo_before_commit) (struct gfs2_sbd *sdp); + void (*lo_before_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr); void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr); void (*lo_before_scan) (struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, int pass); @@ -371,6 +371,7 @@ enum { GIF_ALLOC_FAILED = 2, GIF_SW_PAGED = 3, GIF_ORDERED = 4, + GIF_FREE_VFS_INODE = 5, }; struct gfs2_inode { @@ -462,11 +463,11 @@ struct gfs2_trans { unsigned int tr_blocks; unsigned int tr_revokes; unsigned int tr_reserved; + unsigned int tr_touched:1; + unsigned int tr_attached:1; struct gfs2_holder tr_t_gh; - int tr_touched; - int tr_attached; unsigned int tr_num_buf_new; unsigned int tr_num_databuf_new; @@ -476,6 +477,8 @@ struct gfs2_trans { unsigned int tr_num_revoke_rm; struct list_head tr_list; + struct list_head tr_databuf; + struct list_head tr_buf; unsigned int tr_first; struct list_head tr_ail1_list; @@ -483,7 +486,7 @@ struct gfs2_trans { }; struct gfs2_journal_extent { - struct list_head extent_list; + struct list_head list; unsigned int lblock; /* First logical block */ u64 dblock; /* First disk block */ @@ -493,6 +496,7 @@ struct gfs2_journal_extent { struct gfs2_jdesc { struct list_head jd_list; struct list_head extent_list; + unsigned int nr_extents; struct work_struct jd_work; struct inode *jd_inode; unsigned long jd_flags; @@ -500,6 +504,15 @@ struct gfs2_jdesc { unsigned int jd_jid; unsigned int jd_blocks; int jd_recover_error; + /* Replay stuff */ + + unsigned int jd_found_blocks; + unsigned int jd_found_revokes; + unsigned int jd_replayed_blocks; + + struct list_head jd_revoke_list; + unsigned int jd_replay_tail; + }; struct gfs2_statfs_change_host { @@ -746,19 +759,12 @@ struct gfs2_sbd { struct gfs2_trans *sd_log_tr; unsigned int sd_log_blks_reserved; - unsigned int sd_log_commited_buf; - unsigned int sd_log_commited_databuf; int sd_log_commited_revoke; atomic_t sd_log_pinned; - unsigned int sd_log_num_buf; unsigned int sd_log_num_revoke; - unsigned int sd_log_num_rg; - unsigned int sd_log_num_databuf; - struct list_head sd_log_le_buf; struct list_head sd_log_le_revoke; - struct list_head sd_log_le_databuf; struct list_head sd_log_le_ordered; spinlock_t sd_ordered_lock; @@ -786,15 +792,6 @@ struct gfs2_sbd { struct list_head sd_ail1_list; struct list_head sd_ail2_list; - /* Replay stuff */ - - struct list_head sd_revoke_list; - unsigned int sd_replay_tail; - - unsigned int sd_found_blocks; - unsigned int sd_found_revokes; - unsigned int sd_replayed_blocks; - /* For quiescing the filesystem */ struct gfs2_holder sd_freeze_gh; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 5c524180c98e..28cc7bf6575a 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -376,12 +376,11 @@ static void munge_mode_uid_gid(const struct gfs2_inode *dip, inode->i_gid = current_fsgid(); } -static int alloc_dinode(struct gfs2_inode *ip, u32 flags) +static int alloc_dinode(struct gfs2_inode *ip, u32 flags, unsigned *dblocks) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc_parms ap = { .target = RES_DINODE, .aflags = flags, }; + struct gfs2_alloc_parms ap = { .target = *dblocks, .aflags = flags, }; int error; - int dblocks = 1; error = gfs2_quota_lock_check(ip); if (error) @@ -391,11 +390,11 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags) if (error) goto out_quota; - error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, 0); + error = gfs2_trans_begin(sdp, (*dblocks * RES_RG_BIT) + RES_STATFS + RES_QUOTA, 0); if (error) goto out_ipreserv; - error = gfs2_alloc_blocks(ip, &ip->i_no_addr, &dblocks, 1, &ip->i_generation); + error = gfs2_alloc_blocks(ip, &ip->i_no_addr, dblocks, 1, &ip->i_generation); ip->i_no_formal_ino = ip->i_generation; ip->i_inode.i_ino = ip->i_no_addr; ip->i_goal = ip->i_no_addr; @@ -428,6 +427,33 @@ static void gfs2_init_dir(struct buffer_head *dibh, } /** + * gfs2_init_xattr - Initialise an xattr block for a new inode + * @ip: The inode in question + * + * This sets up an empty xattr block for a new inode, ready to + * take any ACLs, LSM xattrs, etc. + */ + +static void gfs2_init_xattr(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *bh; + struct gfs2_ea_header *ea; + + bh = gfs2_meta_new(ip->i_gl, ip->i_eattr); + gfs2_trans_add_meta(ip->i_gl, bh); + gfs2_metatype_set(bh, GFS2_METATYPE_EA, GFS2_FORMAT_EA); + gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header)); + + ea = GFS2_EA_BH2FIRST(bh); + ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize); + ea->ea_type = GFS2_EATYPE_UNUSED; + ea->ea_flags = GFS2_EAFLAG_LAST; + + brelse(bh); +} + +/** * init_dinode - Fill in a new dinode structure * @dip: The directory this inode is being created in * @ip: The inode @@ -545,13 +571,6 @@ static int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, return err; } -static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip, - const struct qstr *qstr) -{ - return security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr, - &gfs2_initxattrs, NULL); -} - /** * gfs2_create_inode - Create a new inode * @dir: The parent directory @@ -578,8 +597,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct gfs2_glock *io_gl; struct dentry *d; - int error; + int error, free_vfs_inode = 0; u32 aflags = 0; + unsigned blocks = 1; struct gfs2_diradd da = { .bh = NULL, }; if (!name->len || name->len > GFS2_FNAMESIZE) @@ -676,10 +696,15 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, (dip->i_diskflags & GFS2_DIF_TOPDIR)) aflags |= GFS2_AF_ORLOV; - error = alloc_dinode(ip, aflags); + if (default_acl || acl) + blocks++; + + error = alloc_dinode(ip, aflags, &blocks); if (error) goto fail_free_inode; + gfs2_set_inode_blocks(inode, blocks); + error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); if (error) goto fail_free_inode; @@ -689,10 +714,14 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_free_inode; - error = gfs2_trans_begin(sdp, RES_DINODE, 0); + error = gfs2_trans_begin(sdp, blocks, 0); if (error) goto fail_gunlock2; + if (blocks > 1) { + ip->i_eattr = ip->i_no_addr + 1; + gfs2_init_xattr(ip); + } init_dinode(dip, ip, symname); gfs2_trans_end(sdp); @@ -722,7 +751,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_gunlock3; - error = gfs2_security_init(dip, ip, name); + error = security_inode_init_security(&ip->i_inode, &dip->i_inode, name, + &gfs2_initxattrs, NULL); if (error) goto fail_gunlock3; @@ -758,15 +788,16 @@ fail_free_acls: if (acl) posix_acl_release(acl); fail_free_vfs_inode: - free_inode_nonrcu(inode); - inode = NULL; + free_vfs_inode = 1; fail_gunlock: gfs2_dir_no_add(&da); gfs2_glock_dq_uninit(ghs); if (inode && !IS_ERR(inode)) { clear_nlink(inode); - mark_inode_dirty(inode); - set_bit(GIF_ALLOC_FAILED, &GFS2_I(inode)->i_flags); + if (!free_vfs_inode) + mark_inode_dirty(inode); + set_bit(free_vfs_inode ? GIF_FREE_VFS_INODE : GIF_ALLOC_FAILED, + &GFS2_I(inode)->i_flags); iput(inode); } fail: @@ -1263,6 +1294,10 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to) } tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1); + if (!tmp) { + error = -ENOENT; + break; + } if (IS_ERR(tmp)) { error = PTR_ERR(tmp); break; diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 2a6ba06bee6f..c1eb555dc588 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -7,6 +7,8 @@ * of the GNU General Public License version 2. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/fs.h> #include <linux/dlm.h> #include <linux/slab.h> @@ -176,7 +178,7 @@ static void gdlm_bast(void *arg, int mode) gfs2_glock_cb(gl, LM_ST_SHARED); break; default: - printk(KERN_ERR "unknown bast mode %d", mode); + pr_err("unknown bast mode %d\n", mode); BUG(); } } @@ -195,7 +197,7 @@ static int make_mode(const unsigned int lmstate) case LM_ST_SHARED: return DLM_LOCK_PR; } - printk(KERN_ERR "unknown LM state %d", lmstate); + pr_err("unknown LM state %d\n", lmstate); BUG(); return -1; } @@ -308,7 +310,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl) error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK, NULL, gl); if (error) { - printk(KERN_ERR "gdlm_unlock %x,%llx err=%d\n", + pr_err("gdlm_unlock %x,%llx err=%d\n", gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number, error); return; @@ -1102,7 +1104,7 @@ static void gdlm_recover_slot(void *arg, struct dlm_slot *slot) } if (ls->ls_recover_submit[jid]) { - fs_info(sdp, "recover_slot jid %d gen %u prev %u", + fs_info(sdp, "recover_slot jid %d gen %u prev %u\n", jid, ls->ls_recover_block, ls->ls_recover_submit[jid]); } ls->ls_recover_submit[jid] = ls->ls_recover_block; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 9dcb9777a5f8..4a14d504ef83 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -18,6 +18,7 @@ #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/bio.h> +#include <linux/blkdev.h> #include <linux/writeback.h> #include <linux/list_sort.h> @@ -145,8 +146,10 @@ void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc) { struct list_head *head = &sdp->sd_ail1_list; struct gfs2_trans *tr; + struct blk_plug plug; trace_gfs2_ail_flush(sdp, wbc, 1); + blk_start_plug(&plug); spin_lock(&sdp->sd_ail_lock); restart: list_for_each_entry_reverse(tr, head, tr_list) { @@ -156,6 +159,7 @@ restart: goto restart; } spin_unlock(&sdp->sd_ail_lock); + blk_finish_plug(&plug); trace_gfs2_ail_flush(sdp, wbc, 0); } @@ -410,24 +414,22 @@ static inline unsigned int log_distance(struct gfs2_sbd *sdp, unsigned int newer static unsigned int calc_reserved(struct gfs2_sbd *sdp) { unsigned int reserved = 0; - unsigned int mbuf_limit, metabufhdrs_needed; - unsigned int dbuf_limit, databufhdrs_needed; - unsigned int revokes = 0; + unsigned int mbuf; + unsigned int dbuf; + struct gfs2_trans *tr = sdp->sd_log_tr; - mbuf_limit = buf_limit(sdp); - metabufhdrs_needed = (sdp->sd_log_commited_buf + - (mbuf_limit - 1)) / mbuf_limit; - dbuf_limit = databuf_limit(sdp); - databufhdrs_needed = (sdp->sd_log_commited_databuf + - (dbuf_limit - 1)) / dbuf_limit; + if (tr) { + mbuf = tr->tr_num_buf_new - tr->tr_num_buf_rm; + dbuf = tr->tr_num_databuf_new - tr->tr_num_databuf_rm; + reserved = mbuf + dbuf; + /* Account for header blocks */ + reserved += DIV_ROUND_UP(mbuf, buf_limit(sdp)); + reserved += DIV_ROUND_UP(dbuf, databuf_limit(sdp)); + } if (sdp->sd_log_commited_revoke > 0) - revokes = gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke, + reserved += gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke, sizeof(u64)); - - reserved = sdp->sd_log_commited_buf + metabufhdrs_needed + - sdp->sd_log_commited_databuf + databufhdrs_needed + - revokes; /* One for the overall header */ if (reserved) reserved++; @@ -682,36 +684,25 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) } trace_gfs2_log_flush(sdp, 1); + sdp->sd_log_flush_head = sdp->sd_log_head; + sdp->sd_log_flush_wrapped = 0; tr = sdp->sd_log_tr; if (tr) { sdp->sd_log_tr = NULL; INIT_LIST_HEAD(&tr->tr_ail1_list); INIT_LIST_HEAD(&tr->tr_ail2_list); + tr->tr_first = sdp->sd_log_flush_head; } - if (sdp->sd_log_num_buf != sdp->sd_log_commited_buf) { - printk(KERN_INFO "GFS2: log buf %u %u\n", sdp->sd_log_num_buf, - sdp->sd_log_commited_buf); - gfs2_assert_withdraw(sdp, 0); - } - if (sdp->sd_log_num_databuf != sdp->sd_log_commited_databuf) { - printk(KERN_INFO "GFS2: log databuf %u %u\n", - sdp->sd_log_num_databuf, sdp->sd_log_commited_databuf); - gfs2_assert_withdraw(sdp, 0); - } gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke); - sdp->sd_log_flush_head = sdp->sd_log_head; - sdp->sd_log_flush_wrapped = 0; - if (tr) - tr->tr_first = sdp->sd_log_flush_head; - gfs2_ordered_write(sdp); - lops_before_commit(sdp); + lops_before_commit(sdp, tr); gfs2_log_flush_bio(sdp, WRITE); if (sdp->sd_log_head != sdp->sd_log_flush_head) { + log_flush_wait(sdp); log_write_header(sdp, 0); } else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){ atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ @@ -723,8 +714,6 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) gfs2_log_lock(sdp); sdp->sd_log_head = sdp->sd_log_flush_head; sdp->sd_log_blks_reserved = 0; - sdp->sd_log_commited_buf = 0; - sdp->sd_log_commited_databuf = 0; sdp->sd_log_commited_revoke = 0; spin_lock(&sdp->sd_ail_lock); @@ -740,34 +729,54 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) kfree(tr); } +/** + * gfs2_merge_trans - Merge a new transaction into a cached transaction + * @old: Original transaction to be expanded + * @new: New transaction to be merged + */ + +static void gfs2_merge_trans(struct gfs2_trans *old, struct gfs2_trans *new) +{ + WARN_ON_ONCE(old->tr_attached != 1); + + old->tr_num_buf_new += new->tr_num_buf_new; + old->tr_num_databuf_new += new->tr_num_databuf_new; + old->tr_num_buf_rm += new->tr_num_buf_rm; + old->tr_num_databuf_rm += new->tr_num_databuf_rm; + old->tr_num_revoke += new->tr_num_revoke; + old->tr_num_revoke_rm += new->tr_num_revoke_rm; + + list_splice_tail_init(&new->tr_databuf, &old->tr_databuf); + list_splice_tail_init(&new->tr_buf, &old->tr_buf); +} + static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { unsigned int reserved; unsigned int unused; + unsigned int maxres; gfs2_log_lock(sdp); - sdp->sd_log_commited_buf += tr->tr_num_buf_new - tr->tr_num_buf_rm; - sdp->sd_log_commited_databuf += tr->tr_num_databuf_new - - tr->tr_num_databuf_rm; - gfs2_assert_withdraw(sdp, (((int)sdp->sd_log_commited_buf) >= 0) || - (((int)sdp->sd_log_commited_databuf) >= 0)); + if (sdp->sd_log_tr) { + gfs2_merge_trans(sdp->sd_log_tr, tr); + } else if (tr->tr_num_buf_new || tr->tr_num_databuf_new) { + gfs2_assert_withdraw(sdp, tr->tr_t_gh.gh_gl); + sdp->sd_log_tr = tr; + tr->tr_attached = 1; + } + sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm; reserved = calc_reserved(sdp); - gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved); - unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved; + maxres = sdp->sd_log_blks_reserved + tr->tr_reserved; + gfs2_assert_withdraw(sdp, maxres >= reserved); + unused = maxres - reserved; atomic_add(unused, &sdp->sd_log_blks_free); trace_gfs2_log_blocks(sdp, unused); gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks); sdp->sd_log_blks_reserved = reserved; - if (sdp->sd_log_tr == NULL && - (tr->tr_num_buf_new || tr->tr_num_databuf_new)) { - gfs2_assert_withdraw(sdp, tr->tr_t_gh.gh_gl); - sdp->sd_log_tr = tr; - tr->tr_attached = 1; - } gfs2_log_unlock(sdp); } @@ -807,10 +816,7 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp) down_write(&sdp->sd_log_flush_lock); gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved); - gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf); gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); - gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg); - gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf); gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list)); sdp->sd_log_flush_head = sdp->sd_log_head; diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 76693793cedd..a294d8d8bcd4 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -146,8 +146,8 @@ static u64 gfs2_log_bmap(struct gfs2_sbd *sdp) struct gfs2_journal_extent *je; u64 block; - list_for_each_entry(je, &sdp->sd_jdesc->extent_list, extent_list) { - if (lbn >= je->lblock && lbn < je->lblock + je->blocks) { + list_for_each_entry(je, &sdp->sd_jdesc->extent_list, list) { + if ((lbn >= je->lblock) && (lbn < (je->lblock + je->blocks))) { block = je->dblock + lbn - je->lblock; gfs2_log_incr_head(sdp); return block; @@ -491,44 +491,40 @@ static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit, gfs2_log_unlock(sdp); } -static void buf_lo_before_commit(struct gfs2_sbd *sdp) +static void buf_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { unsigned int limit = buf_limit(sdp); /* 503 for 4k blocks */ - - gfs2_before_commit(sdp, limit, sdp->sd_log_num_buf, - &sdp->sd_log_le_buf, 0); + unsigned int nbuf; + if (tr == NULL) + return; + nbuf = tr->tr_num_buf_new - tr->tr_num_buf_rm; + gfs2_before_commit(sdp, limit, nbuf, &tr->tr_buf, 0); } static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { - struct list_head *head = &sdp->sd_log_le_buf; + struct list_head *head; struct gfs2_bufdata *bd; - if (tr == NULL) { - gfs2_assert(sdp, list_empty(head)); + if (tr == NULL) return; - } + head = &tr->tr_buf; while (!list_empty(head)) { bd = list_entry(head->next, struct gfs2_bufdata, bd_list); list_del_init(&bd->bd_list); - sdp->sd_log_num_buf--; - gfs2_unpin(sdp, bd->bd_bh, tr); } - gfs2_assert_warn(sdp, !sdp->sd_log_num_buf); } static void buf_lo_before_scan(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, int pass) { - struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); - if (pass != 0) return; - sdp->sd_found_blocks = 0; - sdp->sd_replayed_blocks = 0; + jd->jd_found_blocks = 0; + jd->jd_replayed_blocks = 0; } static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, @@ -551,9 +547,9 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) { blkno = be64_to_cpu(*ptr++); - sdp->sd_found_blocks++; + jd->jd_found_blocks++; - if (gfs2_revoke_check(sdp, blkno, start)) + if (gfs2_revoke_check(jd, blkno, start)) continue; error = gfs2_replay_read_block(jd, start, &bh_log); @@ -574,7 +570,7 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, if (error) break; - sdp->sd_replayed_blocks++; + jd->jd_replayed_blocks++; } return error; @@ -617,10 +613,10 @@ static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) gfs2_meta_sync(ip->i_gl); fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n", - jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks); + jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks); } -static void revoke_lo_before_commit(struct gfs2_sbd *sdp) +static void revoke_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { struct gfs2_meta_header *mh; unsigned int offset; @@ -679,13 +675,11 @@ static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) static void revoke_lo_before_scan(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, int pass) { - struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); - if (pass != 0) return; - sdp->sd_found_revokes = 0; - sdp->sd_replay_tail = head->lh_tail; + jd->jd_found_revokes = 0; + jd->jd_replay_tail = head->lh_tail; } static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, @@ -717,13 +711,13 @@ static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, while (offset + sizeof(u64) <= sdp->sd_sb.sb_bsize) { blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset)); - error = gfs2_revoke_add(sdp, blkno, start); + error = gfs2_revoke_add(jd, blkno, start); if (error < 0) { brelse(bh); return error; } else if (error) - sdp->sd_found_revokes++; + jd->jd_found_revokes++; if (!--revokes) break; @@ -743,16 +737,16 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); if (error) { - gfs2_revoke_clean(sdp); + gfs2_revoke_clean(jd); return; } if (pass != 1) return; fs_info(sdp, "jid=%u: Found %u revoke tags\n", - jd->jd_jid, sdp->sd_found_revokes); + jd->jd_jid, jd->jd_found_revokes); - gfs2_revoke_clean(sdp); + gfs2_revoke_clean(jd); } /** @@ -760,12 +754,14 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) * */ -static void databuf_lo_before_commit(struct gfs2_sbd *sdp) +static void databuf_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { - unsigned int limit = buf_limit(sdp) / 2; - - gfs2_before_commit(sdp, limit, sdp->sd_log_num_databuf, - &sdp->sd_log_le_databuf, 1); + unsigned int limit = databuf_limit(sdp); + unsigned int nbuf; + if (tr == NULL) + return; + nbuf = tr->tr_num_databuf_new - tr->tr_num_databuf_rm; + gfs2_before_commit(sdp, limit, nbuf, &tr->tr_databuf, 1); } static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, @@ -789,9 +785,9 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, blkno = be64_to_cpu(*ptr++); esc = be64_to_cpu(*ptr++); - sdp->sd_found_blocks++; + jd->jd_found_blocks++; - if (gfs2_revoke_check(sdp, blkno, start)) + if (gfs2_revoke_check(jd, blkno, start)) continue; error = gfs2_replay_read_block(jd, start, &bh_log); @@ -811,7 +807,7 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, brelse(bh_log); brelse(bh_ip); - sdp->sd_replayed_blocks++; + jd->jd_replayed_blocks++; } return error; @@ -835,26 +831,23 @@ static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) gfs2_meta_sync(ip->i_gl); fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n", - jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks); + jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks); } static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { - struct list_head *head = &sdp->sd_log_le_databuf; + struct list_head *head; struct gfs2_bufdata *bd; - if (tr == NULL) { - gfs2_assert(sdp, list_empty(head)); + if (tr == NULL) return; - } + head = &tr->tr_databuf; while (!list_empty(head)) { bd = list_entry(head->next, struct gfs2_bufdata, bd_list); list_del_init(&bd->bd_list); - sdp->sd_log_num_databuf--; gfs2_unpin(sdp, bd->bd_bh, tr); } - gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf); } diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 9ca2e6438419..a65a7ba32ffd 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -46,12 +46,13 @@ static inline unsigned int databuf_limit(struct gfs2_sbd *sdp) return limit; } -static inline void lops_before_commit(struct gfs2_sbd *sdp) +static inline void lops_before_commit(struct gfs2_sbd *sdp, + struct gfs2_trans *tr) { int x; for (x = 0; gfs2_log_ops[x]; x++) if (gfs2_log_ops[x]->lo_before_commit) - gfs2_log_ops[x]->lo_before_commit(sdp); + gfs2_log_ops[x]->lo_before_commit(sdp, tr); } static inline void lops_after_commit(struct gfs2_sbd *sdp, diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index c272e73063de..82b6ac829656 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -7,6 +7,8 @@ * of the GNU General Public License version 2. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> @@ -165,7 +167,7 @@ static int __init init_gfs2_fs(void) gfs2_register_debugfs(); - printk("GFS2 installed\n"); + pr_info("GFS2 installed\n"); return 0; diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index c7f24690ed05..2cf09b63a6b4 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -97,6 +97,11 @@ const struct address_space_operations gfs2_meta_aops = { .releasepage = gfs2_releasepage, }; +const struct address_space_operations gfs2_rgrp_aops = { + .writepage = gfs2_aspace_writepage, + .releasepage = gfs2_releasepage, +}; + /** * gfs2_getbuf - Get a buffer with a given address space * @gl: the glock @@ -267,15 +272,10 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int trace_gfs2_pin(bd, 0); atomic_dec(&sdp->sd_log_pinned); list_del_init(&bd->bd_list); - if (meta) { - gfs2_assert_warn(sdp, sdp->sd_log_num_buf); - sdp->sd_log_num_buf--; + if (meta) tr->tr_num_buf_rm++; - } else { - gfs2_assert_warn(sdp, sdp->sd_log_num_databuf); - sdp->sd_log_num_databuf--; + else tr->tr_num_databuf_rm++; - } tr->tr_touched = 1; was_pinned = 1; brelse(bh); diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index 4823b934208a..ac5d8027d335 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -38,12 +38,15 @@ static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh, } extern const struct address_space_operations gfs2_meta_aops; +extern const struct address_space_operations gfs2_rgrp_aops; static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping) { struct inode *inode = mapping->host; if (mapping->a_ops == &gfs2_meta_aops) return (((struct gfs2_glock *)mapping) - 1)->gl_sbd; + else if (mapping->a_ops == &gfs2_rgrp_aops) + return container_of(mapping, struct gfs2_sbd, sd_aspace); else return inode->i_sb->s_fs_info; } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index c6872d09561a..22f954051bb8 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -7,6 +7,8 @@ * of the GNU General Public License version 2. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> @@ -104,7 +106,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) mapping = &sdp->sd_aspace; address_space_init_once(mapping); - mapping->a_ops = &gfs2_meta_aops; + mapping->a_ops = &gfs2_rgrp_aops; mapping->host = sb->s_bdev->bd_inode; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); @@ -114,9 +116,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) spin_lock_init(&sdp->sd_log_lock); atomic_set(&sdp->sd_log_pinned, 0); - INIT_LIST_HEAD(&sdp->sd_log_le_buf); INIT_LIST_HEAD(&sdp->sd_log_le_revoke); - INIT_LIST_HEAD(&sdp->sd_log_le_databuf); INIT_LIST_HEAD(&sdp->sd_log_le_ordered); spin_lock_init(&sdp->sd_ordered_lock); @@ -130,8 +130,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) atomic_set(&sdp->sd_log_in_flight, 0); init_waitqueue_head(&sdp->sd_log_flush_wait); - INIT_LIST_HEAD(&sdp->sd_revoke_list); - return sdp; } @@ -154,7 +152,7 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent) if (sb->sb_magic != GFS2_MAGIC || sb->sb_type != GFS2_METATYPE_SB) { if (!silent) - printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n"); + pr_warn("not a GFS2 filesystem\n"); return -EINVAL; } @@ -176,7 +174,7 @@ static void end_bio_io_page(struct bio *bio, int error) if (!error) SetPageUptodate(page); else - printk(KERN_WARNING "gfs2: error %d reading superblock\n", error); + pr_warn("error %d reading superblock\n", error); unlock_page(page); } @@ -519,67 +517,6 @@ out: return ret; } -/** - * map_journal_extents - create a reusable "extent" mapping from all logical - * blocks to all physical blocks for the given journal. This will save - * us time when writing journal blocks. Most journals will have only one - * extent that maps all their logical blocks. That's because gfs2.mkfs - * arranges the journal blocks sequentially to maximize performance. - * So the extent would map the first block for the entire file length. - * However, gfs2_jadd can happen while file activity is happening, so - * those journals may not be sequential. Less likely is the case where - * the users created their own journals by mounting the metafs and - * laying it out. But it's still possible. These journals might have - * several extents. - * - * TODO: This should be done in bigger chunks rather than one block at a time, - * but since it's only done at mount time, I'm not worried about the - * time it takes. - */ -static int map_journal_extents(struct gfs2_sbd *sdp) -{ - struct gfs2_jdesc *jd = sdp->sd_jdesc; - unsigned int lb; - u64 db, prev_db; /* logical block, disk block, prev disk block */ - struct gfs2_inode *ip = GFS2_I(jd->jd_inode); - struct gfs2_journal_extent *jext = NULL; - struct buffer_head bh; - int rc = 0; - - prev_db = 0; - - for (lb = 0; lb < i_size_read(jd->jd_inode) >> sdp->sd_sb.sb_bsize_shift; lb++) { - bh.b_state = 0; - bh.b_blocknr = 0; - bh.b_size = 1 << ip->i_inode.i_blkbits; - rc = gfs2_block_map(jd->jd_inode, lb, &bh, 0); - db = bh.b_blocknr; - if (rc || !db) { - printk(KERN_INFO "GFS2 journal mapping error %d: lb=" - "%u db=%llu\n", rc, lb, (unsigned long long)db); - break; - } - if (!prev_db || db != prev_db + 1) { - jext = kzalloc(sizeof(struct gfs2_journal_extent), - GFP_KERNEL); - if (!jext) { - printk(KERN_INFO "GFS2 error: out of memory " - "mapping journal extents.\n"); - rc = -ENOMEM; - break; - } - jext->dblock = db; - jext->lblock = lb; - jext->blocks = 1; - list_add_tail(&jext->extent_list, &jd->extent_list); - } else { - jext->blocks++; - } - prev_db = db; - } - return rc; -} - static void gfs2_others_may_mount(struct gfs2_sbd *sdp) { char *message = "FIRSTMOUNT=Done"; @@ -638,6 +575,8 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) break; INIT_LIST_HEAD(&jd->extent_list); + INIT_LIST_HEAD(&jd->jd_revoke_list); + INIT_WORK(&jd->jd_work, gfs2_recover_func); jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1); if (!jd->jd_inode || IS_ERR(jd->jd_inode)) { @@ -781,7 +720,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5); /* Map the extents for this journal's blocks */ - map_journal_extents(sdp); + gfs2_map_journal_extents(sdp, sdp->sd_jdesc); } trace_gfs2_log_blocks(sdp, atomic_read(&sdp->sd_log_blks_free)); @@ -1008,7 +947,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) lm = &gfs2_dlm_ops; #endif } else { - printk(KERN_INFO "GFS2: can't find protocol %s\n", proto); + pr_info("can't find protocol %s\n", proto); return -ENOENT; } @@ -1115,7 +1054,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent sdp = init_sbd(sb); if (!sdp) { - printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n"); + pr_warn("can't alloc struct gfs2_sbd\n"); return -ENOMEM; } sdp->sd_args = *args; @@ -1363,7 +1302,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, error = gfs2_mount_args(&args, data); if (error) { - printk(KERN_WARNING "GFS2: can't parse mount arguments\n"); + pr_warn("can't parse mount arguments\n"); goto error_super; } @@ -1413,15 +1352,15 @@ static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type, error = kern_path(dev_name, LOOKUP_FOLLOW, &path); if (error) { - printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n", - dev_name, error); + pr_warn("path_lookup on %s returned error %d\n", + dev_name, error); return ERR_PTR(error); } s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, flags, path.dentry->d_inode->i_sb->s_bdev); path_put(&path); if (IS_ERR(s)) { - printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n"); + pr_warn("gfs2 mount does not exist\n"); return ERR_CAST(s); } if ((flags ^ s->s_flags) & MS_RDONLY) { diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 8bec0e3192dd..c4effff7cf55 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -36,6 +36,8 @@ * the quota file, so it is not being constantly read. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/sched.h> #include <linux/slab.h> #include <linux/mm.h> @@ -330,6 +332,7 @@ static int slot_get(struct gfs2_quota_data *qd) if (bit < sdp->sd_quota_slots) { set_bit(bit, sdp->sd_quota_bitmap); qd->qd_slot = bit; + error = 0; out: qd->qd_slot_count++; } @@ -1081,10 +1084,10 @@ static int print_message(struct gfs2_quota_data *qd, char *type) { struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; - printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\n", - sdp->sd_fsname, type, - (qd->qd_id.type == USRQUOTA) ? "user" : "group", - from_kqid(&init_user_ns, qd->qd_id)); + fs_info(sdp, "quota %s for %s %u\n", + type, + (qd->qd_id.type == USRQUOTA) ? "user" : "group", + from_kqid(&init_user_ns, qd->qd_id)); return 0; } @@ -1242,14 +1245,13 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) bm_size = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * sizeof(unsigned long)); bm_size *= sizeof(unsigned long); error = -ENOMEM; - sdp->sd_quota_bitmap = kmalloc(bm_size, GFP_NOFS|__GFP_NOWARN); + sdp->sd_quota_bitmap = kzalloc(bm_size, GFP_NOFS | __GFP_NOWARN); if (sdp->sd_quota_bitmap == NULL) - sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS, PAGE_KERNEL); + sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS | + __GFP_ZERO, PAGE_KERNEL); if (!sdp->sd_quota_bitmap) return error; - memset(sdp->sd_quota_bitmap, 0, bm_size); - for (x = 0; x < blocks; x++) { struct buffer_head *bh; const struct gfs2_quota_change *qc; diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 963b2d75200c..7ad4094d68c0 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -52,9 +52,9 @@ int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, return error; } -int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where) +int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where) { - struct list_head *head = &sdp->sd_revoke_list; + struct list_head *head = &jd->jd_revoke_list; struct gfs2_revoke_replay *rr; int found = 0; @@ -81,13 +81,13 @@ int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where) return 1; } -int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where) +int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where) { struct gfs2_revoke_replay *rr; int wrap, a, b, revoke; int found = 0; - list_for_each_entry(rr, &sdp->sd_revoke_list, rr_list) { + list_for_each_entry(rr, &jd->jd_revoke_list, rr_list) { if (rr->rr_blkno == blkno) { found = 1; break; @@ -97,17 +97,17 @@ int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where) if (!found) return 0; - wrap = (rr->rr_where < sdp->sd_replay_tail); - a = (sdp->sd_replay_tail < where); + wrap = (rr->rr_where < jd->jd_replay_tail); + a = (jd->jd_replay_tail < where); b = (where < rr->rr_where); revoke = (wrap) ? (a || b) : (a && b); return revoke; } -void gfs2_revoke_clean(struct gfs2_sbd *sdp) +void gfs2_revoke_clean(struct gfs2_jdesc *jd) { - struct list_head *head = &sdp->sd_revoke_list; + struct list_head *head = &jd->jd_revoke_list; struct gfs2_revoke_replay *rr; while (!list_empty(head)) { diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h index 2226136c7647..6142836cce96 100644 --- a/fs/gfs2/recovery.h +++ b/fs/gfs2/recovery.h @@ -23,9 +23,9 @@ static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk) extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, struct buffer_head **bh); -extern int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where); -extern int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where); -extern void gfs2_revoke_clean(struct gfs2_sbd *sdp); +extern int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where); +extern int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where); +extern void gfs2_revoke_clean(struct gfs2_jdesc *jd); extern int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index a1da21349235..281a7716e3f3 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -7,6 +7,8 @@ * of the GNU General Public License version 2. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> @@ -99,12 +101,12 @@ static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone, cur_state = (*byte1 >> bit) & GFS2_BIT_MASK; if (unlikely(!valid_change[new_state * 4 + cur_state])) { - printk(KERN_WARNING "GFS2: buf_blk = 0x%x old_state=%d, " - "new_state=%d\n", rbm->offset, cur_state, new_state); - printk(KERN_WARNING "GFS2: rgrp=0x%llx bi_start=0x%x\n", - (unsigned long long)rbm->rgd->rd_addr, bi->bi_start); - printk(KERN_WARNING "GFS2: bi_offset=0x%x bi_len=0x%x\n", - bi->bi_offset, bi->bi_len); + pr_warn("buf_blk = 0x%x old_state=%d, new_state=%d\n", + rbm->offset, cur_state, new_state); + pr_warn("rgrp=0x%llx bi_start=0x%x\n", + (unsigned long long)rbm->rgd->rd_addr, bi->bi_start); + pr_warn("bi_offset=0x%x bi_len=0x%x\n", + bi->bi_offset, bi->bi_len); dump_stack(); gfs2_consist_rgrpd(rbm->rgd); return; @@ -736,11 +738,11 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd) { - printk(KERN_INFO " ri_addr = %llu\n", (unsigned long long)rgd->rd_addr); - printk(KERN_INFO " ri_length = %u\n", rgd->rd_length); - printk(KERN_INFO " ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0); - printk(KERN_INFO " ri_data = %u\n", rgd->rd_data); - printk(KERN_INFO " ri_bitbytes = %u\n", rgd->rd_bitbytes); + pr_info("ri_addr = %llu\n", (unsigned long long)rgd->rd_addr); + pr_info("ri_length = %u\n", rgd->rd_length); + pr_info("ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0); + pr_info("ri_data = %u\n", rgd->rd_data); + pr_info("ri_bitbytes = %u\n", rgd->rd_bitbytes); } /** @@ -1102,7 +1104,7 @@ static u32 count_unlinked(struct gfs2_rgrpd *rgd) * Returns: errno */ -int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) +static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) { struct gfs2_sbd *sdp = rgd->rd_sbd; struct gfs2_glock *gl = rgd->rd_gl; @@ -1169,7 +1171,7 @@ fail: return error; } -int update_rgrp_lvb(struct gfs2_rgrpd *rgd) +static int update_rgrp_lvb(struct gfs2_rgrpd *rgd) { u32 rl_flags; @@ -2278,7 +2280,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, } } if (rbm.rgd->rd_free < *nblocks) { - printk(KERN_WARNING "nblocks=%u\n", *nblocks); + pr_warn("nblocks=%u\n", *nblocks); goto rgrp_error; } @@ -2296,7 +2298,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0); if (dinode) - gfs2_trans_add_unrevoke(sdp, block, 1); + gfs2_trans_add_unrevoke(sdp, block, *nblocks); gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 60f60f6181f3..de8afad89e51 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -7,6 +7,8 @@ * of the GNU General Public License version 2. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/bio.h> #include <linux/sched.h> #include <linux/slab.h> @@ -175,8 +177,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options) break; case Opt_debug: if (args->ar_errors == GFS2_ERRORS_PANIC) { - printk(KERN_WARNING "GFS2: -o debug and -o errors=panic " - "are mutually exclusive.\n"); + pr_warn("-o debug and -o errors=panic are mutually exclusive\n"); return -EINVAL; } args->ar_debug = 1; @@ -228,21 +229,21 @@ int gfs2_mount_args(struct gfs2_args *args, char *options) case Opt_commit: rv = match_int(&tmp[0], &args->ar_commit); if (rv || args->ar_commit <= 0) { - printk(KERN_WARNING "GFS2: commit mount option requires a positive numeric argument\n"); + pr_warn("commit mount option requires a positive numeric argument\n"); return rv ? rv : -EINVAL; } break; case Opt_statfs_quantum: rv = match_int(&tmp[0], &args->ar_statfs_quantum); if (rv || args->ar_statfs_quantum < 0) { - printk(KERN_WARNING "GFS2: statfs_quantum mount option requires a non-negative numeric argument\n"); + pr_warn("statfs_quantum mount option requires a non-negative numeric argument\n"); return rv ? rv : -EINVAL; } break; case Opt_quota_quantum: rv = match_int(&tmp[0], &args->ar_quota_quantum); if (rv || args->ar_quota_quantum <= 0) { - printk(KERN_WARNING "GFS2: quota_quantum mount option requires a positive numeric argument\n"); + pr_warn("quota_quantum mount option requires a positive numeric argument\n"); return rv ? rv : -EINVAL; } break; @@ -250,7 +251,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options) rv = match_int(&tmp[0], &args->ar_statfs_percent); if (rv || args->ar_statfs_percent < 0 || args->ar_statfs_percent > 100) { - printk(KERN_WARNING "statfs_percent mount option requires a numeric argument between 0 and 100\n"); + pr_warn("statfs_percent mount option requires a numeric argument between 0 and 100\n"); return rv ? rv : -EINVAL; } break; @@ -259,8 +260,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options) break; case Opt_err_panic: if (args->ar_debug) { - printk(KERN_WARNING "GFS2: -o debug and -o errors=panic " - "are mutually exclusive.\n"); + pr_warn("-o debug and -o errors=panic are mutually exclusive\n"); return -EINVAL; } args->ar_errors = GFS2_ERRORS_PANIC; @@ -279,7 +279,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options) break; case Opt_error: default: - printk(KERN_WARNING "GFS2: invalid mount option: %s\n", o); + pr_warn("invalid mount option: %s\n", o); return -EINVAL; } } @@ -295,9 +295,8 @@ int gfs2_mount_args(struct gfs2_args *args, char *options) void gfs2_jindex_free(struct gfs2_sbd *sdp) { - struct list_head list, *head; + struct list_head list; struct gfs2_jdesc *jd; - struct gfs2_journal_extent *jext; spin_lock(&sdp->sd_jindex_spin); list_add(&list, &sdp->sd_jindex_list); @@ -307,14 +306,7 @@ void gfs2_jindex_free(struct gfs2_sbd *sdp) while (!list_empty(&list)) { jd = list_entry(list.next, struct gfs2_jdesc, jd_list); - head = &jd->extent_list; - while (!list_empty(head)) { - jext = list_entry(head->next, - struct gfs2_journal_extent, - extent_list); - list_del(&jext->extent_list); - kfree(jext); - } + gfs2_free_journal_extents(jd); list_del(&jd->jd_list); iput(jd->jd_inode); kfree(jd); @@ -1175,6 +1167,8 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) struct gfs2_tune *gt = &sdp->sd_tune; int error; + sync_filesystem(sb); + spin_lock(>->gt_spin); args.ar_commit = gt->gt_logd_secs; args.ar_quota_quantum = gt->gt_quota_quantum; @@ -1256,7 +1250,7 @@ static int gfs2_drop_inode(struct inode *inode) { struct gfs2_inode *ip = GFS2_I(inode); - if (inode->i_nlink) { + if (!test_bit(GIF_FREE_VFS_INODE, &ip->i_flags) && inode->i_nlink) { struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags)) clear_nlink(inode); @@ -1471,6 +1465,11 @@ static void gfs2_evict_inode(struct inode *inode) struct gfs2_holder gh; int error; + if (test_bit(GIF_FREE_VFS_INODE, &ip->i_flags)) { + clear_inode(inode); + return; + } + if (inode->i_nlink || (sb->s_flags & MS_RDONLY)) goto out; @@ -1558,7 +1557,7 @@ out_unlock: fs_warn(sdp, "gfs2_evict_inode: %d\n", error); out: /* Case 3 starts here */ - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); gfs2_rs_delete(ip, NULL); gfs2_ordered_del_inode(ip); clear_inode(inode); diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index d09f6edda0ff..de25d5577e5d 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -7,6 +7,8 @@ * of the GNU General Public License version 2. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/sched.h> #include <linux/spinlock.h> #include <linux/completion.h> @@ -138,9 +140,8 @@ static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len) if (simple_strtol(buf, NULL, 0) != 1) return -EINVAL; - gfs2_lm_withdraw(sdp, - "GFS2: fsid=%s: withdrawing from cluster at user's request\n", - sdp->sd_fsname); + gfs2_lm_withdraw(sdp, "withdrawing from cluster at user's request\n"); + return len; } diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 2b20d7046bf3..bead90d27bad 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -7,6 +7,8 @@ * of the GNU General Public License version 2. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> @@ -51,6 +53,9 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, if (revokes) tr->tr_reserved += gfs2_struct2blk(sdp, revokes, sizeof(u64)); + INIT_LIST_HEAD(&tr->tr_databuf); + INIT_LIST_HEAD(&tr->tr_buf); + sb_start_intwrite(sdp->sd_vfs); gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh); @@ -96,14 +101,13 @@ static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks) static void gfs2_print_trans(const struct gfs2_trans *tr) { - printk(KERN_WARNING "GFS2: Transaction created at: %pSR\n", - (void *)tr->tr_ip); - printk(KERN_WARNING "GFS2: blocks=%u revokes=%u reserved=%u touched=%d\n", - tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, tr->tr_touched); - printk(KERN_WARNING "GFS2: Buf %u/%u Databuf %u/%u Revoke %u/%u\n", - tr->tr_num_buf_new, tr->tr_num_buf_rm, - tr->tr_num_databuf_new, tr->tr_num_databuf_rm, - tr->tr_num_revoke, tr->tr_num_revoke_rm); + pr_warn("Transaction created at: %pSR\n", (void *)tr->tr_ip); + pr_warn("blocks=%u revokes=%u reserved=%u touched=%u\n", + tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, tr->tr_touched); + pr_warn("Buf %u/%u Databuf %u/%u Revoke %u/%u\n", + tr->tr_num_buf_new, tr->tr_num_buf_rm, + tr->tr_num_databuf_new, tr->tr_num_databuf_rm, + tr->tr_num_revoke, tr->tr_num_revoke_rm); } void gfs2_trans_end(struct gfs2_sbd *sdp) @@ -210,8 +214,7 @@ void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh) set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); gfs2_pin(sdp, bd->bd_bh); tr->tr_num_databuf_new++; - sdp->sd_log_num_databuf++; - list_add_tail(&bd->bd_list, &sdp->sd_log_le_databuf); + list_add_tail(&bd->bd_list, &tr->tr_databuf); } gfs2_log_unlock(sdp); unlock_buffer(bh); @@ -230,16 +233,14 @@ static void meta_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); mh = (struct gfs2_meta_header *)bd->bd_bh->b_data; if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) { - printk(KERN_ERR - "Attempting to add uninitialised block to journal (inplace block=%lld)\n", + pr_err("Attempting to add uninitialised block to journal (inplace block=%lld)\n", (unsigned long long)bd->bd_bh->b_blocknr); BUG(); } gfs2_pin(sdp, bd->bd_bh); mh->__pad0 = cpu_to_be64(0); mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); - sdp->sd_log_num_buf++; - list_add(&bd->bd_list, &sdp->sd_log_le_buf); + list_add(&bd->bd_list, &tr->tr_buf); tr->tr_num_buf_new++; } diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index f7109f689e61..86d2035ac669 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -7,6 +7,8 @@ * of the GNU General Public License version 2. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> @@ -30,22 +32,27 @@ mempool_t *gfs2_page_pool __read_mostly; void gfs2_assert_i(struct gfs2_sbd *sdp) { - printk(KERN_EMERG "GFS2: fsid=%s: fatal assertion failed\n", - sdp->sd_fsname); + fs_emerg(sdp, "fatal assertion failed\n"); } -int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...) +int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; const struct lm_lockops *lm = ls->ls_ops; va_list args; + struct va_format vaf; if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW && test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags)) return 0; va_start(args, fmt); - vprintk(fmt, args); + + vaf.fmt = fmt; + vaf.va = &args; + + fs_err(sdp, "%pV", &vaf); + va_end(args); if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) { @@ -66,7 +73,7 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...) } if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC) - panic("GFS2: fsid=%s: panic requested.\n", sdp->sd_fsname); + panic("GFS2: fsid=%s: panic requested\n", sdp->sd_fsname); return -1; } @@ -82,10 +89,9 @@ int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion, { int me; me = gfs2_lm_withdraw(sdp, - "GFS2: fsid=%s: fatal: assertion \"%s\" failed\n" - "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", - sdp->sd_fsname, assertion, - sdp->sd_fsname, function, file, line); + "fatal: assertion \"%s\" failed\n" + " function = %s, file = %s, line = %u\n", + assertion, function, file, line); dump_stack(); return (me) ? -1 : -2; } @@ -105,11 +111,8 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion, return -2; if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) - printk(KERN_WARNING - "GFS2: fsid=%s: warning: assertion \"%s\" failed\n" - "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", - sdp->sd_fsname, assertion, - sdp->sd_fsname, function, file, line); + fs_warn(sdp, "warning: assertion \"%s\" failed at function = %s, file = %s, line = %u\n", + assertion, function, file, line); if (sdp->sd_args.ar_debug) BUG(); @@ -138,10 +141,8 @@ int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function, { int rv; rv = gfs2_lm_withdraw(sdp, - "GFS2: fsid=%s: fatal: filesystem consistency error\n" - "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", - sdp->sd_fsname, - sdp->sd_fsname, function, file, line); + "fatal: filesystem consistency error - function = %s, file = %s, line = %u\n", + function, file, line); return rv; } @@ -157,13 +158,12 @@ int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide, struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); int rv; rv = gfs2_lm_withdraw(sdp, - "GFS2: fsid=%s: fatal: filesystem consistency error\n" - "GFS2: fsid=%s: inode = %llu %llu\n" - "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", - sdp->sd_fsname, - sdp->sd_fsname, (unsigned long long)ip->i_no_formal_ino, - (unsigned long long)ip->i_no_addr, - sdp->sd_fsname, function, file, line); + "fatal: filesystem consistency error\n" + " inode = %llu %llu\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)ip->i_no_formal_ino, + (unsigned long long)ip->i_no_addr, + function, file, line); return rv; } @@ -179,12 +179,11 @@ int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide, struct gfs2_sbd *sdp = rgd->rd_sbd; int rv; rv = gfs2_lm_withdraw(sdp, - "GFS2: fsid=%s: fatal: filesystem consistency error\n" - "GFS2: fsid=%s: RG = %llu\n" - "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", - sdp->sd_fsname, - sdp->sd_fsname, (unsigned long long)rgd->rd_addr, - sdp->sd_fsname, function, file, line); + "fatal: filesystem consistency error\n" + " RG = %llu\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)rgd->rd_addr, + function, file, line); return rv; } @@ -200,12 +199,11 @@ int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, { int me; me = gfs2_lm_withdraw(sdp, - "GFS2: fsid=%s: fatal: invalid metadata block\n" - "GFS2: fsid=%s: bh = %llu (%s)\n" - "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", - sdp->sd_fsname, - sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, - sdp->sd_fsname, function, file, line); + "fatal: invalid metadata block\n" + " bh = %llu (%s)\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)bh->b_blocknr, type, + function, file, line); return (me) ? -1 : -2; } @@ -221,12 +219,11 @@ int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, { int me; me = gfs2_lm_withdraw(sdp, - "GFS2: fsid=%s: fatal: invalid metadata block\n" - "GFS2: fsid=%s: bh = %llu (type: exp=%u, found=%u)\n" - "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", - sdp->sd_fsname, - sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, t, - sdp->sd_fsname, function, file, line); + "fatal: invalid metadata block\n" + " bh = %llu (type: exp=%u, found=%u)\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)bh->b_blocknr, type, t, + function, file, line); return (me) ? -1 : -2; } @@ -241,10 +238,9 @@ int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file, { int rv; rv = gfs2_lm_withdraw(sdp, - "GFS2: fsid=%s: fatal: I/O error\n" - "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", - sdp->sd_fsname, - sdp->sd_fsname, function, file, line); + "fatal: I/O error\n" + " function = %s, file = %s, line = %u\n", + function, file, line); return rv; } @@ -259,12 +255,11 @@ int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh, { int rv; rv = gfs2_lm_withdraw(sdp, - "GFS2: fsid=%s: fatal: I/O error\n" - "GFS2: fsid=%s: block = %llu\n" - "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", - sdp->sd_fsname, - sdp->sd_fsname, (unsigned long long)bh->b_blocknr, - sdp->sd_fsname, function, file, line); + "fatal: I/O error\n" + " block = %llu\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)bh->b_blocknr, + function, file, line); return rv; } diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index b7ffb09b99ea..cbdcbdf39614 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -10,22 +10,23 @@ #ifndef __UTIL_DOT_H__ #define __UTIL_DOT_H__ +#ifdef pr_fmt +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#endif + #include <linux/mempool.h> #include "incore.h" -#define fs_printk(level, fs, fmt, arg...) \ - printk(level "GFS2: fsid=%s: " fmt , (fs)->sd_fsname , ## arg) - -#define fs_info(fs, fmt, arg...) \ - fs_printk(KERN_INFO , fs , fmt , ## arg) - -#define fs_warn(fs, fmt, arg...) \ - fs_printk(KERN_WARNING , fs , fmt , ## arg) - -#define fs_err(fs, fmt, arg...) \ - fs_printk(KERN_ERR, fs , fmt , ## arg) - +#define fs_emerg(fs, fmt, ...) \ + pr_emerg("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__) +#define fs_warn(fs, fmt, ...) \ + pr_warn("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__) +#define fs_err(fs, fmt, ...) \ + pr_err("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__) +#define fs_info(fs, fmt, ...) \ + pr_info("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__) void gfs2_assert_i(struct gfs2_sbd *sdp); @@ -85,7 +86,7 @@ static inline int gfs2_meta_check(struct gfs2_sbd *sdp, struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data; u32 magic = be32_to_cpu(mh->mh_magic); if (unlikely(magic != GFS2_MAGIC)) { - printk(KERN_ERR "GFS2: Magic number missing at %llu\n", + pr_err("Magic number missing at %llu\n", (unsigned long long)bh->b_blocknr); return -EIO; } @@ -164,7 +165,7 @@ static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt, #define gfs2_tune_get(sdp, field) \ gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field) -int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...); +__printf(2, 3) +int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...); #endif /* __UTIL_DOT_H__ */ - diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 380ab31b5e0f..9e2fecd62f62 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -547,7 +547,7 @@ out: void hfs_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) { HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL; diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 2d2039e754cd..eee7206c38d1 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -112,6 +112,7 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf) static int hfs_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_NODIRATIME; if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) return 0; diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c index 0f47890299c4..caf89a7be0a1 100644 --- a/fs/hfsplus/attributes.c +++ b/fs/hfsplus/attributes.c @@ -11,7 +11,7 @@ static struct kmem_cache *hfsplus_attr_tree_cachep; -int hfsplus_create_attr_tree_cache(void) +int __init hfsplus_create_attr_tree_cache(void) { if (hfsplus_attr_tree_cachep) return -EEXIST; diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index fbb212fbb1ef..a7aafb35b624 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -227,10 +227,8 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, u32 ablock, dblock, mask; sector_t sector; int was_dirty = 0; - int shift; /* Convert inode block to disk allocation block */ - shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits; ablock = iblock >> sbi->fs_shift; if (iblock >= hip->fs_blocks) { @@ -498,11 +496,13 @@ int hfsplus_file_extend(struct inode *inode) goto insert_extent; } out: - mutex_unlock(&hip->extents_lock); if (!res) { hip->alloc_blocks += len; + mutex_unlock(&hip->extents_lock); hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY); + return 0; } + mutex_unlock(&hip->extents_lock); return res; insert_extent: @@ -556,11 +556,13 @@ void hfsplus_file_truncate(struct inode *inode) blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >> HFSPLUS_SB(sb)->alloc_blksz_shift; + + mutex_lock(&hip->extents_lock); + alloc_cnt = hip->alloc_blocks; if (blk_cnt == alloc_cnt) - goto out; + goto out_unlock; - mutex_lock(&hip->extents_lock); res = hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd); if (res) { mutex_unlock(&hip->extents_lock); @@ -592,10 +594,10 @@ void hfsplus_file_truncate(struct inode *inode) hfs_brec_remove(&fd); } hfs_find_exit(&fd); - mutex_unlock(&hip->extents_lock); hip->alloc_blocks = blk_cnt; -out: +out_unlock: + mutex_unlock(&hip->extents_lock); hip->phys_size = inode->i_size; hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 62d571eb69ba..83dc29286b10 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -367,7 +367,7 @@ typedef int (*search_strategy_t)(struct hfs_bnode *, */ /* attributes.c */ -int hfsplus_create_attr_tree_cache(void); +int __init hfsplus_create_attr_tree_cache(void); void hfsplus_destroy_attr_tree_cache(void); hfsplus_attr_entry *hfsplus_alloc_attr_entry(void); void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry_p); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 80875aa640ef..a513d2d36be9 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -161,7 +161,7 @@ static int hfsplus_write_inode(struct inode *inode, static void hfsplus_evict_inode(struct inode *inode) { hfs_dbg(INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (HFSPLUS_IS_RSRC(inode)) { HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL; @@ -323,6 +323,7 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) static int hfsplus_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) return 0; if (!(*flags & MS_RDONLY)) { diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index fe649d325b1f..9c470fde9878 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -230,7 +230,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb) static void hostfs_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (HOSTFS_I(inode)->fd != -1) { close_file(&HOSTFS_I(inode)->fd); diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 9edeeb0ea97e..50a427313835 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -304,7 +304,7 @@ void hpfs_write_if_changed(struct inode *inode) void hpfs_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (!inode->i_nlink) { hpfs_lock(inode->i_sb); diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 4534ff688b76..fe3463a43236 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -421,6 +421,8 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) struct hpfs_sb_info *sbi = hpfs_sb(s); char *new_opts = kstrdup(data, GFP_KERNEL); + sync_filesystem(s); + *flags |= MS_NOATIME; hpfs_lock(s); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index d19b30ababf1..e19d4c0cacae 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -366,7 +366,13 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart) static void hugetlbfs_evict_inode(struct inode *inode) { + struct resv_map *resv_map; + truncate_hugepages(inode, 0); + resv_map = (struct resv_map *)inode->i_mapping->private_data; + /* root inode doesn't have the resv_map, so we should check it */ + if (resv_map) + resv_map_release(&resv_map->refs); clear_inode(inode); } @@ -476,6 +482,11 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, umode_t mode, dev_t dev) { struct inode *inode; + struct resv_map *resv_map; + + resv_map = resv_map_alloc(); + if (!resv_map) + return NULL; inode = new_inode(sb); if (inode) { @@ -487,7 +498,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, inode->i_mapping->a_ops = &hugetlbfs_aops; inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - INIT_LIST_HEAD(&inode->i_mapping->private_list); + inode->i_mapping->private_data = resv_map; info = HUGETLBFS_I(inode); /* * The policy is initialized here even if we are creating a @@ -517,7 +528,9 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, break; } lockdep_annotate_inode_mutex_key(inode); - } + } else + kref_put(&resv_map->refs, resv_map_release); + return inode; } @@ -1017,6 +1030,11 @@ static int __init init_hugetlbfs_fs(void) int error; int i; + if (!hugepages_supported()) { + pr_info("hugetlbfs: disabling because there are no supported hugepage sizes\n"); + return -ENOTSUPP; + } + error = bdi_init(&hugetlbfs_backing_dev_info); if (error) return error; diff --git a/fs/inode.c b/fs/inode.c index 4bcdad3c9361..f96d2a6f88cc 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -503,6 +503,7 @@ void clear_inode(struct inode *inode) */ spin_lock_irq(&inode->i_data.tree_lock); BUG_ON(inode->i_data.nrpages); + BUG_ON(inode->i_data.nrshadows); spin_unlock_irq(&inode->i_data.tree_lock); BUG_ON(!list_empty(&inode->i_data.private_list)); BUG_ON(!(inode->i_state & I_FREEING)); @@ -548,8 +549,7 @@ static void evict(struct inode *inode) if (op->evict_inode) { op->evict_inode(inode); } else { - if (inode->i_data.nrpages) - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); } if (S_ISBLK(inode->i_mode) && inode->i_bdev) @@ -944,24 +944,22 @@ EXPORT_SYMBOL(unlock_new_inode); /** * lock_two_nondirectories - take two i_mutexes on non-directory objects + * + * Lock any non-NULL argument that is not a directory. + * Zero, one or two objects may be locked by this function. + * * @inode1: first inode to lock * @inode2: second inode to lock */ void lock_two_nondirectories(struct inode *inode1, struct inode *inode2) { - WARN_ON_ONCE(S_ISDIR(inode1->i_mode)); - if (inode1 == inode2 || !inode2) { - mutex_lock(&inode1->i_mutex); - return; - } - WARN_ON_ONCE(S_ISDIR(inode2->i_mode)); - if (inode1 < inode2) { + if (inode1 > inode2) + swap(inode1, inode2); + + if (inode1 && !S_ISDIR(inode1->i_mode)) mutex_lock(&inode1->i_mutex); + if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) mutex_lock_nested(&inode2->i_mutex, I_MUTEX_NONDIR2); - } else { - mutex_lock(&inode2->i_mutex); - mutex_lock_nested(&inode1->i_mutex, I_MUTEX_NONDIR2); - } } EXPORT_SYMBOL(lock_two_nondirectories); @@ -972,8 +970,9 @@ EXPORT_SYMBOL(lock_two_nondirectories); */ void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2) { - mutex_unlock(&inode1->i_mutex); - if (inode2 && inode2 != inode1) + if (inode1 && !S_ISDIR(inode1->i_mode)) + mutex_unlock(&inode1->i_mutex); + if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) mutex_unlock(&inode2->i_mutex); } EXPORT_SYMBOL(unlock_two_nondirectories); @@ -1899,3 +1898,34 @@ void inode_dio_done(struct inode *inode) wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); } EXPORT_SYMBOL(inode_dio_done); + +/* + * inode_set_flags - atomically set some inode flags + * + * Note: the caller should be holding i_mutex, or else be sure that + * they have exclusive access to the inode structure (i.e., while the + * inode is being instantiated). The reason for the cmpxchg() loop + * --- which wouldn't be necessary if all code paths which modify + * i_flags actually followed this rule, is that there is at least one + * code path which doesn't today --- for example, + * __generic_file_aio_write() calls file_remove_suid() without holding + * i_mutex --- so we use cmpxchg() out of an abundance of caution. + * + * In the long run, i_mutex is overkill, and we should probably look + * at using the i_lock spinlock to protect i_flags, and then make sure + * it is so documented in include/linux/fs.h and that all code follows + * the locking convention!! + */ +void inode_set_flags(struct inode *inode, unsigned int flags, + unsigned int mask) +{ + unsigned int old_flags, new_flags; + + WARN_ON_ONCE(flags & ~mask); + do { + old_flags = ACCESS_ONCE(inode->i_flags); + new_flags = (old_flags & ~mask) | flags; + } while (unlikely(cmpxchg(&inode->i_flags, old_flags, + new_flags) != old_flags)); +} +EXPORT_SYMBOL(inode_set_flags); diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 4a9e10ea13f2..4556ce1af5b0 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -93,7 +93,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { isofs_inode_cachep = kmem_cache_create("isofs_inode_cache", sizeof(struct iso_inode_info), @@ -117,6 +117,7 @@ static void destroy_inodecache(void) static int isofs_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); if (!(*flags & MS_RDONLY)) return -EROFS; return 0; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index cf2fc0594063..5f26139a165a 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -555,7 +555,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) blk_start_plug(&plug); jbd2_journal_write_revoke_records(journal, commit_transaction, &log_bufs, WRITE_SYNC); - blk_finish_plug(&plug); jbd_debug(3, "JBD2: commit phase 2b\n"); @@ -582,7 +581,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) err = 0; bufs = 0; descriptor = NULL; - blk_start_plug(&plug); while (commit_transaction->t_buffers) { /* Find the next buffer to be journaled... */ @@ -1067,6 +1065,25 @@ restart_loop: goto restart_loop; } + /* Add the transaction to the checkpoint list + * __journal_remove_checkpoint() can not destroy transaction + * under us because it is not marked as T_FINISHED yet */ + if (journal->j_checkpoint_transactions == NULL) { + journal->j_checkpoint_transactions = commit_transaction; + commit_transaction->t_cpnext = commit_transaction; + commit_transaction->t_cpprev = commit_transaction; + } else { + commit_transaction->t_cpnext = + journal->j_checkpoint_transactions; + commit_transaction->t_cpprev = + commit_transaction->t_cpnext->t_cpprev; + commit_transaction->t_cpnext->t_cpprev = + commit_transaction; + commit_transaction->t_cpprev->t_cpnext = + commit_transaction; + } + spin_unlock(&journal->j_list_lock); + /* Done with this transaction! */ jbd_debug(3, "JBD2: commit phase 7\n"); @@ -1085,24 +1102,7 @@ restart_loop: atomic_read(&commit_transaction->t_handle_count); trace_jbd2_run_stats(journal->j_fs_dev->bd_dev, commit_transaction->t_tid, &stats.run); - - /* - * Calculate overall stats - */ - spin_lock(&journal->j_history_lock); - journal->j_stats.ts_tid++; - if (commit_transaction->t_requested) - journal->j_stats.ts_requested++; - journal->j_stats.run.rs_wait += stats.run.rs_wait; - journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay; - journal->j_stats.run.rs_running += stats.run.rs_running; - journal->j_stats.run.rs_locked += stats.run.rs_locked; - journal->j_stats.run.rs_flushing += stats.run.rs_flushing; - journal->j_stats.run.rs_logging += stats.run.rs_logging; - journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count; - journal->j_stats.run.rs_blocks += stats.run.rs_blocks; - journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; - spin_unlock(&journal->j_history_lock); + stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0; commit_transaction->t_state = T_COMMIT_CALLBACK; J_ASSERT(commit_transaction == journal->j_committing_transaction); @@ -1122,24 +1122,6 @@ restart_loop: write_unlock(&journal->j_state_lock); - if (journal->j_checkpoint_transactions == NULL) { - journal->j_checkpoint_transactions = commit_transaction; - commit_transaction->t_cpnext = commit_transaction; - commit_transaction->t_cpprev = commit_transaction; - } else { - commit_transaction->t_cpnext = - journal->j_checkpoint_transactions; - commit_transaction->t_cpprev = - commit_transaction->t_cpnext->t_cpprev; - commit_transaction->t_cpnext->t_cpprev = - commit_transaction; - commit_transaction->t_cpprev->t_cpnext = - commit_transaction; - } - spin_unlock(&journal->j_list_lock); - /* Drop all spin_locks because commit_callback may be block. - * __journal_remove_checkpoint() can not destroy transaction - * under us because it is not marked as T_FINISHED yet */ if (journal->j_commit_callback) journal->j_commit_callback(journal, commit_transaction); @@ -1150,7 +1132,7 @@ restart_loop: write_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); commit_transaction->t_state = T_FINISHED; - /* Recheck checkpoint lists after j_list_lock was dropped */ + /* Check if the transaction can be dropped now that we are finished */ if (commit_transaction->t_checkpoint_list == NULL && commit_transaction->t_checkpoint_io_list == NULL) { __jbd2_journal_drop_transaction(journal, commit_transaction); @@ -1159,4 +1141,21 @@ restart_loop: spin_unlock(&journal->j_list_lock); write_unlock(&journal->j_state_lock); wake_up(&journal->j_wait_done_commit); + + /* + * Calculate overall stats + */ + spin_lock(&journal->j_history_lock); + journal->j_stats.ts_tid++; + journal->j_stats.ts_requested += stats.ts_requested; + journal->j_stats.run.rs_wait += stats.run.rs_wait; + journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay; + journal->j_stats.run.rs_running += stats.run.rs_running; + journal->j_stats.run.rs_locked += stats.run.rs_locked; + journal->j_stats.run.rs_flushing += stats.run.rs_flushing; + journal->j_stats.run.rs_logging += stats.run.rs_logging; + journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count; + journal->j_stats.run.rs_blocks += stats.run.rs_blocks; + journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; + spin_unlock(&journal->j_history_lock); } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 5fa344afb49a..67b8e303946c 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -122,7 +122,7 @@ EXPORT_SYMBOL(__jbd2_debug); #endif /* Checksumming functions */ -int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) +static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) { if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return 1; @@ -143,7 +143,7 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb) return cpu_to_be32(csum); } -int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) +static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) { if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return 1; @@ -151,7 +151,7 @@ int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) return sb->s_checksum == jbd2_superblock_csum(j, sb); } -void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb) +static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb) { if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return; @@ -302,8 +302,8 @@ static void journal_kill_thread(journal_t *journal) journal->j_flags |= JBD2_UNMOUNT; while (journal->j_task) { - wake_up(&journal->j_wait_commit); write_unlock(&journal->j_state_lock); + wake_up(&journal->j_wait_commit); wait_event(journal->j_wait_done_commit, journal->j_task == NULL); write_lock(&journal->j_state_lock); } @@ -710,8 +710,8 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) while (tid_gt(tid, journal->j_commit_sequence)) { jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n", tid, journal->j_commit_sequence); - wake_up(&journal->j_wait_commit); read_unlock(&journal->j_state_lock); + wake_up(&journal->j_wait_commit); wait_event(journal->j_wait_done_commit, !tid_gt(tid, journal->j_commit_sequence)); read_lock(&journal->j_state_lock); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 60bb365f54a5..38cfcf5f6fce 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1073,7 +1073,6 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) * reused here. */ jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); J_ASSERT_JH(jh, (jh->b_transaction == transaction || jh->b_transaction == NULL || (jh->b_transaction == journal->j_committing_transaction && @@ -1096,12 +1095,14 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) jh->b_modified = 0; JBUFFER_TRACE(jh, "file as BJ_Reserved"); + spin_lock(&journal->j_list_lock); __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); } else if (jh->b_transaction == journal->j_committing_transaction) { /* first access by this transaction */ jh->b_modified = 0; JBUFFER_TRACE(jh, "set next transaction"); + spin_lock(&journal->j_list_lock); jh->b_next_transaction = transaction; } spin_unlock(&journal->j_list_lock); @@ -1312,7 +1313,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) journal->j_running_transaction)) { printk(KERN_ERR "JBD2: %s: " "jh->b_transaction (%llu, %p, %u) != " - "journal->j_running_transaction (%p, %u)", + "journal->j_running_transaction (%p, %u)\n", journal->j_devname, (unsigned long long) bh->b_blocknr, jh->b_transaction, @@ -1335,30 +1336,25 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) */ if (jh->b_transaction != transaction) { JBUFFER_TRACE(jh, "already on other transaction"); - if (unlikely(jh->b_transaction != - journal->j_committing_transaction)) { - printk(KERN_ERR "JBD2: %s: " - "jh->b_transaction (%llu, %p, %u) != " - "journal->j_committing_transaction (%p, %u)", + if (unlikely(((jh->b_transaction != + journal->j_committing_transaction)) || + (jh->b_next_transaction != transaction))) { + printk(KERN_ERR "jbd2_journal_dirty_metadata: %s: " + "bad jh for block %llu: " + "transaction (%p, %u), " + "jh->b_transaction (%p, %u), " + "jh->b_next_transaction (%p, %u), jlist %u\n", journal->j_devname, (unsigned long long) bh->b_blocknr, + transaction, transaction->t_tid, jh->b_transaction, - jh->b_transaction ? jh->b_transaction->t_tid : 0, - journal->j_committing_transaction, - journal->j_committing_transaction ? - journal->j_committing_transaction->t_tid : 0); - ret = -EINVAL; - } - if (unlikely(jh->b_next_transaction != transaction)) { - printk(KERN_ERR "JBD2: %s: " - "jh->b_next_transaction (%llu, %p, %u) != " - "transaction (%p, %u)", - journal->j_devname, - (unsigned long long) bh->b_blocknr, + jh->b_transaction ? + jh->b_transaction->t_tid : 0, jh->b_next_transaction, jh->b_next_transaction ? jh->b_next_transaction->t_tid : 0, - transaction, transaction->t_tid); + jh->b_jlist); + WARN_ON(1); ret = -EINVAL; } /* And this case is illegal: we can't reuse another @@ -1415,7 +1411,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) BUFFER_TRACE(bh, "entry"); jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); if (!buffer_jbd(bh)) goto not_jbd; @@ -1468,6 +1463,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) * we know to remove the checkpoint after we commit. */ + spin_lock(&journal->j_list_lock); if (jh->b_cp_transaction) { __jbd2_journal_temp_unlink_buffer(jh); __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); @@ -1480,6 +1476,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) goto drop; } } + spin_unlock(&journal->j_list_lock); } else if (jh->b_transaction) { J_ASSERT_JH(jh, (jh->b_transaction == journal->j_committing_transaction)); @@ -1491,7 +1488,9 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) if (jh->b_next_transaction) { J_ASSERT(jh->b_next_transaction == transaction); + spin_lock(&journal->j_list_lock); jh->b_next_transaction = NULL; + spin_unlock(&journal->j_list_lock); /* * only drop a reference if this transaction modified @@ -1503,7 +1502,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) } not_jbd: - spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); __brelse(bh); drop: @@ -1821,11 +1819,11 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) if (buffer_locked(bh) || buffer_dirty(bh)) goto out; - if (jh->b_next_transaction != NULL) + if (jh->b_next_transaction != NULL || jh->b_transaction != NULL) goto out; spin_lock(&journal->j_list_lock); - if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { + if (jh->b_cp_transaction != NULL) { /* written-back checkpointed metadata buffer */ JBUFFER_TRACE(jh, "remove from checkpoint list"); __jbd2_journal_remove_checkpoint(jh); diff --git a/fs/jffs2/compr_rtime.c b/fs/jffs2/compr_rtime.c index 16a5047903a6..406d9cc84ba8 100644 --- a/fs/jffs2/compr_rtime.c +++ b/fs/jffs2/compr_rtime.c @@ -33,7 +33,7 @@ static int jffs2_rtime_compress(unsigned char *data_in, unsigned char *cpage_out, uint32_t *sourcelen, uint32_t *dstlen) { - short positions[256]; + unsigned short positions[256]; int outpos = 0; int pos=0; @@ -74,7 +74,7 @@ static int jffs2_rtime_decompress(unsigned char *data_in, unsigned char *cpage_out, uint32_t srclen, uint32_t destlen) { - short positions[256]; + unsigned short positions[256]; int outpos = 0; int pos=0; diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index a69e426435dd..601afd1afddf 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -242,7 +242,7 @@ void jffs2_evict_inode (struct inode *inode) jffs2_dbg(1, "%s(): ino #%lu mode %o\n", __func__, inode->i_ino, inode->i_mode); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); jffs2_do_clear_inode(c, f); } @@ -457,12 +457,14 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r The umask is only applied if there's no default ACL */ ret = jffs2_init_acl_pre(dir_i, inode, &mode); if (ret) { - make_bad_inode(inode); - iput(inode); - return ERR_PTR(ret); + mutex_unlock(&f->sem); + make_bad_inode(inode); + iput(inode); + return ERR_PTR(ret); } ret = jffs2_do_new_inode (c, f, mode, ri); if (ret) { + mutex_unlock(&f->sem); make_bad_inode(inode); iput(inode); return ERR_PTR(ret); @@ -479,6 +481,7 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r inode->i_size = 0; if (insert_inode_locked(inode) < 0) { + mutex_unlock(&f->sem); make_bad_inode(inode); iput(inode); return ERR_PTR(-EINVAL); @@ -687,7 +690,7 @@ unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c, struct inode *inode = OFNI_EDONI_2SFFJ(f); struct page *pg; - pg = read_cache_page_async(inode->i_mapping, offset >> PAGE_CACHE_SHIFT, + pg = read_cache_page(inode->i_mapping, offset >> PAGE_CACHE_SHIFT, (void *)jffs2_do_readpage_unlock, inode); if (IS_ERR(pg)) return (void *)pg; diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h index e4619b00f7c5..fa35ff79ab35 100644 --- a/fs/jffs2/nodelist.h +++ b/fs/jffs2/nodelist.h @@ -231,7 +231,7 @@ struct jffs2_tmp_dnode_info uint32_t version; uint32_t data_crc; uint32_t partial_crc; - uint16_t csize; + uint32_t csize; uint16_t overlapped; }; diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c index 03310721712f..b6bd4affd9ad 100644 --- a/fs/jffs2/nodemgmt.c +++ b/fs/jffs2/nodemgmt.c @@ -179,6 +179,7 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, spin_unlock(&c->erase_completion_lock); schedule(); + remove_wait_queue(&c->erase_wait, &wait); } else spin_unlock(&c->erase_completion_lock); } else if (ret) @@ -211,20 +212,25 @@ out: int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *len, uint32_t sumsize) { - int ret = -EAGAIN; + int ret; minsize = PAD(minsize); jffs2_dbg(1, "%s(): Requested 0x%x bytes\n", __func__, minsize); - spin_lock(&c->erase_completion_lock); - while(ret == -EAGAIN) { + while (true) { + spin_lock(&c->erase_completion_lock); ret = jffs2_do_reserve_space(c, minsize, len, sumsize); if (ret) { jffs2_dbg(1, "%s(): looping, ret is %d\n", __func__, ret); } + spin_unlock(&c->erase_completion_lock); + + if (ret == -EAGAIN) + cond_resched(); + else + break; } - spin_unlock(&c->erase_completion_lock); if (!ret) ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, 1); diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 0defb1cc2a35..0918f0e2e266 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -243,6 +243,7 @@ static int jffs2_remount_fs(struct super_block *sb, int *flags, char *data) struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); int err; + sync_filesystem(sb); err = jffs2_parse_options(c, data); if (err) return -EINVAL; diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index f4aab719add5..6f8fe72c2a7a 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -154,7 +154,7 @@ void jfs_evict_inode(struct inode *inode) dquot_initialize(inode); if (JFS_IP(inode)->fileset == FILESYSTEM_I) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); if (test_cflag(COMMIT_Freewmap, inode)) jfs_free_zero_link(inode); @@ -168,7 +168,7 @@ void jfs_evict_inode(struct inode *inode) dquot_free_inode(inode); } } else { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); } clear_inode(inode); dquot_drop(inode); diff --git a/fs/jfs/super.c b/fs/jfs/super.c index e2b7483444fd..97f7fda51890 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -418,6 +418,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data) int flag = JFS_SBI(sb)->flag; int ret; + sync_filesystem(sb); if (!parse_options(data, sb, &newLVSize, &flag)) { return -EINVAL; } diff --git a/fs/kernfs/Kconfig b/fs/kernfs/Kconfig new file mode 100644 index 000000000000..397b5f7a7a16 --- /dev/null +++ b/fs/kernfs/Kconfig @@ -0,0 +1,7 @@ +# +# KERNFS should be selected by its users +# + +config KERNFS + bool + default n diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index bd6e18be6e1a..ac127cd008bf 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -8,6 +8,7 @@ * This file is released under the GPLv2. */ +#include <linux/sched.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/idr.h> @@ -18,9 +19,162 @@ #include "kernfs-internal.h" DEFINE_MUTEX(kernfs_mutex); +static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */ +static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by rename_lock */ #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) +static bool kernfs_active(struct kernfs_node *kn) +{ + lockdep_assert_held(&kernfs_mutex); + return atomic_read(&kn->active) >= 0; +} + +static bool kernfs_lockdep(struct kernfs_node *kn) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + return kn->flags & KERNFS_LOCKDEP; +#else + return false; +#endif +} + +static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen) +{ + return strlcpy(buf, kn->parent ? kn->name : "/", buflen); +} + +static char * __must_check kernfs_path_locked(struct kernfs_node *kn, char *buf, + size_t buflen) +{ + char *p = buf + buflen; + int len; + + *--p = '\0'; + + do { + len = strlen(kn->name); + if (p - buf < len + 1) { + buf[0] = '\0'; + p = NULL; + break; + } + p -= len; + memcpy(p, kn->name, len); + *--p = '/'; + kn = kn->parent; + } while (kn && kn->parent); + + return p; +} + +/** + * kernfs_name - obtain the name of a given node + * @kn: kernfs_node of interest + * @buf: buffer to copy @kn's name into + * @buflen: size of @buf + * + * Copies the name of @kn into @buf of @buflen bytes. The behavior is + * similar to strlcpy(). It returns the length of @kn's name and if @buf + * isn't long enough, it's filled upto @buflen-1 and nul terminated. + * + * This function can be called from any context. + */ +int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&kernfs_rename_lock, flags); + ret = kernfs_name_locked(kn, buf, buflen); + spin_unlock_irqrestore(&kernfs_rename_lock, flags); + return ret; +} + +/** + * kernfs_path - build full path of a given node + * @kn: kernfs_node of interest + * @buf: buffer to copy @kn's name into + * @buflen: size of @buf + * + * Builds and returns the full path of @kn in @buf of @buflen bytes. The + * path is built from the end of @buf so the returned pointer usually + * doesn't match @buf. If @buf isn't long enough, @buf is nul terminated + * and %NULL is returned. + */ +char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen) +{ + unsigned long flags; + char *p; + + spin_lock_irqsave(&kernfs_rename_lock, flags); + p = kernfs_path_locked(kn, buf, buflen); + spin_unlock_irqrestore(&kernfs_rename_lock, flags); + return p; +} +EXPORT_SYMBOL_GPL(kernfs_path); + +/** + * pr_cont_kernfs_name - pr_cont name of a kernfs_node + * @kn: kernfs_node of interest + * + * This function can be called from any context. + */ +void pr_cont_kernfs_name(struct kernfs_node *kn) +{ + unsigned long flags; + + spin_lock_irqsave(&kernfs_rename_lock, flags); + + kernfs_name_locked(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf)); + pr_cont("%s", kernfs_pr_cont_buf); + + spin_unlock_irqrestore(&kernfs_rename_lock, flags); +} + +/** + * pr_cont_kernfs_path - pr_cont path of a kernfs_node + * @kn: kernfs_node of interest + * + * This function can be called from any context. + */ +void pr_cont_kernfs_path(struct kernfs_node *kn) +{ + unsigned long flags; + char *p; + + spin_lock_irqsave(&kernfs_rename_lock, flags); + + p = kernfs_path_locked(kn, kernfs_pr_cont_buf, + sizeof(kernfs_pr_cont_buf)); + if (p) + pr_cont("%s", p); + else + pr_cont("<name too long>"); + + spin_unlock_irqrestore(&kernfs_rename_lock, flags); +} + +/** + * kernfs_get_parent - determine the parent node and pin it + * @kn: kernfs_node of interest + * + * Determines @kn's parent, pins and returns it. This function can be + * called from any context. + */ +struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn) +{ + struct kernfs_node *parent; + unsigned long flags; + + spin_lock_irqsave(&kernfs_rename_lock, flags); + parent = kn->parent; + kernfs_get(parent); + spin_unlock_irqrestore(&kernfs_rename_lock, flags); + + return parent; +} + /** * kernfs_name_hash * @name: Null terminated string to hash @@ -37,7 +191,7 @@ static unsigned int kernfs_name_hash(const char *name, const void *ns) hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31)); hash &= 0x7fffffffU; /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */ - if (hash < 1) + if (hash < 2) hash += 2; if (hash >= INT_MAX) hash = INT_MAX - 1; @@ -78,9 +232,6 @@ static int kernfs_link_sibling(struct kernfs_node *kn) struct rb_node **node = &kn->parent->dir.children.rb_node; struct rb_node *parent = NULL; - if (kernfs_type(kn) == KERNFS_DIR) - kn->parent->dir.subdirs++; - while (*node) { struct kernfs_node *pos; int result; @@ -95,9 +246,15 @@ static int kernfs_link_sibling(struct kernfs_node *kn) else return -EEXIST; } + /* add new node and rebalance the tree */ rb_link_node(&kn->rb, parent, node); rb_insert_color(&kn->rb, &kn->parent->dir.children); + + /* successfully added, account subdir number */ + if (kernfs_type(kn) == KERNFS_DIR) + kn->parent->dir.subdirs++; + return 0; } @@ -105,18 +262,24 @@ static int kernfs_link_sibling(struct kernfs_node *kn) * kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree * @kn: kernfs_node of interest * - * Unlink @kn from its sibling rbtree which starts from - * kn->parent->dir.children. + * Try to unlink @kn from its sibling rbtree which starts from + * kn->parent->dir.children. Returns %true if @kn was actually + * removed, %false if @kn wasn't on the rbtree. * * Locking: * mutex_lock(kernfs_mutex) */ -static void kernfs_unlink_sibling(struct kernfs_node *kn) +static bool kernfs_unlink_sibling(struct kernfs_node *kn) { + if (RB_EMPTY_NODE(&kn->rb)) + return false; + if (kernfs_type(kn) == KERNFS_DIR) kn->parent->dir.subdirs--; rb_erase(&kn->rb, &kn->parent->dir.children); + RB_CLEAR_NODE(&kn->rb); + return true; } /** @@ -137,7 +300,7 @@ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn) if (!atomic_inc_unless_negative(&kn->active)) return NULL; - if (kn->flags & KERNFS_LOCKDEP) + if (kernfs_lockdep(kn)) rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_); return kn; } @@ -151,59 +314,57 @@ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn) */ void kernfs_put_active(struct kernfs_node *kn) { + struct kernfs_root *root = kernfs_root(kn); int v; if (unlikely(!kn)) return; - if (kn->flags & KERNFS_LOCKDEP) + if (kernfs_lockdep(kn)) rwsem_release(&kn->dep_map, 1, _RET_IP_); v = atomic_dec_return(&kn->active); if (likely(v != KN_DEACTIVATED_BIAS)) return; - /* - * atomic_dec_return() is a mb(), we'll always see the updated - * kn->u.completion. - */ - complete(kn->u.completion); + wake_up_all(&root->deactivate_waitq); } /** - * kernfs_deactivate - deactivate kernfs_node - * @kn: kernfs_node to deactivate + * kernfs_drain - drain kernfs_node + * @kn: kernfs_node to drain * - * Deny new active references and drain existing ones. + * Drain existing usages and nuke all existing mmaps of @kn. Mutiple + * removers may invoke this function concurrently on @kn and all will + * return after draining is complete. */ -static void kernfs_deactivate(struct kernfs_node *kn) +static void kernfs_drain(struct kernfs_node *kn) + __releases(&kernfs_mutex) __acquires(&kernfs_mutex) { - DECLARE_COMPLETION_ONSTACK(wait); - int v; - - BUG_ON(!(kn->flags & KERNFS_REMOVED)); + struct kernfs_root *root = kernfs_root(kn); - if (!(kernfs_type(kn) & KERNFS_ACTIVE_REF)) - return; + lockdep_assert_held(&kernfs_mutex); + WARN_ON_ONCE(kernfs_active(kn)); - kn->u.completion = (void *)&wait; + mutex_unlock(&kernfs_mutex); - if (kn->flags & KERNFS_LOCKDEP) + if (kernfs_lockdep(kn)) { rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); - /* atomic_add_return() is a mb(), put_active() will always see - * the updated kn->u.completion. - */ - v = atomic_add_return(KN_DEACTIVATED_BIAS, &kn->active); - - if (v != KN_DEACTIVATED_BIAS) { - if (kn->flags & KERNFS_LOCKDEP) + if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS) lock_contended(&kn->dep_map, _RET_IP_); - wait_for_completion(&wait); } - if (kn->flags & KERNFS_LOCKDEP) { + /* but everyone should wait for draining */ + wait_event(root->deactivate_waitq, + atomic_read(&kn->active) == KN_DEACTIVATED_BIAS); + + if (kernfs_lockdep(kn)) { lock_acquired(&kn->dep_map, _RET_IP_); rwsem_release(&kn->dep_map, 1, _RET_IP_); } + + kernfs_unmap_bin_file(kn); + + mutex_lock(&kernfs_mutex); } /** @@ -234,13 +395,15 @@ void kernfs_put(struct kernfs_node *kn) return; root = kernfs_root(kn); repeat: - /* Moving/renaming is always done while holding reference. + /* + * Moving/renaming is always done while holding reference. * kn->parent won't change beneath us. */ parent = kn->parent; - WARN(!(kn->flags & KERNFS_REMOVED), "kernfs: free using entry: %s/%s\n", - parent ? parent->name : "", kn->name); + WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS, + "kernfs_put: %s/%s: released with incorrect active_ref %d\n", + parent ? parent->name : "", kn->name, atomic_read(&kn->active)); if (kernfs_type(kn) == KERNFS_LINK) kernfs_put(kn->symlink.target_kn); @@ -282,8 +445,8 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) kn = dentry->d_fsdata; mutex_lock(&kernfs_mutex); - /* The kernfs node has been deleted */ - if (kn->flags & KERNFS_REMOVED) + /* The kernfs node has been deactivated */ + if (!kernfs_active(kn)) goto out_bad; /* The kernfs node has been moved? */ @@ -328,6 +491,24 @@ const struct dentry_operations kernfs_dops = { .d_release = kernfs_dop_release, }; +/** + * kernfs_node_from_dentry - determine kernfs_node associated with a dentry + * @dentry: the dentry in question + * + * Return the kernfs_node associated with @dentry. If @dentry is not a + * kernfs one, %NULL is returned. + * + * While the returned kernfs_node will stay accessible as long as @dentry + * is accessible, the returned node can be in any state and the caller is + * fully responsible for determining what's accessible. + */ +struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) +{ + if (dentry->d_sb->s_op == &kernfs_sops) + return dentry->d_fsdata; + return NULL; +} + static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, const char *name, umode_t mode, unsigned flags) @@ -352,11 +533,12 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, kn->ino = ret; atomic_set(&kn->count, 1); - atomic_set(&kn->active, 0); + atomic_set(&kn->active, KN_DEACTIVATED_BIAS); + RB_CLEAR_NODE(&kn->rb); kn->name = name; kn->mode = mode; - kn->flags = flags | KERNFS_REMOVED; + kn->flags = flags; return kn; @@ -382,69 +564,44 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, } /** - * kernfs_addrm_start - prepare for kernfs_node add/remove - * @acxt: pointer to kernfs_addrm_cxt to be used - * - * This function is called when the caller is about to add or remove - * kernfs_node. This function acquires kernfs_mutex. @acxt is used - * to keep and pass context to other addrm functions. - * - * LOCKING: - * Kernel thread context (may sleep). kernfs_mutex is locked on - * return. - */ -void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt) - __acquires(kernfs_mutex) -{ - memset(acxt, 0, sizeof(*acxt)); - - mutex_lock(&kernfs_mutex); -} - -/** * kernfs_add_one - add kernfs_node to parent without warning - * @acxt: addrm context to use * @kn: kernfs_node to be added * * The caller must already have initialized @kn->parent. This * function increments nlink of the parent's inode if @kn is a * directory and link into the children list of the parent. * - * This function should be called between calls to - * kernfs_addrm_start() and kernfs_addrm_finish() and should be passed - * the same @acxt as passed to kernfs_addrm_start(). - * - * LOCKING: - * Determined by kernfs_addrm_start(). - * * RETURNS: * 0 on success, -EEXIST if entry with the given name already * exists. */ -int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn) +int kernfs_add_one(struct kernfs_node *kn) { struct kernfs_node *parent = kn->parent; - bool has_ns = kernfs_ns_enabled(parent); struct kernfs_iattrs *ps_iattr; + bool has_ns; int ret; - if (has_ns != (bool)kn->ns) { - WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", - has_ns ? "required" : "invalid", parent->name, kn->name); - return -EINVAL; - } + mutex_lock(&kernfs_mutex); + + ret = -EINVAL; + has_ns = kernfs_ns_enabled(parent); + if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", + has_ns ? "required" : "invalid", parent->name, kn->name)) + goto out_unlock; if (kernfs_type(parent) != KERNFS_DIR) - return -EINVAL; + goto out_unlock; - if (parent->flags & KERNFS_REMOVED) - return -ENOENT; + ret = -ENOENT; + if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent)) + goto out_unlock; kn->hash = kernfs_name_hash(kn->name, kn->ns); ret = kernfs_link_sibling(kn); if (ret) - return ret; + goto out_unlock; /* Update timestamps on the parent */ ps_iattr = parent->iattr; @@ -453,82 +610,22 @@ int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn) ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; } - /* Mark the entry added into directory tree */ - kn->flags &= ~KERNFS_REMOVED; - - return 0; -} - -/** - * kernfs_remove_one - remove kernfs_node from parent - * @acxt: addrm context to use - * @kn: kernfs_node to be removed - * - * Mark @kn removed and drop nlink of parent inode if @kn is a - * directory. @kn is unlinked from the children list. - * - * This function should be called between calls to - * kernfs_addrm_start() and kernfs_addrm_finish() and should be - * passed the same @acxt as passed to kernfs_addrm_start(). - * - * LOCKING: - * Determined by kernfs_addrm_start(). - */ -static void kernfs_remove_one(struct kernfs_addrm_cxt *acxt, - struct kernfs_node *kn) -{ - struct kernfs_iattrs *ps_iattr; + mutex_unlock(&kernfs_mutex); /* - * Removal can be called multiple times on the same node. Only the - * first invocation is effective and puts the base ref. + * Activate the new node unless CREATE_DEACTIVATED is requested. + * If not activated here, the kernfs user is responsible for + * activating the node with kernfs_activate(). A node which hasn't + * been activated is not visible to userland and its removal won't + * trigger deactivation. */ - if (kn->flags & KERNFS_REMOVED) - return; - - if (kn->parent) { - kernfs_unlink_sibling(kn); - - /* Update timestamps on the parent */ - ps_iattr = kn->parent->iattr; - if (ps_iattr) { - ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME; - ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME; - } - } - - kn->flags |= KERNFS_REMOVED; - kn->u.removed_list = acxt->removed; - acxt->removed = kn; -} + if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED)) + kernfs_activate(kn); + return 0; -/** - * kernfs_addrm_finish - finish up kernfs_node add/remove - * @acxt: addrm context to finish up - * - * Finish up kernfs_node add/remove. Resources acquired by - * kernfs_addrm_start() are released and removed kernfs_nodes are - * cleaned up. - * - * LOCKING: - * kernfs_mutex is released. - */ -void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt) - __releases(kernfs_mutex) -{ - /* release resources acquired by kernfs_addrm_start() */ +out_unlock: mutex_unlock(&kernfs_mutex); - - /* kill removed kernfs_nodes */ - while (acxt->removed) { - struct kernfs_node *kn = acxt->removed; - - acxt->removed = kn->u.removed_list; - - kernfs_deactivate(kn); - kernfs_unmap_bin_file(kn); - kernfs_put(kn); - } + return ret; } /** @@ -599,13 +696,15 @@ EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns); /** * kernfs_create_root - create a new kernfs hierarchy - * @kdops: optional directory syscall operations for the hierarchy + * @scops: optional syscall operations for the hierarchy + * @flags: KERNFS_ROOT_* flags * @priv: opaque data associated with the new directory * * Returns the root of the new hierarchy on success, ERR_PTR() value on * failure. */ -struct kernfs_root *kernfs_create_root(struct kernfs_dir_ops *kdops, void *priv) +struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, + unsigned int flags, void *priv) { struct kernfs_root *root; struct kernfs_node *kn; @@ -624,12 +723,16 @@ struct kernfs_root *kernfs_create_root(struct kernfs_dir_ops *kdops, void *priv) return ERR_PTR(-ENOMEM); } - kn->flags &= ~KERNFS_REMOVED; kn->priv = priv; kn->dir.root = root; - root->dir_ops = kdops; + root->syscall_ops = scops; + root->flags = flags; root->kn = kn; + init_waitqueue_head(&root->deactivate_waitq); + + if (!(root->flags & KERNFS_ROOT_CREATE_DEACTIVATED)) + kernfs_activate(kn); return root; } @@ -660,7 +763,6 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, void *priv, const void *ns) { - struct kernfs_addrm_cxt acxt; struct kernfs_node *kn; int rc; @@ -674,10 +776,7 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, kn->priv = priv; /* link in */ - kernfs_addrm_start(&acxt); - rc = kernfs_add_one(&acxt, kn); - kernfs_addrm_finish(&acxt); - + rc = kernfs_add_one(kn); if (!rc) return kn; @@ -703,7 +802,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, kn = kernfs_find_ns(parent, dentry->d_name.name, ns); /* no such entry */ - if (!kn) { + if (!kn || !kernfs_active(kn)) { ret = NULL; goto out_unlock; } @@ -728,23 +827,37 @@ static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct kernfs_node *parent = dir->i_private; - struct kernfs_dir_ops *kdops = kernfs_root(parent)->dir_ops; + struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops; + int ret; - if (!kdops || !kdops->mkdir) + if (!scops || !scops->mkdir) return -EPERM; - return kdops->mkdir(parent, dentry->d_name.name, mode); + if (!kernfs_get_active(parent)) + return -ENODEV; + + ret = scops->mkdir(parent, dentry->d_name.name, mode); + + kernfs_put_active(parent); + return ret; } static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) { struct kernfs_node *kn = dentry->d_fsdata; - struct kernfs_dir_ops *kdops = kernfs_root(kn)->dir_ops; + struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; + int ret; - if (!kdops || !kdops->rmdir) + if (!scops || !scops->rmdir) return -EPERM; - return kdops->rmdir(kn); + if (!kernfs_get_active(kn)) + return -ENODEV; + + ret = scops->rmdir(kn); + + kernfs_put_active(kn); + return ret; } static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry, @@ -752,12 +865,25 @@ static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry, { struct kernfs_node *kn = old_dentry->d_fsdata; struct kernfs_node *new_parent = new_dir->i_private; - struct kernfs_dir_ops *kdops = kernfs_root(kn)->dir_ops; + struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; + int ret; - if (!kdops || !kdops->rename) + if (!scops || !scops->rename) return -EPERM; - return kdops->rename(kn, new_parent, new_dentry->d_name.name); + if (!kernfs_get_active(kn)) + return -ENODEV; + + if (!kernfs_get_active(new_parent)) { + kernfs_put_active(kn); + return -ENODEV; + } + + ret = scops->rename(kn, new_parent, new_dentry->d_name.name); + + kernfs_put_active(new_parent); + kernfs_put_active(kn); + return ret; } const struct inode_operations kernfs_dir_iops = { @@ -830,23 +956,104 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, return pos->parent; } -static void __kernfs_remove(struct kernfs_addrm_cxt *acxt, - struct kernfs_node *kn) +/** + * kernfs_activate - activate a node which started deactivated + * @kn: kernfs_node whose subtree is to be activated + * + * If the root has KERNFS_ROOT_CREATE_DEACTIVATED set, a newly created node + * needs to be explicitly activated. A node which hasn't been activated + * isn't visible to userland and deactivation is skipped during its + * removal. This is useful to construct atomic init sequences where + * creation of multiple nodes should either succeed or fail atomically. + * + * The caller is responsible for ensuring that this function is not called + * after kernfs_remove*() is invoked on @kn. + */ +void kernfs_activate(struct kernfs_node *kn) { - struct kernfs_node *pos, *next; + struct kernfs_node *pos; - if (!kn) + mutex_lock(&kernfs_mutex); + + pos = NULL; + while ((pos = kernfs_next_descendant_post(pos, kn))) { + if (!pos || (pos->flags & KERNFS_ACTIVATED)) + continue; + + WARN_ON_ONCE(pos->parent && RB_EMPTY_NODE(&pos->rb)); + WARN_ON_ONCE(atomic_read(&pos->active) != KN_DEACTIVATED_BIAS); + + atomic_sub(KN_DEACTIVATED_BIAS, &pos->active); + pos->flags |= KERNFS_ACTIVATED; + } + + mutex_unlock(&kernfs_mutex); +} + +static void __kernfs_remove(struct kernfs_node *kn) +{ + struct kernfs_node *pos; + + lockdep_assert_held(&kernfs_mutex); + + /* + * Short-circuit if non-root @kn has already finished removal. + * This is for kernfs_remove_self() which plays with active ref + * after removal. + */ + if (!kn || (kn->parent && RB_EMPTY_NODE(&kn->rb))) return; pr_debug("kernfs %s: removing\n", kn->name); - next = NULL; + /* prevent any new usage under @kn by deactivating all nodes */ + pos = NULL; + while ((pos = kernfs_next_descendant_post(pos, kn))) + if (kernfs_active(pos)) + atomic_add(KN_DEACTIVATED_BIAS, &pos->active); + + /* deactivate and unlink the subtree node-by-node */ do { - pos = next; - next = kernfs_next_descendant_post(pos, kn); - if (pos) - kernfs_remove_one(acxt, pos); - } while (next); + pos = kernfs_leftmost_descendant(kn); + + /* + * kernfs_drain() drops kernfs_mutex temporarily and @pos's + * base ref could have been put by someone else by the time + * the function returns. Make sure it doesn't go away + * underneath us. + */ + kernfs_get(pos); + + /* + * Drain iff @kn was activated. This avoids draining and + * its lockdep annotations for nodes which have never been + * activated and allows embedding kernfs_remove() in create + * error paths without worrying about draining. + */ + if (kn->flags & KERNFS_ACTIVATED) + kernfs_drain(pos); + else + WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); + + /* + * kernfs_unlink_sibling() succeeds once per node. Use it + * to decide who's responsible for cleanups. + */ + if (!pos->parent || kernfs_unlink_sibling(pos)) { + struct kernfs_iattrs *ps_iattr = + pos->parent ? pos->parent->iattr : NULL; + + /* update timestamps on the parent */ + if (ps_iattr) { + ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME; + ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME; + } + + kernfs_put(pos); + } + + kernfs_put(pos); + } while (pos != kn); } /** @@ -857,11 +1064,140 @@ static void __kernfs_remove(struct kernfs_addrm_cxt *acxt, */ void kernfs_remove(struct kernfs_node *kn) { - struct kernfs_addrm_cxt acxt; + mutex_lock(&kernfs_mutex); + __kernfs_remove(kn); + mutex_unlock(&kernfs_mutex); +} + +/** + * kernfs_break_active_protection - break out of active protection + * @kn: the self kernfs_node + * + * The caller must be running off of a kernfs operation which is invoked + * with an active reference - e.g. one of kernfs_ops. Each invocation of + * this function must also be matched with an invocation of + * kernfs_unbreak_active_protection(). + * + * This function releases the active reference of @kn the caller is + * holding. Once this function is called, @kn may be removed at any point + * and the caller is solely responsible for ensuring that the objects it + * dereferences are accessible. + */ +void kernfs_break_active_protection(struct kernfs_node *kn) +{ + /* + * Take out ourself out of the active ref dependency chain. If + * we're called without an active ref, lockdep will complain. + */ + kernfs_put_active(kn); +} + +/** + * kernfs_unbreak_active_protection - undo kernfs_break_active_protection() + * @kn: the self kernfs_node + * + * If kernfs_break_active_protection() was called, this function must be + * invoked before finishing the kernfs operation. Note that while this + * function restores the active reference, it doesn't and can't actually + * restore the active protection - @kn may already or be in the process of + * being removed. Once kernfs_break_active_protection() is invoked, that + * protection is irreversibly gone for the kernfs operation instance. + * + * While this function may be called at any point after + * kernfs_break_active_protection() is invoked, its most useful location + * would be right before the enclosing kernfs operation returns. + */ +void kernfs_unbreak_active_protection(struct kernfs_node *kn) +{ + /* + * @kn->active could be in any state; however, the increment we do + * here will be undone as soon as the enclosing kernfs operation + * finishes and this temporary bump can't break anything. If @kn + * is alive, nothing changes. If @kn is being deactivated, the + * soon-to-follow put will either finish deactivation or restore + * deactivated state. If @kn is already removed, the temporary + * bump is guaranteed to be gone before @kn is released. + */ + atomic_inc(&kn->active); + if (kernfs_lockdep(kn)) + rwsem_acquire(&kn->dep_map, 0, 1, _RET_IP_); +} + +/** + * kernfs_remove_self - remove a kernfs_node from its own method + * @kn: the self kernfs_node to remove + * + * The caller must be running off of a kernfs operation which is invoked + * with an active reference - e.g. one of kernfs_ops. This can be used to + * implement a file operation which deletes itself. + * + * For example, the "delete" file for a sysfs device directory can be + * implemented by invoking kernfs_remove_self() on the "delete" file + * itself. This function breaks the circular dependency of trying to + * deactivate self while holding an active ref itself. It isn't necessary + * to modify the usual removal path to use kernfs_remove_self(). The + * "delete" implementation can simply invoke kernfs_remove_self() on self + * before proceeding with the usual removal path. kernfs will ignore later + * kernfs_remove() on self. + * + * kernfs_remove_self() can be called multiple times concurrently on the + * same kernfs_node. Only the first one actually performs removal and + * returns %true. All others will wait until the kernfs operation which + * won self-removal finishes and return %false. Note that the losers wait + * for the completion of not only the winning kernfs_remove_self() but also + * the whole kernfs_ops which won the arbitration. This can be used to + * guarantee, for example, all concurrent writes to a "delete" file to + * finish only after the whole operation is complete. + */ +bool kernfs_remove_self(struct kernfs_node *kn) +{ + bool ret; + + mutex_lock(&kernfs_mutex); + kernfs_break_active_protection(kn); + + /* + * SUICIDAL is used to arbitrate among competing invocations. Only + * the first one will actually perform removal. When the removal + * is complete, SUICIDED is set and the active ref is restored + * while holding kernfs_mutex. The ones which lost arbitration + * waits for SUICDED && drained which can happen only after the + * enclosing kernfs operation which executed the winning instance + * of kernfs_remove_self() finished. + */ + if (!(kn->flags & KERNFS_SUICIDAL)) { + kn->flags |= KERNFS_SUICIDAL; + __kernfs_remove(kn); + kn->flags |= KERNFS_SUICIDED; + ret = true; + } else { + wait_queue_head_t *waitq = &kernfs_root(kn)->deactivate_waitq; + DEFINE_WAIT(wait); - kernfs_addrm_start(&acxt); - __kernfs_remove(&acxt, kn); - kernfs_addrm_finish(&acxt); + while (true) { + prepare_to_wait(waitq, &wait, TASK_UNINTERRUPTIBLE); + + if ((kn->flags & KERNFS_SUICIDED) && + atomic_read(&kn->active) == KN_DEACTIVATED_BIAS) + break; + + mutex_unlock(&kernfs_mutex); + schedule(); + mutex_lock(&kernfs_mutex); + } + finish_wait(waitq, &wait); + WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb)); + ret = false; + } + + /* + * This must be done while holding kernfs_mutex; otherwise, waiting + * for SUICIDED && deactivated could finish prematurely. + */ + kernfs_unbreak_active_protection(kn); + + mutex_unlock(&kernfs_mutex); + return ret; } /** @@ -876,7 +1212,6 @@ void kernfs_remove(struct kernfs_node *kn) int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, const void *ns) { - struct kernfs_addrm_cxt acxt; struct kernfs_node *kn; if (!parent) { @@ -885,13 +1220,13 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, return -ENOENT; } - kernfs_addrm_start(&acxt); + mutex_lock(&kernfs_mutex); kn = kernfs_find_ns(parent, name, ns); if (kn) - __kernfs_remove(&acxt, kn); + __kernfs_remove(kn); - kernfs_addrm_finish(&acxt); + mutex_unlock(&kernfs_mutex); if (kn) return 0; @@ -909,12 +1244,18 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name, const void *new_ns) { + struct kernfs_node *old_parent; + const char *old_name = NULL; int error; + /* can't move or rename root */ + if (!kn->parent) + return -EINVAL; + mutex_lock(&kernfs_mutex); error = -ENOENT; - if ((kn->flags | new_parent->flags) & KERNFS_REMOVED) + if (!kernfs_active(kn) || !kernfs_active(new_parent)) goto out; error = 0; @@ -932,13 +1273,8 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, new_name = kstrdup(new_name, GFP_KERNEL); if (!new_name) goto out; - - if (kn->flags & KERNFS_STATIC_NAME) - kn->flags &= ~KERNFS_STATIC_NAME; - else - kfree(kn->name); - - kn->name = new_name; + } else { + new_name = NULL; } /* @@ -946,12 +1282,29 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, */ kernfs_unlink_sibling(kn); kernfs_get(new_parent); - kernfs_put(kn->parent); + + /* rename_lock protects ->parent and ->name accessors */ + spin_lock_irq(&kernfs_rename_lock); + + old_parent = kn->parent; + kn->parent = new_parent; + kn->ns = new_ns; + if (new_name) { + if (!(kn->flags & KERNFS_STATIC_NAME)) + old_name = kn->name; + kn->flags &= ~KERNFS_STATIC_NAME; + kn->name = new_name; + } + + spin_unlock_irq(&kernfs_rename_lock); + kn->hash = kernfs_name_hash(kn->name, kn->ns); - kn->parent = new_parent; kernfs_link_sibling(kn); + kernfs_put(old_parent); + kfree(old_name); + error = 0; out: mutex_unlock(&kernfs_mutex); @@ -974,7 +1327,7 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns, struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos) { if (pos) { - int valid = !(pos->flags & KERNFS_REMOVED) && + int valid = kernfs_active(pos) && pos->parent == parent && hash == pos->hash; kernfs_put(pos); if (!valid) @@ -993,8 +1346,8 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns, break; } } - /* Skip over entries in the wrong namespace */ - while (pos && pos->ns != ns) { + /* Skip over entries which are dying/dead or in the wrong namespace */ + while (pos && (!kernfs_active(pos) || pos->ns != ns)) { struct rb_node *node = rb_next(&pos->rb); if (!node) pos = NULL; @@ -1008,14 +1361,15 @@ static struct kernfs_node *kernfs_dir_next_pos(const void *ns, struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos) { pos = kernfs_dir_pos(ns, parent, ino, pos); - if (pos) + if (pos) { do { struct rb_node *node = rb_next(&pos->rb); if (!node) pos = NULL; else pos = rb_to_kn(node); - } while (pos && pos->ns != ns); + } while (pos && (!kernfs_active(pos) || pos->ns != ns)); + } return pos; } diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index dbf397bfdff2..5e9a80cfc3d8 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -252,10 +252,18 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct kernfs_open_file *of = kernfs_of(file); - ssize_t len = min_t(size_t, count, PAGE_SIZE); const struct kernfs_ops *ops; + size_t len; char *buf; + if (of->atomic_write_len) { + len = count; + if (len > of->atomic_write_len) + return -E2BIG; + } else { + len = min_t(size_t, count, PAGE_SIZE); + } + buf = kmalloc(len + 1, GFP_KERNEL); if (!buf) return -ENOMEM; @@ -476,6 +484,8 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) ops = kernfs_ops(of->kn); rc = ops->mmap(of, vma); + if (rc) + goto out_put; /* * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup() @@ -600,6 +610,7 @@ static void kernfs_put_open_node(struct kernfs_node *kn, static int kernfs_fop_open(struct inode *inode, struct file *file) { struct kernfs_node *kn = file->f_path.dentry->d_fsdata; + struct kernfs_root *root = kernfs_root(kn); const struct kernfs_ops *ops; struct kernfs_open_file *of; bool has_read, has_write, has_mmap; @@ -614,14 +625,16 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) has_write = ops->write || ops->mmap; has_mmap = ops->mmap; - /* check perms and supported operations */ - if ((file->f_mode & FMODE_WRITE) && - (!(inode->i_mode & S_IWUGO) || !has_write)) - goto err_out; + /* see the flag definition for details */ + if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) { + if ((file->f_mode & FMODE_WRITE) && + (!(inode->i_mode & S_IWUGO) || !has_write)) + goto err_out; - if ((file->f_mode & FMODE_READ) && - (!(inode->i_mode & S_IRUGO) || !has_read)) - goto err_out; + if ((file->f_mode & FMODE_READ) && + (!(inode->i_mode & S_IRUGO) || !has_read)) + goto err_out; + } /* allocate a kernfs_open_file for the file */ error = -ENOMEM; @@ -653,6 +666,12 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) of->file = file; /* + * Write path needs to atomic_write_len outside active reference. + * Cache it in open_file. See kernfs_fop_write() for details. + */ + of->atomic_write_len = ops->atomic_write_len; + + /* * Always instantiate seq_file even if read access doesn't use * seq_file or is not requested. This unifies private data access * and readable regular files are the vast majority anyway. @@ -820,7 +839,6 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, bool name_is_static, struct lock_class_key *key) { - struct kernfs_addrm_cxt acxt; struct kernfs_node *kn; unsigned flags; int rc; @@ -855,10 +873,7 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, if (ops->mmap) kn->flags |= KERNFS_HAS_MMAP; - kernfs_addrm_start(&acxt); - rc = kernfs_add_one(&acxt, kn); - kernfs_addrm_finish(&acxt); - + rc = kernfs_add_one(kn); if (rc) { kernfs_put(kn); return ERR_PTR(rc); diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index e55126f85bd2..985217626e66 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -48,14 +48,18 @@ void __init kernfs_inode_init(void) static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) { + static DEFINE_MUTEX(iattr_mutex); + struct kernfs_iattrs *ret; struct iattr *iattrs; + mutex_lock(&iattr_mutex); + if (kn->iattr) - return kn->iattr; + goto out_unlock; kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL); if (!kn->iattr) - return NULL; + goto out_unlock; iattrs = &kn->iattr->ia_iattr; /* assign default attributes */ @@ -65,8 +69,10 @@ static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME; simple_xattrs_init(&kn->iattr->xattrs); - - return kn->iattr; +out_unlock: + ret = kn->iattr; + mutex_unlock(&iattr_mutex); + return ret; } static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) @@ -355,7 +361,7 @@ void kernfs_evict_inode(struct inode *inode) { struct kernfs_node *kn = inode->i_private; - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); kernfs_put(kn); } diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index eb536b76374a..8be13b2a079b 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -26,7 +26,8 @@ struct kernfs_iattrs { struct simple_xattrs xattrs; }; -#define KN_DEACTIVATED_BIAS INT_MIN +/* +1 to avoid triggering overflow warning when negating it */ +#define KN_DEACTIVATED_BIAS (INT_MIN + 1) /* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */ @@ -45,13 +46,6 @@ static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn) } /* - * Context structure to be used while adding/removing nodes. - */ -struct kernfs_addrm_cxt { - struct kernfs_node *removed; -}; - -/* * mount.c */ struct kernfs_super_info { @@ -71,6 +65,7 @@ struct kernfs_super_info { }; #define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info)) +extern const struct super_operations kernfs_sops; extern struct kmem_cache *kernfs_node_cache; /* @@ -100,9 +95,7 @@ extern const struct inode_operations kernfs_dir_iops; struct kernfs_node *kernfs_get_active(struct kernfs_node *kn); void kernfs_put_active(struct kernfs_node *kn); -void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt); -int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn); -void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt); +int kernfs_add_one(struct kernfs_node *kn); struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, const char *name, umode_t mode, unsigned flags); diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 0f4152defe7b..95dcd1d558bb 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -19,13 +19,50 @@ struct kmem_cache *kernfs_node_cache; -static const struct super_operations kernfs_sops = { +static int kernfs_sop_remount_fs(struct super_block *sb, int *flags, char *data) +{ + struct kernfs_root *root = kernfs_info(sb)->root; + struct kernfs_syscall_ops *scops = root->syscall_ops; + + if (scops && scops->remount_fs) + return scops->remount_fs(root, flags, data); + return 0; +} + +static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry) +{ + struct kernfs_root *root = kernfs_root(dentry->d_fsdata); + struct kernfs_syscall_ops *scops = root->syscall_ops; + + if (scops && scops->show_options) + return scops->show_options(sf, root); + return 0; +} + +const struct super_operations kernfs_sops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .evict_inode = kernfs_evict_inode, + + .remount_fs = kernfs_sop_remount_fs, + .show_options = kernfs_sop_show_options, }; -static int kernfs_fill_super(struct super_block *sb) +/** + * kernfs_root_from_sb - determine kernfs_root associated with a super_block + * @sb: the super_block in question + * + * Return the kernfs_root associated with @sb. If @sb is not a kernfs one, + * %NULL is returned. + */ +struct kernfs_root *kernfs_root_from_sb(struct super_block *sb) +{ + if (sb->s_op == &kernfs_sops) + return kernfs_info(sb)->root; + return NULL; +} + +static int kernfs_fill_super(struct super_block *sb, unsigned long magic) { struct kernfs_super_info *info = kernfs_info(sb); struct inode *inode; @@ -33,7 +70,7 @@ static int kernfs_fill_super(struct super_block *sb) sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; - sb->s_magic = SYSFS_MAGIC; + sb->s_magic = magic; sb->s_op = &kernfs_sops; sb->s_time_gran = 1; @@ -94,6 +131,7 @@ const void *kernfs_super_ns(struct super_block *sb) * @fs_type: file_system_type of the fs being mounted * @flags: mount flags specified for the mount * @root: kernfs_root of the hierarchy being mounted + * @magic: file system specific magic number * @new_sb_created: tell the caller if we allocated a new superblock * @ns: optional namespace tag of the mount * @@ -105,8 +143,8 @@ const void *kernfs_super_ns(struct super_block *sb) * The return value can be passed to the vfs layer verbatim. */ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, - struct kernfs_root *root, bool *new_sb_created, - const void *ns) + struct kernfs_root *root, unsigned long magic, + bool *new_sb_created, const void *ns) { struct super_block *sb; struct kernfs_super_info *info; @@ -129,7 +167,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, *new_sb_created = !sb->s_root; if (!sb->s_root) { - error = kernfs_fill_super(sb); + error = kernfs_fill_super(sb, magic); if (error) { deactivate_locked_super(sb); return ERR_PTR(error); diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c index 4d457055acb9..8a198898e39a 100644 --- a/fs/kernfs/symlink.c +++ b/fs/kernfs/symlink.c @@ -27,7 +27,6 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, struct kernfs_node *target) { struct kernfs_node *kn; - struct kernfs_addrm_cxt acxt; int error; kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK); @@ -39,10 +38,7 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, kn->symlink.target_kn = target; kernfs_get(target); /* ref owned by symlink */ - kernfs_addrm_start(&acxt); - error = kernfs_add_one(&acxt, kn); - kernfs_addrm_finish(&acxt); - + error = kernfs_add_one(kn); if (!error) return kn; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 10d6c41aecad..6bf06a07f3e0 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -235,6 +235,7 @@ out_err: if (warned++ == 0) printk(KERN_WARNING "lockd_up: makesock failed, error=%d\n", err); + svc_shutdown_net(serv, net); return err; } diff --git a/fs/locks.c b/fs/locks.c index 92a0f0a52b06..e390bd9ae068 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -135,6 +135,7 @@ #define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) #define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) #define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG)) +#define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK) static bool lease_breaking(struct file_lock *fl) { @@ -344,48 +345,43 @@ static int assign_type(struct file_lock *fl, long type) return 0; } -/* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX - * style lock. - */ -static int flock_to_posix_lock(struct file *filp, struct file_lock *fl, - struct flock *l) +static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl, + struct flock64 *l) { - off_t start, end; - switch (l->l_whence) { case SEEK_SET: - start = 0; + fl->fl_start = 0; break; case SEEK_CUR: - start = filp->f_pos; + fl->fl_start = filp->f_pos; break; case SEEK_END: - start = i_size_read(file_inode(filp)); + fl->fl_start = i_size_read(file_inode(filp)); break; default: return -EINVAL; } + if (l->l_start > OFFSET_MAX - fl->fl_start) + return -EOVERFLOW; + fl->fl_start += l->l_start; + if (fl->fl_start < 0) + return -EINVAL; /* POSIX-1996 leaves the case l->l_len < 0 undefined; POSIX-2001 defines it. */ - start += l->l_start; - if (start < 0) - return -EINVAL; - fl->fl_end = OFFSET_MAX; if (l->l_len > 0) { - end = start + l->l_len - 1; - fl->fl_end = end; + if (l->l_len - 1 > OFFSET_MAX - fl->fl_start) + return -EOVERFLOW; + fl->fl_end = fl->fl_start + l->l_len - 1; + } else if (l->l_len < 0) { - end = start - 1; - fl->fl_end = end; - start += l->l_len; - if (start < 0) + if (fl->fl_start + l->l_len < 0) return -EINVAL; - } - fl->fl_start = start; /* we record the absolute position */ - if (fl->fl_end < fl->fl_start) - return -EOVERFLOW; - + fl->fl_end = fl->fl_start - 1; + fl->fl_start += l->l_len; + } else + fl->fl_end = OFFSET_MAX; + fl->fl_owner = current->files; fl->fl_pid = current->tgid; fl->fl_file = filp; @@ -396,52 +392,21 @@ static int flock_to_posix_lock(struct file *filp, struct file_lock *fl, return assign_type(fl, l->l_type); } -#if BITS_PER_LONG == 32 -static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl, - struct flock64 *l) +/* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX + * style lock. + */ +static int flock_to_posix_lock(struct file *filp, struct file_lock *fl, + struct flock *l) { - loff_t start; - - switch (l->l_whence) { - case SEEK_SET: - start = 0; - break; - case SEEK_CUR: - start = filp->f_pos; - break; - case SEEK_END: - start = i_size_read(file_inode(filp)); - break; - default: - return -EINVAL; - } + struct flock64 ll = { + .l_type = l->l_type, + .l_whence = l->l_whence, + .l_start = l->l_start, + .l_len = l->l_len, + }; - start += l->l_start; - if (start < 0) - return -EINVAL; - fl->fl_end = OFFSET_MAX; - if (l->l_len > 0) { - fl->fl_end = start + l->l_len - 1; - } else if (l->l_len < 0) { - fl->fl_end = start - 1; - start += l->l_len; - if (start < 0) - return -EINVAL; - } - fl->fl_start = start; /* we record the absolute position */ - if (fl->fl_end < fl->fl_start) - return -EOVERFLOW; - - fl->fl_owner = current->files; - fl->fl_pid = current->tgid; - fl->fl_file = filp; - fl->fl_flags = FL_POSIX; - fl->fl_ops = NULL; - fl->fl_lmops = NULL; - - return assign_type(fl, l->l_type); + return flock64_to_posix_lock(filp, fl, &ll); } -#endif /* default lease lock manager operations */ static void lease_break_callback(struct file_lock *fl) @@ -511,8 +476,7 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2) } /* Must be called with the i_lock held! */ -static inline void -locks_insert_global_locks(struct file_lock *fl) +static void locks_insert_global_locks(struct file_lock *fl) { lg_local_lock(&file_lock_lglock); fl->fl_link_cpu = smp_processor_id(); @@ -521,8 +485,7 @@ locks_insert_global_locks(struct file_lock *fl) } /* Must be called with the i_lock held! */ -static inline void -locks_delete_global_locks(struct file_lock *fl) +static void locks_delete_global_locks(struct file_lock *fl) { /* * Avoid taking lock if already unhashed. This is safe since this check @@ -544,14 +507,12 @@ posix_owner_key(struct file_lock *fl) return (unsigned long)fl->fl_owner; } -static inline void -locks_insert_global_blocked(struct file_lock *waiter) +static void locks_insert_global_blocked(struct file_lock *waiter) { hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter)); } -static inline void -locks_delete_global_blocked(struct file_lock *waiter) +static void locks_delete_global_blocked(struct file_lock *waiter) { hash_del(&waiter->fl_link); } @@ -581,7 +542,7 @@ static void locks_delete_block(struct file_lock *waiter) * it seems like the reasonable thing to do. * * Must be called with both the i_lock and blocked_lock_lock held. The fl_block - * list itself is protected by the file_lock_list, but by ensuring that the + * list itself is protected by the blocked_lock_lock, but by ensuring that the * i_lock is also held on insertions we can avoid taking the blocked_lock_lock * in some cases when we see that the fl_block list is empty. */ @@ -591,7 +552,7 @@ static void __locks_insert_block(struct file_lock *blocker, BUG_ON(!list_empty(&waiter->fl_block)); waiter->fl_next = blocker; list_add_tail(&waiter->fl_block, &blocker->fl_block); - if (IS_POSIX(blocker)) + if (IS_POSIX(blocker) && !IS_OFDLCK(blocker)) locks_insert_global_blocked(waiter); } @@ -652,15 +613,18 @@ static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl) locks_insert_global_locks(fl); } -/* - * Delete a lock and then free it. - * Wake up processes that are blocked waiting for this lock, - * notify the FS that the lock has been cleared and - * finally free the lock. +/** + * locks_delete_lock - Delete a lock and then free it. + * @thisfl_p: pointer that points to the fl_next field of the previous + * inode->i_flock list entry + * + * Unlink a lock from all lists and free the namespace reference, but don't + * free it yet. Wake up processes that are blocked waiting for this lock and + * notify the FS that the lock has been cleared. * * Must be called with the i_lock held! */ -static void locks_delete_lock(struct file_lock **thisfl_p) +static void locks_unlink_lock(struct file_lock **thisfl_p) { struct file_lock *fl = *thisfl_p; @@ -675,6 +639,18 @@ static void locks_delete_lock(struct file_lock **thisfl_p) } locks_wake_up_blocks(fl); +} + +/* + * Unlink a lock from all lists and free it. + * + * Must be called with i_lock held! + */ +static void locks_delete_lock(struct file_lock **thisfl_p) +{ + struct file_lock *fl = *thisfl_p; + + locks_unlink_lock(thisfl_p); locks_free_lock(fl); } @@ -769,8 +745,16 @@ EXPORT_SYMBOL(posix_test_lock); * Note: the above assumption may not be true when handling lock * requests from a broken NFS client. It may also fail in the presence * of tasks (such as posix threads) sharing the same open file table. - * * To handle those cases, we just bail out after a few iterations. + * + * For FL_OFDLCK locks, the owner is the filp, not the files_struct. + * Because the owner is not even nominally tied to a thread of + * execution, the deadlock detection below can't reasonably work well. Just + * skip it for those. + * + * In principle, we could do a more limited deadlock detection on FL_OFDLCK + * locks that just checks for the case where two tasks are attempting to + * upgrade from read to write locks on the same inode. */ #define MAX_DEADLK_ITERATIONS 10 @@ -793,6 +777,13 @@ static int posix_locks_deadlock(struct file_lock *caller_fl, { int i = 0; + /* + * This deadlock detector can't reasonably detect deadlocks with + * FL_OFDLCK locks, since they aren't owned by a process, per-se. + */ + if (IS_OFDLCK(caller_fl)) + return 0; + while ((block_fl = what_owner_is_waiting_for(block_fl))) { if (i++ > MAX_DEADLK_ITERATIONS) return 0; @@ -1152,13 +1143,14 @@ EXPORT_SYMBOL(posix_lock_file_wait); /** * locks_mandatory_locked - Check for an active lock - * @inode: the file to check + * @file: the file to check * * Searches the inode's list of locks to find any POSIX locks which conflict. * This function is called from locks_verify_locked() only. */ -int locks_mandatory_locked(struct inode *inode) +int locks_mandatory_locked(struct file *file) { + struct inode *inode = file_inode(file); fl_owner_t owner = current->files; struct file_lock *fl; @@ -1169,7 +1161,7 @@ int locks_mandatory_locked(struct inode *inode) for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { if (!IS_POSIX(fl)) continue; - if (fl->fl_owner != owner) + if (fl->fl_owner != owner && fl->fl_owner != (fl_owner_t)file) break; } spin_unlock(&inode->i_lock); @@ -1195,19 +1187,30 @@ int locks_mandatory_area(int read_write, struct inode *inode, { struct file_lock fl; int error; + bool sleep = false; locks_init_lock(&fl); - fl.fl_owner = current->files; fl.fl_pid = current->tgid; fl.fl_file = filp; fl.fl_flags = FL_POSIX | FL_ACCESS; if (filp && !(filp->f_flags & O_NONBLOCK)) - fl.fl_flags |= FL_SLEEP; + sleep = true; fl.fl_type = (read_write == FLOCK_VERIFY_WRITE) ? F_WRLCK : F_RDLCK; fl.fl_start = offset; fl.fl_end = offset + count - 1; for (;;) { + if (filp) { + fl.fl_owner = (fl_owner_t)filp; + fl.fl_flags &= ~FL_SLEEP; + error = __posix_lock_file(inode, &fl, NULL); + if (!error) + break; + } + + if (sleep) + fl.fl_flags |= FL_SLEEP; + fl.fl_owner = current->files; error = __posix_lock_file(inode, &fl, NULL); if (error != FILE_LOCK_DEFERRED) break; @@ -1376,11 +1379,10 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) restart: break_time = flock->fl_break_time; - if (break_time != 0) { + if (break_time != 0) break_time -= jiffies; - if (break_time == 0) - break_time++; - } + if (break_time == 0) + break_time++; locks_insert_block(flock, new_fl); spin_unlock(&inode->i_lock); error = wait_event_interruptible_timeout(new_fl->fl_wait, @@ -1472,6 +1474,32 @@ int fcntl_getlease(struct file *filp) return type; } +/** + * check_conflicting_open - see if the given dentry points to a file that has + * an existing open that would conflict with the + * desired lease. + * @dentry: dentry to check + * @arg: type of lease that we're trying to acquire + * + * Check to see if there's an existing open fd on this file that would + * conflict with the lease we're trying to set. + */ +static int +check_conflicting_open(const struct dentry *dentry, const long arg) +{ + int ret = 0; + struct inode *inode = dentry->d_inode; + + if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) + return -EAGAIN; + + if ((arg == F_WRLCK) && ((d_count(dentry) > 1) || + (atomic_read(&inode->i_count) > 1))) + ret = -EAGAIN; + + return ret; +} + static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp) { struct file_lock *fl, **before, **my_before = NULL, *lease; @@ -1499,12 +1527,8 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp return -EINVAL; } - error = -EAGAIN; - if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) - goto out; - if ((arg == F_WRLCK) - && ((d_count(dentry) > 1) - || (atomic_read(&inode->i_count) > 1))) + error = check_conflicting_open(dentry, arg); + if (error) goto out; /* @@ -1549,7 +1573,19 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp goto out; locks_insert_lock(before, lease); - error = 0; + /* + * The check in break_lease() is lockless. It's possible for another + * open to race in after we did the earlier check for a conflicting + * open but before the lease was inserted. Check again for a + * conflicting open and cancel the lease if there is one. + * + * We also add a barrier here to ensure that the insertion of the lock + * precedes these checks. + */ + smp_mb(); + error = check_conflicting_open(dentry, arg); + if (error) + locks_unlink_lock(flp); out: if (is_deleg) mutex_unlock(&inode->i_mutex); @@ -1842,7 +1878,7 @@ EXPORT_SYMBOL_GPL(vfs_test_lock); static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl) { - flock->l_pid = fl->fl_pid; + flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid; #if BITS_PER_LONG == 32 /* * Make sure we can represent the posix lock via @@ -1864,7 +1900,7 @@ static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl) #if BITS_PER_LONG == 32 static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl) { - flock->l_pid = fl->fl_pid; + flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid; flock->l_start = fl->fl_start; flock->l_len = fl->fl_end == OFFSET_MAX ? 0 : fl->fl_end - fl->fl_start + 1; @@ -1876,7 +1912,7 @@ static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl) /* Report the first existing lock that would conflict with l. * This implements the F_GETLK command of fcntl(). */ -int fcntl_getlk(struct file *filp, struct flock __user *l) +int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l) { struct file_lock file_lock; struct flock flock; @@ -1893,6 +1929,16 @@ int fcntl_getlk(struct file *filp, struct flock __user *l) if (error) goto out; + if (cmd == F_OFD_GETLK) { + error = -EINVAL; + if (flock.l_pid != 0) + goto out; + + cmd = F_GETLK; + file_lock.fl_flags |= FL_OFDLCK; + file_lock.fl_owner = (fl_owner_t)filp; + } + error = vfs_test_lock(filp, &file_lock); if (error) goto out; @@ -1976,6 +2022,22 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd, return error; } +/* Ensure that fl->fl_filp has compatible f_mode for F_SETLK calls */ +static int +check_fmode_for_setlk(struct file_lock *fl) +{ + switch (fl->fl_type) { + case F_RDLCK: + if (!(fl->fl_file->f_mode & FMODE_READ)) + return -EBADF; + break; + case F_WRLCK: + if (!(fl->fl_file->f_mode & FMODE_WRITE)) + return -EBADF; + } + return 0; +} + /* Apply the lock described by l to an open file descriptor. * This implements both the F_SETLK and F_SETLKW commands of fcntl(). */ @@ -2012,25 +2074,36 @@ again: error = flock_to_posix_lock(filp, file_lock, &flock); if (error) goto out; - if (cmd == F_SETLKW) { - file_lock->fl_flags |= FL_SLEEP; - } - - error = -EBADF; - switch (flock.l_type) { - case F_RDLCK: - if (!(filp->f_mode & FMODE_READ)) - goto out; - break; - case F_WRLCK: - if (!(filp->f_mode & FMODE_WRITE)) + + error = check_fmode_for_setlk(file_lock); + if (error) + goto out; + + /* + * If the cmd is requesting file-private locks, then set the + * FL_OFDLCK flag and override the owner. + */ + switch (cmd) { + case F_OFD_SETLK: + error = -EINVAL; + if (flock.l_pid != 0) goto out; + + cmd = F_SETLK; + file_lock->fl_flags |= FL_OFDLCK; + file_lock->fl_owner = (fl_owner_t)filp; break; - case F_UNLCK: - break; - default: + case F_OFD_SETLKW: error = -EINVAL; - goto out; + if (flock.l_pid != 0) + goto out; + + cmd = F_SETLKW; + file_lock->fl_flags |= FL_OFDLCK; + file_lock->fl_owner = (fl_owner_t)filp; + /* Fallthrough */ + case F_SETLKW: + file_lock->fl_flags |= FL_SLEEP; } error = do_lock_file_wait(filp, cmd, file_lock); @@ -2061,7 +2134,7 @@ out: /* Report the first existing lock that would conflict with l. * This implements the F_GETLK command of fcntl(). */ -int fcntl_getlk64(struct file *filp, struct flock64 __user *l) +int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l) { struct file_lock file_lock; struct flock64 flock; @@ -2078,6 +2151,16 @@ int fcntl_getlk64(struct file *filp, struct flock64 __user *l) if (error) goto out; + if (cmd == F_OFD_GETLK) { + error = -EINVAL; + if (flock.l_pid != 0) + goto out; + + cmd = F_GETLK64; + file_lock.fl_flags |= FL_OFDLCK; + file_lock.fl_owner = (fl_owner_t)filp; + } + error = vfs_test_lock(filp, &file_lock); if (error) goto out; @@ -2130,25 +2213,36 @@ again: error = flock64_to_posix_lock(filp, file_lock, &flock); if (error) goto out; - if (cmd == F_SETLKW64) { - file_lock->fl_flags |= FL_SLEEP; - } - - error = -EBADF; - switch (flock.l_type) { - case F_RDLCK: - if (!(filp->f_mode & FMODE_READ)) - goto out; - break; - case F_WRLCK: - if (!(filp->f_mode & FMODE_WRITE)) + + error = check_fmode_for_setlk(file_lock); + if (error) + goto out; + + /* + * If the cmd is requesting file-private locks, then set the + * FL_OFDLCK flag and override the owner. + */ + switch (cmd) { + case F_OFD_SETLK: + error = -EINVAL; + if (flock.l_pid != 0) goto out; + + cmd = F_SETLK64; + file_lock->fl_flags |= FL_OFDLCK; + file_lock->fl_owner = (fl_owner_t)filp; break; - case F_UNLCK: - break; - default: + case F_OFD_SETLKW: error = -EINVAL; - goto out; + if (flock.l_pid != 0) + goto out; + + cmd = F_SETLKW64; + file_lock->fl_flags |= FL_OFDLCK; + file_lock->fl_owner = (fl_owner_t)filp; + /* Fallthrough */ + case F_SETLKW64: + file_lock->fl_flags |= FL_SLEEP; } error = do_lock_file_wait(filp, cmd, file_lock); @@ -2209,7 +2303,7 @@ EXPORT_SYMBOL(locks_remove_posix); /* * This function is called on the last close of an open file. */ -void locks_remove_flock(struct file *filp) +void locks_remove_file(struct file *filp) { struct inode * inode = file_inode(filp); struct file_lock *fl; @@ -2218,6 +2312,8 @@ void locks_remove_flock(struct file *filp) if (!inode->i_flock) return; + locks_remove_posix(filp, (fl_owner_t)filp); + if (filp->f_op->flock) { struct file_lock fl = { .fl_pid = current->tgid, @@ -2236,16 +2332,28 @@ void locks_remove_flock(struct file *filp) while ((fl = *before) != NULL) { if (fl->fl_file == filp) { - if (IS_FLOCK(fl)) { - locks_delete_lock(before); - continue; - } if (IS_LEASE(fl)) { lease_modify(before, F_UNLCK); continue; } - /* What? */ - BUG(); + + /* + * There's a leftover lock on the list of a type that + * we didn't expect to see. Most likely a classic + * POSIX lock that ended up not getting released + * properly, or that raced onto the list somehow. Log + * some info about it and then just remove it from + * the list. + */ + WARN(!IS_FLOCK(fl), + "leftover lock: dev=%u:%u ino=%lu type=%hhd flags=0x%x start=%lld end=%lld\n", + MAJOR(inode->i_sb->s_dev), + MINOR(inode->i_sb->s_dev), inode->i_ino, + fl->fl_type, fl->fl_flags, + fl->fl_start, fl->fl_end); + + locks_delete_lock(before); + continue; } before = &fl->fl_next; } @@ -2314,8 +2422,14 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, seq_printf(f, "%lld:%s ", id, pfx); if (IS_POSIX(fl)) { - seq_printf(f, "%6s %s ", - (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ", + if (fl->fl_flags & FL_ACCESS) + seq_printf(f, "ACCESS"); + else if (IS_OFDLCK(fl)) + seq_printf(f, "OFDLCK"); + else + seq_printf(f, "POSIX "); + + seq_printf(f, " %s ", (inode == NULL) ? "*NOINODE*" : mandatory_lock(inode) ? "MANDATORY" : "ADVISORY "); } else if (IS_FLOCK(fl)) { @@ -2385,6 +2499,7 @@ static int locks_show(struct seq_file *f, void *v) } static void *locks_start(struct seq_file *f, loff_t *pos) + __acquires(&blocked_lock_lock) { struct locks_iterator *iter = f->private; @@ -2403,6 +2518,7 @@ static void *locks_next(struct seq_file *f, void *v, loff_t *pos) } static void locks_stop(struct seq_file *f, void *v) + __releases(&blocked_lock_lock) { spin_unlock(&blocked_lock_lock); lg_global_unlock(&file_lock_lglock); diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index 9a59cbade2fb..48140315f627 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -2180,7 +2180,7 @@ void logfs_evict_inode(struct inode *inode) do_delete_inode(inode); } } - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); /* Cheaper version of write_inode. All changes are concealed in diff --git a/fs/mbcache.c b/fs/mbcache.c index e519e45bf673..bf166e388f0d 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -26,6 +26,41 @@ * back on the lru list. */ +/* + * Lock descriptions and usage: + * + * Each hash chain of both the block and index hash tables now contains + * a built-in lock used to serialize accesses to the hash chain. + * + * Accesses to global data structures mb_cache_list and mb_cache_lru_list + * are serialized via the global spinlock mb_cache_spinlock. + * + * Each mb_cache_entry contains a spinlock, e_entry_lock, to serialize + * accesses to its local data, such as e_used and e_queued. + * + * Lock ordering: + * + * Each block hash chain's lock has the highest lock order, followed by an + * index hash chain's lock, mb_cache_bg_lock (used to implement mb_cache_entry's + * lock), and mb_cach_spinlock, with the lowest order. While holding + * either a block or index hash chain lock, a thread can acquire an + * mc_cache_bg_lock, which in turn can also acquire mb_cache_spinlock. + * + * Synchronization: + * + * Since both mb_cache_entry_get and mb_cache_entry_find scan the block and + * index hash chian, it needs to lock the corresponding hash chain. For each + * mb_cache_entry within the chain, it needs to lock the mb_cache_entry to + * prevent either any simultaneous release or free on the entry and also + * to serialize accesses to either the e_used or e_queued member of the entry. + * + * To avoid having a dangling reference to an already freed + * mb_cache_entry, an mb_cache_entry is only freed when it is not on a + * block hash chain and also no longer being referenced, both e_used, + * and e_queued are 0's. When an mb_cache_entry is explicitly freed it is + * first removed from a block hash chain. + */ + #include <linux/kernel.h> #include <linux/module.h> @@ -34,9 +69,10 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/sched.h> -#include <linux/init.h> +#include <linux/list_bl.h> #include <linux/mbcache.h> - +#include <linux/init.h> +#include <linux/blockgroup_lock.h> #ifdef MB_CACHE_DEBUG # define mb_debug(f...) do { \ @@ -57,8 +93,14 @@ #define MB_CACHE_WRITER ((unsigned short)~0U >> 1) +#define MB_CACHE_ENTRY_LOCK_BITS __builtin_log2(NR_BG_LOCKS) +#define MB_CACHE_ENTRY_LOCK_INDEX(ce) \ + (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS)) + static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue); - +static struct blockgroup_lock *mb_cache_bg_lock; +static struct kmem_cache *mb_cache_kmem_cache; + MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>"); MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); MODULE_LICENSE("GPL"); @@ -86,58 +128,110 @@ static LIST_HEAD(mb_cache_list); static LIST_HEAD(mb_cache_lru_list); static DEFINE_SPINLOCK(mb_cache_spinlock); +static inline void +__spin_lock_mb_cache_entry(struct mb_cache_entry *ce) +{ + spin_lock(bgl_lock_ptr(mb_cache_bg_lock, + MB_CACHE_ENTRY_LOCK_INDEX(ce))); +} + +static inline void +__spin_unlock_mb_cache_entry(struct mb_cache_entry *ce) +{ + spin_unlock(bgl_lock_ptr(mb_cache_bg_lock, + MB_CACHE_ENTRY_LOCK_INDEX(ce))); +} + static inline int -__mb_cache_entry_is_hashed(struct mb_cache_entry *ce) +__mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce) { - return !list_empty(&ce->e_block_list); + return !hlist_bl_unhashed(&ce->e_block_list); } -static void -__mb_cache_entry_unhash(struct mb_cache_entry *ce) +static inline void +__mb_cache_entry_unhash_block(struct mb_cache_entry *ce) { - if (__mb_cache_entry_is_hashed(ce)) { - list_del_init(&ce->e_block_list); - list_del(&ce->e_index.o_list); - } + if (__mb_cache_entry_is_block_hashed(ce)) + hlist_bl_del_init(&ce->e_block_list); } +static inline int +__mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce) +{ + return !hlist_bl_unhashed(&ce->e_index.o_list); +} + +static inline void +__mb_cache_entry_unhash_index(struct mb_cache_entry *ce) +{ + if (__mb_cache_entry_is_index_hashed(ce)) + hlist_bl_del_init(&ce->e_index.o_list); +} + +/* + * __mb_cache_entry_unhash_unlock() + * + * This function is called to unhash both the block and index hash + * chain. + * It assumes both the block and index hash chain is locked upon entry. + * It also unlock both hash chains both exit + */ +static inline void +__mb_cache_entry_unhash_unlock(struct mb_cache_entry *ce) +{ + __mb_cache_entry_unhash_index(ce); + hlist_bl_unlock(ce->e_index_hash_p); + __mb_cache_entry_unhash_block(ce); + hlist_bl_unlock(ce->e_block_hash_p); +} static void __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask) { struct mb_cache *cache = ce->e_cache; - mb_assert(!(ce->e_used || ce->e_queued)); + mb_assert(!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))); kmem_cache_free(cache->c_entry_cache, ce); atomic_dec(&cache->c_entry_count); } - static void -__mb_cache_entry_release_unlock(struct mb_cache_entry *ce) - __releases(mb_cache_spinlock) +__mb_cache_entry_release(struct mb_cache_entry *ce) { + /* First lock the entry to serialize access to its local data. */ + __spin_lock_mb_cache_entry(ce); /* Wake up all processes queuing for this cache entry. */ if (ce->e_queued) wake_up_all(&mb_cache_queue); if (ce->e_used >= MB_CACHE_WRITER) ce->e_used -= MB_CACHE_WRITER; + /* + * Make sure that all cache entries on lru_list have + * both e_used and e_qued of 0s. + */ ce->e_used--; - if (!(ce->e_used || ce->e_queued)) { - if (!__mb_cache_entry_is_hashed(ce)) + if (!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))) { + if (!__mb_cache_entry_is_block_hashed(ce)) { + __spin_unlock_mb_cache_entry(ce); goto forget; - mb_assert(list_empty(&ce->e_lru_list)); - list_add_tail(&ce->e_lru_list, &mb_cache_lru_list); + } + /* + * Need access to lru list, first drop entry lock, + * then reacquire the lock in the proper order. + */ + spin_lock(&mb_cache_spinlock); + if (list_empty(&ce->e_lru_list)) + list_add_tail(&ce->e_lru_list, &mb_cache_lru_list); + spin_unlock(&mb_cache_spinlock); } - spin_unlock(&mb_cache_spinlock); + __spin_unlock_mb_cache_entry(ce); return; forget: - spin_unlock(&mb_cache_spinlock); + mb_assert(list_empty(&ce->e_lru_list)); __mb_cache_entry_forget(ce, GFP_KERNEL); } - /* * mb_cache_shrink_scan() memory pressure callback * @@ -160,17 +254,34 @@ mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) mb_debug("trying to free %d entries", nr_to_scan); spin_lock(&mb_cache_spinlock); - while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) { + while ((nr_to_scan-- > 0) && !list_empty(&mb_cache_lru_list)) { struct mb_cache_entry *ce = list_entry(mb_cache_lru_list.next, - struct mb_cache_entry, e_lru_list); - list_move_tail(&ce->e_lru_list, &free_list); - __mb_cache_entry_unhash(ce); - freed++; + struct mb_cache_entry, e_lru_list); + list_del_init(&ce->e_lru_list); + if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt)) + continue; + spin_unlock(&mb_cache_spinlock); + /* Prevent any find or get operation on the entry */ + hlist_bl_lock(ce->e_block_hash_p); + hlist_bl_lock(ce->e_index_hash_p); + /* Ignore if it is touched by a find/get */ + if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt) || + !list_empty(&ce->e_lru_list)) { + hlist_bl_unlock(ce->e_index_hash_p); + hlist_bl_unlock(ce->e_block_hash_p); + spin_lock(&mb_cache_spinlock); + continue; + } + __mb_cache_entry_unhash_unlock(ce); + list_add_tail(&ce->e_lru_list, &free_list); + spin_lock(&mb_cache_spinlock); } spin_unlock(&mb_cache_spinlock); + list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) { __mb_cache_entry_forget(entry, gfp_mask); + freed++; } return freed; } @@ -215,29 +326,40 @@ mb_cache_create(const char *name, int bucket_bits) int n, bucket_count = 1 << bucket_bits; struct mb_cache *cache = NULL; + if (!mb_cache_bg_lock) { + mb_cache_bg_lock = kmalloc(sizeof(struct blockgroup_lock), + GFP_KERNEL); + if (!mb_cache_bg_lock) + return NULL; + bgl_lock_init(mb_cache_bg_lock); + } + cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL); if (!cache) return NULL; cache->c_name = name; atomic_set(&cache->c_entry_count, 0); cache->c_bucket_bits = bucket_bits; - cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head), - GFP_KERNEL); + cache->c_block_hash = kmalloc(bucket_count * + sizeof(struct hlist_bl_head), GFP_KERNEL); if (!cache->c_block_hash) goto fail; for (n=0; n<bucket_count; n++) - INIT_LIST_HEAD(&cache->c_block_hash[n]); - cache->c_index_hash = kmalloc(bucket_count * sizeof(struct list_head), - GFP_KERNEL); + INIT_HLIST_BL_HEAD(&cache->c_block_hash[n]); + cache->c_index_hash = kmalloc(bucket_count * + sizeof(struct hlist_bl_head), GFP_KERNEL); if (!cache->c_index_hash) goto fail; for (n=0; n<bucket_count; n++) - INIT_LIST_HEAD(&cache->c_index_hash[n]); - cache->c_entry_cache = kmem_cache_create(name, - sizeof(struct mb_cache_entry), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); - if (!cache->c_entry_cache) - goto fail2; + INIT_HLIST_BL_HEAD(&cache->c_index_hash[n]); + if (!mb_cache_kmem_cache) { + mb_cache_kmem_cache = kmem_cache_create(name, + sizeof(struct mb_cache_entry), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); + if (!mb_cache_kmem_cache) + goto fail2; + } + cache->c_entry_cache = mb_cache_kmem_cache; /* * Set an upper limit on the number of cache entries so that the hash @@ -273,21 +395,47 @@ void mb_cache_shrink(struct block_device *bdev) { LIST_HEAD(free_list); - struct list_head *l, *ltmp; + struct list_head *l; + struct mb_cache_entry *ce, *tmp; + l = &mb_cache_lru_list; spin_lock(&mb_cache_spinlock); - list_for_each_safe(l, ltmp, &mb_cache_lru_list) { - struct mb_cache_entry *ce = - list_entry(l, struct mb_cache_entry, e_lru_list); + while (!list_is_last(l, &mb_cache_lru_list)) { + l = l->next; + ce = list_entry(l, struct mb_cache_entry, e_lru_list); if (ce->e_bdev == bdev) { - list_move_tail(&ce->e_lru_list, &free_list); - __mb_cache_entry_unhash(ce); + list_del_init(&ce->e_lru_list); + if (ce->e_used || ce->e_queued || + atomic_read(&ce->e_refcnt)) + continue; + spin_unlock(&mb_cache_spinlock); + /* + * Prevent any find or get operation on the entry. + */ + hlist_bl_lock(ce->e_block_hash_p); + hlist_bl_lock(ce->e_index_hash_p); + /* Ignore if it is touched by a find/get */ + if (ce->e_used || ce->e_queued || + atomic_read(&ce->e_refcnt) || + !list_empty(&ce->e_lru_list)) { + hlist_bl_unlock(ce->e_index_hash_p); + hlist_bl_unlock(ce->e_block_hash_p); + l = &mb_cache_lru_list; + spin_lock(&mb_cache_spinlock); + continue; + } + __mb_cache_entry_unhash_unlock(ce); + mb_assert(!(ce->e_used || ce->e_queued || + atomic_read(&ce->e_refcnt))); + list_add_tail(&ce->e_lru_list, &free_list); + l = &mb_cache_lru_list; + spin_lock(&mb_cache_spinlock); } } spin_unlock(&mb_cache_spinlock); - list_for_each_safe(l, ltmp, &free_list) { - __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, - e_lru_list), GFP_KERNEL); + + list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) { + __mb_cache_entry_forget(ce, GFP_KERNEL); } } @@ -303,23 +451,27 @@ void mb_cache_destroy(struct mb_cache *cache) { LIST_HEAD(free_list); - struct list_head *l, *ltmp; + struct mb_cache_entry *ce, *tmp; spin_lock(&mb_cache_spinlock); - list_for_each_safe(l, ltmp, &mb_cache_lru_list) { - struct mb_cache_entry *ce = - list_entry(l, struct mb_cache_entry, e_lru_list); - if (ce->e_cache == cache) { + list_for_each_entry_safe(ce, tmp, &mb_cache_lru_list, e_lru_list) { + if (ce->e_cache == cache) list_move_tail(&ce->e_lru_list, &free_list); - __mb_cache_entry_unhash(ce); - } } list_del(&cache->c_cache_list); spin_unlock(&mb_cache_spinlock); - list_for_each_safe(l, ltmp, &free_list) { - __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, - e_lru_list), GFP_KERNEL); + list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) { + list_del_init(&ce->e_lru_list); + /* + * Prevent any find or get operation on the entry. + */ + hlist_bl_lock(ce->e_block_hash_p); + hlist_bl_lock(ce->e_index_hash_p); + mb_assert(!(ce->e_used || ce->e_queued || + atomic_read(&ce->e_refcnt))); + __mb_cache_entry_unhash_unlock(ce); + __mb_cache_entry_forget(ce, GFP_KERNEL); } if (atomic_read(&cache->c_entry_count) > 0) { @@ -328,8 +480,10 @@ mb_cache_destroy(struct mb_cache *cache) atomic_read(&cache->c_entry_count)); } - kmem_cache_destroy(cache->c_entry_cache); - + if (list_empty(&mb_cache_list)) { + kmem_cache_destroy(mb_cache_kmem_cache); + mb_cache_kmem_cache = NULL; + } kfree(cache->c_index_hash); kfree(cache->c_block_hash); kfree(cache); @@ -346,28 +500,61 @@ mb_cache_destroy(struct mb_cache *cache) struct mb_cache_entry * mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags) { - struct mb_cache_entry *ce = NULL; + struct mb_cache_entry *ce; if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) { + struct list_head *l; + + l = &mb_cache_lru_list; spin_lock(&mb_cache_spinlock); - if (!list_empty(&mb_cache_lru_list)) { - ce = list_entry(mb_cache_lru_list.next, - struct mb_cache_entry, e_lru_list); - list_del_init(&ce->e_lru_list); - __mb_cache_entry_unhash(ce); + while (!list_is_last(l, &mb_cache_lru_list)) { + l = l->next; + ce = list_entry(l, struct mb_cache_entry, e_lru_list); + if (ce->e_cache == cache) { + list_del_init(&ce->e_lru_list); + if (ce->e_used || ce->e_queued || + atomic_read(&ce->e_refcnt)) + continue; + spin_unlock(&mb_cache_spinlock); + /* + * Prevent any find or get operation on the + * entry. + */ + hlist_bl_lock(ce->e_block_hash_p); + hlist_bl_lock(ce->e_index_hash_p); + /* Ignore if it is touched by a find/get */ + if (ce->e_used || ce->e_queued || + atomic_read(&ce->e_refcnt) || + !list_empty(&ce->e_lru_list)) { + hlist_bl_unlock(ce->e_index_hash_p); + hlist_bl_unlock(ce->e_block_hash_p); + l = &mb_cache_lru_list; + spin_lock(&mb_cache_spinlock); + continue; + } + mb_assert(list_empty(&ce->e_lru_list)); + mb_assert(!(ce->e_used || ce->e_queued || + atomic_read(&ce->e_refcnt))); + __mb_cache_entry_unhash_unlock(ce); + goto found; + } } spin_unlock(&mb_cache_spinlock); } - if (!ce) { - ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags); - if (!ce) - return NULL; - atomic_inc(&cache->c_entry_count); - INIT_LIST_HEAD(&ce->e_lru_list); - INIT_LIST_HEAD(&ce->e_block_list); - ce->e_cache = cache; - ce->e_queued = 0; - } + + ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags); + if (!ce) + return NULL; + atomic_inc(&cache->c_entry_count); + INIT_LIST_HEAD(&ce->e_lru_list); + INIT_HLIST_BL_NODE(&ce->e_block_list); + INIT_HLIST_BL_NODE(&ce->e_index.o_list); + ce->e_cache = cache; + ce->e_queued = 0; + atomic_set(&ce->e_refcnt, 0); +found: + ce->e_block_hash_p = &cache->c_block_hash[0]; + ce->e_index_hash_p = &cache->c_index_hash[0]; ce->e_used = 1 + MB_CACHE_WRITER; return ce; } @@ -393,29 +580,38 @@ mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev, { struct mb_cache *cache = ce->e_cache; unsigned int bucket; - struct list_head *l; - int error = -EBUSY; + struct hlist_bl_node *l; + struct hlist_bl_head *block_hash_p; + struct hlist_bl_head *index_hash_p; + struct mb_cache_entry *lce; + mb_assert(ce); bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), cache->c_bucket_bits); - spin_lock(&mb_cache_spinlock); - list_for_each_prev(l, &cache->c_block_hash[bucket]) { - struct mb_cache_entry *ce = - list_entry(l, struct mb_cache_entry, e_block_list); - if (ce->e_bdev == bdev && ce->e_block == block) - goto out; + block_hash_p = &cache->c_block_hash[bucket]; + hlist_bl_lock(block_hash_p); + hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) { + if (lce->e_bdev == bdev && lce->e_block == block) { + hlist_bl_unlock(block_hash_p); + return -EBUSY; + } } - __mb_cache_entry_unhash(ce); + mb_assert(!__mb_cache_entry_is_block_hashed(ce)); + __mb_cache_entry_unhash_block(ce); + __mb_cache_entry_unhash_index(ce); ce->e_bdev = bdev; ce->e_block = block; - list_add(&ce->e_block_list, &cache->c_block_hash[bucket]); + ce->e_block_hash_p = block_hash_p; ce->e_index.o_key = key; + hlist_bl_add_head(&ce->e_block_list, block_hash_p); + hlist_bl_unlock(block_hash_p); bucket = hash_long(key, cache->c_bucket_bits); - list_add(&ce->e_index.o_list, &cache->c_index_hash[bucket]); - error = 0; -out: - spin_unlock(&mb_cache_spinlock); - return error; + index_hash_p = &cache->c_index_hash[bucket]; + hlist_bl_lock(index_hash_p); + ce->e_index_hash_p = index_hash_p; + hlist_bl_add_head(&ce->e_index.o_list, index_hash_p); + hlist_bl_unlock(index_hash_p); + return 0; } @@ -429,24 +625,26 @@ out: void mb_cache_entry_release(struct mb_cache_entry *ce) { - spin_lock(&mb_cache_spinlock); - __mb_cache_entry_release_unlock(ce); + __mb_cache_entry_release(ce); } /* * mb_cache_entry_free() * - * This is equivalent to the sequence mb_cache_entry_takeout() -- - * mb_cache_entry_release(). */ void mb_cache_entry_free(struct mb_cache_entry *ce) { - spin_lock(&mb_cache_spinlock); + mb_assert(ce); mb_assert(list_empty(&ce->e_lru_list)); - __mb_cache_entry_unhash(ce); - __mb_cache_entry_release_unlock(ce); + hlist_bl_lock(ce->e_index_hash_p); + __mb_cache_entry_unhash_index(ce); + hlist_bl_unlock(ce->e_index_hash_p); + hlist_bl_lock(ce->e_block_hash_p); + __mb_cache_entry_unhash_block(ce); + hlist_bl_unlock(ce->e_block_hash_p); + __mb_cache_entry_release(ce); } @@ -463,84 +661,110 @@ mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev, sector_t block) { unsigned int bucket; - struct list_head *l; + struct hlist_bl_node *l; struct mb_cache_entry *ce; + struct hlist_bl_head *block_hash_p; bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), cache->c_bucket_bits); - spin_lock(&mb_cache_spinlock); - list_for_each(l, &cache->c_block_hash[bucket]) { - ce = list_entry(l, struct mb_cache_entry, e_block_list); + block_hash_p = &cache->c_block_hash[bucket]; + /* First serialize access to the block corresponding hash chain. */ + hlist_bl_lock(block_hash_p); + hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) { + mb_assert(ce->e_block_hash_p == block_hash_p); if (ce->e_bdev == bdev && ce->e_block == block) { - DEFINE_WAIT(wait); + /* + * Prevent a free from removing the entry. + */ + atomic_inc(&ce->e_refcnt); + hlist_bl_unlock(block_hash_p); + __spin_lock_mb_cache_entry(ce); + atomic_dec(&ce->e_refcnt); + if (ce->e_used > 0) { + DEFINE_WAIT(wait); + while (ce->e_used > 0) { + ce->e_queued++; + prepare_to_wait(&mb_cache_queue, &wait, + TASK_UNINTERRUPTIBLE); + __spin_unlock_mb_cache_entry(ce); + schedule(); + __spin_lock_mb_cache_entry(ce); + ce->e_queued--; + } + finish_wait(&mb_cache_queue, &wait); + } + ce->e_used += 1 + MB_CACHE_WRITER; + __spin_unlock_mb_cache_entry(ce); - if (!list_empty(&ce->e_lru_list)) + if (!list_empty(&ce->e_lru_list)) { + spin_lock(&mb_cache_spinlock); list_del_init(&ce->e_lru_list); - - while (ce->e_used > 0) { - ce->e_queued++; - prepare_to_wait(&mb_cache_queue, &wait, - TASK_UNINTERRUPTIBLE); spin_unlock(&mb_cache_spinlock); - schedule(); - spin_lock(&mb_cache_spinlock); - ce->e_queued--; } - finish_wait(&mb_cache_queue, &wait); - ce->e_used += 1 + MB_CACHE_WRITER; - - if (!__mb_cache_entry_is_hashed(ce)) { - __mb_cache_entry_release_unlock(ce); + if (!__mb_cache_entry_is_block_hashed(ce)) { + __mb_cache_entry_release(ce); return NULL; } - goto cleanup; + return ce; } } - ce = NULL; - -cleanup: - spin_unlock(&mb_cache_spinlock); - return ce; + hlist_bl_unlock(block_hash_p); + return NULL; } #if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) static struct mb_cache_entry * -__mb_cache_entry_find(struct list_head *l, struct list_head *head, +__mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head, struct block_device *bdev, unsigned int key) { - while (l != head) { + + /* The index hash chain is alredy acquire by caller. */ + while (l != NULL) { struct mb_cache_entry *ce = - list_entry(l, struct mb_cache_entry, e_index.o_list); + hlist_bl_entry(l, struct mb_cache_entry, + e_index.o_list); + mb_assert(ce->e_index_hash_p == head); if (ce->e_bdev == bdev && ce->e_index.o_key == key) { - DEFINE_WAIT(wait); - - if (!list_empty(&ce->e_lru_list)) - list_del_init(&ce->e_lru_list); - + /* + * Prevent a free from removing the entry. + */ + atomic_inc(&ce->e_refcnt); + hlist_bl_unlock(head); + __spin_lock_mb_cache_entry(ce); + atomic_dec(&ce->e_refcnt); + ce->e_used++; /* Incrementing before holding the lock gives readers priority over writers. */ - ce->e_used++; - while (ce->e_used >= MB_CACHE_WRITER) { - ce->e_queued++; - prepare_to_wait(&mb_cache_queue, &wait, - TASK_UNINTERRUPTIBLE); - spin_unlock(&mb_cache_spinlock); - schedule(); - spin_lock(&mb_cache_spinlock); - ce->e_queued--; + if (ce->e_used >= MB_CACHE_WRITER) { + DEFINE_WAIT(wait); + + while (ce->e_used >= MB_CACHE_WRITER) { + ce->e_queued++; + prepare_to_wait(&mb_cache_queue, &wait, + TASK_UNINTERRUPTIBLE); + __spin_unlock_mb_cache_entry(ce); + schedule(); + __spin_lock_mb_cache_entry(ce); + ce->e_queued--; + } + finish_wait(&mb_cache_queue, &wait); } - finish_wait(&mb_cache_queue, &wait); - - if (!__mb_cache_entry_is_hashed(ce)) { - __mb_cache_entry_release_unlock(ce); + __spin_unlock_mb_cache_entry(ce); + if (!list_empty(&ce->e_lru_list)) { spin_lock(&mb_cache_spinlock); + list_del_init(&ce->e_lru_list); + spin_unlock(&mb_cache_spinlock); + } + if (!__mb_cache_entry_is_block_hashed(ce)) { + __mb_cache_entry_release(ce); return ERR_PTR(-EAGAIN); } return ce; } l = l->next; } + hlist_bl_unlock(head); return NULL; } @@ -562,13 +786,17 @@ mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev, unsigned int key) { unsigned int bucket = hash_long(key, cache->c_bucket_bits); - struct list_head *l; - struct mb_cache_entry *ce; - - spin_lock(&mb_cache_spinlock); - l = cache->c_index_hash[bucket].next; - ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key); - spin_unlock(&mb_cache_spinlock); + struct hlist_bl_node *l; + struct mb_cache_entry *ce = NULL; + struct hlist_bl_head *index_hash_p; + + index_hash_p = &cache->c_index_hash[bucket]; + hlist_bl_lock(index_hash_p); + if (!hlist_bl_empty(index_hash_p)) { + l = hlist_bl_first(index_hash_p); + ce = __mb_cache_entry_find(l, index_hash_p, bdev, key); + } else + hlist_bl_unlock(index_hash_p); return ce; } @@ -597,13 +825,17 @@ mb_cache_entry_find_next(struct mb_cache_entry *prev, { struct mb_cache *cache = prev->e_cache; unsigned int bucket = hash_long(key, cache->c_bucket_bits); - struct list_head *l; + struct hlist_bl_node *l; struct mb_cache_entry *ce; + struct hlist_bl_head *index_hash_p; - spin_lock(&mb_cache_spinlock); + index_hash_p = &cache->c_index_hash[bucket]; + mb_assert(prev->e_index_hash_p == index_hash_p); + hlist_bl_lock(index_hash_p); + mb_assert(!hlist_bl_empty(index_hash_p)); l = prev->e_index.o_list.next; - ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key); - __mb_cache_entry_release_unlock(prev); + ce = __mb_cache_entry_find(l, index_hash_p, bdev, key); + __mb_cache_entry_release(prev); return ce; } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 0332109162a5..f007a3355570 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -26,7 +26,7 @@ static int minix_remount (struct super_block * sb, int * flags, char * data); static void minix_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); if (!inode->i_nlink) { inode->i_size = 0; minix_truncate(inode); @@ -86,7 +86,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { minix_inode_cachep = kmem_cache_create("minix_inode_cache", sizeof(struct minix_inode_info), @@ -123,6 +123,7 @@ static int minix_remount (struct super_block * sb, int * flags, char * data) struct minix_sb_info * sbi = minix_sb(sb); struct minix_super_block * ms; + sync_filesystem(sb); ms = sbi->s_ms; if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) return 0; diff --git a/fs/mount.h b/fs/mount.h index b29e42f05f34..d55297f2fa05 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -10,7 +10,7 @@ struct mnt_namespace { struct user_namespace *user_ns; u64 seq; /* Sequence number to prevent loops */ wait_queue_head_t poll; - int event; + u64 event; }; struct mnt_pcp { @@ -104,6 +104,9 @@ struct proc_mounts { struct mnt_namespace *ns; struct path root; int (*show)(struct seq_file *, struct vfsmount *); + void *cached_mount; + u64 cached_event; + loff_t cached_index; }; #define proc_mounts(p) (container_of((p), struct proc_mounts, m)) diff --git a/fs/namei.c b/fs/namei.c index 4b491b431990..80168273396b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -358,6 +358,7 @@ int generic_permission(struct inode *inode, int mask) return -EACCES; } +EXPORT_SYMBOL(generic_permission); /* * We _really_ want to just do "generic_permission()" without @@ -455,6 +456,7 @@ int inode_permission(struct inode *inode, int mask) return retval; return __inode_permission(inode, mask); } +EXPORT_SYMBOL(inode_permission); /** * path_get - get a reference to a path @@ -924,6 +926,7 @@ int follow_up(struct path *path) path->mnt = &parent->mnt; return 1; } +EXPORT_SYMBOL(follow_up); /* * Perform an automount @@ -1085,6 +1088,7 @@ int follow_down_one(struct path *path) } return 0; } +EXPORT_SYMBOL(follow_down_one); static inline bool managed_dentry_might_block(struct dentry *dentry) { @@ -1223,6 +1227,7 @@ int follow_down(struct path *path) } return 0; } +EXPORT_SYMBOL(follow_down); /* * Skip to top of mountpoint pile in refwalk mode for follow_dotdot() @@ -1537,7 +1542,7 @@ static inline int walk_component(struct nameidata *nd, struct path *path, inode = path->dentry->d_inode; } err = -ENOENT; - if (!inode) + if (!inode || d_is_negative(path->dentry)) goto out_path_put; if (should_follow_link(path->dentry, follow)) { @@ -1796,7 +1801,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) if (err) return err; } - if (!d_is_directory(nd->path.dentry)) { + if (!d_can_lookup(nd->path.dentry)) { err = -ENOTDIR; break; } @@ -1817,7 +1822,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct dentry *root = nd->root.dentry; struct inode *inode = root->d_inode; if (*name) { - if (!d_is_directory(root)) + if (!d_can_lookup(root)) return -ENOTDIR; retval = inode_permission(inode, MAY_EXEC); if (retval) @@ -1873,7 +1878,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, dentry = f.file->f_path.dentry; if (*name) { - if (!d_is_directory(dentry)) { + if (!d_can_lookup(dentry)) { fdput(f); return -ENOTDIR; } @@ -1955,7 +1960,7 @@ static int path_lookupat(int dfd, const char *name, err = complete_walk(nd); if (!err && nd->flags & LOOKUP_DIRECTORY) { - if (!d_is_directory(nd->path.dentry)) { + if (!d_can_lookup(nd->path.dentry)) { path_put(&nd->path); err = -ENOTDIR; } @@ -2025,6 +2030,7 @@ int kern_path(const char *name, unsigned int flags, struct path *path) *path = nd.path; return res; } +EXPORT_SYMBOL(kern_path); /** * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair @@ -2049,6 +2055,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, *path = nd.path; return err; } +EXPORT_SYMBOL(vfs_path_lookup); /* * Restricted form of lookup. Doesn't follow links, single-component only, @@ -2111,6 +2118,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) return __lookup_hash(&this, base, 0); } +EXPORT_SYMBOL(lookup_one_len); int user_path_at_empty(int dfd, const char __user *name, unsigned flags, struct path *path, int *empty) @@ -2135,6 +2143,7 @@ int user_path_at(int dfd, const char __user *name, unsigned flags, { return user_path_at_empty(dfd, name, flags, path, NULL); } +EXPORT_SYMBOL(user_path_at); /* * NB: most callers don't do anything directly with the reference to the @@ -2240,7 +2249,7 @@ mountpoint_last(struct nameidata *nd, struct path *path) mutex_unlock(&dir->d_inode->i_mutex); done: - if (!dentry->d_inode) { + if (!dentry->d_inode || d_is_negative(dentry)) { error = -ENOENT; dput(dentry); goto out; @@ -2414,11 +2423,11 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) IS_IMMUTABLE(inode) || IS_SWAPFILE(inode)) return -EPERM; if (isdir) { - if (!d_is_directory(victim) && !d_is_autodir(victim)) + if (!d_is_dir(victim)) return -ENOTDIR; if (IS_ROOT(victim)) return -EBUSY; - } else if (d_is_directory(victim) || d_is_autodir(victim)) + } else if (d_is_dir(victim)) return -EISDIR; if (IS_DEADDIR(dir)) return -ENOENT; @@ -2477,6 +2486,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); return NULL; } +EXPORT_SYMBOL(lock_rename); void unlock_rename(struct dentry *p1, struct dentry *p2) { @@ -2486,6 +2496,7 @@ void unlock_rename(struct dentry *p1, struct dentry *p2) mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex); } } +EXPORT_SYMBOL(unlock_rename); int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool want_excl) @@ -2506,6 +2517,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, fsnotify_create(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_create); static int may_open(struct path *path, int acc_mode, int flag) { @@ -2569,7 +2581,7 @@ static int handle_truncate(struct file *filp) /* * Refuse to truncate files with mandatory locks held on them. */ - error = locks_verify_locked(inode); + error = locks_verify_locked(filp); if (!error) error = security_path_truncate(path); if (!error) { @@ -2982,7 +2994,7 @@ retry_lookup: finish_lookup: /* we _can_ be in RCU mode here */ error = -ENOENT; - if (d_is_negative(path->dentry)) { + if (!inode || d_is_negative(path->dentry)) { path_to_nameidata(path, nd); goto out; } @@ -3016,11 +3028,10 @@ finish_open: } audit_inode(name, nd->path.dentry, 0); error = -EISDIR; - if ((open_flag & O_CREAT) && - (d_is_directory(nd->path.dentry) || d_is_autodir(nd->path.dentry))) + if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry)) goto out; error = -ENOTDIR; - if ((nd->flags & LOOKUP_DIRECTORY) && !d_is_directory(nd->path.dentry)) + if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry)) goto out; if (!S_ISREG(nd->inode->i_mode)) will_truncate = false; @@ -3376,6 +3387,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) fsnotify_create(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_mknod); static int may_mknod(umode_t mode) { @@ -3465,6 +3477,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) fsnotify_mkdir(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_mkdir); SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) { @@ -3519,6 +3532,7 @@ void dentry_unhash(struct dentry *dentry) __d_drop(dentry); spin_unlock(&dentry->d_lock); } +EXPORT_SYMBOL(dentry_unhash); int vfs_rmdir(struct inode *dir, struct dentry *dentry) { @@ -3556,6 +3570,7 @@ out: d_delete(dentry); return error; } +EXPORT_SYMBOL(vfs_rmdir); static long do_rmdir(int dfd, const char __user *pathname) { @@ -3673,6 +3688,7 @@ out: return error; } +EXPORT_SYMBOL(vfs_unlink); /* * Make sure that the actual truncation of the file will occur outside its @@ -3744,7 +3760,7 @@ exit1: slashes: if (d_is_negative(dentry)) error = -ENOENT; - else if (d_is_directory(dentry) || d_is_autodir(dentry)) + else if (d_is_dir(dentry)) error = -EISDIR; else error = -ENOTDIR; @@ -3786,6 +3802,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) fsnotify_create(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_symlink); SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, int, newdfd, const char __user *, newname) @@ -3894,6 +3911,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de fsnotify_link(dir, inode, new_dentry); return error; } +EXPORT_SYMBOL(vfs_link); /* * Hardlinks are often used in delicate situations. We avoid @@ -3974,7 +3992,28 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } -/* +/** + * vfs_rename - rename a filesystem object + * @old_dir: parent of source + * @old_dentry: source + * @new_dir: parent of destination + * @new_dentry: destination + * @delegated_inode: returns an inode needing a delegation break + * @flags: rename flags + * + * The caller must hold multiple mutexes--see lock_rename()). + * + * If vfs_rename discovers a delegation in need of breaking at either + * the source or destination, it will return -EWOULDBLOCK and return a + * reference to the inode in delegated_inode. The caller should then + * break the delegation and retry. Because breaking a delegation may + * take a long time, the caller should drop all locks before doing + * so. + * + * Alternatively, a caller may pass NULL for delegated_inode. This may + * be appropriate for callers that expect the underlying filesystem not + * to be NFS exported. + * * The worst of all namespace operations - renaming directory. "Perverted" * doesn't even start to describe it. Somebody in UCB had a heck of a trip... * Problems: @@ -4002,163 +4041,140 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * ->i_mutex on parents, which works but leads to some truly excessive * locking]. */ -static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) +int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + struct inode **delegated_inode, unsigned int flags) { - int error = 0; + int error; + bool is_dir = d_is_dir(old_dentry); + const unsigned char *old_name; + struct inode *source = old_dentry->d_inode; struct inode *target = new_dentry->d_inode; + bool new_is_dir = false; unsigned max_links = new_dir->i_sb->s_max_links; + if (source == target) + return 0; + + error = may_delete(old_dir, old_dentry, is_dir); + if (error) + return error; + + if (!target) { + error = may_create(new_dir, new_dentry); + } else { + new_is_dir = d_is_dir(new_dentry); + + if (!(flags & RENAME_EXCHANGE)) + error = may_delete(new_dir, new_dentry, is_dir); + else + error = may_delete(new_dir, new_dentry, new_is_dir); + } + if (error) + return error; + + if (!old_dir->i_op->rename) + return -EPERM; + + if (flags && !old_dir->i_op->rename2) + return -EINVAL; + /* * If we are going to change the parent - check write permissions, * we'll need to flip '..'. */ if (new_dir != old_dir) { - error = inode_permission(old_dentry->d_inode, MAY_WRITE); - if (error) - return error; + if (is_dir) { + error = inode_permission(source, MAY_WRITE); + if (error) + return error; + } + if ((flags & RENAME_EXCHANGE) && new_is_dir) { + error = inode_permission(target, MAY_WRITE); + if (error) + return error; + } } - error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); + error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry, + flags); if (error) return error; + old_name = fsnotify_oldname_init(old_dentry->d_name.name); dget(new_dentry); - if (target) + if (!is_dir || (flags & RENAME_EXCHANGE)) + lock_two_nondirectories(source, target); + else if (target) mutex_lock(&target->i_mutex); error = -EBUSY; if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry)) goto out; - error = -EMLINK; - if (max_links && !target && new_dir != old_dir && - new_dir->i_nlink >= max_links) - goto out; - - if (target) + if (max_links && new_dir != old_dir) { + error = -EMLINK; + if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links) + goto out; + if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir && + old_dir->i_nlink >= max_links) + goto out; + } + if (is_dir && !(flags & RENAME_EXCHANGE) && target) shrink_dcache_parent(new_dentry); - error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); - if (error) - goto out; - - if (target) { - target->i_flags |= S_DEAD; - dont_mount(new_dentry); + if (!is_dir) { + error = try_break_deleg(source, delegated_inode); + if (error) + goto out; } -out: - if (target) - mutex_unlock(&target->i_mutex); - dput(new_dentry); - if (!error) - if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) - d_move(old_dentry,new_dentry); - return error; -} - -static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - struct inode **delegated_inode) -{ - struct inode *target = new_dentry->d_inode; - struct inode *source = old_dentry->d_inode; - int error; - - error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); - if (error) - return error; - - dget(new_dentry); - lock_two_nondirectories(source, target); - - error = -EBUSY; - if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) - goto out; - - error = try_break_deleg(source, delegated_inode); - if (error) - goto out; - if (target) { + if (target && !new_is_dir) { error = try_break_deleg(target, delegated_inode); if (error) goto out; } - error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); + if (!flags) { + error = old_dir->i_op->rename(old_dir, old_dentry, + new_dir, new_dentry); + } else { + error = old_dir->i_op->rename2(old_dir, old_dentry, + new_dir, new_dentry, flags); + } if (error) goto out; - if (target) + if (!(flags & RENAME_EXCHANGE) && target) { + if (is_dir) + target->i_flags |= S_DEAD; dont_mount(new_dentry); - if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) - d_move(old_dentry, new_dentry); + } + if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) { + if (!(flags & RENAME_EXCHANGE)) + d_move(old_dentry, new_dentry); + else + d_exchange(old_dentry, new_dentry); + } out: - unlock_two_nondirectories(source, target); + if (!is_dir || (flags & RENAME_EXCHANGE)) + unlock_two_nondirectories(source, target); + else if (target) + mutex_unlock(&target->i_mutex); dput(new_dentry); - return error; -} - -/** - * vfs_rename - rename a filesystem object - * @old_dir: parent of source - * @old_dentry: source - * @new_dir: parent of destination - * @new_dentry: destination - * @delegated_inode: returns an inode needing a delegation break - * - * The caller must hold multiple mutexes--see lock_rename()). - * - * If vfs_rename discovers a delegation in need of breaking at either - * the source or destination, it will return -EWOULDBLOCK and return a - * reference to the inode in delegated_inode. The caller should then - * break the delegation and retry. Because breaking a delegation may - * take a long time, the caller should drop all locks before doing - * so. - * - * Alternatively, a caller may pass NULL for delegated_inode. This may - * be appropriate for callers that expect the underlying filesystem not - * to be NFS exported. - */ -int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - struct inode **delegated_inode) -{ - int error; - int is_dir = d_is_directory(old_dentry) || d_is_autodir(old_dentry); - const unsigned char *old_name; - - if (old_dentry->d_inode == new_dentry->d_inode) - return 0; - - error = may_delete(old_dir, old_dentry, is_dir); - if (error) - return error; - - if (!new_dentry->d_inode) - error = may_create(new_dir, new_dentry); - else - error = may_delete(new_dir, new_dentry, is_dir); - if (error) - return error; - - if (!old_dir->i_op->rename) - return -EPERM; - - old_name = fsnotify_oldname_init(old_dentry->d_name.name); - - if (is_dir) - error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry); - else - error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry,delegated_inode); - if (!error) + if (!error) { fsnotify_move(old_dir, new_dir, old_name, is_dir, - new_dentry->d_inode, old_dentry); + !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry); + if (flags & RENAME_EXCHANGE) { + fsnotify_move(new_dir, old_dir, old_dentry->d_name.name, + new_is_dir, NULL, new_dentry); + } + } fsnotify_oldname_free(old_name); return error; } +EXPORT_SYMBOL(vfs_rename); -SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, - int, newdfd, const char __user *, newname) +SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, + int, newdfd, const char __user *, newname, unsigned int, flags) { struct dentry *old_dir, *new_dir; struct dentry *old_dentry, *new_dentry; @@ -4170,6 +4186,13 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, unsigned int lookup_flags = 0; bool should_retry = false; int error; + + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + if ((flags & RENAME_NOREPLACE) && (flags & RENAME_EXCHANGE)) + return -EINVAL; + retry: from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags); if (IS_ERR(from)) { @@ -4193,6 +4216,8 @@ retry: goto exit2; new_dir = newnd.path.dentry; + if (flags & RENAME_NOREPLACE) + error = -EEXIST; if (newnd.last_type != LAST_NORM) goto exit2; @@ -4202,7 +4227,8 @@ retry: oldnd.flags &= ~LOOKUP_PARENT; newnd.flags &= ~LOOKUP_PARENT; - newnd.flags |= LOOKUP_RENAME_TARGET; + if (!(flags & RENAME_EXCHANGE)) + newnd.flags |= LOOKUP_RENAME_TARGET; retry_deleg: trap = lock_rename(new_dir, old_dir); @@ -4215,34 +4241,49 @@ retry_deleg: error = -ENOENT; if (d_is_negative(old_dentry)) goto exit4; + new_dentry = lookup_hash(&newnd); + error = PTR_ERR(new_dentry); + if (IS_ERR(new_dentry)) + goto exit4; + error = -EEXIST; + if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry)) + goto exit5; + if (flags & RENAME_EXCHANGE) { + error = -ENOENT; + if (d_is_negative(new_dentry)) + goto exit5; + + if (!d_is_dir(new_dentry)) { + error = -ENOTDIR; + if (newnd.last.name[newnd.last.len]) + goto exit5; + } + } /* unless the source is a directory trailing slashes give -ENOTDIR */ - if (!d_is_directory(old_dentry) && !d_is_autodir(old_dentry)) { + if (!d_is_dir(old_dentry)) { error = -ENOTDIR; if (oldnd.last.name[oldnd.last.len]) - goto exit4; - if (newnd.last.name[newnd.last.len]) - goto exit4; + goto exit5; + if (!(flags & RENAME_EXCHANGE) && newnd.last.name[newnd.last.len]) + goto exit5; } /* source should not be ancestor of target */ error = -EINVAL; if (old_dentry == trap) - goto exit4; - new_dentry = lookup_hash(&newnd); - error = PTR_ERR(new_dentry); - if (IS_ERR(new_dentry)) - goto exit4; + goto exit5; /* target should not be an ancestor of source */ - error = -ENOTEMPTY; + if (!(flags & RENAME_EXCHANGE)) + error = -ENOTEMPTY; if (new_dentry == trap) goto exit5; error = security_path_rename(&oldnd.path, old_dentry, - &newnd.path, new_dentry); + &newnd.path, new_dentry, flags); if (error) goto exit5; error = vfs_rename(old_dir->d_inode, old_dentry, - new_dir->d_inode, new_dentry, - &delegated_inode); + new_dir->d_inode, new_dentry, + &delegated_inode, flags); exit5: dput(new_dentry); exit4: @@ -4272,16 +4313,20 @@ exit: return error; } -SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) +SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, + int, newdfd, const char __user *, newname) { - return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname); + return sys_renameat2(olddfd, oldname, newdfd, newname, 0); } -int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link) +SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) { - int len; + return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); +} - len = PTR_ERR(link); +int readlink_copy(char __user *buffer, int buflen, const char *link) +{ + int len = PTR_ERR(link); if (IS_ERR(link)) goto out; @@ -4293,6 +4338,7 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const c out: return len; } +EXPORT_SYMBOL(readlink_copy); /* * A helper for ->readlink(). This should be used *ONLY* for symlinks that @@ -4310,11 +4356,12 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) if (IS_ERR(cookie)) return PTR_ERR(cookie); - res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd)); + res = readlink_copy(buffer, buflen, nd_get_link(&nd)); if (dentry->d_inode->i_op->put_link) dentry->d_inode->i_op->put_link(dentry, &nd, cookie); return res; } +EXPORT_SYMBOL(generic_readlink); /* get the link contents into pagecache */ static char *page_getlink(struct dentry * dentry, struct page **ppage) @@ -4334,14 +4381,14 @@ static char *page_getlink(struct dentry * dentry, struct page **ppage) int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) { struct page *page = NULL; - char *s = page_getlink(dentry, &page); - int res = vfs_readlink(dentry,buffer,buflen,s); + int res = readlink_copy(buffer, buflen, page_getlink(dentry, &page)); if (page) { kunmap(page); page_cache_release(page); } return res; } +EXPORT_SYMBOL(page_readlink); void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd) { @@ -4349,6 +4396,7 @@ void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd) nd_set_link(nd, page_getlink(dentry, &page)); return page; } +EXPORT_SYMBOL(page_follow_link_light); void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) { @@ -4359,6 +4407,7 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) page_cache_release(page); } } +EXPORT_SYMBOL(page_put_link); /* * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS @@ -4396,45 +4445,18 @@ retry: fail: return err; } +EXPORT_SYMBOL(__page_symlink); int page_symlink(struct inode *inode, const char *symname, int len) { return __page_symlink(inode, symname, len, !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS)); } +EXPORT_SYMBOL(page_symlink); const struct inode_operations page_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, }; - -EXPORT_SYMBOL(user_path_at); -EXPORT_SYMBOL(follow_down_one); -EXPORT_SYMBOL(follow_down); -EXPORT_SYMBOL(follow_up); -EXPORT_SYMBOL(get_write_access); /* nfsd */ -EXPORT_SYMBOL(lock_rename); -EXPORT_SYMBOL(lookup_one_len); -EXPORT_SYMBOL(page_follow_link_light); -EXPORT_SYMBOL(page_put_link); -EXPORT_SYMBOL(page_readlink); -EXPORT_SYMBOL(__page_symlink); -EXPORT_SYMBOL(page_symlink); EXPORT_SYMBOL(page_symlink_inode_operations); -EXPORT_SYMBOL(kern_path); -EXPORT_SYMBOL(vfs_path_lookup); -EXPORT_SYMBOL(inode_permission); -EXPORT_SYMBOL(unlock_rename); -EXPORT_SYMBOL(vfs_create); -EXPORT_SYMBOL(vfs_link); -EXPORT_SYMBOL(vfs_mkdir); -EXPORT_SYMBOL(vfs_mknod); -EXPORT_SYMBOL(generic_permission); -EXPORT_SYMBOL(vfs_readlink); -EXPORT_SYMBOL(vfs_rename); -EXPORT_SYMBOL(vfs_rmdir); -EXPORT_SYMBOL(vfs_symlink); -EXPORT_SYMBOL(vfs_unlink); -EXPORT_SYMBOL(dentry_unhash); -EXPORT_SYMBOL(generic_readlink); diff --git a/fs/namespace.c b/fs/namespace.c index 2ffc5a2905d4..182bc41cd887 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -52,7 +52,7 @@ static int __init set_mphash_entries(char *str) } __setup("mphash_entries=", set_mphash_entries); -static int event; +static u64 event; static DEFINE_IDA(mnt_id_ida); static DEFINE_IDA(mnt_group_ida); static DEFINE_SPINLOCK(mnt_id_lock); @@ -414,9 +414,7 @@ EXPORT_SYMBOL_GPL(mnt_clone_write); */ int __mnt_want_write_file(struct file *file) { - struct inode *inode = file_inode(file); - - if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode)) + if (!(file->f_mode & FMODE_WRITER)) return __mnt_want_write(file->f_path.mnt); else return mnt_clone_write(file->f_path.mnt); @@ -570,13 +568,17 @@ int sb_prepare_remount_readonly(struct super_block *sb) static void free_vfsmnt(struct mount *mnt) { kfree(mnt->mnt_devname); - mnt_free_id(mnt); #ifdef CONFIG_SMP free_percpu(mnt->mnt_pcp); #endif kmem_cache_free(mnt_cache, mnt); } +static void delayed_free_vfsmnt(struct rcu_head *head) +{ + free_vfsmnt(container_of(head, struct mount, mnt_rcu)); +} + /* call under rcu_read_lock */ bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) { @@ -848,6 +850,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void root = mount_fs(type, flags, name, data); if (IS_ERR(root)) { + mnt_free_id(mnt); free_vfsmnt(mnt); return ERR_CAST(root); } @@ -885,7 +888,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, goto out_free; } - mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD; + mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED); /* Don't allow unprivileged users to change mount flags */ if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY)) mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; @@ -928,20 +931,11 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, return mnt; out_free: + mnt_free_id(mnt); free_vfsmnt(mnt); return ERR_PTR(err); } -static void delayed_free(struct rcu_head *head) -{ - struct mount *mnt = container_of(head, struct mount, mnt_rcu); - kfree(mnt->mnt_devname); -#ifdef CONFIG_SMP - free_percpu(mnt->mnt_pcp); -#endif - kmem_cache_free(mnt_cache, mnt); -} - static void mntput_no_expire(struct mount *mnt) { put_again: @@ -991,7 +985,7 @@ put_again: dput(mnt->mnt.mnt_root); deactivate_super(mnt->mnt.mnt_sb); mnt_free_id(mnt); - call_rcu(&mnt->mnt_rcu, delayed_free); + call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); } void mntput(struct vfsmount *mnt) @@ -1100,14 +1094,29 @@ static void *m_start(struct seq_file *m, loff_t *pos) struct proc_mounts *p = proc_mounts(m); down_read(&namespace_sem); - return seq_list_start(&p->ns->list, *pos); + if (p->cached_event == p->ns->event) { + void *v = p->cached_mount; + if (*pos == p->cached_index) + return v; + if (*pos == p->cached_index + 1) { + v = seq_list_next(v, &p->ns->list, &p->cached_index); + return p->cached_mount = v; + } + } + + p->cached_event = p->ns->event; + p->cached_mount = seq_list_start(&p->ns->list, *pos); + p->cached_index = *pos; + return p->cached_mount; } static void *m_next(struct seq_file *m, void *v, loff_t *pos) { struct proc_mounts *p = proc_mounts(m); - return seq_list_next(v, &p->ns->list, pos); + p->cached_mount = seq_list_next(v, &p->ns->list, pos); + p->cached_index = *pos; + return p->cached_mount; } static void m_stop(struct seq_file *m, void *v) @@ -1661,9 +1670,9 @@ static int attach_recursive_mnt(struct mount *source_mnt, if (err) goto out; err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); + lock_mount_hash(); if (err) goto out_cleanup_ids; - lock_mount_hash(); for (p = source_mnt; p; p = next_mnt(p, source_mnt)) set_mnt_shared(p); } else { @@ -1690,6 +1699,11 @@ static int attach_recursive_mnt(struct mount *source_mnt, return 0; out_cleanup_ids: + while (!hlist_empty(&tree_list)) { + child = hlist_entry(tree_list.first, struct mount, mnt_hash); + umount_tree(child, 0); + } + unlock_mount_hash(); cleanup_group_ids(source_mnt, NULL); out: return err; @@ -2044,7 +2058,7 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) struct mount *parent; int err; - mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED | MNT_SYNC_UMOUNT); + mnt_flags &= ~MNT_INTERNAL_FLAGS; mp = lock_mount(path); if (IS_ERR(mp)) diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index c320ac52353e..08b8ea8c353e 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -339,7 +339,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags) if (val) goto finished; - DDPRINTK("ncp_lookup_validate: %pd2 not valid, age=%ld, server lookup\n", + ncp_dbg(2, "%pd2 not valid, age=%ld, server lookup\n", dentry, NCP_GET_AGE(dentry)); len = sizeof(__name); @@ -358,7 +358,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags) res = ncp_obtain_info(server, dir, __name, &(finfo.i)); } finfo.volume = finfo.i.volNumber; - DDPRINTK("ncp_lookup_validate: looked for %pd/%s, res=%d\n", + ncp_dbg(2, "looked for %pd/%s, res=%d\n", dentry->d_parent, __name, res); /* * If we didn't find it, or if it has a different dirEntNum to @@ -372,14 +372,14 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags) ncp_new_dentry(dentry); val=1; } else - DDPRINTK("ncp_lookup_validate: found, but dirEntNum changed\n"); + ncp_dbg(2, "found, but dirEntNum changed\n"); ncp_update_inode2(inode, &finfo); mutex_unlock(&inode->i_mutex); } finished: - DDPRINTK("ncp_lookup_validate: result=%d\n", val); + ncp_dbg(2, "result=%d\n", val); dput(parent); return val; } @@ -453,8 +453,7 @@ static int ncp_readdir(struct file *file, struct dir_context *ctx) ctl.page = NULL; ctl.cache = NULL; - DDPRINTK("ncp_readdir: reading %pD2, pos=%d\n", file, - (int) ctx->pos); + ncp_dbg(2, "reading %pD2, pos=%d\n", file, (int)ctx->pos); result = -EIO; /* Do not generate '.' and '..' when server is dead. */ @@ -697,8 +696,7 @@ ncp_read_volume_list(struct file *file, struct dir_context *ctx, struct ncp_entry_info entry; int i; - DPRINTK("ncp_read_volume_list: pos=%ld\n", - (unsigned long) ctx->pos); + ncp_dbg(1, "pos=%ld\n", (unsigned long)ctx->pos); for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) { int inval_dentry; @@ -708,12 +706,11 @@ ncp_read_volume_list(struct file *file, struct dir_context *ctx, if (!strlen(info.volume_name)) continue; - DPRINTK("ncp_read_volume_list: found vol: %s\n", - info.volume_name); + ncp_dbg(1, "found vol: %s\n", info.volume_name); if (ncp_lookup_volume(server, info.volume_name, &entry.i)) { - DPRINTK("ncpfs: could not lookup vol %s\n", + ncp_dbg(1, "could not lookup vol %s\n", info.volume_name); continue; } @@ -738,14 +735,13 @@ ncp_do_readdir(struct file *file, struct dir_context *ctx, int more; size_t bufsize; - DPRINTK("ncp_do_readdir: %pD2, fpos=%ld\n", file, - (unsigned long) ctx->pos); - PPRINTK("ncp_do_readdir: init %pD, volnum=%d, dirent=%u\n", - file, NCP_FINFO(dir)->volNumber, NCP_FINFO(dir)->dirEntNum); + ncp_dbg(1, "%pD2, fpos=%ld\n", file, (unsigned long)ctx->pos); + ncp_vdbg("init %pD, volnum=%d, dirent=%u\n", + file, NCP_FINFO(dir)->volNumber, NCP_FINFO(dir)->dirEntNum); err = ncp_initialize_search(server, dir, &seq); if (err) { - DPRINTK("ncp_do_readdir: init failed, err=%d\n", err); + ncp_dbg(1, "init failed, err=%d\n", err); return; } /* We MUST NOT use server->buffer_size handshaked with server if we are @@ -808,8 +804,7 @@ int ncp_conn_logged_in(struct super_block *sb) goto out; result = -ENOENT; if (ncp_get_volume_root(server, __name, &volNumber, &dirEntNum, &DosDirNum)) { - PPRINTK("ncp_conn_logged_in: %s not found\n", - server->m.mounted_vol); + ncp_vdbg("%s not found\n", server->m.mounted_vol); goto out; } dent = sb->s_root; @@ -822,10 +817,10 @@ int ncp_conn_logged_in(struct super_block *sb) NCP_FINFO(ino)->DosDirNum = DosDirNum; result = 0; } else { - DPRINTK("ncpfs: sb->s_root->d_inode == NULL!\n"); + ncp_dbg(1, "sb->s_root->d_inode == NULL!\n"); } } else { - DPRINTK("ncpfs: sb->s_root == NULL!\n"); + ncp_dbg(1, "sb->s_root == NULL!\n"); } } else result = 0; @@ -846,7 +841,7 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, unsig if (!ncp_conn_valid(server)) goto finished; - PPRINTK("ncp_lookup: server lookup for %pd2\n", dentry); + ncp_vdbg("server lookup for %pd2\n", dentry); len = sizeof(__name); if (ncp_is_server_root(dir)) { @@ -854,15 +849,15 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, unsig dentry->d_name.len, 1); if (!res) res = ncp_lookup_volume(server, __name, &(finfo.i)); - if (!res) - ncp_update_known_namespace(server, finfo.i.volNumber, NULL); + if (!res) + ncp_update_known_namespace(server, finfo.i.volNumber, NULL); } else { res = ncp_io2vol(server, __name, &len, dentry->d_name.name, dentry->d_name.len, !ncp_preserve_case(dir)); if (!res) res = ncp_obtain_info(server, dir, __name, &(finfo.i)); } - PPRINTK("ncp_lookup: looked for %pd2, res=%d\n", dentry, res); + ncp_vdbg("looked for %pd2, res=%d\n", dentry, res); /* * If we didn't find an entry, make a negative dentry. */ @@ -886,7 +881,7 @@ add_entry: } finished: - PPRINTK("ncp_lookup: result=%d\n", error); + ncp_vdbg("result=%d\n", error); return ERR_PTR(error); } @@ -909,7 +904,7 @@ out: return error; out_close: - PPRINTK("ncp_instantiate: %pd2 failed, closing file\n", dentry); + ncp_vdbg("%pd2 failed, closing file\n", dentry); ncp_close_file(NCP_SERVER(dir), finfo->file_handle); goto out; } @@ -923,7 +918,7 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, umode_t mode, int opmode; __u8 __name[NCP_MAXPATHLEN + 1]; - PPRINTK("ncp_create_new: creating %pd2, mode=%hx\n", dentry, mode); + ncp_vdbg("creating %pd2, mode=%hx\n", dentry, mode); ncp_age_dentry(server, dentry); len = sizeof(__name); @@ -952,7 +947,7 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, umode_t mode, error = -ENAMETOOLONG; else if (result < 0) error = result; - DPRINTK("ncp_create: %pd2 failed\n", dentry); + ncp_dbg(1, "%pd2 failed\n", dentry); goto out; } opmode = O_WRONLY; @@ -985,7 +980,7 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) int error, len; __u8 __name[NCP_MAXPATHLEN + 1]; - DPRINTK("ncp_mkdir: making %pd2\n", dentry); + ncp_dbg(1, "making %pd2\n", dentry); ncp_age_dentry(server, dentry); len = sizeof(__name); @@ -1022,7 +1017,7 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry) int error, result, len; __u8 __name[NCP_MAXPATHLEN + 1]; - DPRINTK("ncp_rmdir: removing %pd2\n", dentry); + ncp_dbg(1, "removing %pd2\n", dentry); len = sizeof(__name); error = ncp_io2vol(server, __name, &len, dentry->d_name.name, @@ -1067,13 +1062,13 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry) int error; server = NCP_SERVER(dir); - DPRINTK("ncp_unlink: unlinking %pd2\n", dentry); + ncp_dbg(1, "unlinking %pd2\n", dentry); /* * Check whether to close the file ... */ if (inode) { - PPRINTK("ncp_unlink: closing file\n"); + ncp_vdbg("closing file\n"); ncp_make_closed(inode); } @@ -1087,7 +1082,7 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry) #endif switch (error) { case 0x00: - DPRINTK("ncp: removed %pd2\n", dentry); + ncp_dbg(1, "removed %pd2\n", dentry); break; case 0x85: case 0x8A: @@ -1120,7 +1115,7 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry, int old_len, new_len; __u8 __old_name[NCP_MAXPATHLEN + 1], __new_name[NCP_MAXPATHLEN + 1]; - DPRINTK("ncp_rename: %pd2 to %pd2\n", old_dentry, new_dentry); + ncp_dbg(1, "%pd2 to %pd2\n", old_dentry, new_dentry); ncp_age_dentry(server, old_dentry); ncp_age_dentry(server, new_dentry); @@ -1150,8 +1145,8 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry, #endif switch (error) { case 0x00: - DPRINTK("ncp renamed %pd -> %pd.\n", - old_dentry, new_dentry); + ncp_dbg(1, "renamed %pd -> %pd\n", + old_dentry, new_dentry); break; case 0x9E: error = -ENAMETOOLONG; @@ -1173,7 +1168,7 @@ static int ncp_mknod(struct inode * dir, struct dentry *dentry, if (!new_valid_dev(rdev)) return -EINVAL; if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber)) { - DPRINTK(KERN_DEBUG "ncp_mknod: mode = 0%ho\n", mode); + ncp_dbg(1, "mode = 0%ho\n", mode); return ncp_create_new(dir, dentry, mode, rdev, 0); } return -EPERM; /* Strange, but true */ diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index 8f5074e1ecb9..77640a8bfb87 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -6,6 +6,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <asm/uaccess.h> #include <linux/time.h> @@ -34,11 +36,11 @@ int ncp_make_open(struct inode *inode, int right) error = -EINVAL; if (!inode) { - printk(KERN_ERR "ncp_make_open: got NULL inode\n"); + pr_err("%s: got NULL inode\n", __func__); goto out; } - DPRINTK("ncp_make_open: opened=%d, volume # %u, dir entry # %u\n", + ncp_dbg(1, "opened=%d, volume # %u, dir entry # %u\n", atomic_read(&NCP_FINFO(inode)->opened), NCP_FINFO(inode)->volNumber, NCP_FINFO(inode)->dirEntNum); @@ -71,7 +73,7 @@ int ncp_make_open(struct inode *inode, int right) break; } if (result) { - PPRINTK("ncp_make_open: failed, result=%d\n", result); + ncp_vdbg("failed, result=%d\n", result); goto out_unlock; } /* @@ -83,7 +85,7 @@ int ncp_make_open(struct inode *inode, int right) } access = NCP_FINFO(inode)->access; - PPRINTK("ncp_make_open: file open, access=%x\n", access); + ncp_vdbg("file open, access=%x\n", access); if (access == right || access == O_RDWR) { atomic_inc(&NCP_FINFO(inode)->opened); error = 0; @@ -107,7 +109,7 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) void* freepage; size_t freelen; - DPRINTK("ncp_file_read: enter %pd2\n", dentry); + ncp_dbg(1, "enter %pd2\n", dentry); pos = *ppos; @@ -124,7 +126,7 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) error = ncp_make_open(inode, O_RDONLY); if (error) { - DPRINTK(KERN_ERR "ncp_file_read: open failed, error=%d\n", error); + ncp_dbg(1, "open failed, error=%d\n", error); return error; } @@ -165,7 +167,7 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) file_accessed(file); - DPRINTK("ncp_file_read: exit %pd2\n", dentry); + ncp_dbg(1, "exit %pd2\n", dentry); outrel: ncp_inode_close(inode); return already_read ? already_read : error; @@ -182,7 +184,7 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t * int errno; void* bouncebuffer; - DPRINTK("ncp_file_write: enter %pd2\n", dentry); + ncp_dbg(1, "enter %pd2\n", dentry); if ((ssize_t) count < 0) return -EINVAL; pos = *ppos; @@ -211,7 +213,7 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t * return 0; errno = ncp_make_open(inode, O_WRONLY); if (errno) { - DPRINTK(KERN_ERR "ncp_file_write: open failed, error=%d\n", errno); + ncp_dbg(1, "open failed, error=%d\n", errno); return errno; } bufsize = NCP_SERVER(inode)->buffer_size; @@ -261,7 +263,7 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t * i_size_write(inode, pos); mutex_unlock(&inode->i_mutex); } - DPRINTK("ncp_file_write: exit %pd2\n", dentry); + ncp_dbg(1, "exit %pd2\n", dentry); outrel: ncp_inode_close(inode); return already_written ? already_written : errno; @@ -269,7 +271,7 @@ outrel: static int ncp_release(struct inode *inode, struct file *file) { if (ncp_make_closed(inode)) { - DPRINTK("ncp_release: failed to close\n"); + ncp_dbg(1, "failed to close\n"); } return 0; } diff --git a/fs/ncpfs/getopt.c b/fs/ncpfs/getopt.c index 0af3349de851..03ffde1f44d6 100644 --- a/fs/ncpfs/getopt.c +++ b/fs/ncpfs/getopt.c @@ -2,6 +2,8 @@ * getopt.c */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/string.h> @@ -46,8 +48,8 @@ int ncp_getopt(const char *caller, char **options, const struct ncp_option *opts if (opts->has_arg & OPT_NOPARAM) { return opts->val; } - printk(KERN_INFO "%s: the %s option requires an argument\n", - caller, token); + pr_info("%s: the %s option requires an argument\n", + caller, token); return -EINVAL; } if (opts->has_arg & OPT_INT) { @@ -57,18 +59,18 @@ int ncp_getopt(const char *caller, char **options, const struct ncp_option *opts if (!*v) { return opts->val; } - printk(KERN_INFO "%s: invalid numeric value in %s=%s\n", + pr_info("%s: invalid numeric value in %s=%s\n", caller, token, val); return -EDOM; } if (opts->has_arg & OPT_STRING) { return opts->val; } - printk(KERN_INFO "%s: unexpected argument %s to the %s option\n", + pr_info("%s: unexpected argument %s to the %s option\n", caller, val, token); return -EINVAL; } } - printk(KERN_INFO "%s: Unrecognized mount option %s\n", caller, token); + pr_info("%s: Unrecognized mount option %s\n", caller, token); return -EOPNOTSUPP; } diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 2cf2ebecb55f..e31e589369a4 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -9,6 +9,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <asm/uaccess.h> @@ -99,6 +101,7 @@ static void destroy_inodecache(void) static int ncp_remount(struct super_block *sb, int *flags, char* data) { + sync_filesystem(sb); *flags |= MS_NODIRATIME; return 0; } @@ -132,7 +135,7 @@ void ncp_update_inode(struct inode *inode, struct ncp_entry_info *nwinfo) NCP_FINFO(inode)->access = nwinfo->access; memcpy(NCP_FINFO(inode)->file_handle, nwinfo->file_handle, sizeof(nwinfo->file_handle)); - DPRINTK("ncp_update_inode: updated %s, volnum=%d, dirent=%u\n", + ncp_dbg(1, "updated %s, volnum=%d, dirent=%u\n", nwinfo->i.entryName, NCP_FINFO(inode)->volNumber, NCP_FINFO(inode)->dirEntNum); } @@ -140,8 +143,7 @@ void ncp_update_inode(struct inode *inode, struct ncp_entry_info *nwinfo) static void ncp_update_dates(struct inode *inode, struct nw_info_struct *nwi) { /* NFS namespace mode overrides others if it's set. */ - DPRINTK(KERN_DEBUG "ncp_update_dates_and_mode: (%s) nfs.mode=0%o\n", - nwi->entryName, nwi->nfs.mode); + ncp_dbg(1, "(%s) nfs.mode=0%o\n", nwi->entryName, nwi->nfs.mode); if (nwi->nfs.mode) { /* XXX Security? */ inode->i_mode = nwi->nfs.mode; @@ -229,7 +231,7 @@ static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo) ncp_update_attrs(inode, nwinfo); - DDPRINTK("ncp_read_inode: inode->i_mode = %u\n", inode->i_mode); + ncp_dbg(2, "inode->i_mode = %u\n", inode->i_mode); set_nlink(inode, 1); inode->i_uid = server->m.uid; @@ -257,7 +259,7 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info) struct inode *inode; if (info == NULL) { - printk(KERN_ERR "ncp_iget: info is NULL\n"); + pr_err("%s: info is NULL\n", __func__); return NULL; } @@ -289,23 +291,23 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info) } insert_inode_hash(inode); } else - printk(KERN_ERR "ncp_iget: iget failed!\n"); + pr_err("%s: iget failed!\n", __func__); return inode; } static void ncp_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (S_ISDIR(inode->i_mode)) { - DDPRINTK("ncp_evict_inode: put directory %ld\n", inode->i_ino); + ncp_dbg(2, "put directory %ld\n", inode->i_ino); } if (ncp_make_closed(inode) != 0) { /* We can't do anything but complain. */ - printk(KERN_ERR "ncp_evict_inode: could not close\n"); + pr_err("%s: could not close\n", __func__); } } @@ -468,9 +470,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) { struct ncp_mount_data_kernel data; struct ncp_server *server; - struct file *ncp_filp; struct inode *root_inode; - struct inode *sock_inode; struct socket *sock; int error; int default_bufsize; @@ -539,18 +539,10 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) if (!uid_valid(data.mounted_uid) || !uid_valid(data.uid) || !gid_valid(data.gid)) goto out; - error = -EBADF; - ncp_filp = fget(data.ncp_fd); - if (!ncp_filp) - goto out; - error = -ENOTSOCK; - sock_inode = file_inode(ncp_filp); - if (!S_ISSOCK(sock_inode->i_mode)) - goto out_fput; - sock = SOCKET_I(sock_inode); + sock = sockfd_lookup(data.ncp_fd, &error); if (!sock) - goto out_fput; - + goto out; + if (sock->type == SOCK_STREAM) default_bufsize = 0xF000; else @@ -572,27 +564,16 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) if (error) goto out_fput; - server->ncp_filp = ncp_filp; server->ncp_sock = sock; if (data.info_fd != -1) { - struct socket *info_sock; - - error = -EBADF; - server->info_filp = fget(data.info_fd); - if (!server->info_filp) - goto out_bdi; - error = -ENOTSOCK; - sock_inode = file_inode(server->info_filp); - if (!S_ISSOCK(sock_inode->i_mode)) - goto out_fput2; - info_sock = SOCKET_I(sock_inode); + struct socket *info_sock = sockfd_lookup(data.info_fd, &error); if (!info_sock) - goto out_fput2; + goto out_bdi; + server->info_sock = info_sock; error = -EBADFD; if (info_sock->type != SOCK_STREAM) goto out_fput2; - server->info_sock = info_sock; } /* server->lock = 0; */ @@ -620,7 +601,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) now because of PATH_MAX changes.. */ if (server->m.time_out < 1) { server->m.time_out = 10; - printk(KERN_INFO "You need to recompile your ncpfs utils..\n"); + pr_info("You need to recompile your ncpfs utils..\n"); } server->m.time_out = server->m.time_out * HZ / 100; server->m.file_mode = (server->m.file_mode & S_IRWXUGO) | S_IFREG; @@ -681,7 +662,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) ncp_unlock_server(server); if (error < 0) goto out_rxbuf; - DPRINTK("ncp_fill_super: NCP_SBP(sb) = %x\n", (int) NCP_SBP(sb)); + ncp_dbg(1, "NCP_SBP(sb) = %p\n", NCP_SBP(sb)); error = -EMSGSIZE; /* -EREMOTESIDEINCOMPATIBLE */ #ifdef CONFIG_NCPFS_PACKET_SIGNING @@ -709,7 +690,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) if (ncp_negotiate_buffersize(server, default_bufsize, &(server->buffer_size)) != 0) goto out_disconnect; - DPRINTK("ncpfs: bufsize = %d\n", server->buffer_size); + ncp_dbg(1, "bufsize = %d\n", server->buffer_size); memset(&finfo, 0, sizeof(finfo)); finfo.i.attributes = aDIR; @@ -738,7 +719,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) root_inode = ncp_iget(sb, &finfo); if (!root_inode) goto out_disconnect; - DPRINTK("ncp_fill_super: root vol=%d\n", NCP_FINFO(root_inode)->volNumber); + ncp_dbg(1, "root vol=%d\n", NCP_FINFO(root_inode)->volNumber); sb->s_root = d_make_root(root_inode); if (!sb->s_root) goto out_disconnect; @@ -764,17 +745,12 @@ out_nls: mutex_destroy(&server->root_setup_lock); mutex_destroy(&server->mutex); out_fput2: - if (server->info_filp) - fput(server->info_filp); + if (server->info_sock) + sockfd_put(server->info_sock); out_bdi: bdi_destroy(&server->bdi); out_fput: - /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>: - * - * The previously used put_filp(ncp_filp); was bogus, since - * it doesn't perform proper unlocking. - */ - fput(ncp_filp); + sockfd_put(sock); out: put_pid(data.wdog_pid); sb->s_fs_info = NULL; @@ -807,9 +783,9 @@ static void ncp_put_super(struct super_block *sb) mutex_destroy(&server->root_setup_lock); mutex_destroy(&server->mutex); - if (server->info_filp) - fput(server->info_filp); - fput(server->ncp_filp); + if (server->info_sock) + sockfd_put(server->info_sock); + sockfd_put(server->ncp_sock); kill_pid(server->m.wdog_pid, SIGTERM, 1); put_pid(server->m.wdog_pid); @@ -984,8 +960,7 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr) if ((attr->ia_valid & ATTR_SIZE) != 0) { int written; - DPRINTK("ncpfs: trying to change size to %ld\n", - attr->ia_size); + ncp_dbg(1, "trying to change size to %llu\n", attr->ia_size); if ((result = ncp_make_open(inode, O_WRONLY)) < 0) { result = -EACCES; @@ -1071,7 +1046,7 @@ MODULE_ALIAS_FS("ncpfs"); static int __init init_ncp_fs(void) { int err; - DPRINTK("ncpfs: init_ncp_fs called\n"); + ncp_dbg(1, "called\n"); err = init_inodecache(); if (err) @@ -1088,7 +1063,7 @@ out1: static void __exit exit_ncp_fs(void) { - DPRINTK("ncpfs: exit_ncp_fs called\n"); + ncp_dbg(1, "called\n"); unregister_filesystem(&ncp_fs_type); destroy_inodecache(); } diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index 60426ccb3b65..d5659d96ee7f 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -41,7 +41,7 @@ ncp_get_fs_info(struct ncp_server * server, struct inode *inode, return -EFAULT; if (info.version != NCP_GET_FS_INFO_VERSION) { - DPRINTK("info.version invalid: %d\n", info.version); + ncp_dbg(1, "info.version invalid: %d\n", info.version); return -EINVAL; } /* TODO: info.addr = server->m.serv_addr; */ @@ -66,7 +66,7 @@ ncp_get_fs_info_v2(struct ncp_server * server, struct inode *inode, return -EFAULT; if (info2.version != NCP_GET_FS_INFO_VERSION_V2) { - DPRINTK("info.version invalid: %d\n", info2.version); + ncp_dbg(1, "info.version invalid: %d\n", info2.version); return -EINVAL; } info2.mounted_uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid); @@ -132,7 +132,7 @@ ncp_get_compat_fs_info_v2(struct ncp_server * server, struct inode *inode, return -EFAULT; if (info2.version != NCP_GET_FS_INFO_VERSION_V2) { - DPRINTK("info.version invalid: %d\n", info2.version); + ncp_dbg(1, "info.version invalid: %d\n", info2.version); return -EINVAL; } info2.mounted_uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid); @@ -308,8 +308,7 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg else result = server->reply_size; ncp_unlock_server(server); - DPRINTK("ncp_ioctl: copy %d bytes\n", - result); + ncp_dbg(1, "copy %d bytes\n", result); if (result >= 0) if (copy_to_user(request.data, bouncebuffer, result)) result = -EFAULT; @@ -385,9 +384,9 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg sr.namespace = server->name_space[sr.volNumber]; result = 0; } else - DPRINTK("ncpfs: s_root->d_inode==NULL\n"); + ncp_dbg(1, "s_root->d_inode==NULL\n"); } else - DPRINTK("ncpfs: s_root==NULL\n"); + ncp_dbg(1, "s_root==NULL\n"); } else { sr.volNumber = -1; sr.namespace = 0; @@ -440,11 +439,11 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg NCP_FINFO(s_inode)->DosDirNum = dosde; server->root_setuped = 1; } else { - DPRINTK("ncpfs: s_root->d_inode==NULL\n"); + ncp_dbg(1, "s_root->d_inode==NULL\n"); result = -EIO; } } else { - DPRINTK("ncpfs: s_root==NULL\n"); + ncp_dbg(1, "s_root==NULL\n"); result = -EIO; } } diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c index 3c5dd55d284c..b359d12eb359 100644 --- a/fs/ncpfs/mmap.c +++ b/fs/ncpfs/mmap.c @@ -107,7 +107,7 @@ int ncp_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); - DPRINTK("ncp_mmap: called\n"); + ncp_dbg(1, "called\n"); if (!ncp_conn_valid(NCP_SERVER(inode))) return -EIO; diff --git a/fs/ncpfs/ncp_fs.h b/fs/ncpfs/ncp_fs.h index 31831afe1c3b..b9f69e1b1f43 100644 --- a/fs/ncpfs/ncp_fs.h +++ b/fs/ncpfs/ncp_fs.h @@ -2,30 +2,32 @@ #include "ncp_fs_i.h" #include "ncp_fs_sb.h" -/* define because it is easy to change PRINTK to {*}PRINTK */ -#define PRINTK(format, args...) printk(KERN_DEBUG format , ## args) - #undef NCPFS_PARANOIA #ifdef NCPFS_PARANOIA -#define PPRINTK(format, args...) PRINTK(format , ## args) +#define ncp_vdbg(fmt, ...) \ + pr_debug(fmt, ##__VA_ARGS__) #else -#define PPRINTK(format, args...) +#define ncp_vdbg(fmt, ...) \ +do { \ + if (0) \ + pr_debug(fmt, ##__VA_ARGS__); \ +} while (0) #endif #ifndef DEBUG_NCP #define DEBUG_NCP 0 #endif -#if DEBUG_NCP > 0 -#define DPRINTK(format, args...) PRINTK(format , ## args) -#else -#define DPRINTK(format, args...) -#endif -#if DEBUG_NCP > 1 -#define DDPRINTK(format, args...) PRINTK(format , ## args) -#else -#define DDPRINTK(format, args...) + +#if DEBUG_NCP > 0 && !defined(DEBUG) +#define DEBUG #endif +#define ncp_dbg(level, fmt, ...) \ +do { \ + if (level <= DEBUG_NCP) \ + pr_debug(fmt, ##__VA_ARGS__); \ +} while (0) + #define NCP_MAX_RPC_TIMEOUT (6*HZ) diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h index b81e97adc5a9..55e26fd80886 100644 --- a/fs/ncpfs/ncp_fs_sb.h +++ b/fs/ncpfs/ncp_fs_sb.h @@ -45,9 +45,7 @@ struct ncp_server { __u8 name_space[NCP_NUMBER_OF_VOLUMES + 2]; - struct file *ncp_filp; /* File pointer to ncp socket */ struct socket *ncp_sock;/* ncp socket */ - struct file *info_filp; struct socket *info_sock; u8 sequence; @@ -111,7 +109,7 @@ struct ncp_server { spinlock_t requests_lock; /* Lock accesses to tx.requests, tx.creq and rcv.creq when STREAM mode */ - void (*data_ready)(struct sock* sk, int len); + void (*data_ready)(struct sock* sk); void (*error_report)(struct sock* sk); void (*write_space)(struct sock* sk); /* STREAM mode only */ struct { @@ -153,7 +151,7 @@ extern void ncp_tcp_tx_proc(struct work_struct *work); extern void ncpdgram_rcv_proc(struct work_struct *work); extern void ncpdgram_timeout_proc(struct work_struct *work); extern void ncpdgram_timeout_call(unsigned long server); -extern void ncp_tcp_data_ready(struct sock* sk, int len); +extern void ncp_tcp_data_ready(struct sock* sk); extern void ncp_tcp_write_space(struct sock* sk); extern void ncp_tcp_error_report(struct sock* sk); diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c index 981a95617fc9..482387532f54 100644 --- a/fs/ncpfs/ncplib_kernel.c +++ b/fs/ncpfs/ncplib_kernel.c @@ -9,14 +9,14 @@ * */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "ncp_fs.h" static inline void assert_server_locked(struct ncp_server *server) { if (server->lock == 0) { - DPRINTK("ncpfs: server not locked!\n"); + ncp_dbg(1, "server not locked!\n"); } } @@ -75,7 +75,7 @@ static void ncp_add_pstring(struct ncp_server *server, const char *s) int len = strlen(s); assert_server_locked(server); if (len > 255) { - DPRINTK("ncpfs: string too long: %s\n", s); + ncp_dbg(1, "string too long: %s\n", s); len = 255; } ncp_add_byte(server, len); @@ -225,7 +225,7 @@ int ncp_get_volume_info_with_number(struct ncp_server* server, result = -EIO; len = ncp_reply_byte(server, 29); if (len > NCP_VOLNAME_LEN) { - DPRINTK("ncpfs: volume name too long: %d\n", len); + ncp_dbg(1, "volume name too long: %d\n", len); goto out; } memcpy(&(target->volume_name), ncp_reply_data(server, 30), len); @@ -259,7 +259,7 @@ int ncp_get_directory_info(struct ncp_server* server, __u8 n, result = -EIO; len = ncp_reply_byte(server, 21); if (len > NCP_VOLNAME_LEN) { - DPRINTK("ncpfs: volume name too long: %d\n", len); + ncp_dbg(1, "volume name too long: %d\n", len); goto out; } memcpy(&(target->volume_name), ncp_reply_data(server, 22), len); @@ -295,9 +295,9 @@ ncp_make_closed(struct inode *inode) err = ncp_close_file(NCP_SERVER(inode), NCP_FINFO(inode)->file_handle); if (!err) - PPRINTK("ncp_make_closed: volnum=%d, dirent=%u, error=%d\n", - NCP_FINFO(inode)->volNumber, - NCP_FINFO(inode)->dirEntNum, err); + ncp_vdbg("volnum=%d, dirent=%u, error=%d\n", + NCP_FINFO(inode)->volNumber, + NCP_FINFO(inode)->dirEntNum, err); } mutex_unlock(&NCP_FINFO(inode)->open_mutex); return err; @@ -394,8 +394,7 @@ int ncp_obtain_nfs_info(struct ncp_server *server, if ((result = ncp_request(server, 87)) == 0) { ncp_extract_nfs_info(ncp_reply_data(server, 0), &target->nfs); - DPRINTK(KERN_DEBUG - "ncp_obtain_nfs_info: (%s) mode=0%o, rdev=0x%x\n", + ncp_dbg(1, "(%s) mode=0%o, rdev=0x%x\n", target->entryName, target->nfs.mode, target->nfs.rdev); } else { @@ -425,7 +424,7 @@ int ncp_obtain_info(struct ncp_server *server, struct inode *dir, const char *pa int result; if (target == NULL) { - printk(KERN_ERR "ncp_obtain_info: invalid call\n"); + pr_err("%s: invalid call\n", __func__); return -EINVAL; } ncp_init_request(server); @@ -498,7 +497,7 @@ ncp_get_known_namespace(struct ncp_server *server, __u8 volume) namespace = ncp_reply_data(server, 2); while (no_namespaces > 0) { - DPRINTK("get_namespaces: found %d on %d\n", *namespace, volume); + ncp_dbg(1, "found %d on %d\n", *namespace, volume); #ifdef CONFIG_NCPFS_NFS_NS if ((*namespace == NW_NS_NFS) && !(server->m.flags&NCP_MOUNT_NO_NFS)) @@ -531,8 +530,7 @@ ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns) if (ret_ns) *ret_ns = ns; - DPRINTK("lookup_vol: namespace[%d] = %d\n", - volume, server->name_space[volume]); + ncp_dbg(1, "namespace[%d] = %d\n", volume, server->name_space[volume]); if (server->name_space[volume] == ns) return 0; @@ -596,7 +594,7 @@ ncp_get_volume_root(struct ncp_server *server, { int result; - DPRINTK("ncp_get_volume_root: looking up vol %s\n", volname); + ncp_dbg(1, "looking up vol %s\n", volname); ncp_init_request(server); ncp_add_byte(server, 22); /* Subfunction: Generate dir handle */ diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c index 3a1587222c8a..471bc3d1139e 100644 --- a/fs/ncpfs/sock.c +++ b/fs/ncpfs/sock.c @@ -8,6 +8,7 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/time.h> #include <linux/errno.h> @@ -96,11 +97,11 @@ static void ncp_req_put(struct ncp_request_reply *req) kfree(req); } -void ncp_tcp_data_ready(struct sock *sk, int len) +void ncp_tcp_data_ready(struct sock *sk) { struct ncp_server *server = sk->sk_user_data; - server->data_ready(sk, len); + server->data_ready(sk); schedule_work(&server->rcv.tq); } @@ -231,7 +232,7 @@ static void __ncptcp_try_send(struct ncp_server *server) return; if (result < 0) { - printk(KERN_ERR "ncpfs: tcp: Send failed: %d\n", result); + pr_err("tcp: Send failed: %d\n", result); __ncp_abort_request(server, rq, result); return; } @@ -332,7 +333,7 @@ static int ncp_add_request(struct ncp_server *server, struct ncp_request_reply * mutex_lock(&server->rcv.creq_mutex); if (!ncp_conn_valid(server)) { mutex_unlock(&server->rcv.creq_mutex); - printk(KERN_ERR "ncpfs: tcp: Server died\n"); + pr_err("tcp: Server died\n"); return -EIO; } ncp_req_get(req); @@ -405,15 +406,15 @@ void ncpdgram_rcv_proc(struct work_struct *work) } result = _recv(sock, buf, sizeof(buf), MSG_DONTWAIT); if (result < 0) { - DPRINTK("recv failed with %d\n", result); + ncp_dbg(1, "recv failed with %d\n", result); continue; } if (result < 10) { - DPRINTK("too short (%u) watchdog packet\n", result); + ncp_dbg(1, "too short (%u) watchdog packet\n", result); continue; } if (buf[9] != '?') { - DPRINTK("bad signature (%02X) in watchdog packet\n", buf[9]); + ncp_dbg(1, "bad signature (%02X) in watchdog packet\n", buf[9]); continue; } buf[9] = 'Y'; @@ -448,7 +449,7 @@ void ncpdgram_rcv_proc(struct work_struct *work) result -= 8; hdrl = sock->sk->sk_family == AF_INET ? 8 : 6; if (sign_verify_reply(server, server->rxbuf + hdrl, result - hdrl, cpu_to_le32(result), server->rxbuf + result)) { - printk(KERN_INFO "ncpfs: Signature violation\n"); + pr_info("Signature violation\n"); result = -EIO; } } @@ -524,7 +525,7 @@ static int do_tcp_rcv(struct ncp_server *server, void *buffer, size_t len) return result; } if (result > len) { - printk(KERN_ERR "ncpfs: tcp: bug in recvmsg (%u > %Zu)\n", result, len); + pr_err("tcp: bug in recvmsg (%u > %Zu)\n", result, len); return -EIO; } return result; @@ -552,9 +553,9 @@ static int __ncptcp_rcv_proc(struct ncp_server *server) __ncptcp_abort(server); } if (result < 0) { - printk(KERN_ERR "ncpfs: tcp: error in recvmsg: %d\n", result); + pr_err("tcp: error in recvmsg: %d\n", result); } else { - DPRINTK(KERN_ERR "ncpfs: tcp: EOF\n"); + ncp_dbg(1, "tcp: EOF\n"); } return -EIO; } @@ -566,20 +567,20 @@ static int __ncptcp_rcv_proc(struct ncp_server *server) switch (server->rcv.state) { case 0: if (server->rcv.buf.magic != htonl(NCP_TCP_RCVD_MAGIC)) { - printk(KERN_ERR "ncpfs: tcp: Unexpected reply type %08X\n", ntohl(server->rcv.buf.magic)); + pr_err("tcp: Unexpected reply type %08X\n", ntohl(server->rcv.buf.magic)); __ncptcp_abort(server); return -EIO; } datalen = ntohl(server->rcv.buf.len) & 0x0FFFFFFF; if (datalen < 10) { - printk(KERN_ERR "ncpfs: tcp: Unexpected reply len %d\n", datalen); + pr_err("tcp: Unexpected reply len %d\n", datalen); __ncptcp_abort(server); return -EIO; } #ifdef CONFIG_NCPFS_PACKET_SIGNING if (server->sign_active) { if (datalen < 18) { - printk(KERN_ERR "ncpfs: tcp: Unexpected reply len %d\n", datalen); + pr_err("tcp: Unexpected reply len %d\n", datalen); __ncptcp_abort(server); return -EIO; } @@ -604,7 +605,7 @@ cont:; server->rcv.len = datalen - 10; break; } - DPRINTK("ncpfs: tcp: Unexpected NCP type %02X\n", type); + ncp_dbg(1, "tcp: Unexpected NCP type %02X\n", type); skipdata2:; server->rcv.state = 2; skipdata:; @@ -614,11 +615,11 @@ skipdata:; } req = server->rcv.creq; if (!req) { - DPRINTK(KERN_ERR "ncpfs: Reply without appropriate request\n"); + ncp_dbg(1, "Reply without appropriate request\n"); goto skipdata2; } if (datalen > req->datalen + 8) { - printk(KERN_ERR "ncpfs: tcp: Unexpected reply len %d (expected at most %Zd)\n", datalen, req->datalen + 8); + pr_err("tcp: Unexpected reply len %d (expected at most %Zd)\n", datalen, req->datalen + 8); server->rcv.state = 3; goto skipdata; } @@ -638,12 +639,12 @@ skipdata:; req = server->rcv.creq; if (req->tx_type != NCP_ALLOC_SLOT_REQUEST) { if (((struct ncp_reply_header*)server->rxbuf)->sequence != server->sequence) { - printk(KERN_ERR "ncpfs: tcp: Bad sequence number\n"); + pr_err("tcp: Bad sequence number\n"); __ncp_abort_request(server, req, -EIO); return -EIO; } if ((((struct ncp_reply_header*)server->rxbuf)->conn_low | (((struct ncp_reply_header*)server->rxbuf)->conn_high << 8)) != server->connection) { - printk(KERN_ERR "ncpfs: tcp: Connection number mismatch\n"); + pr_err("tcp: Connection number mismatch\n"); __ncp_abort_request(server, req, -EIO); return -EIO; } @@ -651,7 +652,7 @@ skipdata:; #ifdef CONFIG_NCPFS_PACKET_SIGNING if (server->sign_active && req->tx_type != NCP_DEALLOC_SLOT_REQUEST) { if (sign_verify_reply(server, server->rxbuf + 6, req->datalen - 6, cpu_to_be32(req->datalen + 16), &server->rcv.buf.type)) { - printk(KERN_ERR "ncpfs: tcp: Signature violation\n"); + pr_err("tcp: Signature violation\n"); __ncp_abort_request(server, req, -EIO); return -EIO; } @@ -742,7 +743,7 @@ static int ncp_do_request(struct ncp_server *server, int size, int result; if (server->lock == 0) { - printk(KERN_ERR "ncpfs: Server not locked!\n"); + pr_err("Server not locked!\n"); return -EIO; } if (!ncp_conn_valid(server)) { @@ -781,7 +782,7 @@ static int ncp_do_request(struct ncp_server *server, int size, spin_unlock_irqrestore(¤t->sighand->siglock, flags); } - DDPRINTK("do_ncp_rpc_call returned %d\n", result); + ncp_dbg(2, "do_ncp_rpc_call returned %d\n", result); return result; } @@ -811,7 +812,7 @@ int ncp_request2(struct ncp_server *server, int function, result = ncp_do_request(server, server->current_size, reply, size); if (result < 0) { - DPRINTK("ncp_request_error: %d\n", result); + ncp_dbg(1, "ncp_request_error: %d\n", result); goto out; } server->completion = reply->completion_code; @@ -822,7 +823,7 @@ int ncp_request2(struct ncp_server *server, int function, result = reply->completion_code; if (result != 0) - PPRINTK("ncp_request: completion code=%x\n", result); + ncp_vdbg("completion code=%x\n", result); out: return result; } @@ -865,14 +866,14 @@ void ncp_lock_server(struct ncp_server *server) { mutex_lock(&server->mutex); if (server->lock) - printk(KERN_WARNING "ncp_lock_server: was locked!\n"); + pr_warn("%s: was locked!\n", __func__); server->lock = 1; } void ncp_unlock_server(struct ncp_server *server) { if (!server->lock) { - printk(KERN_WARNING "ncp_unlock_server: was not locked!\n"); + pr_warn("%s: was not locked!\n", __func__); return; } server->lock = 0; diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c index 52439ddc8de0..1a63bfdb4a65 100644 --- a/fs/ncpfs/symlink.c +++ b/fs/ncpfs/symlink.c @@ -112,7 +112,7 @@ int ncp_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { __le32 attr; unsigned int hdr; - DPRINTK("ncp_symlink(dir=%p,dentry=%p,symname=%s)\n",dir,dentry,symname); + ncp_dbg(1, "dir=%p, dentry=%p, symname=%s\n", dir, dentry, symname); if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber)) kludge = 0; diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 56ff823ca82e..65d849bdf77a 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -1213,7 +1213,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx) end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE); if (end != NFS_I(inode)->npages) { rcu_read_lock(); - end = radix_tree_next_hole(&mapping->page_tree, idx + 1, ULONG_MAX); + end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX); rcu_read_unlock(); } diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index ae2e87b95453..41db5258e7a7 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -112,7 +112,8 @@ out: * TODO: keep track of all layouts (and delegations) in a hash table * hashed by filehandle. */ -static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, struct nfs_fh *fh) +static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, + struct nfs_fh *fh, nfs4_stateid *stateid) { struct nfs_server *server; struct inode *ino; @@ -120,17 +121,19 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { list_for_each_entry(lo, &server->layouts, plh_layouts) { + if (!nfs4_stateid_match_other(&lo->plh_stateid, stateid)) + continue; if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh)) continue; ino = igrab(lo->plh_inode); if (!ino) - continue; + break; spin_lock(&ino->i_lock); /* Is this layout in the process of being freed? */ if (NFS_I(ino)->layout != lo) { spin_unlock(&ino->i_lock); iput(ino); - continue; + break; } pnfs_get_layout_hdr(lo); spin_unlock(&ino->i_lock); @@ -141,13 +144,14 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, return NULL; } -static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, struct nfs_fh *fh) +static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, + struct nfs_fh *fh, nfs4_stateid *stateid) { struct pnfs_layout_hdr *lo; spin_lock(&clp->cl_lock); rcu_read_lock(); - lo = get_layout_by_fh_locked(clp, fh); + lo = get_layout_by_fh_locked(clp, fh, stateid); rcu_read_unlock(); spin_unlock(&clp->cl_lock); @@ -162,9 +166,9 @@ static u32 initiate_file_draining(struct nfs_client *clp, u32 rv = NFS4ERR_NOMATCHING_LAYOUT; LIST_HEAD(free_me_list); - lo = get_layout_by_fh(clp, &args->cbl_fh); + lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid); if (!lo) - return NFS4ERR_NOMATCHING_LAYOUT; + goto out; ino = lo->plh_inode; spin_lock(&ino->i_lock); @@ -179,6 +183,7 @@ static u32 initiate_file_draining(struct nfs_client *clp, pnfs_free_lseg_list(&free_me_list); pnfs_put_layout_hdr(lo); iput(ino); +out: return rv; } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 4a48fe4b84b6..d9f3d067cd15 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -69,21 +69,28 @@ const struct address_space_operations nfs_dir_aops = { static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred) { + struct nfs_inode *nfsi = NFS_I(dir); struct nfs_open_dir_context *ctx; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (ctx != NULL) { ctx->duped = 0; - ctx->attr_gencount = NFS_I(dir)->attr_gencount; + ctx->attr_gencount = nfsi->attr_gencount; ctx->dir_cookie = 0; ctx->dup_cookie = 0; ctx->cred = get_rpccred(cred); + spin_lock(&dir->i_lock); + list_add(&ctx->list, &nfsi->open_files); + spin_unlock(&dir->i_lock); return ctx; } return ERR_PTR(-ENOMEM); } -static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx) +static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_context *ctx) { + spin_lock(&dir->i_lock); + list_del(&ctx->list); + spin_unlock(&dir->i_lock); put_rpccred(ctx->cred); kfree(ctx); } @@ -126,7 +133,7 @@ out: static int nfs_closedir(struct inode *inode, struct file *filp) { - put_nfs_open_dir_context(filp->private_data); + put_nfs_open_dir_context(filp->f_path.dentry->d_inode, filp->private_data); return 0; } @@ -306,10 +313,9 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des if (printk_ratelimit()) { pr_notice("NFS: directory %pD2 contains a readdir loop." "Please contact your server vendor. " - "The file: %s has duplicate cookie %llu\n", - desc->file, - array->array[i].string.name, - *desc->dir_cookie); + "The file: %.*s has duplicate cookie %llu\n", + desc->file, array->array[i].string.len, + array->array[i].string.name, *desc->dir_cookie); } status = -ELOOP; goto out; @@ -437,6 +443,22 @@ void nfs_advise_use_readdirplus(struct inode *dir) set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags); } +/* + * This function is mainly for use by nfs_getattr(). + * + * If this is an 'ls -l', we want to force use of readdirplus. + * Do this by checking if there is an active file descriptor + * and calling nfs_advise_use_readdirplus, then forcing a + * cache flush. + */ +void nfs_force_use_readdirplus(struct inode *dir) +{ + if (!list_empty(&NFS_I(dir)->open_files)) { + nfs_advise_use_readdirplus(dir); + nfs_zap_mapping(dir, dir->i_mapping); + } +} + static void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) { @@ -815,6 +837,17 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc) goto out; } +static bool nfs_dir_mapping_need_revalidate(struct inode *dir) +{ + struct nfs_inode *nfsi = NFS_I(dir); + + if (nfs_attribute_cache_expired(dir)) + return true; + if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + return true; + return false; +} + /* The file offset position represents the dirent entry number. A last cookie cache takes care of the common case of reading the whole directory. @@ -847,7 +880,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0; nfs_block_sillyrename(dentry); - if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) + if (ctx->pos == 0 || nfs_dir_mapping_need_revalidate(inode)) res = nfs_revalidate_mapping(inode, file->f_mapping); if (res < 0) goto out; @@ -1911,6 +1944,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *old_inode = old_dentry->d_inode; struct inode *new_inode = new_dentry->d_inode; struct dentry *dentry = NULL, *rehash = NULL; + struct rpc_task *task; int error = -EBUSY; dfprintk(VFS, "NFS: rename(%pd2 -> %pd2, ct=%d)\n", @@ -1958,8 +1992,16 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_inode != NULL) NFS_PROTO(new_inode)->return_delegation(new_inode); - error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name, - new_dir, &new_dentry->d_name); + task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL); + if (IS_ERR(task)) { + error = PTR_ERR(task); + goto out; + } + + error = rpc_wait_for_completion_task(task); + if (error == 0) + error = task->tk_status; + rpc_put_task(task); nfs_mark_for_revalidate(old_inode); out: if (rehash) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 5bb790a69c71..284ca901fe16 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -617,6 +617,7 @@ out: static const struct vm_operations_struct nfs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = nfs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 360114ae8b82..0c438973f3c8 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -128,7 +128,7 @@ EXPORT_SYMBOL_GPL(nfs_clear_inode); void nfs_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); nfs_clear_inode(inode); } @@ -588,6 +588,25 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) } EXPORT_SYMBOL_GPL(nfs_setattr_update_inode); +static void nfs_request_parent_use_readdirplus(struct dentry *dentry) +{ + struct dentry *parent; + + parent = dget_parent(dentry); + nfs_force_use_readdirplus(parent->d_inode); + dput(parent); +} + +static bool nfs_need_revalidate_inode(struct inode *inode) +{ + if (NFS_I(inode)->cache_validity & + (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) + return true; + if (nfs_attribute_cache_expired(inode)) + return true; + return false; +} + int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; @@ -616,10 +635,13 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) need_atime = 0; - if (need_atime) - err = __nfs_revalidate_inode(NFS_SERVER(inode), inode); - else - err = nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (need_atime || nfs_need_revalidate_inode(inode)) { + struct nfs_server *server = NFS_SERVER(inode); + + if (server->caps & NFS_CAP_READDIRPLUS) + nfs_request_parent_use_readdirplus(dentry); + err = __nfs_revalidate_inode(server, inode); + } if (!err) { generic_fillattr(inode, stat); stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); @@ -961,9 +983,7 @@ int nfs_attribute_cache_expired(struct inode *inode) */ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) { - if (!(NFS_I(inode)->cache_validity & - (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) - && !nfs_attribute_cache_expired(inode)) + if (!nfs_need_revalidate_inode(inode)) return NFS_STALE(inode) ? -ESTALE : 0; return __nfs_revalidate_inode(server, inode); } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index b46cf5a67329..dd8bfc2e2464 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -301,6 +301,7 @@ extern struct nfs_client *nfs_init_client(struct nfs_client *clp, const char *ip_addr); /* dir.c */ +extern void nfs_force_use_readdirplus(struct inode *dir); extern unsigned long nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc); extern unsigned long nfs_access_cache_scan(struct shrinker *shrink, @@ -474,6 +475,13 @@ extern int nfs_migrate_page(struct address_space *, #define nfs_migrate_page NULL #endif +/* unlink.c */ +extern struct rpc_task * +nfs_async_rename(struct inode *old_dir, struct inode *new_dir, + struct dentry *old_dentry, struct dentry *new_dentry, + void (*complete)(struct rpc_task *, struct nfs_renamedata *)); +extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry); + /* direct.c */ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, struct nfs_direct_req *dreq); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index a462ef0fb5d6..db60149c4579 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -479,41 +479,6 @@ nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir, } static int -nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name, - struct inode *new_dir, struct qstr *new_name) -{ - struct nfs_renameargs arg = { - .old_dir = NFS_FH(old_dir), - .old_name = old_name, - .new_dir = NFS_FH(new_dir), - .new_name = new_name, - }; - struct nfs_renameres res; - struct rpc_message msg = { - .rpc_proc = &nfs3_procedures[NFS3PROC_RENAME], - .rpc_argp = &arg, - .rpc_resp = &res, - }; - int status = -ENOMEM; - - dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name); - - res.old_fattr = nfs_alloc_fattr(); - res.new_fattr = nfs_alloc_fattr(); - if (res.old_fattr == NULL || res.new_fattr == NULL) - goto out; - - status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); - nfs_post_op_update_inode(old_dir, res.old_fattr); - nfs_post_op_update_inode(new_dir, res.new_fattr); -out: - nfs_free_fattr(res.old_fattr); - nfs_free_fattr(res.new_fattr); - dprintk("NFS reply rename: %d\n", status); - return status; -} - -static int nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) { struct nfs3_linkargs arg = { @@ -968,7 +933,6 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .unlink_setup = nfs3_proc_unlink_setup, .unlink_rpc_prepare = nfs3_proc_unlink_rpc_prepare, .unlink_done = nfs3_proc_unlink_done, - .rename = nfs3_proc_rename, .rename_setup = nfs3_proc_rename_setup, .rename_rpc_prepare = nfs3_proc_rename_rpc_prepare, .rename_done = nfs3_proc_rename_done, diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index a5b27c2d9689..e1d1badbe53c 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -427,6 +427,7 @@ extern void nfs4_close_sync(struct nfs4_state *, fmode_t); extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t); extern void nfs_inode_find_state_and_recover(struct inode *inode, const nfs4_stateid *stateid); +extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *, struct nfs4_state *); extern void nfs4_schedule_lease_recovery(struct nfs_client *); extern int nfs4_wait_clnt_recover(struct nfs_client *clp); extern int nfs4_client_recover_expired_lease(struct nfs_client *clp); @@ -500,6 +501,16 @@ static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_statei return memcmp(dst, src, sizeof(*dst)) == 0; } +static inline bool nfs4_stateid_match_other(const nfs4_stateid *dst, const nfs4_stateid *src) +{ + return memcmp(dst->other, src->other, NFS4_STATEID_OTHER_SIZE) == 0; +} + +static inline bool nfs4_stateid_is_newer(const nfs4_stateid *s1, const nfs4_stateid *s2) +{ + return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0; +} + static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state) { return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0; diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 0e46d3d1b6cc..aa9ef4876046 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -531,6 +531,13 @@ int nfs40_walk_client_list(struct nfs_client *new, *result = pos; dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", __func__, pos, atomic_read(&pos->cl_count)); + goto out; + case -ERESTARTSYS: + case -ETIMEDOUT: + /* The callback path may have been inadvertently + * changed. Schedule recovery! + */ + nfs4_schedule_path_down_recovery(pos); default: goto out; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 450bfedbe2f4..397be39c6dc8 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1068,6 +1068,7 @@ static void nfs4_opendata_free(struct kref *kref) dput(p->dentry); nfs_sb_deactive(sb); nfs_fattr_free_names(&p->f_attr); + kfree(p->f_attr.mdsthreshold); kfree(p); } @@ -1137,12 +1138,71 @@ static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode) nfs4_state_set_mode_locked(state, state->state | fmode); } -static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) +static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state) +{ + struct nfs_client *clp = state->owner->so_server->nfs_client; + bool need_recover = false; + + if (test_and_clear_bit(NFS_O_RDONLY_STATE, &state->flags) && state->n_rdonly) + need_recover = true; + if (test_and_clear_bit(NFS_O_WRONLY_STATE, &state->flags) && state->n_wronly) + need_recover = true; + if (test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags) && state->n_rdwr) + need_recover = true; + if (need_recover) + nfs4_state_mark_reclaim_nograce(clp, state); +} + +static bool nfs_need_update_open_stateid(struct nfs4_state *state, + nfs4_stateid *stateid) +{ + if (test_and_set_bit(NFS_OPEN_STATE, &state->flags) == 0) + return true; + if (!nfs4_stateid_match_other(stateid, &state->open_stateid)) { + nfs_test_and_clear_all_open_stateid(state); + return true; + } + if (nfs4_stateid_is_newer(stateid, &state->open_stateid)) + return true; + return false; +} + +static void nfs_clear_open_stateid_locked(struct nfs4_state *state, + nfs4_stateid *stateid, fmode_t fmode) { + clear_bit(NFS_O_RDWR_STATE, &state->flags); + switch (fmode & (FMODE_READ|FMODE_WRITE)) { + case FMODE_WRITE: + clear_bit(NFS_O_RDONLY_STATE, &state->flags); + break; + case FMODE_READ: + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + break; + case 0: + clear_bit(NFS_O_RDONLY_STATE, &state->flags); + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + clear_bit(NFS_OPEN_STATE, &state->flags); + } + if (stateid == NULL) + return; + if (!nfs_need_update_open_stateid(state, stateid)) + return; if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) nfs4_stateid_copy(&state->stateid, stateid); nfs4_stateid_copy(&state->open_stateid, stateid); - set_bit(NFS_OPEN_STATE, &state->flags); +} + +static void nfs_clear_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) +{ + write_seqlock(&state->seqlock); + nfs_clear_open_stateid_locked(state, stateid, fmode); + write_sequnlock(&state->seqlock); + if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) + nfs4_schedule_state_manager(state->owner->so_server->nfs_client); +} + +static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) +{ switch (fmode) { case FMODE_READ: set_bit(NFS_O_RDONLY_STATE, &state->flags); @@ -1153,13 +1213,11 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid * case FMODE_READ|FMODE_WRITE: set_bit(NFS_O_RDWR_STATE, &state->flags); } -} - -static void nfs_set_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) -{ - write_seqlock(&state->seqlock); - nfs_set_open_stateid_locked(state, stateid, fmode); - write_sequnlock(&state->seqlock); + if (!nfs_need_update_open_stateid(state, stateid)) + return; + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) + nfs4_stateid_copy(&state->stateid, stateid); + nfs4_stateid_copy(&state->open_stateid, stateid); } static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode) @@ -1217,6 +1275,8 @@ no_delegation: __update_open_stateid(state, open_stateid, NULL, fmode); ret = 1; } + if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) + nfs4_schedule_state_manager(state->owner->so_server->nfs_client); return ret; } @@ -1450,12 +1510,15 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * struct nfs4_state *newstate; int ret; + /* Don't trigger recovery in nfs_test_and_clear_all_open_stateid */ + clear_bit(NFS_O_RDWR_STATE, &state->flags); + clear_bit(NFS_O_WRONLY_STATE, &state->flags); + clear_bit(NFS_O_RDONLY_STATE, &state->flags); /* memory barrier prior to reading state->n_* */ clear_bit(NFS_DELEGATED_STATE, &state->flags); clear_bit(NFS_OPEN_STATE, &state->flags); smp_rmb(); if (state->n_rdwr != 0) { - clear_bit(NFS_O_RDWR_STATE, &state->flags); ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate); if (ret != 0) return ret; @@ -1463,7 +1526,6 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * return -ESTALE; } if (state->n_wronly != 0) { - clear_bit(NFS_O_WRONLY_STATE, &state->flags); ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate); if (ret != 0) return ret; @@ -1471,7 +1533,6 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * return -ESTALE; } if (state->n_rdonly != 0) { - clear_bit(NFS_O_RDONLY_STATE, &state->flags); ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate); if (ret != 0) return ret; @@ -2244,10 +2305,12 @@ static int _nfs4_do_open(struct inode *dir, } } - if (ctx_th && server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) { - opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc(); - if (!opendata->f_attr.mdsthreshold) - goto err_free_label; + if (server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) { + if (!opendata->f_attr.mdsthreshold) { + opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc(); + if (!opendata->f_attr.mdsthreshold) + goto err_free_label; + } opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0]; } if (dentry->d_inode != NULL) @@ -2275,11 +2338,10 @@ static int _nfs4_do_open(struct inode *dir, if (opendata->file_created) *opened |= FILE_CREATED; - if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) + if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) { *ctx_th = opendata->f_attr.mdsthreshold; - else - kfree(opendata->f_attr.mdsthreshold); - opendata->f_attr.mdsthreshold = NULL; + opendata->f_attr.mdsthreshold = NULL; + } nfs4_label_free(olabel); @@ -2289,7 +2351,6 @@ static int _nfs4_do_open(struct inode *dir, err_free_label: nfs4_label_free(olabel); err_opendata_put: - kfree(opendata->f_attr.mdsthreshold); nfs4_opendata_put(opendata); err_put_state_owner: nfs4_put_state_owner(sp); @@ -2479,26 +2540,6 @@ static void nfs4_free_closedata(void *data) kfree(calldata); } -static void nfs4_close_clear_stateid_flags(struct nfs4_state *state, - fmode_t fmode) -{ - spin_lock(&state->owner->so_lock); - clear_bit(NFS_O_RDWR_STATE, &state->flags); - switch (fmode & (FMODE_READ|FMODE_WRITE)) { - case FMODE_WRITE: - clear_bit(NFS_O_RDONLY_STATE, &state->flags); - break; - case FMODE_READ: - clear_bit(NFS_O_WRONLY_STATE, &state->flags); - break; - case 0: - clear_bit(NFS_O_RDONLY_STATE, &state->flags); - clear_bit(NFS_O_WRONLY_STATE, &state->flags); - clear_bit(NFS_OPEN_STATE, &state->flags); - } - spin_unlock(&state->owner->so_lock); -} - static void nfs4_close_done(struct rpc_task *task, void *data) { struct nfs4_closedata *calldata = data; @@ -2517,9 +2558,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data) if (calldata->roc) pnfs_roc_set_barrier(state->inode, calldata->roc_barrier); - nfs_set_open_stateid(state, &calldata->res.stateid, 0); + nfs_clear_open_stateid(state, &calldata->res.stateid, 0); renew_lease(server, calldata->timestamp); - break; + goto out_release; case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_OLD_STATEID: @@ -2533,7 +2574,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) goto out_release; } } - nfs4_close_clear_stateid_flags(state, calldata->arg.fmode); + nfs_clear_open_stateid(state, NULL, calldata->arg.fmode); out_release: nfs_release_seqid(calldata->arg.seqid); nfs_refresh_inode(calldata->inode, calldata->res.fattr); @@ -3507,49 +3548,6 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, return 1; } -static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, - struct inode *new_dir, struct qstr *new_name) -{ - struct nfs_server *server = NFS_SERVER(old_dir); - struct nfs_renameargs arg = { - .old_dir = NFS_FH(old_dir), - .new_dir = NFS_FH(new_dir), - .old_name = old_name, - .new_name = new_name, - }; - struct nfs_renameres res = { - .server = server, - }; - struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME], - .rpc_argp = &arg, - .rpc_resp = &res, - }; - int status = -ENOMEM; - - status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); - if (!status) { - update_changeattr(old_dir, &res.old_cinfo); - update_changeattr(new_dir, &res.new_cinfo); - } - return status; -} - -static int nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, - struct inode *new_dir, struct qstr *new_name) -{ - struct nfs4_exception exception = { }; - int err; - do { - err = _nfs4_proc_rename(old_dir, old_name, - new_dir, new_name); - trace_nfs4_rename(old_dir, old_name, new_dir, new_name, err); - err = nfs4_handle_exception(NFS_SERVER(old_dir), err, - &exception); - } while (exception.retry); - return err; -} - static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) { struct nfs_server *server = NFS_SERVER(inode); @@ -4884,6 +4882,20 @@ nfs4_init_uniform_client_string(const struct nfs_client *clp, nodename); } +/* + * nfs4_callback_up_net() starts only "tcp" and "tcp6" callback + * services. Advertise one based on the address family of the + * clientaddr. + */ +static unsigned int +nfs4_init_callback_netid(const struct nfs_client *clp, char *buf, size_t len) +{ + if (strchr(clp->cl_ipaddr, ':') != NULL) + return scnprintf(buf, len, "tcp6"); + else + return scnprintf(buf, len, "tcp"); +} + /** * nfs4_proc_setclientid - Negotiate client ID * @clp: state data structure @@ -4925,12 +4937,10 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, setclientid.sc_name, sizeof(setclientid.sc_name)); /* cb_client4 */ - rcu_read_lock(); - setclientid.sc_netid_len = scnprintf(setclientid.sc_netid, - sizeof(setclientid.sc_netid), "%s", - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_NETID)); - rcu_read_unlock(); + setclientid.sc_netid_len = + nfs4_init_callback_netid(clp, + setclientid.sc_netid, + sizeof(setclientid.sc_netid)); setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr, sizeof(setclientid.sc_uaddr), "%s.%u.%u", clp->cl_ipaddr, port >> 8, port & 255); @@ -8408,7 +8418,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .unlink_setup = nfs4_proc_unlink_setup, .unlink_rpc_prepare = nfs4_proc_unlink_rpc_prepare, .unlink_done = nfs4_proc_unlink_done, - .rename = nfs4_proc_rename, .rename_setup = nfs4_proc_rename_setup, .rename_rpc_prepare = nfs4_proc_rename_rpc_prepare, .rename_done = nfs4_proc_rename_done, diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 0deb32105ccf..2349518eef2c 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1316,7 +1316,7 @@ static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_st return 1; } -static int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state) +int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state) { set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags); clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); @@ -2075,8 +2075,10 @@ again: switch (status) { case 0: break; - case -NFS4ERR_DELAY: case -ETIMEDOUT: + if (clnt->cl_softrtry) + break; + case -NFS4ERR_DELAY: case -EAGAIN: ssleep(1); case -NFS4ERR_STALE_CLIENTID: diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index 808f29574412..6f340f02f2ba 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -90,7 +90,7 @@ static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc) */ static void nfs4_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); pnfs_return_layout(inode); pnfs_destroy_layout(NFS_I(inode)); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 72f3bf1754ef..73ce8d4fe2c8 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -203,8 +203,7 @@ static int nfs4_stat_to_errno(int); 2 + encode_verifier_maxsz + 5 + \ nfs4_label_maxsz) #define decode_readdir_maxsz (op_decode_hdr_maxsz + \ - decode_verifier_maxsz + \ - nfs4_label_maxsz + nfs4_fattr_maxsz) + decode_verifier_maxsz) #define encode_readlink_maxsz (op_encode_hdr_maxsz) #define decode_readlink_maxsz (op_decode_hdr_maxsz + 1) #define encode_write_maxsz (op_encode_hdr_maxsz + \ diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 4755858e37a0..cb53d450ae32 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -662,7 +662,18 @@ pnfs_destroy_all_layouts(struct nfs_client *clp) */ static bool pnfs_seqid_is_newer(u32 s1, u32 s2) { - return (s32)s1 - (s32)s2 > 0; + return (s32)(s1 - s2) > 0; +} + +static void +pnfs_verify_layout_stateid(struct pnfs_layout_hdr *lo, + const nfs4_stateid *new, + struct list_head *free_me_list) +{ + if (nfs4_stateid_match_other(&lo->plh_stateid, new)) + return; + /* Layout is new! Kill existing layout segments */ + pnfs_mark_matching_lsegs_invalid(lo, free_me_list, NULL); } /* update lo->plh_stateid with new if is more recent */ @@ -1315,6 +1326,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) struct nfs4_layoutget_res *res = &lgp->res; struct pnfs_layout_segment *lseg; struct inode *ino = lo->plh_inode; + LIST_HEAD(free_me); int status = 0; /* Inject layout blob into I/O device driver */ @@ -1341,6 +1353,8 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) goto out_forget_reply; } + /* Check that the new stateid matches the old stateid */ + pnfs_verify_layout_stateid(lo, &res->stateid, &free_me); /* Done processing layoutget. Set the layout stateid */ pnfs_set_layout_stateid(lo, &res->stateid, false); @@ -1355,6 +1369,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) } spin_unlock(&ino->i_lock); + pnfs_free_lseg_list(&free_me); return lseg; out: return ERR_PTR(status); diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index fddbba2d9eff..e55ce9e8b034 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -357,30 +357,6 @@ nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir, } static int -nfs_proc_rename(struct inode *old_dir, struct qstr *old_name, - struct inode *new_dir, struct qstr *new_name) -{ - struct nfs_renameargs arg = { - .old_dir = NFS_FH(old_dir), - .old_name = old_name, - .new_dir = NFS_FH(new_dir), - .new_name = new_name, - }; - struct rpc_message msg = { - .rpc_proc = &nfs_procedures[NFSPROC_RENAME], - .rpc_argp = &arg, - }; - int status; - - dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name); - status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0); - nfs_mark_for_revalidate(old_dir); - nfs_mark_for_revalidate(new_dir); - dprintk("NFS reply rename: %d\n", status); - return status; -} - -static int nfs_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) { struct nfs_linkargs arg = { @@ -745,7 +721,6 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .unlink_setup = nfs_proc_unlink_setup, .unlink_rpc_prepare = nfs_proc_unlink_rpc_prepare, .unlink_done = nfs_proc_unlink_done, - .rename = nfs_proc_rename, .rename_setup = nfs_proc_rename_setup, .rename_rpc_prepare = nfs_proc_rename_rpc_prepare, .rename_done = nfs_proc_rename_done, diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 910ed906eb82..2cb56943e232 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2215,6 +2215,8 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data; u32 nfsvers = nfss->nfs_client->rpc_ops->version; + sync_filesystem(sb); + /* * Userspace mount programs that send binary options generally send * them populated with default values. We have no way to know which diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 11d78944de79..de54129336c6 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -14,6 +14,7 @@ #include <linux/sched.h> #include <linux/wait.h> #include <linux/namei.h> +#include <linux/fsnotify.h> #include "internal.h" #include "nfs4_fs.h" @@ -353,8 +354,8 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata) return; } - if (task->tk_status != 0) - nfs_cancel_async_unlink(old_dentry); + if (data->complete) + data->complete(task, data); } /** @@ -399,9 +400,10 @@ static const struct rpc_call_ops nfs_rename_ops = { * * It's expected that valid references to the dentries and inodes are held */ -static struct rpc_task * +struct rpc_task * nfs_async_rename(struct inode *old_dir, struct inode *new_dir, - struct dentry *old_dentry, struct dentry *new_dentry) + struct dentry *old_dentry, struct dentry *new_dentry, + void (*complete)(struct rpc_task *, struct nfs_renamedata *)) { struct nfs_renamedata *data; struct rpc_message msg = { }; @@ -438,6 +440,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, data->new_dentry = dget(new_dentry); nfs_fattr_init(&data->old_fattr); nfs_fattr_init(&data->new_fattr); + data->complete = complete; /* set up nfs_renameargs */ data->args.old_dir = NFS_FH(old_dir); @@ -456,6 +459,27 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, return rpc_run_task(&task_setup_data); } +/* + * Perform tasks needed when a sillyrename is done such as cancelling the + * queued async unlink if it failed. + */ +static void +nfs_complete_sillyrename(struct rpc_task *task, struct nfs_renamedata *data) +{ + struct dentry *dentry = data->old_dentry; + + if (task->tk_status != 0) { + nfs_cancel_async_unlink(dentry); + return; + } + + /* + * vfs_unlink and the like do not issue this when a file is + * sillyrenamed, so do it here. + */ + fsnotify_nameremove(dentry, 0); +} + #define SILLYNAME_PREFIX ".nfs" #define SILLYNAME_PREFIX_LEN ((unsigned)sizeof(SILLYNAME_PREFIX) - 1) #define SILLYNAME_FILEID_LEN ((unsigned)sizeof(u64) << 1) @@ -548,7 +572,8 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) } /* run the rename task, undo unlink if it fails */ - task = nfs_async_rename(dir, dir, dentry, sdentry); + task = nfs_async_rename(dir, dir, dentry, sdentry, + nfs_complete_sillyrename); if (IS_ERR(task)) { error = -EBUSY; nfs_cancel_async_unlink(dentry); diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h index a812fd1b92a4..b481e1f5eecc 100644 --- a/fs/nfsd/acl.h +++ b/fs/nfsd/acl.h @@ -39,9 +39,13 @@ struct nfs4_acl; struct svc_fh; struct svc_rqst; -/* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to - * fit in a page: */ -#define NFS4_ACL_MAX 170 +/* + * Maximum ACL we'll accept from a client; chosen (somewhat + * arbitrarily) so that kmalloc'ing the ACL shouldn't require a + * high-order allocation. This allows 204 ACEs on x86_64: + */ +#define NFS4_ACL_MAX ((PAGE_SIZE - sizeof(struct nfs4_acl)) \ + / sizeof(struct nfs4_ace)) struct nfs4_acl *nfs4_acl_new(int); int nfs4_acl_get_whotype(char *, u32); diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 06cddd572264..2645be435e75 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -71,10 +71,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) if (gid_eq(new->fsgid, INVALID_GID)) new->fsgid = exp->ex_anon_gid; - ret = set_groups(new, gi); + set_groups(new, gi); put_group_info(gi); - if (ret < 0) - goto error; if (!uid_eq(new->fsuid, GLOBAL_ROOT_UID)) new->cap_effective = cap_drop_nfsd_set(new->cap_effective); @@ -89,7 +87,6 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) oom: ret = -ENOMEM; -error: abort_creds(new); return ret; } diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index d190e33d0ec2..f66c66b9f182 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -402,8 +402,10 @@ sort_pacl(struct posix_acl *pacl) * by uid/gid. */ int i, j; - if (pacl->a_count <= 4) - return; /* no users or groups */ + /* no users or groups */ + if (!pacl || pacl->a_count <= 4) + return; + i = 1; while (pacl->a_entries[i].e_tag == ACL_USER) i++; @@ -530,19 +532,21 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) /* * ACLs with no ACEs are treated differently in the inheritable - * and effective cases: when there are no inheritable ACEs, we - * set a zero-length default posix acl: + * and effective cases: when there are no inheritable ACEs, + * calls ->set_acl with a NULL ACL structure. */ - if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT)) { - pacl = posix_acl_alloc(0, GFP_KERNEL); - return pacl ? pacl : ERR_PTR(-ENOMEM); - } + if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT)) + return NULL; + /* * When there are no effective ACEs, the following will end * up setting a 3-element effective posix ACL with all * permissions zero. */ - nace = 4 + state->users->n + state->groups->n; + if (!state->users->n && !state->groups->n) + nace = 3; + else /* Note we also include a MASK ACE in this case: */ + nace = 4 + state->users->n + state->groups->n; pacl = posix_acl_alloc(nace, GFP_KERNEL); if (!pacl) return ERR_PTR(-ENOMEM); @@ -586,9 +590,11 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) add_to_mask(state, &state->groups->aces[i].perms); } - pace++; - pace->e_tag = ACL_MASK; - low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags); + if (state->users->n || state->groups->n) { + pace++; + pace->e_tag = ACL_MASK; + low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags); + } pace++; pace->e_tag = ACL_OTHER; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 7f05cd140de3..2c73cae9899d 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -32,6 +32,7 @@ */ #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/xprt.h> #include <linux/sunrpc/svc_xprt.h> #include <linux/slab.h> #include "nfsd.h" @@ -635,11 +636,29 @@ static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc } } +static struct rpc_clnt *create_backchannel_client(struct rpc_create_args *args) +{ + struct rpc_xprt *xprt; + + if (args->protocol != XPRT_TRANSPORT_BC_TCP) + return rpc_create(args); + + xprt = args->bc_xprt->xpt_bc_xprt; + if (xprt) { + xprt_get(xprt); + return rpc_create_xprt(args, xprt); + } + + return rpc_create(args); +} + static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses) { + int maxtime = max_cb_time(clp->net); struct rpc_timeout timeparms = { - .to_initval = max_cb_time(clp->net), + .to_initval = maxtime, .to_retries = 0, + .to_maxval = maxtime, }; struct rpc_create_args args = { .net = clp->net, @@ -674,7 +693,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c args.authflavor = ses->se_cb_sec.flavor; } /* Create RPC client */ - client = rpc_create(&args); + client = create_backchannel_client(&args); if (IS_ERR(client)) { dprintk("NFSD: couldn't create callback client: %ld\n", PTR_ERR(client)); diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 82189b208af3..d543222babf3 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1273,6 +1273,8 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, struct nfsd4_op *op; struct nfsd4_operation *opdesc; struct nfsd4_compound_state *cstate = &resp->cstate; + struct svc_fh *current_fh = &cstate->current_fh; + struct svc_fh *save_fh = &cstate->save_fh; int slack_bytes; u32 plen = 0; __be32 status; @@ -1288,11 +1290,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, resp->tag = args->tag; resp->opcnt = 0; resp->rqstp = rqstp; - resp->cstate.minorversion = args->minorversion; - resp->cstate.replay_owner = NULL; - resp->cstate.session = NULL; - fh_init(&resp->cstate.current_fh, NFS4_FHSIZE); - fh_init(&resp->cstate.save_fh, NFS4_FHSIZE); + cstate->minorversion = args->minorversion; + cstate->replay_owner = NULL; + cstate->session = NULL; + fh_init(current_fh, NFS4_FHSIZE); + fh_init(save_fh, NFS4_FHSIZE); /* * Don't use the deferral mechanism for NFSv4; compounds make it * too hard to avoid non-idempotency problems. @@ -1345,20 +1347,28 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, opdesc = OPDESC(op); - if (!cstate->current_fh.fh_dentry) { + if (!current_fh->fh_dentry) { if (!(opdesc->op_flags & ALLOWED_WITHOUT_FH)) { op->status = nfserr_nofilehandle; goto encode_op; } - } else if (cstate->current_fh.fh_export->ex_fslocs.migrated && + } else if (current_fh->fh_export->ex_fslocs.migrated && !(opdesc->op_flags & ALLOWED_ON_ABSENT_FS)) { op->status = nfserr_moved; goto encode_op; } + fh_clear_wcc(current_fh); + /* If op is non-idempotent */ if (opdesc->op_flags & OP_MODIFIES_SOMETHING) { plen = opdesc->op_rsize_bop(rqstp, op); + /* + * If there's still another operation, make sure + * we'll have space to at least encode an error: + */ + if (resp->opcnt < args->opcnt) + plen += COMPOUND_ERR_SLACK_SPACE; op->status = nfsd4_check_resp_size(resp, plen); } @@ -1377,12 +1387,12 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, clear_current_stateid(cstate); if (need_wrongsec_check(rqstp)) - op->status = check_nfsd_access(cstate->current_fh.fh_export, rqstp); + op->status = check_nfsd_access(current_fh->fh_export, rqstp); } encode_op: /* Only from SEQUENCE */ - if (resp->cstate.status == nfserr_replay_cache) { + if (cstate->status == nfserr_replay_cache) { dprintk("%s NFS4.1 replay from cache\n", __func__); status = op->status; goto out; @@ -1411,10 +1421,10 @@ encode_op: nfsd4_increment_op_stats(op->opnum); } - resp->cstate.status = status; - fh_put(&resp->cstate.current_fh); - fh_put(&resp->cstate.save_fh); - BUG_ON(resp->cstate.replay_owner); + cstate->status = status; + fh_put(current_fh); + fh_put(save_fh); + BUG_ON(cstate->replay_owner); out: /* Reset deferral mechanism for RPC deferrals */ rqstp->rq_usedeferral = 1; @@ -1523,7 +1533,8 @@ static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *o static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { - return (op_encode_hdr_size + 2 + 1024) * sizeof(__be32); + return (op_encode_hdr_size + 2 + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) * + sizeof(__be32); } static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index d5d070fbeb35..9a77a5a21557 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1078,6 +1078,18 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) return NULL; } clp->cl_name.len = name.len; + INIT_LIST_HEAD(&clp->cl_sessions); + idr_init(&clp->cl_stateids); + atomic_set(&clp->cl_refcount, 0); + clp->cl_cb_state = NFSD4_CB_UNKNOWN; + INIT_LIST_HEAD(&clp->cl_idhash); + INIT_LIST_HEAD(&clp->cl_openowners); + INIT_LIST_HEAD(&clp->cl_delegations); + INIT_LIST_HEAD(&clp->cl_lru); + INIT_LIST_HEAD(&clp->cl_callbacks); + INIT_LIST_HEAD(&clp->cl_revoked); + spin_lock_init(&clp->cl_lock); + rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); return clp; } @@ -1095,6 +1107,7 @@ free_client(struct nfs4_client *clp) WARN_ON_ONCE(atomic_read(&ses->se_ref)); free_session(ses); } + rpc_destroy_wait_queue(&clp->cl_cb_waitq); free_svc_cred(&clp->cl_cred); kfree(clp->cl_name.data); idr_destroy(&clp->cl_stateids); @@ -1347,7 +1360,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name, if (clp == NULL) return NULL; - INIT_LIST_HEAD(&clp->cl_sessions); ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred); if (ret) { spin_lock(&nn->client_lock); @@ -1355,20 +1367,9 @@ static struct nfs4_client *create_client(struct xdr_netobj name, spin_unlock(&nn->client_lock); return NULL; } - idr_init(&clp->cl_stateids); - atomic_set(&clp->cl_refcount, 0); - clp->cl_cb_state = NFSD4_CB_UNKNOWN; - INIT_LIST_HEAD(&clp->cl_idhash); - INIT_LIST_HEAD(&clp->cl_openowners); - INIT_LIST_HEAD(&clp->cl_delegations); - INIT_LIST_HEAD(&clp->cl_lru); - INIT_LIST_HEAD(&clp->cl_callbacks); - INIT_LIST_HEAD(&clp->cl_revoked); - spin_lock_init(&clp->cl_lock); nfsd4_init_callback(&clp->cl_cb_null); clp->cl_time = get_seconds(); clear_bit(0, &clp->cl_cb_slot_busy); - rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); copy_verf(clp, verf); rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa); gen_confirm(clp); @@ -1538,7 +1539,7 @@ out_err: } /* - * Cache a reply. nfsd4_check_drc_limit() has bounded the cache size. + * Cache a reply. nfsd4_check_resp_size() has bounded the cache size. */ void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) @@ -1596,7 +1597,7 @@ nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args, * The sequence operation is not cached because we can use the slot and * session values. */ -__be32 +static __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, struct nfsd4_sequence *seq) { @@ -1605,9 +1606,8 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, dprintk("--> %s slot %p\n", __func__, slot); - /* Either returns 0 or nfserr_retry_uncached */ status = nfsd4_enc_sequence_replay(resp->rqstp->rq_argp, resp); - if (status == nfserr_retry_uncached_rep) + if (status) return status; /* The sequence operation has been encoded, cstate->datap set. */ @@ -2287,7 +2287,8 @@ out: if (!list_empty(&clp->cl_revoked)) seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED; out_no_session: - kfree(conn); + if (conn) + free_conn(conn); spin_unlock(&nn->client_lock); return status; out_put_session: @@ -3627,8 +3628,11 @@ static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, return nfserr_bad_stateid; status = lookup_clientid(&stateid->si_opaque.so_clid, sessions, nn, &cl); - if (status == nfserr_stale_clientid) + if (status == nfserr_stale_clientid) { + if (sessions) + return nfserr_bad_stateid; return nfserr_stale_stateid; + } if (status) return status; *s = find_stateid_by_type(cl, stateid, typemask); @@ -3713,9 +3717,16 @@ out: static __be32 nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp) { - if (check_for_locks(stp->st_file, lockowner(stp->st_stateowner))) + struct nfs4_lockowner *lo = lockowner(stp->st_stateowner); + + if (check_for_locks(stp->st_file, lo)) return nfserr_locks_held; - release_lock_stateid(stp); + /* + * Currently there's a 1-1 lock stateid<->lockowner + * correspondance, and we have to delete the lockowner when we + * delete the lock stateid: + */ + unhash_lockowner(lo); return nfs_ok; } @@ -4155,6 +4166,10 @@ static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, c if (!same_owner_str(&lo->lo_owner, owner, clid)) return false; + if (list_empty(&lo->lo_owner.so_stateids)) { + WARN_ON_ONCE(1); + return false; + } lst = list_first_entry(&lo->lo_owner.so_stateids, struct nfs4_ol_stateid, st_perstateowner); return lst->st_file->fi_inode == inode; @@ -5062,7 +5077,6 @@ nfs4_state_destroy_net(struct net *net) int i; struct nfs4_client *clp = NULL; struct nfsd_net *nn = net_generic(net, nfsd_net_id); - struct rb_node *node, *tmp; for (i = 0; i < CLIENT_HASH_SIZE; i++) { while (!list_empty(&nn->conf_id_hashtbl[i])) { @@ -5071,13 +5085,11 @@ nfs4_state_destroy_net(struct net *net) } } - node = rb_first(&nn->unconf_name_tree); - while (node != NULL) { - tmp = node; - node = rb_next(tmp); - clp = rb_entry(tmp, struct nfs4_client, cl_namenode); - rb_erase(tmp, &nn->unconf_name_tree); - destroy_client(clp); + for (i = 0; i < CLIENT_HASH_SIZE; i++) { + while (!list_empty(&nn->unconf_id_hashtbl[i])) { + clp = list_entry(nn->unconf_id_hashtbl[i].next, struct nfs4_client, cl_idhash); + destroy_client(clp); + } } kfree(nn->sessionid_hashtbl); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 63f2395c57ed..18881f34737a 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -294,7 +294,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, READ32(nace); if (nace > NFS4_ACL_MAX) - return nfserr_resource; + return nfserr_fbig; *acl = nfs4_acl_new(nace); if (*acl == NULL) @@ -1222,7 +1222,6 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) } write->wr_head.iov_base = p; write->wr_head.iov_len = avail; - WARN_ON(avail != (XDR_QUADLEN(avail) << 2)); write->wr_pagelist = argp->pagelist; len = XDR_QUADLEN(write->wr_buflen) << 2; @@ -2483,6 +2482,8 @@ out_acl: goto out; } if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { + if ((buflen -= 16) < 0) + goto out_resource; WRITE32(3); WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0); WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD1); @@ -2499,8 +2500,10 @@ out: security_release_secctx(context, contextlen); #endif /* CONFIG_NFSD_V4_SECURITY_LABEL */ kfree(acl); - if (tempfh) + if (tempfh) { fh_put(tempfh); + kfree(tempfh); + } return status; out_nfserr: status = nfserrno(err); @@ -3471,6 +3474,9 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_test_stateid_id *stateid, *next; __be32 *p; + if (nfserr) + return nfserr; + RESERVE_SPACE(4 + (4 * test_stateid->ts_num_ids)); *p++ = htonl(test_stateid->ts_num_ids); @@ -3579,8 +3585,6 @@ __be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad) return 0; session = resp->cstate.session; - if (session == NULL) - return 0; if (xb->page_len == 0) { length = (char *)resp->p - (char *)xb->head[0].iov_base + pad; @@ -3620,7 +3624,7 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) || !nfsd4_enc_ops[op->opnum]); op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u); - /* nfsd4_check_drc_limit guarantees enough room for error status */ + /* nfsd4_check_resp_size guarantees enough room for error status */ if (!op->status) op->status = nfsd4_check_resp_size(resp, 0); if (so) { @@ -3691,6 +3695,12 @@ int nfsd4_release_compoundargs(void *rq, __be32 *p, void *resp) int nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compoundargs *args) { + if (rqstp->rq_arg.head[0].iov_len % 4) { + /* client is nuts */ + dprintk("%s: compound not properly padded! (peeraddr=%pISc xid=0x%x)", + __func__, svc_addr(rqstp), be32_to_cpu(rqstp->rq_xid)); + return 0; + } args->p = p; args->end = rqstp->rq_arg.head[0].iov_base + rqstp->rq_arg.head[0].iov_len; args->pagelist = rqstp->rq_arg.pages; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 7f555179bf81..f34d9de802ab 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -699,6 +699,11 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net) if (err != 0 || fd < 0) return -EINVAL; + if (svc_alien_sock(net, fd)) { + printk(KERN_ERR "%s: socket net is different to NFSd's one\n", __func__); + return -EINVAL; + } + err = nfsd_create_serv(net); if (err != 0) return err; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 30f34ab02137..479eb681c27c 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -282,7 +282,7 @@ void nfsd_lockd_shutdown(void); * reason. */ #define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */ -#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */ +#define COMPOUND_ERR_SLACK_SPACE 16 /* OP_SETATTR */ #define NFSD_LAUNDROMAT_MINTIMEOUT 1 /* seconds */ diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 4775bc4896c8..ad67964d0bb1 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -133,6 +133,17 @@ fh_init(struct svc_fh *fhp, int maxsize) #ifdef CONFIG_NFSD_V3 /* + * The wcc data stored in current_fh should be cleared + * between compound ops. + */ +static inline void +fh_clear_wcc(struct svc_fh *fhp) +{ + fhp->fh_post_saved = 0; + fhp->fh_pre_saved = 0; +} + +/* * Fill in the pre_op attr for the wcc data */ static inline void @@ -152,7 +163,8 @@ fill_pre_wcc(struct svc_fh *fhp) extern void fill_post_wcc(struct svc_fh *); #else -#define fill_pre_wcc(ignored) +#define fh_clear_wcc(ignored) +#define fill_pre_wcc(ignored) #define fill_post_wcc(notused) #endif /* CONFIG_NFSD_V3 */ diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index b17d93214d01..9c769a47ac5a 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -152,7 +152,7 @@ encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, type = (stat->mode & S_IFMT); *p++ = htonl(nfs_ftypes[type >> 12]); - *p++ = htonl((u32) (stat->mode & S_IALLUGO)); + *p++ = htonl((u32) stat->mode); *p++ = htonl((u32) stat->nlink); *p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid)); *p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid)); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 6d7be3f80356..16f0673a423c 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -404,6 +404,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, umode_t ftype = 0; __be32 err; int host_err; + bool get_write_count; int size_change = 0; if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE)) @@ -411,10 +412,18 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, if (iap->ia_valid & ATTR_SIZE) ftype = S_IFREG; + /* Callers that do fh_verify should do the fh_want_write: */ + get_write_count = !fhp->fh_dentry; + /* Get inode */ err = fh_verify(rqstp, fhp, ftype, accmode); if (err) goto out; + if (get_write_count) { + host_err = fh_want_write(fhp); + if (host_err) + return nfserrno(host_err); + } dentry = fhp->fh_dentry; inode = dentry->d_inode; @@ -1694,7 +1703,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry) goto out_dput_new; - host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL); + host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0); if (!host_err) { host_err = commit_metadata(tfhp); if (!host_err) @@ -1706,10 +1715,10 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, dput(odentry); out_nfserr: err = nfserrno(host_err); - - /* we cannot reply on fh_unlock on the two filehandles, + /* + * We cannot rely on fh_unlock on the two filehandles, * as that would do the wrong thing if the two directories - * were the same, so again we do it by hand + * were the same, so again we do it by hand. */ fill_post_wcc(ffhp); fill_post_wcc(tfhp); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index d278a0d03496..5ea7df305083 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -574,8 +574,6 @@ extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_setclientid_confirm *setclientid_confirm); extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp); -extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, - struct nfsd4_sequence *seq); extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_exchange_id *); extern __be32 nfsd4_backchannel_ctl(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_backchannel_ctl *); diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index deaa3d33a0aa..0d58075f34e2 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -942,6 +942,18 @@ int nilfs_cpfile_read(struct super_block *sb, size_t cpsize, struct inode *cpfile; int err; + if (cpsize > sb->s_blocksize) { + printk(KERN_ERR + "NILFS: too large checkpoint size: %zu bytes.\n", + cpsize); + return -EINVAL; + } else if (cpsize < NILFS_MIN_CHECKPOINT_SIZE) { + printk(KERN_ERR + "NILFS: too small checkpoint size: %zu bytes.\n", + cpsize); + return -EINVAL; + } + cpfile = nilfs_iget_locked(sb, NULL, NILFS_CPFILE_INO); if (unlikely(!cpfile)) return -ENOMEM; diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index fa0f80308c2d..0d5fada91191 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -484,6 +484,18 @@ int nilfs_dat_read(struct super_block *sb, size_t entry_size, struct nilfs_dat_info *di; int err; + if (entry_size > sb->s_blocksize) { + printk(KERN_ERR + "NILFS: too large DAT entry size: %zu bytes.\n", + entry_size); + return -EINVAL; + } else if (entry_size < NILFS_MIN_DAT_ENTRY_SIZE) { + printk(KERN_ERR + "NILFS: too small DAT entry size: %zu bytes.\n", + entry_size); + return -EINVAL; + } + dat = nilfs_iget_locked(sb, NULL, NILFS_DAT_INO); if (unlikely(!dat)) return -ENOMEM; diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 08fdb77852ac..f3a82fbcae02 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -134,6 +134,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) static const struct vm_operations_struct nilfs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = nilfs_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 7e350c562e0e..b9c5726120e3 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -783,16 +783,14 @@ void nilfs_evict_inode(struct inode *inode) int ret; if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) { - if (inode->i_data.nrpages) - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); nilfs_clear_inode(inode); return; } nilfs_transaction_begin(sb, &ti, 0); /* never fails */ - if (inode->i_data.nrpages) - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); /* TODO: some of the following operations may fail. */ nilfs_truncate_bmap(ii, 0); diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 2b34021948e4..422fb54b7377 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -1072,6 +1072,48 @@ out: } /** + * nilfs_ioctl_trim_fs() - trim ioctl handle function + * @inode: inode object + * @argp: pointer on argument from userspace + * + * Decription: nilfs_ioctl_trim_fs is the FITRIM ioctl handle function. It + * checks the arguments from userspace and calls nilfs_sufile_trim_fs, which + * performs the actual trim operation. + * + * Return Value: On success, 0 is returned or negative error code, otherwise. + */ +static int nilfs_ioctl_trim_fs(struct inode *inode, void __user *argp) +{ + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + struct request_queue *q = bdev_get_queue(nilfs->ns_bdev); + struct fstrim_range range; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + + if (copy_from_user(&range, argp, sizeof(range))) + return -EFAULT; + + range.minlen = max_t(u64, range.minlen, q->limits.discard_granularity); + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_sufile_trim_fs(nilfs->ns_sufile, &range); + up_read(&nilfs->ns_segctor_sem); + + if (ret < 0) + return ret; + + if (copy_to_user(argp, &range, sizeof(range))) + return -EFAULT; + + return 0; +} + +/** * nilfs_ioctl_set_alloc_range - limit range of segments to be allocated * @inode: inode object * @argp: pointer on argument from userspace @@ -1163,6 +1205,95 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp, return ret; } +/** + * nilfs_ioctl_set_suinfo - set segment usage info + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: Expects an array of nilfs_suinfo_update structures + * encapsulated in nilfs_argv and updates the segment usage info + * according to the flags in nilfs_suinfo_update. + * + * Return Value: On success, 0 is returned. On error, one of the + * following negative error codes is returned. + * + * %-EPERM - Not enough permissions + * + * %-EFAULT - Error copying input data + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - Invalid values in input (segment number, flags or nblocks) + */ +static int nilfs_ioctl_set_suinfo(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + struct nilfs_transaction_info ti; + struct nilfs_argv argv; + size_t len; + void __user *base; + void *kbuf; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + ret = -EFAULT; + if (copy_from_user(&argv, argp, sizeof(argv))) + goto out; + + ret = -EINVAL; + if (argv.v_size < sizeof(struct nilfs_suinfo_update)) + goto out; + + if (argv.v_nmembs > nilfs->ns_nsegments) + goto out; + + if (argv.v_nmembs >= UINT_MAX / argv.v_size) + goto out; + + len = argv.v_size * argv.v_nmembs; + if (!len) { + ret = 0; + goto out; + } + + base = (void __user *)(unsigned long)argv.v_base; + kbuf = vmalloc(len); + if (!kbuf) { + ret = -ENOMEM; + goto out; + } + + if (copy_from_user(kbuf, base, len)) { + ret = -EFAULT; + goto out_free; + } + + nilfs_transaction_begin(inode->i_sb, &ti, 0); + ret = nilfs_sufile_set_suinfo(nilfs->ns_sufile, kbuf, argv.v_size, + argv.v_nmembs); + if (unlikely(ret < 0)) + nilfs_transaction_abort(inode->i_sb); + else + nilfs_transaction_commit(inode->i_sb); /* never fails */ + +out_free: + vfree(kbuf); +out: + mnt_drop_write_file(filp); + return ret; +} + long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -1189,6 +1320,8 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return nilfs_ioctl_get_info(inode, filp, cmd, argp, sizeof(struct nilfs_suinfo), nilfs_ioctl_do_get_suinfo); + case NILFS_IOCTL_SET_SUINFO: + return nilfs_ioctl_set_suinfo(inode, filp, cmd, argp); case NILFS_IOCTL_GET_SUSTAT: return nilfs_ioctl_get_sustat(inode, filp, cmd, argp); case NILFS_IOCTL_GET_VINFO: @@ -1205,6 +1338,8 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return nilfs_ioctl_resize(inode, filp, argp); case NILFS_IOCTL_SET_ALLOC_RANGE: return nilfs_ioctl_set_alloc_range(inode, argp); + case FITRIM: + return nilfs_ioctl_trim_fs(inode, argp); default: return -ENOTTY; } @@ -1228,6 +1363,7 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case NILFS_IOCTL_GET_CPINFO: case NILFS_IOCTL_GET_CPSTAT: case NILFS_IOCTL_GET_SUINFO: + case NILFS_IOCTL_SET_SUINFO: case NILFS_IOCTL_GET_SUSTAT: case NILFS_IOCTL_GET_VINFO: case NILFS_IOCTL_GET_BDESCS: @@ -1235,6 +1371,7 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case NILFS_IOCTL_SYNC: case NILFS_IOCTL_RESIZE: case NILFS_IOCTL_SET_ALLOC_RANGE: + case FITRIM: break; default: return -ENOIOCTLCMD; diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 3127e9f438a7..2a869c35c362 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -870,6 +870,289 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf, } /** + * nilfs_sufile_set_suinfo - sets segment usage info + * @sufile: inode of segment usage file + * @buf: array of suinfo_update + * @supsz: byte size of suinfo_update + * @nsup: size of suinfo_update array + * + * Description: Takes an array of nilfs_suinfo_update structs and updates + * segment usage accordingly. Only the fields indicated by the sup_flags + * are updated. + * + * Return Value: On success, 0 is returned. On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - Invalid values in input (segment number, flags or nblocks) + */ +ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf, + unsigned supsz, size_t nsup) +{ + struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; + struct buffer_head *header_bh, *bh; + struct nilfs_suinfo_update *sup, *supend = buf + supsz * nsup; + struct nilfs_segment_usage *su; + void *kaddr; + unsigned long blkoff, prev_blkoff; + int cleansi, cleansu, dirtysi, dirtysu; + long ncleaned = 0, ndirtied = 0; + int ret = 0; + + if (unlikely(nsup == 0)) + return ret; + + for (sup = buf; sup < supend; sup = (void *)sup + supsz) { + if (sup->sup_segnum >= nilfs->ns_nsegments + || (sup->sup_flags & + (~0UL << __NR_NILFS_SUINFO_UPDATE_FIELDS)) + || (nilfs_suinfo_update_nblocks(sup) && + sup->sup_sui.sui_nblocks > + nilfs->ns_blocks_per_segment)) + return -EINVAL; + } + + down_write(&NILFS_MDT(sufile)->mi_sem); + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out_sem; + + sup = buf; + blkoff = nilfs_sufile_get_blkoff(sufile, sup->sup_segnum); + ret = nilfs_mdt_get_block(sufile, blkoff, 1, NULL, &bh); + if (ret < 0) + goto out_header; + + for (;;) { + kaddr = kmap_atomic(bh->b_page); + su = nilfs_sufile_block_get_segment_usage( + sufile, sup->sup_segnum, bh, kaddr); + + if (nilfs_suinfo_update_lastmod(sup)) + su->su_lastmod = cpu_to_le64(sup->sup_sui.sui_lastmod); + + if (nilfs_suinfo_update_nblocks(sup)) + su->su_nblocks = cpu_to_le32(sup->sup_sui.sui_nblocks); + + if (nilfs_suinfo_update_flags(sup)) { + /* + * Active flag is a virtual flag projected by running + * nilfs kernel code - drop it not to write it to + * disk. + */ + sup->sup_sui.sui_flags &= + ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE); + + cleansi = nilfs_suinfo_clean(&sup->sup_sui); + cleansu = nilfs_segment_usage_clean(su); + dirtysi = nilfs_suinfo_dirty(&sup->sup_sui); + dirtysu = nilfs_segment_usage_dirty(su); + + if (cleansi && !cleansu) + ++ncleaned; + else if (!cleansi && cleansu) + --ncleaned; + + if (dirtysi && !dirtysu) + ++ndirtied; + else if (!dirtysi && dirtysu) + --ndirtied; + + su->su_flags = cpu_to_le32(sup->sup_sui.sui_flags); + } + + kunmap_atomic(kaddr); + + sup = (void *)sup + supsz; + if (sup >= supend) + break; + + prev_blkoff = blkoff; + blkoff = nilfs_sufile_get_blkoff(sufile, sup->sup_segnum); + if (blkoff == prev_blkoff) + continue; + + /* get different block */ + mark_buffer_dirty(bh); + put_bh(bh); + ret = nilfs_mdt_get_block(sufile, blkoff, 1, NULL, &bh); + if (unlikely(ret < 0)) + goto out_mark; + } + mark_buffer_dirty(bh); + put_bh(bh); + + out_mark: + if (ncleaned || ndirtied) { + nilfs_sufile_mod_counter(header_bh, (u64)ncleaned, + (u64)ndirtied); + NILFS_SUI(sufile)->ncleansegs += ncleaned; + } + nilfs_mdt_mark_dirty(sufile); + out_header: + put_bh(header_bh); + out_sem: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** + * nilfs_sufile_trim_fs() - trim ioctl handle function + * @sufile: inode of segment usage file + * @range: fstrim_range structure + * + * start: First Byte to trim + * len: number of Bytes to trim from start + * minlen: minimum extent length in Bytes + * + * Decription: nilfs_sufile_trim_fs goes through all segments containing bytes + * from start to start+len. start is rounded up to the next block boundary + * and start+len is rounded down. For each clean segment blkdev_issue_discard + * function is invoked. + * + * Return Value: On success, 0 is returned or negative error code, otherwise. + */ +int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range) +{ + struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; + struct buffer_head *su_bh; + struct nilfs_segment_usage *su; + void *kaddr; + size_t n, i, susz = NILFS_MDT(sufile)->mi_entry_size; + sector_t seg_start, seg_end, start_block, end_block; + sector_t start = 0, nblocks = 0; + u64 segnum, segnum_end, minlen, len, max_blocks, ndiscarded = 0; + int ret = 0; + unsigned int sects_per_block; + + sects_per_block = (1 << nilfs->ns_blocksize_bits) / + bdev_logical_block_size(nilfs->ns_bdev); + len = range->len >> nilfs->ns_blocksize_bits; + minlen = range->minlen >> nilfs->ns_blocksize_bits; + max_blocks = ((u64)nilfs->ns_nsegments * nilfs->ns_blocks_per_segment); + + if (!len || range->start >= max_blocks << nilfs->ns_blocksize_bits) + return -EINVAL; + + start_block = (range->start + nilfs->ns_blocksize - 1) >> + nilfs->ns_blocksize_bits; + + /* + * range->len can be very large (actually, it is set to + * ULLONG_MAX by default) - truncate upper end of the range + * carefully so as not to overflow. + */ + if (max_blocks - start_block < len) + end_block = max_blocks - 1; + else + end_block = start_block + len - 1; + + segnum = nilfs_get_segnum_of_block(nilfs, start_block); + segnum_end = nilfs_get_segnum_of_block(nilfs, end_block); + + down_read(&NILFS_MDT(sufile)->mi_sem); + + while (segnum <= segnum_end) { + n = nilfs_sufile_segment_usages_in_block(sufile, segnum, + segnum_end); + + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, + &su_bh); + if (ret < 0) { + if (ret != -ENOENT) + goto out_sem; + /* hole */ + segnum += n; + continue; + } + + kaddr = kmap_atomic(su_bh->b_page); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, + su_bh, kaddr); + for (i = 0; i < n; ++i, ++segnum, su = (void *)su + susz) { + if (!nilfs_segment_usage_clean(su)) + continue; + + nilfs_get_segment_range(nilfs, segnum, &seg_start, + &seg_end); + + if (!nblocks) { + /* start new extent */ + start = seg_start; + nblocks = seg_end - seg_start + 1; + continue; + } + + if (start + nblocks == seg_start) { + /* add to previous extent */ + nblocks += seg_end - seg_start + 1; + continue; + } + + /* discard previous extent */ + if (start < start_block) { + nblocks -= start_block - start; + start = start_block; + } + + if (nblocks >= minlen) { + kunmap_atomic(kaddr); + + ret = blkdev_issue_discard(nilfs->ns_bdev, + start * sects_per_block, + nblocks * sects_per_block, + GFP_NOFS, 0); + if (ret < 0) { + put_bh(su_bh); + goto out_sem; + } + + ndiscarded += nblocks; + kaddr = kmap_atomic(su_bh->b_page); + su = nilfs_sufile_block_get_segment_usage( + sufile, segnum, su_bh, kaddr); + } + + /* start new extent */ + start = seg_start; + nblocks = seg_end - seg_start + 1; + } + kunmap_atomic(kaddr); + put_bh(su_bh); + } + + + if (nblocks) { + /* discard last extent */ + if (start < start_block) { + nblocks -= start_block - start; + start = start_block; + } + if (start + nblocks > end_block + 1) + nblocks = end_block - start + 1; + + if (nblocks >= minlen) { + ret = blkdev_issue_discard(nilfs->ns_bdev, + start * sects_per_block, + nblocks * sects_per_block, + GFP_NOFS, 0); + if (!ret) + ndiscarded += nblocks; + } + } + +out_sem: + up_read(&NILFS_MDT(sufile)->mi_sem); + + range->len = ndiscarded << nilfs->ns_blocksize_bits; + return ret; +} + +/** * nilfs_sufile_read - read or get sufile inode * @sb: super block instance * @susize: size of a segment usage entry @@ -886,6 +1169,18 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize, void *kaddr; int err; + if (susize > sb->s_blocksize) { + printk(KERN_ERR + "NILFS: too large segment usage size: %zu bytes.\n", + susize); + return -EINVAL; + } else if (susize < NILFS_MIN_SEGMENT_USAGE_SIZE) { + printk(KERN_ERR + "NILFS: too small segment usage size: %zu bytes.\n", + susize); + return -EINVAL; + } + sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO); if (unlikely(!sufile)) return -ENOMEM; diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h index e84bc5b51fc1..b8afd72f2379 100644 --- a/fs/nilfs2/sufile.h +++ b/fs/nilfs2/sufile.h @@ -44,6 +44,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *); ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned, size_t); +ssize_t nilfs_sufile_set_suinfo(struct inode *, void *, unsigned , size_t); int nilfs_sufile_updatev(struct inode *, __u64 *, size_t, int, size_t *, void (*dofunc)(struct inode *, __u64, @@ -65,6 +66,7 @@ void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *, int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs); int nilfs_sufile_read(struct super_block *sb, size_t susize, struct nilfs_inode *raw_inode, struct inode **inodep); +int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range); /** * nilfs_sufile_scrap - make a segment garbage diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 7ac2a122ca1d..8c532b2ca3ab 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1129,6 +1129,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) unsigned long old_mount_opt; int err; + sync_filesystem(sb); old_sb_flags = sb->s_flags; old_mount_opt = nilfs->ns_mount_opt; diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 94c451ce6d24..8ba8229ba076 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -399,6 +399,16 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs, return -EINVAL; nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size); + if (nilfs->ns_inode_size > nilfs->ns_blocksize) { + printk(KERN_ERR "NILFS: too large inode size: %d bytes.\n", + nilfs->ns_inode_size); + return -EINVAL; + } else if (nilfs->ns_inode_size < NILFS_MIN_INODE_SIZE) { + printk(KERN_ERR "NILFS: too small inode size: %d bytes.\n", + nilfs->ns_inode_size); + return -EINVAL; + } + nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino); nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment); diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index dc638f786d5c..ee9cb3795c2b 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -60,8 +60,8 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event) } #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS -static int fanotify_get_response_from_access(struct fsnotify_group *group, - struct fanotify_event_info *event) +static int fanotify_get_response(struct fsnotify_group *group, + struct fanotify_perm_event_info *event) { int ret; @@ -142,6 +142,40 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, return false; } +struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask, + struct path *path) +{ + struct fanotify_event_info *event; + +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + if (mask & FAN_ALL_PERM_EVENTS) { + struct fanotify_perm_event_info *pevent; + + pevent = kmem_cache_alloc(fanotify_perm_event_cachep, + GFP_KERNEL); + if (!pevent) + return NULL; + event = &pevent->fae; + pevent->response = 0; + goto init; + } +#endif + event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL); + if (!event) + return NULL; +init: __maybe_unused + fsnotify_init_event(&event->fse, inode, mask); + event->tgid = get_pid(task_tgid(current)); + if (path) { + event->path = *path; + path_get(&event->path); + } else { + event->path.mnt = NULL; + event->path.dentry = NULL; + } + return event; +} + static int fanotify_handle_event(struct fsnotify_group *group, struct inode *inode, struct fsnotify_mark *inode_mark, @@ -171,25 +205,11 @@ static int fanotify_handle_event(struct fsnotify_group *group, pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode, mask); - event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL); + event = fanotify_alloc_event(inode, mask, data); if (unlikely(!event)) return -ENOMEM; fsn_event = &event->fse; - fsnotify_init_event(fsn_event, inode, mask); - event->tgid = get_pid(task_tgid(current)); - if (data_type == FSNOTIFY_EVENT_PATH) { - struct path *path = data; - event->path = *path; - path_get(&event->path); - } else { - event->path.mnt = NULL; - event->path.dentry = NULL; - } -#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - event->response = 0; -#endif - ret = fsnotify_add_notify_event(group, fsn_event, fanotify_merge); if (ret) { /* Permission events shouldn't be merged */ @@ -202,7 +222,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS if (mask & FAN_ALL_PERM_EVENTS) { - ret = fanotify_get_response_from_access(group, event); + ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event)); fsnotify_destroy_event(group, fsn_event); } #endif @@ -225,6 +245,13 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event) event = FANOTIFY_E(fsn_event); path_put(&event->path); put_pid(event->tgid); +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + if (fsn_event->mask & FAN_ALL_PERM_EVENTS) { + kmem_cache_free(fanotify_perm_event_cachep, + FANOTIFY_PE(fsn_event)); + return; + } +#endif kmem_cache_free(fanotify_event_cachep, event); } diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 32a2f034fb94..2a5fb14115df 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -3,13 +3,12 @@ #include <linux/slab.h> extern struct kmem_cache *fanotify_event_cachep; +extern struct kmem_cache *fanotify_perm_event_cachep; /* - * Lifetime of the structure differs for normal and permission events. In both - * cases the structure is allocated in fanotify_handle_event(). For normal - * events the structure is freed immediately after reporting it to userspace. - * For permission events we free it only after we receive response from - * userspace. + * Structure for normal fanotify events. It gets allocated in + * fanotify_handle_event() and freed when the information is retrieved by + * userspace */ struct fanotify_event_info { struct fsnotify_event fse; @@ -19,12 +18,33 @@ struct fanotify_event_info { */ struct path path; struct pid *tgid; +}; + #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - u32 response; /* userspace answer to question */ -#endif +/* + * Structure for permission fanotify events. It gets allocated and freed in + * fanotify_handle_event() since we wait there for user response. When the + * information is retrieved by userspace the structure is moved from + * group->notification_list to group->fanotify_data.access_list to wait for + * user response. + */ +struct fanotify_perm_event_info { + struct fanotify_event_info fae; + int response; /* userspace answer to question */ + int fd; /* fd we passed to userspace for this event */ }; +static inline struct fanotify_perm_event_info * +FANOTIFY_PE(struct fsnotify_event *fse) +{ + return container_of(fse, struct fanotify_perm_event_info, fae.fse); +} +#endif + static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse) { return container_of(fse, struct fanotify_event_info, fse); } + +struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask, + struct path *path); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 287a22c04149..732648b270dc 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -28,14 +28,8 @@ extern const struct fsnotify_ops fanotify_fsnotify_ops; static struct kmem_cache *fanotify_mark_cache __read_mostly; -static struct kmem_cache *fanotify_response_event_cache __read_mostly; struct kmem_cache *fanotify_event_cachep __read_mostly; - -struct fanotify_response_event { - struct list_head list; - __s32 fd; - struct fanotify_event_info *event; -}; +struct kmem_cache *fanotify_perm_event_cachep __read_mostly; /* * Get an fsnotify notification event if one exists and is small @@ -135,33 +129,34 @@ static int fill_event_metadata(struct fsnotify_group *group, } #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS -static struct fanotify_response_event *dequeue_re(struct fsnotify_group *group, - __s32 fd) +static struct fanotify_perm_event_info *dequeue_event( + struct fsnotify_group *group, int fd) { - struct fanotify_response_event *re, *return_re = NULL; + struct fanotify_perm_event_info *event, *return_e = NULL; - mutex_lock(&group->fanotify_data.access_mutex); - list_for_each_entry(re, &group->fanotify_data.access_list, list) { - if (re->fd != fd) + spin_lock(&group->fanotify_data.access_lock); + list_for_each_entry(event, &group->fanotify_data.access_list, + fae.fse.list) { + if (event->fd != fd) continue; - list_del_init(&re->list); - return_re = re; + list_del_init(&event->fae.fse.list); + return_e = event; break; } - mutex_unlock(&group->fanotify_data.access_mutex); + spin_unlock(&group->fanotify_data.access_lock); - pr_debug("%s: found return_re=%p\n", __func__, return_re); + pr_debug("%s: found return_re=%p\n", __func__, return_e); - return return_re; + return return_e; } static int process_access_response(struct fsnotify_group *group, struct fanotify_response *response_struct) { - struct fanotify_response_event *re; - __s32 fd = response_struct->fd; - __u32 response = response_struct->response; + struct fanotify_perm_event_info *event; + int fd = response_struct->fd; + int response = response_struct->response; pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group, fd, response); @@ -181,58 +176,15 @@ static int process_access_response(struct fsnotify_group *group, if (fd < 0) return -EINVAL; - re = dequeue_re(group, fd); - if (!re) + event = dequeue_event(group, fd); + if (!event) return -ENOENT; - re->event->response = response; - + event->response = response; wake_up(&group->fanotify_data.access_waitq); - kmem_cache_free(fanotify_response_event_cache, re); - - return 0; -} - -static int prepare_for_access_response(struct fsnotify_group *group, - struct fsnotify_event *event, - __s32 fd) -{ - struct fanotify_response_event *re; - - if (!(event->mask & FAN_ALL_PERM_EVENTS)) - return 0; - - re = kmem_cache_alloc(fanotify_response_event_cache, GFP_KERNEL); - if (!re) - return -ENOMEM; - - re->event = FANOTIFY_E(event); - re->fd = fd; - - mutex_lock(&group->fanotify_data.access_mutex); - - if (atomic_read(&group->fanotify_data.bypass_perm)) { - mutex_unlock(&group->fanotify_data.access_mutex); - kmem_cache_free(fanotify_response_event_cache, re); - FANOTIFY_E(event)->response = FAN_ALLOW; - return 0; - } - - list_add_tail(&re->list, &group->fanotify_data.access_list); - mutex_unlock(&group->fanotify_data.access_mutex); - - return 0; -} - -#else -static int prepare_for_access_response(struct fsnotify_group *group, - struct fsnotify_event *event, - __s32 fd) -{ return 0; } - #endif static ssize_t copy_event_to_user(struct fsnotify_group *group, @@ -247,7 +199,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, ret = fill_event_metadata(group, &fanotify_event_metadata, event, &f); if (ret < 0) - goto out; + return ret; fd = fanotify_event_metadata.fd; ret = -EFAULT; @@ -255,9 +207,10 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, fanotify_event_metadata.event_len)) goto out_close_fd; - ret = prepare_for_access_response(group, event, fd); - if (ret) - goto out_close_fd; +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + if (event->mask & FAN_ALL_PERM_EVENTS) + FANOTIFY_PE(event)->fd = fd; +#endif if (fd != FAN_NOFD) fd_install(fd, f); @@ -268,13 +221,6 @@ out_close_fd: put_unused_fd(fd); fput(f); } -out: -#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - if (event->mask & FAN_ALL_PERM_EVENTS) { - FANOTIFY_E(event)->response = FAN_DENY; - wake_up(&group->fanotify_data.access_waitq); - } -#endif return ret; } @@ -314,35 +260,50 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, kevent = get_one_event(group, count); mutex_unlock(&group->notification_mutex); - if (kevent) { + if (IS_ERR(kevent)) { ret = PTR_ERR(kevent); - if (IS_ERR(kevent)) + break; + } + + if (!kevent) { + ret = -EAGAIN; + if (file->f_flags & O_NONBLOCK) break; - ret = copy_event_to_user(group, kevent, buf); - /* - * Permission events get destroyed after we - * receive response - */ - if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) - fsnotify_destroy_event(group, kevent); - if (ret < 0) + + ret = -ERESTARTSYS; + if (signal_pending(current)) + break; + + if (start != buf) break; - buf += ret; - count -= ret; + schedule(); continue; } - ret = -EAGAIN; - if (file->f_flags & O_NONBLOCK) - break; - ret = -ERESTARTSYS; - if (signal_pending(current)) - break; - - if (start != buf) - break; - - schedule(); + ret = copy_event_to_user(group, kevent, buf); + /* + * Permission events get queued to wait for response. Other + * events can be destroyed now. + */ + if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) { + fsnotify_destroy_event(group, kevent); + if (ret < 0) + break; + } else { +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + if (ret < 0) { + FANOTIFY_PE(kevent)->response = FAN_DENY; + wake_up(&group->fanotify_data.access_waitq); + break; + } + spin_lock(&group->fanotify_data.access_lock); + list_add_tail(&kevent->list, + &group->fanotify_data.access_list); + spin_unlock(&group->fanotify_data.access_lock); +#endif + } + buf += ret; + count -= ret; } finish_wait(&group->notification_waitq, &wait); @@ -383,22 +344,21 @@ static int fanotify_release(struct inode *ignored, struct file *file) struct fsnotify_group *group = file->private_data; #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - struct fanotify_response_event *re, *lre; + struct fanotify_perm_event_info *event, *next; - mutex_lock(&group->fanotify_data.access_mutex); + spin_lock(&group->fanotify_data.access_lock); atomic_inc(&group->fanotify_data.bypass_perm); - list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) { - pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group, - re, re->event); + list_for_each_entry_safe(event, next, &group->fanotify_data.access_list, + fae.fse.list) { + pr_debug("%s: found group=%p event=%p\n", __func__, group, + event); - list_del_init(&re->list); - re->event->response = FAN_ALLOW; - - kmem_cache_free(fanotify_response_event_cache, re); + list_del_init(&event->fae.fse.list); + event->response = FAN_ALLOW; } - mutex_unlock(&group->fanotify_data.access_mutex); + spin_unlock(&group->fanotify_data.access_lock); wake_up(&group->fanotify_data.access_waitq); #endif @@ -731,21 +691,18 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) group->fanotify_data.user = user; atomic_inc(&user->fanotify_listeners); - oevent = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL); + oevent = fanotify_alloc_event(NULL, FS_Q_OVERFLOW, NULL); if (unlikely(!oevent)) { fd = -ENOMEM; goto out_destroy_group; } group->overflow_event = &oevent->fse; - fsnotify_init_event(group->overflow_event, NULL, FS_Q_OVERFLOW); - oevent->tgid = get_pid(task_tgid(current)); - oevent->path.mnt = NULL; - oevent->path.dentry = NULL; + if (force_o_largefile()) + event_f_flags |= O_LARGEFILE; group->fanotify_data.f_flags = event_f_flags; #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - oevent->response = 0; - mutex_init(&group->fanotify_data.access_mutex); + spin_lock_init(&group->fanotify_data.access_lock); init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); atomic_set(&group->fanotify_data.bypass_perm, 0); @@ -920,9 +877,11 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark, static int __init fanotify_user_setup(void) { fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC); - fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event, - SLAB_PANIC); fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC); +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + fanotify_perm_event_cachep = KMEM_CACHE(fanotify_perm_event_info, + SLAB_PANIC); +#endif return 0; } diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c index 807150e2c2b9..dd6103cc93c1 100644 --- a/fs/ntfs/debug.c +++ b/fs/ntfs/debug.c @@ -18,16 +18,9 @@ * distribution in the file COPYING); if not, write to the Free Software * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "debug.h" -/* - * A static buffer to hold the error string being displayed and a spinlock - * to protect concurrent accesses to it. - */ -static char err_buf[1024]; -static DEFINE_SPINLOCK(err_buf_lock); - /** * __ntfs_warning - output a warning to the syslog * @function: name of function outputting the warning @@ -50,6 +43,7 @@ static DEFINE_SPINLOCK(err_buf_lock); void __ntfs_warning(const char *function, const struct super_block *sb, const char *fmt, ...) { + struct va_format vaf; va_list args; int flen = 0; @@ -59,17 +53,15 @@ void __ntfs_warning(const char *function, const struct super_block *sb, #endif if (function) flen = strlen(function); - spin_lock(&err_buf_lock); va_start(args, fmt); - vsnprintf(err_buf, sizeof(err_buf), fmt, args); - va_end(args); + vaf.fmt = fmt; + vaf.va = &args; if (sb) - printk(KERN_ERR "NTFS-fs warning (device %s): %s(): %s\n", - sb->s_id, flen ? function : "", err_buf); + pr_warn("(device %s): %s(): %pV\n", + sb->s_id, flen ? function : "", &vaf); else - printk(KERN_ERR "NTFS-fs warning: %s(): %s\n", - flen ? function : "", err_buf); - spin_unlock(&err_buf_lock); + pr_warn("%s(): %pV\n", flen ? function : "", &vaf); + va_end(args); } /** @@ -94,6 +86,7 @@ void __ntfs_warning(const char *function, const struct super_block *sb, void __ntfs_error(const char *function, const struct super_block *sb, const char *fmt, ...) { + struct va_format vaf; va_list args; int flen = 0; @@ -103,17 +96,15 @@ void __ntfs_error(const char *function, const struct super_block *sb, #endif if (function) flen = strlen(function); - spin_lock(&err_buf_lock); va_start(args, fmt); - vsnprintf(err_buf, sizeof(err_buf), fmt, args); - va_end(args); + vaf.fmt = fmt; + vaf.va = &args; if (sb) - printk(KERN_ERR "NTFS-fs error (device %s): %s(): %s\n", - sb->s_id, flen ? function : "", err_buf); + pr_err("(device %s): %s(): %pV\n", + sb->s_id, flen ? function : "", &vaf); else - printk(KERN_ERR "NTFS-fs error: %s(): %s\n", - flen ? function : "", err_buf); - spin_unlock(&err_buf_lock); + pr_err("%s(): %pV\n", flen ? function : "", &vaf); + va_end(args); } #ifdef DEBUG @@ -124,6 +115,7 @@ int debug_msgs = 0; void __ntfs_debug (const char *file, int line, const char *function, const char *fmt, ...) { + struct va_format vaf; va_list args; int flen = 0; @@ -131,13 +123,11 @@ void __ntfs_debug (const char *file, int line, const char *function, return; if (function) flen = strlen(function); - spin_lock(&err_buf_lock); va_start(args, fmt); - vsnprintf(err_buf, sizeof(err_buf), fmt, args); + vaf.fmt = fmt; + vaf.va = &args; + pr_debug("(%s, %d): %s(): %pV", file, line, flen ? function : "", &vaf); va_end(args); - printk(KERN_DEBUG "NTFS-fs DEBUG (%s, %d): %s(): %s\n", file, line, - flen ? function : "", err_buf); - spin_unlock(&err_buf_lock); } /* Dump a runlist. Caller has to provide synchronisation for @rl. */ @@ -149,12 +139,12 @@ void ntfs_debug_dump_runlist(const runlist_element *rl) if (!debug_msgs) return; - printk(KERN_DEBUG "NTFS-fs DEBUG: Dumping runlist (values in hex):\n"); + pr_debug("Dumping runlist (values in hex):\n"); if (!rl) { - printk(KERN_DEBUG "Run list not present.\n"); + pr_debug("Run list not present.\n"); return; } - printk(KERN_DEBUG "VCN LCN Run length\n"); + pr_debug("VCN LCN Run length\n"); for (i = 0; ; i++) { LCN lcn = (rl + i)->lcn; @@ -163,13 +153,13 @@ void ntfs_debug_dump_runlist(const runlist_element *rl) if (index > -LCN_ENOENT - 1) index = 3; - printk(KERN_DEBUG "%-16Lx %s %-16Lx%s\n", + pr_debug("%-16Lx %s %-16Lx%s\n", (long long)(rl + i)->vcn, lcn_str[index], (long long)(rl + i)->length, (rl + i)->length ? "" : " (runlist end)"); } else - printk(KERN_DEBUG "%-16Lx %-16Lx %-16Lx%s\n", + pr_debug("%-16Lx %-16Lx %-16Lx%s\n", (long long)(rl + i)->vcn, (long long)(rl + i)->lcn, (long long)(rl + i)->length, diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h index 53c27eaf2307..61bf091e32a8 100644 --- a/fs/ntfs/debug.h +++ b/fs/ntfs/debug.h @@ -48,7 +48,12 @@ extern void ntfs_debug_dump_runlist(const runlist_element *rl); #else /* !DEBUG */ -#define ntfs_debug(f, a...) do {} while (0) +#define ntfs_debug(fmt, ...) \ +do { \ + if (0) \ + no_printk(fmt, ##__VA_ARGS__); \ +} while (0) + #define ntfs_debug_dump_runlist(rl) do {} while (0) #endif /* !DEBUG */ diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index ffb9b3675736..f47af5e6e230 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -1704,8 +1704,6 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi) iput(bvi); skip_large_index_stuff: /* Setup the operations for this index inode. */ - vi->i_op = NULL; - vi->i_fop = NULL; vi->i_mapping->a_ops = &ntfs_mst_aops; vi->i_blocks = ni->allocated_size >> 9; /* @@ -2259,7 +2257,7 @@ void ntfs_evict_big_inode(struct inode *vi) { ntfs_inode *ni = NTFS_I(vi); - truncate_inode_pages(&vi->i_data, 0); + truncate_inode_pages_final(&vi->i_data); clear_inode(vi); #ifdef NTFS_RW diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 82650d52d916..9de2491f2926 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -19,6 +19,7 @@ * distribution in the file COPYING); if not, write to the Free Software * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/stddef.h> #include <linux/init.h> @@ -468,6 +469,8 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt) ntfs_debug("Entering with remount options string: %s", opt); + sync_filesystem(sb); + #ifndef NTFS_RW /* For read-only compiled driver, enforce read-only flag. */ *flags |= MS_RDONLY; @@ -1894,7 +1897,7 @@ get_ctx_vol_failed: vol->minor_ver = vi->minor_ver; ntfs_attr_put_search_ctx(ctx); unmap_mft_record(NTFS_I(vol->vol_ino)); - printk(KERN_INFO "NTFS volume version %i.%i.\n", vol->major_ver, + pr_info("volume version %i.%i.\n", vol->major_ver, vol->minor_ver); if (vol->major_ver < 3 && NVolSparseEnabled(vol)) { ntfs_warning(vol->sb, "Disabling sparse support due to NTFS " @@ -3093,7 +3096,7 @@ static int __init init_ntfs_fs(void) int err = 0; /* This may be ugly but it results in pretty output so who cares. (-8 */ - printk(KERN_INFO "NTFS driver " NTFS_VERSION " [Flags: R/" + pr_info("driver " NTFS_VERSION " [Flags: R/" #ifdef NTFS_RW "W" #else @@ -3113,16 +3116,15 @@ static int __init init_ntfs_fs(void) sizeof(ntfs_index_context), 0 /* offset */, SLAB_HWCACHE_ALIGN, NULL /* ctor */); if (!ntfs_index_ctx_cache) { - printk(KERN_CRIT "NTFS: Failed to create %s!\n", - ntfs_index_ctx_cache_name); + pr_crit("Failed to create %s!\n", ntfs_index_ctx_cache_name); goto ictx_err_out; } ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name, sizeof(ntfs_attr_search_ctx), 0 /* offset */, SLAB_HWCACHE_ALIGN, NULL /* ctor */); if (!ntfs_attr_ctx_cache) { - printk(KERN_CRIT "NTFS: Failed to create %s!\n", - ntfs_attr_ctx_cache_name); + pr_crit("NTFS: Failed to create %s!\n", + ntfs_attr_ctx_cache_name); goto actx_err_out; } @@ -3130,8 +3132,7 @@ static int __init init_ntfs_fs(void) (NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0, SLAB_HWCACHE_ALIGN, NULL); if (!ntfs_name_cache) { - printk(KERN_CRIT "NTFS: Failed to create %s!\n", - ntfs_name_cache_name); + pr_crit("Failed to create %s!\n", ntfs_name_cache_name); goto name_err_out; } @@ -3139,8 +3140,7 @@ static int __init init_ntfs_fs(void) sizeof(ntfs_inode), 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); if (!ntfs_inode_cache) { - printk(KERN_CRIT "NTFS: Failed to create %s!\n", - ntfs_inode_cache_name); + pr_crit("Failed to create %s!\n", ntfs_inode_cache_name); goto inode_err_out; } @@ -3149,15 +3149,14 @@ static int __init init_ntfs_fs(void) SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, ntfs_big_inode_init_once); if (!ntfs_big_inode_cache) { - printk(KERN_CRIT "NTFS: Failed to create %s!\n", - ntfs_big_inode_cache_name); + pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name); goto big_inode_err_out; } /* Register the ntfs sysctls. */ err = ntfs_sysctl(1); if (err) { - printk(KERN_CRIT "NTFS: Failed to register NTFS sysctls!\n"); + pr_crit("Failed to register NTFS sysctls!\n"); goto sysctl_err_out; } @@ -3166,7 +3165,7 @@ static int __init init_ntfs_fs(void) ntfs_debug("NTFS driver registered successfully."); return 0; /* Success! */ } - printk(KERN_CRIT "NTFS: Failed to register NTFS filesystem driver!\n"); + pr_crit("Failed to register NTFS filesystem driver!\n"); /* Unregister the ntfs sysctls. */ ntfs_sysctl(0); @@ -3182,8 +3181,7 @@ actx_err_out: kmem_cache_destroy(ntfs_index_ctx_cache); ictx_err_out: if (!err) { - printk(KERN_CRIT "NTFS: Aborting NTFS filesystem driver " - "registration...\n"); + pr_crit("Aborting NTFS filesystem driver registration...\n"); err = -ENOMEM; } return err; diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 555f4cddefe3..7e8282dcea2a 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -205,6 +205,7 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh, di->i_mode = cpu_to_le16(inode->i_mode); di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index e2edff38be52..b4deb5f750d9 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5728,6 +5728,7 @@ int ocfs2_remove_btree_range(struct inode *inode, } ocfs2_et_update_clusters(et, -len); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, et->et_root_bh); @@ -6932,6 +6933,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); spin_unlock(&oi->ip_lock); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_dinode_new_extent_list(inode, di); ocfs2_journal_dirty(handle, di_bh); @@ -7208,6 +7210,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, di_bh); out_commit: diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index aeb44e879c51..d310d12a9adc 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -571,7 +571,6 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, { struct inode *inode = file_inode(iocb->ki_filp); int level; - wait_queue_head_t *wq = ocfs2_ioend_wq(inode); /* this io's submitter should not have unlocked this before we could */ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); @@ -582,10 +581,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, if (ocfs2_iocb_is_unaligned_aio(iocb)) { ocfs2_iocb_clear_unaligned_aio(iocb); - if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) && - waitqueue_active(wq)) { - wake_up_all(wq); - } + mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); } ocfs2_iocb_clear_rw_locked(iocb); @@ -2043,6 +2039,7 @@ out_write_size: inode->i_mtime = inode->i_ctime = CURRENT_TIME; di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, wc->w_di_bh); ocfs2_commit_trans(osb, handle); diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index f671e49beb34..6cae155d54df 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -102,9 +102,4 @@ enum ocfs2_iocb_lock_bits { #define ocfs2_iocb_is_unaligned_aio(iocb) \ test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) -#define OCFS2_IOEND_WQ_HASH_SZ 37 -#define ocfs2_ioend_wq(v) (&ocfs2__ioend_wq[((unsigned long)(v)) %\ - OCFS2_IOEND_WQ_HASH_SZ]) -extern wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ]; - #endif /* OCFS2_FILE_H */ diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 5b704c63a103..1edcb141f639 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -90,7 +90,6 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, * information for this bh as it's not marked locally * uptodate. */ ret = -EIO; - put_bh(bh); mlog_errno(ret); } @@ -420,7 +419,6 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb, if (!buffer_uptodate(bh)) { ret = -EIO; - put_bh(bh); mlog_errno(ret); } diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c index a4b07730b2e1..b7f57271d49c 100644 --- a/fs/ocfs2/cluster/sys.c +++ b/fs/ocfs2/cluster/sys.c @@ -41,7 +41,7 @@ static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr, return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION); } static struct kobj_attribute attr_version = - __ATTR(interface_revision, S_IFREG | S_IRUGO, version_show, NULL); + __ATTR(interface_revision, S_IRUGO, version_show, NULL); static struct attribute *o2cb_attrs[] = { &attr_version.attr, diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 2cd2406b4140..c6b90e670389 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -137,7 +137,7 @@ static int o2net_sys_err_translations[O2NET_ERR_MAX] = static void o2net_sc_connect_completed(struct work_struct *work); static void o2net_rx_until_empty(struct work_struct *work); static void o2net_shutdown_sc(struct work_struct *work); -static void o2net_listen_data_ready(struct sock *sk, int bytes); +static void o2net_listen_data_ready(struct sock *sk); static void o2net_sc_send_keep_req(struct work_struct *work); static void o2net_idle_timer(unsigned long data); static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); @@ -262,17 +262,17 @@ static void o2net_update_recv_stats(struct o2net_sock_container *sc) #endif /* CONFIG_OCFS2_FS_STATS */ -static inline int o2net_reconnect_delay(void) +static inline unsigned int o2net_reconnect_delay(void) { return o2nm_single_cluster->cl_reconnect_delay_ms; } -static inline int o2net_keepalive_delay(void) +static inline unsigned int o2net_keepalive_delay(void) { return o2nm_single_cluster->cl_keepalive_delay_ms; } -static inline int o2net_idle_timeout(void) +static inline unsigned int o2net_idle_timeout(void) { return o2nm_single_cluster->cl_idle_timeout_ms; } @@ -597,9 +597,9 @@ static void o2net_set_nn_state(struct o2net_node *nn, } /* see o2net_register_callbacks() */ -static void o2net_data_ready(struct sock *sk, int bytes) +static void o2net_data_ready(struct sock *sk) { - void (*ready)(struct sock *sk, int bytes); + void (*ready)(struct sock *sk); read_lock(&sk->sk_callback_lock); if (sk->sk_user_data) { @@ -613,7 +613,7 @@ static void o2net_data_ready(struct sock *sk, int bytes) } read_unlock(&sk->sk_callback_lock); - ready(sk, bytes); + ready(sk); } /* see o2net_register_callbacks() */ @@ -916,57 +916,30 @@ static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key) static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len) { - int ret; - mm_segment_t oldfs; - struct kvec vec = { - .iov_len = len, - .iov_base = data, - }; - struct msghdr msg = { - .msg_iovlen = 1, - .msg_iov = (struct iovec *)&vec, - .msg_flags = MSG_DONTWAIT, - }; - - oldfs = get_fs(); - set_fs(get_ds()); - ret = sock_recvmsg(sock, &msg, len, msg.msg_flags); - set_fs(oldfs); - - return ret; + struct kvec vec = { .iov_len = len, .iov_base = data, }; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT, }; + return kernel_recvmsg(sock, &msg, &vec, 1, len, msg.msg_flags); } static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec, size_t veclen, size_t total) { int ret; - mm_segment_t oldfs; - struct msghdr msg = { - .msg_iov = (struct iovec *)vec, - .msg_iovlen = veclen, - }; + struct msghdr msg; if (sock == NULL) { ret = -EINVAL; goto out; } - oldfs = get_fs(); - set_fs(get_ds()); - ret = sock_sendmsg(sock, &msg, total); - set_fs(oldfs); - if (ret != total) { - mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, - total); - if (ret >= 0) - ret = -EPIPE; /* should be smarter, I bet */ - goto out; - } - - ret = 0; + ret = kernel_sendmsg(sock, &msg, vec, veclen, total); + if (likely(ret == total)) + return 0; + mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, total); + if (ret >= 0) + ret = -EPIPE; /* should be smarter, I bet */ out: - if (ret < 0) - mlog(0, "returning error: %d\n", ret); + mlog(0, "returning error: %d\n", ret); return ret; } @@ -1953,9 +1926,9 @@ static void o2net_accept_many(struct work_struct *work) cond_resched(); } -static void o2net_listen_data_ready(struct sock *sk, int bytes) +static void o2net_listen_data_ready(struct sock *sk) { - void (*ready)(struct sock *sk, int bytes); + void (*ready)(struct sock *sk); read_lock(&sk->sk_callback_lock); ready = sk->sk_user_data; @@ -1964,18 +1937,29 @@ static void o2net_listen_data_ready(struct sock *sk, int bytes) goto out; } - /* ->sk_data_ready is also called for a newly established child socket - * before it has been accepted and the acceptor has set up their - * data_ready.. we only want to queue listen work for our listening - * socket */ + /* This callback may called twice when a new connection + * is being established as a child socket inherits everything + * from a parent LISTEN socket, including the data_ready cb of + * the parent. This leads to a hazard. In o2net_accept_one() + * we are still initializing the child socket but have not + * changed the inherited data_ready callback yet when + * data starts arriving. + * We avoid this hazard by checking the state. + * For the listening socket, the state will be TCP_LISTEN; for the new + * socket, will be TCP_ESTABLISHED. Also, in this case, + * sk->sk_user_data is not a valid function pointer. + */ + if (sk->sk_state == TCP_LISTEN) { - mlog(ML_TCP, "bytes: %d\n", bytes); queue_work(o2net_wq, &o2net_listen_work); + } else { + ready = NULL; } out: read_unlock(&sk->sk_callback_lock); - ready(sk, bytes); + if (ready != NULL) + ready(sk); } static int o2net_open_listening_sock(__be32 addr, __be16 port) diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index 4cbcb65784a3..dc024367110a 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -165,7 +165,7 @@ struct o2net_sock_container { /* original handlers for the sockets */ void (*sc_state_change)(struct sock *sk); - void (*sc_data_ready)(struct sock *sk, int bytes); + void (*sc_data_ready)(struct sock *sk); u32 sc_msg_key; u16 sc_msg_type; diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 0d3a97d2d5f6..e2e05a106beb 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -37,7 +37,6 @@ #include "dlmglue.h" #include "file.h" #include "inode.h" -#include "super.h" #include "ocfs2_trace.h" void ocfs2_dentry_attach_gen(struct dentry *dentry) @@ -346,52 +345,6 @@ out_attach: return ret; } -DEFINE_SPINLOCK(dentry_list_lock); - -/* We limit the number of dentry locks to drop in one go. We have - * this limit so that we don't starve other users of ocfs2_wq. */ -#define DL_INODE_DROP_COUNT 64 - -/* Drop inode references from dentry locks */ -static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count) -{ - struct ocfs2_dentry_lock *dl; - - spin_lock(&dentry_list_lock); - while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) { - dl = osb->dentry_lock_list; - osb->dentry_lock_list = dl->dl_next; - spin_unlock(&dentry_list_lock); - iput(dl->dl_inode); - kfree(dl); - spin_lock(&dentry_list_lock); - } - spin_unlock(&dentry_list_lock); -} - -void ocfs2_drop_dl_inodes(struct work_struct *work) -{ - struct ocfs2_super *osb = container_of(work, struct ocfs2_super, - dentry_lock_work); - - __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT); - /* - * Don't queue dropping if umount is in progress. We flush the - * list in ocfs2_dismount_volume - */ - spin_lock(&dentry_list_lock); - if (osb->dentry_lock_list && - !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) - queue_work(ocfs2_wq, &osb->dentry_lock_work); - spin_unlock(&dentry_list_lock); -} - -/* Flush the whole work queue */ -void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb) -{ - __ocfs2_drop_dl_inodes(osb, -1); -} - /* * ocfs2_dentry_iput() and friends. * @@ -416,24 +369,16 @@ void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb) static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl) { + iput(dl->dl_inode); ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); ocfs2_lock_res_free(&dl->dl_lockres); - - /* We leave dropping of inode reference to ocfs2_wq as that can - * possibly lead to inode deletion which gets tricky */ - spin_lock(&dentry_list_lock); - if (!osb->dentry_lock_list && - !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) - queue_work(ocfs2_wq, &osb->dentry_lock_work); - dl->dl_next = osb->dentry_lock_list; - osb->dentry_lock_list = dl; - spin_unlock(&dentry_list_lock); + kfree(dl); } void ocfs2_dentry_lock_put(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl) { - int unlock; + int unlock = 0; BUG_ON(dl->dl_count == 0); diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h index b79eff709958..55f58892b153 100644 --- a/fs/ocfs2/dcache.h +++ b/fs/ocfs2/dcache.h @@ -29,13 +29,8 @@ extern const struct dentry_operations ocfs2_dentry_ops; struct ocfs2_dentry_lock { - /* Use count of dentry lock */ unsigned int dl_count; - union { - /* Linked list of dentry locks to release */ - struct ocfs2_dentry_lock *dl_next; - u64 dl_parent_blkno; - }; + u64 dl_parent_blkno; /* * The ocfs2_dentry_lock keeps an inode reference until @@ -49,14 +44,9 @@ struct ocfs2_dentry_lock { int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode, u64 parent_blkno); -extern spinlock_t dentry_list_lock; - void ocfs2_dentry_lock_put(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl); -void ocfs2_drop_dl_inodes(struct work_struct *work); -void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb); - struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, int skip_unhashed); diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 91a7e85ac8fd..0717662b4aef 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2957,6 +2957,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, ocfs2_init_dir_trailer(dir, dirdata_bh, i); } + ocfs2_update_inode_fsync_trans(handle, dir, 1); ocfs2_journal_dirty(handle, dirdata_bh); if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) { @@ -3005,6 +3006,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, di->i_size = cpu_to_le64(sb->s_blocksize); di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec); di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, dir, 1); /* * This should never fail as our extent list is empty and all @@ -3338,6 +3340,7 @@ do_extend: } else { de->rec_len = cpu_to_le16(sb->s_blocksize); } + ocfs2_update_inode_fsync_trans(handle, dir, 1); ocfs2_journal_dirty(handle, new_bh); dir_i_size += dir->i_sb->s_blocksize; @@ -3896,6 +3899,7 @@ out_commit: dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(dir->i_sb, 1)); + ocfs2_update_inode_fsync_trans(handle, dir, 1); ocfs2_commit_trans(osb, handle); out: @@ -4134,6 +4138,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir, mlog_errno(ret); did_quota = 0; + ocfs2_update_inode_fsync_trans(handle, dir, 1); ocfs2_journal_dirty(handle, dx_root_bh); out_commit: @@ -4401,6 +4406,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir, di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); spin_unlock(&OCFS2_I(dir)->ip_lock); di->i_dx_root = cpu_to_le64(0ULL); + ocfs2_update_inode_fsync_trans(handle, dir, 1); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 33660a4a52fa..c973690dc0bc 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1123,7 +1123,6 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, struct dlm_ctxt *dlm = NULL; char *local = NULL; int status = 0; - int locked = 0; qr = (struct dlm_query_region *) msg->buf; @@ -1132,10 +1131,8 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, /* buffer used in dlm_mast_regions() */ local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL); - if (!local) { - status = -ENOMEM; - goto bail; - } + if (!local) + return -ENOMEM; status = -EINVAL; @@ -1144,16 +1141,15 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, if (!dlm) { mlog(ML_ERROR, "Node %d queried hb regions on domain %s " "before join domain\n", qr->qr_node, qr->qr_domain); - goto bail; + goto out_domain_lock; } spin_lock(&dlm->spinlock); - locked = 1; if (dlm->joining_node != qr->qr_node) { mlog(ML_ERROR, "Node %d queried hb regions on domain %s " "but joining node is %d\n", qr->qr_node, qr->qr_domain, dlm->joining_node); - goto bail; + goto out_dlm_lock; } /* Support for global heartbeat was added in 1.1 */ @@ -1163,14 +1159,15 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, "but active dlm protocol is %d.%d\n", qr->qr_node, qr->qr_domain, dlm->dlm_locking_proto.pv_major, dlm->dlm_locking_proto.pv_minor); - goto bail; + goto out_dlm_lock; } status = dlm_match_regions(dlm, qr, local, sizeof(qr->qr_regions)); -bail: - if (locked) - spin_unlock(&dlm->spinlock); +out_dlm_lock: + spin_unlock(&dlm->spinlock); + +out_domain_lock: spin_unlock(&dlm_domain_lock); kfree(local); @@ -1877,19 +1874,19 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) goto bail; } - status = dlm_debug_init(dlm); + status = dlm_launch_thread(dlm); if (status < 0) { mlog_errno(status); goto bail; } - status = dlm_launch_thread(dlm); + status = dlm_launch_recovery_thread(dlm); if (status < 0) { mlog_errno(status); goto bail; } - status = dlm_launch_recovery_thread(dlm); + status = dlm_debug_init(dlm); if (status < 0) { mlog_errno(status); goto bail; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index af3f7aa73e13..ee1f88419cb0 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -472,11 +472,15 @@ bail: void dlm_destroy_master_caches(void) { - if (dlm_lockname_cache) + if (dlm_lockname_cache) { kmem_cache_destroy(dlm_lockname_cache); + dlm_lockname_cache = NULL; + } - if (dlm_lockres_cache) + if (dlm_lockres_cache) { kmem_cache_destroy(dlm_lockres_cache); + dlm_lockres_cache = NULL; + } } static void dlm_lockres_release(struct kref *kref) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 7035af09cc03..fe29f7978f81 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -537,7 +537,10 @@ master_here: /* success! see if any other nodes need recovery */ mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n", dlm->name, dlm->reco.dead_node, dlm->node_num); - dlm_reset_recovery(dlm); + spin_lock(&dlm->spinlock); + __dlm_reset_recovery(dlm); + dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; + spin_unlock(&dlm->spinlock); } dlm_end_recovery(dlm); @@ -695,6 +698,14 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) if (all_nodes_done) { int ret; + /* Set this flag on recovery master to avoid + * a new recovery for another dead node start + * before the recovery is not done. That may + * cause recovery hung.*/ + spin_lock(&dlm->spinlock); + dlm->reco.state |= DLM_RECO_STATE_FINALIZE; + spin_unlock(&dlm->spinlock); + /* all nodes are now in DLM_RECO_NODE_DATA_DONE state * just send a finalize message to everyone and * clean up */ @@ -1750,13 +1761,13 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, struct dlm_migratable_lockres *mres) { struct dlm_migratable_lock *ml; - struct list_head *queue; + struct list_head *queue, *iter; struct list_head *tmpq = NULL; struct dlm_lock *newlock = NULL; struct dlm_lockstatus *lksb = NULL; int ret = 0; int i, j, bad; - struct dlm_lock *lock = NULL; + struct dlm_lock *lock; u8 from = O2NM_MAX_NODES; unsigned int added = 0; __be64 c; @@ -1791,14 +1802,16 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, /* MIGRATION ONLY! */ BUG_ON(!(mres->flags & DLM_MRES_MIGRATION)); + lock = NULL; spin_lock(&res->spinlock); for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { tmpq = dlm_list_idx_to_ptr(res, j); - list_for_each_entry(lock, tmpq, list) { - if (lock->ml.cookie != ml->cookie) - lock = NULL; - else + list_for_each(iter, tmpq) { + lock = list_entry(iter, + struct dlm_lock, list); + if (lock->ml.cookie == ml->cookie) break; + lock = NULL; } if (lock) break; @@ -2882,8 +2895,8 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data, BUG(); } dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; + __dlm_reset_recovery(dlm); spin_unlock(&dlm->spinlock); - dlm_reset_recovery(dlm); dlm_kick_recovery_thread(dlm); break; default: diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 19986959d149..6bd690b5a061 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3144,22 +3144,60 @@ out: return 0; } +static void ocfs2_process_blocked_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); + /* Mark the lockres as being dropped. It will no longer be * queued if blocking, but we still may have to wait on it * being dequeued from the downconvert thread before we can consider * it safe to drop. * * You can *not* attempt to call cluster_lock on this lockres anymore. */ -void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres) +void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) { int status; struct ocfs2_mask_waiter mw; - unsigned long flags; + unsigned long flags, flags2; ocfs2_init_mask_waiter(&mw); spin_lock_irqsave(&lockres->l_lock, flags); lockres->l_flags |= OCFS2_LOCK_FREEING; + if (lockres->l_flags & OCFS2_LOCK_QUEUED && current == osb->dc_task) { + /* + * We know the downconvert is queued but not in progress + * because we are the downconvert thread and processing + * different lock. So we can just remove the lock from the + * queue. This is not only an optimization but also a way + * to avoid the following deadlock: + * ocfs2_dentry_post_unlock() + * ocfs2_dentry_lock_put() + * ocfs2_drop_dentry_lock() + * iput() + * ocfs2_evict_inode() + * ocfs2_clear_inode() + * ocfs2_mark_lockres_freeing() + * ... blocks waiting for OCFS2_LOCK_QUEUED + * since we are the downconvert thread which + * should clear the flag. + */ + spin_unlock_irqrestore(&lockres->l_lock, flags); + spin_lock_irqsave(&osb->dc_task_lock, flags2); + list_del_init(&lockres->l_blocked_list); + osb->blocked_lock_count--; + spin_unlock_irqrestore(&osb->dc_task_lock, flags2); + /* + * Warn if we recurse into another post_unlock call. Strictly + * speaking it isn't a problem but we need to be careful if + * that happens (stack overflow, deadlocks, ...) so warn if + * ocfs2 grows a path for which this can happen. + */ + WARN_ON_ONCE(lockres->l_ops->post_unlock); + /* Since the lock is freeing we don't do much in the fn below */ + ocfs2_process_blocked_lock(osb, lockres); + return; + } while (lockres->l_flags & OCFS2_LOCK_QUEUED) { lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0); spin_unlock_irqrestore(&lockres->l_lock, flags); @@ -3180,7 +3218,7 @@ void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, { int ret; - ocfs2_mark_lockres_freeing(lockres); + ocfs2_mark_lockres_freeing(osb, lockres); ret = ocfs2_drop_lock(osb, lockres); if (ret) mlog_errno(ret); diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index 1d596d8c4a4a..d293a22c32c5 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -157,7 +157,8 @@ int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex); void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex); -void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); +void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 51632c40e896..8970dcf74de5 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -175,9 +175,13 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { int err = 0; - journal_t *journal; struct inode *inode = file->f_mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_inode_info *oi = OCFS2_I(inode); + journal_t *journal = osb->journal->j_journal; + int ret; + tid_t commit_tid; + bool needs_barrier = false; trace_ocfs2_sync_file(inode, file, file->f_path.dentry, OCFS2_I(inode)->ip_blkno, @@ -192,29 +196,19 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end, if (err) return err; - /* - * Probably don't need the i_mutex at all in here, just putting it here - * to be consistent with how fsync used to be called, someone more - * familiar with the fs could possibly remove it. - */ - mutex_lock(&inode->i_mutex); - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { - /* - * We still have to flush drive's caches to get data to the - * platter - */ - if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) - blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); - goto bail; + commit_tid = datasync ? oi->i_datasync_tid : oi->i_sync_tid; + if (journal->j_flags & JBD2_BARRIER && + !jbd2_trans_will_send_data_barrier(journal, commit_tid)) + needs_barrier = true; + err = jbd2_complete_transaction(journal, commit_tid); + if (needs_barrier) { + ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + if (!err) + err = ret; } - journal = osb->journal->j_journal; - err = jbd2_journal_force_commit(journal); - -bail: if (err) mlog_errno(err); - mutex_unlock(&inode->i_mutex); return (err < 0) ? -EIO : 0; } @@ -292,6 +286,7 @@ int ocfs2_update_inode_atime(struct inode *inode, inode->i_atime = CURRENT_TIME; di->i_atime = cpu_to_le64(inode->i_atime.tv_sec); di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, bh); out_commit: @@ -341,6 +336,7 @@ int ocfs2_simple_size_update(struct inode *inode, if (ret < 0) mlog_errno(ret); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_commit_trans(osb, handle); out: return ret; @@ -435,6 +431,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, di->i_size = cpu_to_le64(new_i_size); di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, fe_bh); @@ -650,7 +647,7 @@ restarted_transaction: mlog_errno(status); goto leave; } - + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, bh); spin_lock(&OCFS2_I(inode)->ip_lock); @@ -743,6 +740,7 @@ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) mlog_errno(ret); + ocfs2_update_inode_fsync_trans(handle, inode, 1); out: if (ret) { @@ -840,6 +838,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); di->i_mtime_nsec = di->i_ctime_nsec; ocfs2_journal_dirty(handle, di_bh); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); } @@ -1344,6 +1343,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode, di = (struct ocfs2_dinode *) bh->b_data; di->i_mode = cpu_to_le16(inode->i_mode); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, bh); @@ -1576,6 +1576,7 @@ static int ocfs2_zero_partial_clusters(struct inode *inode, if (ret) mlog_errno(ret); } + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_commit_trans(osb, handle); out: @@ -2061,13 +2062,6 @@ out: return ret; } -static void ocfs2_aiodio_wait(struct inode *inode) -{ - wait_queue_head_t *wq = ocfs2_ioend_wq(inode); - - wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0)); -} - static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos) { int blockmask = inode->i_sb->s_blocksize - 1; @@ -2345,10 +2339,8 @@ relock: * Wait on previous unaligned aio to complete before * proceeding. */ - ocfs2_aiodio_wait(inode); - - /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */ - atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio); + mutex_lock(&OCFS2_I(inode)->ip_unaligned_aio); + /* Mark the iocb as needing an unlock in ocfs2_dio_end_io */ ocfs2_iocb_set_unaligned_aio(iocb); } @@ -2375,15 +2367,18 @@ relock: if (direct_io) { written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, - ppos, count, ocount); + count, ocount); if (written < 0) { ret = written; goto out_dio; } } else { + struct iov_iter from; + iov_iter_init(&from, iov, nr_segs, count, 0); current->backing_dev_info = file->f_mapping->backing_dev_info; - written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos, - ppos, count, 0); + written = generic_perform_write(file, &from, *ppos); + if (likely(written >= 0)) + iocb->ki_pos = *ppos + written; current->backing_dev_info = NULL; } @@ -2428,7 +2423,7 @@ out_dio: if (unaligned_dio) { ocfs2_iocb_clear_unaligned_aio(iocb); - atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio); + mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); } out: @@ -2645,7 +2640,16 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence) case SEEK_SET: break; case SEEK_END: - offset += inode->i_size; + /* SEEK_END requires the OCFS2 inode lock for the file + * because it references the file's size. + */ + ret = ocfs2_inode_lock(inode, NULL, 0); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + offset += i_size_read(inode); + ocfs2_inode_unlock(inode, 0); break; case SEEK_CUR: if (offset == 0) { diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index f29a90fde619..437de7f768c6 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -130,6 +130,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, struct inode *inode = NULL; struct super_block *sb = osb->sb; struct ocfs2_find_inode_args args; + journal_t *journal = OCFS2_SB(sb)->journal->j_journal; trace_ocfs2_iget_begin((unsigned long long)blkno, flags, sysfile_type); @@ -169,6 +170,32 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, goto bail; } + /* + * Set transaction id's of transactions that have to be committed + * to finish f[data]sync. We set them to currently running transaction + * as we cannot be sure that the inode or some of its metadata isn't + * part of the transaction - the inode could have been reclaimed and + * now it is reread from disk. + */ + if (journal) { + transaction_t *transaction; + tid_t tid; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + read_lock(&journal->j_state_lock); + if (journal->j_running_transaction) + transaction = journal->j_running_transaction; + else + transaction = journal->j_committing_transaction; + if (transaction) + tid = transaction->t_tid; + else + tid = journal->j_commit_sequence; + read_unlock(&journal->j_state_lock); + oi->i_sync_tid = tid; + oi->i_datasync_tid = tid; + } + bail: if (!IS_ERR(inode)) { trace_ocfs2_iget_end(inode, @@ -804,11 +831,13 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode) goto bail; } - /* If we're coming from downconvert_thread we can't go into our own - * voting [hello, deadlock city!], so unforuntately we just - * have to skip deleting this guy. That's OK though because - * the node who's doing the actual deleting should handle it - * anyway. */ + /* + * If we're coming from downconvert_thread we can't go into our own + * voting [hello, deadlock city!] so we cannot delete the inode. But + * since we dropped last inode ref when downconverting dentry lock, + * we cannot have the file open and thus the node doing unlink will + * take care of deleting the inode. + */ if (current == osb->dc_task) goto bail; @@ -822,12 +851,6 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode) goto bail_unlock; } - /* If we have allowd wipe of this inode for another node, it - * will be marked here so we can safely skip it. Recovery will - * cleanup any inodes we might inadvertently skip here. */ - if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) - goto bail_unlock; - ret = 1; bail_unlock: spin_unlock(&oi->ip_lock); @@ -941,7 +964,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode, (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data); if (sync_data) filemap_write_and_wait(inode->i_mapping); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); } static void ocfs2_delete_inode(struct inode *inode) @@ -960,8 +983,6 @@ static void ocfs2_delete_inode(struct inode *inode) if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) goto bail; - dquot_initialize(inode); - if (!ocfs2_inode_is_valid_to_delete(inode)) { /* It's probably not necessary to truncate_inode_pages * here but we do it for safety anyway (it will most @@ -970,6 +991,8 @@ static void ocfs2_delete_inode(struct inode *inode) goto bail; } + dquot_initialize(inode); + /* We want to block signals in delete_inode as the lock and * messaging paths may return us -ERESTARTSYS. Which would * cause us to exit early, resulting in inodes being orphaned @@ -1057,6 +1080,7 @@ static void ocfs2_clear_inode(struct inode *inode) { int status; struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); clear_inode(inode); trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno, @@ -1073,9 +1097,9 @@ static void ocfs2_clear_inode(struct inode *inode) /* Do these before all the other work so that we don't bounce * the downconvert thread while waiting to destroy the locks. */ - ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); - ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres); - ocfs2_mark_lockres_freeing(&oi->ip_open_lockres); + ocfs2_mark_lockres_freeing(osb, &oi->ip_rw_lockres); + ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres); + ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres); ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap, &oi->ip_la_data_resv); @@ -1157,7 +1181,7 @@ void ocfs2_evict_inode(struct inode *inode) (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) { ocfs2_delete_inode(inode); } else { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); } ocfs2_clear_inode(inode); } @@ -1260,6 +1284,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle, fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); ocfs2_journal_dirty(handle, bh); + ocfs2_update_inode_fsync_trans(handle, inode, 1); leave: return status; } diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 621fc73bf23d..a6c991c0fc98 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -44,7 +44,7 @@ struct ocfs2_inode_info struct rw_semaphore ip_xattr_sem; /* Number of outstanding AIO's which are not page aligned */ - atomic_t ip_unaligned_aio; + struct mutex ip_unaligned_aio; /* These fields are protected by ip_lock */ spinlock_t ip_lock; @@ -73,6 +73,13 @@ struct ocfs2_inode_info u32 ip_dir_lock_gen; struct ocfs2_alloc_reservation ip_la_data_resv; + + /* + * Transactions that contain inode's metadata needed to complete + * fsync and fdatasync, respectively. + */ + tid_t i_sync_tid; + tid_t i_datasync_tid; }; /* @@ -84,8 +91,6 @@ struct ocfs2_inode_info #define OCFS2_INODE_BITMAP 0x00000004 /* This inode has been wiped from disk */ #define OCFS2_INODE_DELETED 0x00000008 -/* Another node is deleting, so our delete is a nop */ -#define OCFS2_INODE_SKIP_DELETE 0x00000010 /* Has the inode been orphaned on another node? * * This hints to ocfs2_drop_inode that it should clear i_nlink before @@ -100,11 +105,11 @@ struct ocfs2_inode_info * rely on ocfs2_delete_inode to sort things out under the proper * cluster locks. */ -#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020 +#define OCFS2_INODE_MAYBE_ORPHANED 0x00000010 /* Does someone have the file open O_DIRECT */ -#define OCFS2_INODE_OPEN_DIRECT 0x00000040 +#define OCFS2_INODE_OPEN_DIRECT 0x00000020 /* Tell the inode wipe code it's not in orphan dir */ -#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000080 +#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000040 static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) { diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 8ca3c29accbf..490229f43731 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -413,11 +413,12 @@ int ocfs2_info_handle_freeinode(struct inode *inode, } status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i); - if (status < 0) - goto bail; iput(inode_alloc); inode_alloc = NULL; + + if (status < 0) + goto bail; } o2info_set_request_filled(&oifi->ifi_req); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 44fc3e530c3d..03ea9314fecd 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -2132,12 +2132,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, iter = oi->ip_next_orphan; spin_lock(&oi->ip_lock); - /* The remote delete code may have set these on the - * assumption that the other node would wipe them - * successfully. If they are still in the node's - * orphan dir, we need to reset that state. */ - oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE); - /* Set the proper information to get us going into * ocfs2_delete_inode. */ oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 9ff4e8cf9d97..7f8cde94abfe 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -626,4 +626,15 @@ static inline int ocfs2_begin_ordered_truncate(struct inode *inode, new_size); } +static inline void ocfs2_update_inode_fsync_trans(handle_t *handle, + struct inode *inode, + int datasync) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + oi->i_sync_tid = handle->h_transaction->t_tid; + if (datasync) + oi->i_datasync_tid = handle->h_transaction->t_tid; +} + #endif /* OCFS2_JOURNAL_H */ diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c index e57c804069ea..6b6d092b0998 100644 --- a/fs/ocfs2/locks.c +++ b/fs/ocfs2/locks.c @@ -82,6 +82,8 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode, } ret = flock_lock_file_wait(file, fl); + if (ret) + ocfs2_file_unlock(file); out: mutex_unlock(&fp->fp_mutex); diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 64c304d668f0..599eb4c4c8be 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -151,6 +151,7 @@ static int __ocfs2_move_extent(handle_t *handle, old_blkno, len); } + ocfs2_update_inode_fsync_trans(handle, inode, 0); out: ocfs2_free_path(path); return ret; @@ -690,8 +691,11 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh, goal_bit, len); - if (ret) + if (ret) { + ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len, + le16_to_cpu(gd->bg_chain)); mlog_errno(ret); + } /* * Here we should write the new page out first if we are @@ -957,6 +961,7 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) inode->i_ctime = CURRENT_TIME; di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 3683643f3f0e..2060fc398445 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -450,7 +450,6 @@ leave: brelse(new_fe_bh); brelse(parent_fe_bh); - kfree(si.name); kfree(si.value); ocfs2_free_dir_lookup_result(&lookup); @@ -495,6 +494,7 @@ static int __ocfs2_mknod_locked(struct inode *dir, struct ocfs2_dinode *fe = NULL; struct ocfs2_extent_list *fel; u16 feat; + struct ocfs2_inode_info *oi = OCFS2_I(inode); *new_fe_bh = NULL; @@ -576,8 +576,8 @@ static int __ocfs2_mknod_locked(struct inode *dir, mlog_errno(status); } - status = 0; /* error in ocfs2_create_new_inode_locks is not - * critical */ + oi->i_sync_tid = handle->h_transaction->t_tid; + oi->i_datasync_tid = handle->h_transaction->t_tid; leave: if (status < 0) { @@ -1855,7 +1855,6 @@ bail: brelse(new_fe_bh); brelse(parent_fe_bh); - kfree(si.name); kfree(si.value); ocfs2_free_dir_lookup_result(&lookup); if (inode_ac) @@ -2481,6 +2480,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, di->i_orphaned_slot = 0; set_nlink(inode, 1); ocfs2_set_links_count(di, inode->i_nlink); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, di_bh); status = ocfs2_add_entry(handle, dentry, inode, diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 553f53cc73ae..8d64a97a9d5e 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -30,6 +30,7 @@ #include <linux/sched.h> #include <linux/wait.h> #include <linux/list.h> +#include <linux/llist.h> #include <linux/rbtree.h> #include <linux/workqueue.h> #include <linux/kref.h> @@ -274,19 +275,16 @@ enum ocfs2_mount_options OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ }; -#define OCFS2_OSB_SOFT_RO 0x0001 -#define OCFS2_OSB_HARD_RO 0x0002 -#define OCFS2_OSB_ERROR_FS 0x0004 -#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED 0x0008 - -#define OCFS2_DEFAULT_ATIME_QUANTUM 60 +#define OCFS2_OSB_SOFT_RO 0x0001 +#define OCFS2_OSB_HARD_RO 0x0002 +#define OCFS2_OSB_ERROR_FS 0x0004 +#define OCFS2_DEFAULT_ATIME_QUANTUM 60 struct ocfs2_journal; struct ocfs2_slot_info; struct ocfs2_recovery_map; struct ocfs2_replay_map; struct ocfs2_quota_recovery; -struct ocfs2_dentry_lock; struct ocfs2_super { struct task_struct *commit_task; @@ -414,10 +412,9 @@ struct ocfs2_super struct list_head blocked_lock_list; unsigned long blocked_lock_count; - /* List of dentry locks to release. Anyone can add locks to - * the list, ocfs2_wq processes the list */ - struct ocfs2_dentry_lock *dentry_lock_list; - struct work_struct dentry_lock_work; + /* List of dquot structures to drop last reference to */ + struct llist_head dquot_drop_list; + struct work_struct dquot_drop_work; wait_queue_head_t osb_mount_event; @@ -449,6 +446,8 @@ struct ocfs2_super /* rb tree root for refcount lock. */ struct rb_root osb_rf_lock_tree; struct ocfs2_refcount_tree *osb_ref_tree_lru; + + struct mutex system_file_mutex; }; #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) @@ -579,18 +578,6 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb, spin_unlock(&osb->osb_lock); } - -static inline unsigned long ocfs2_test_osb_flag(struct ocfs2_super *osb, - unsigned long flag) -{ - unsigned long ret; - - spin_lock(&osb->osb_lock); - ret = osb->osb_flags & flag; - spin_unlock(&osb->osb_lock); - return ret; -} - static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb, int hard) { diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index d5ab56cbe5c5..f266d67df3c6 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -28,6 +28,7 @@ struct ocfs2_dquot { unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */ s64 dq_origspace; /* Last globally synced space usage */ s64 dq_originodes; /* Last globally synced inode usage */ + struct llist_node list; /* Member of list of dquots to drop */ }; /* Description of one chunk to recover in memory */ @@ -110,6 +111,7 @@ int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block, int ocfs2_create_local_dquot(struct dquot *dquot); int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot); int ocfs2_local_write_dquot(struct dquot *dquot); +void ocfs2_drop_dquot_refs(struct work_struct *work); extern const struct dquot_operations ocfs2_quota_operations; extern struct quota_format_type ocfs2_quota_format; diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index d7b5108789e2..b990a62cff50 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -10,6 +10,7 @@ #include <linux/jiffies.h> #include <linux/writeback.h> #include <linux/workqueue.h> +#include <linux/llist.h> #include <cluster/masklog.h> @@ -679,6 +680,27 @@ static int ocfs2_calc_qdel_credits(struct super_block *sb, int type) OCFS2_INODE_UPDATE_CREDITS; } +void ocfs2_drop_dquot_refs(struct work_struct *work) +{ + struct ocfs2_super *osb = container_of(work, struct ocfs2_super, + dquot_drop_work); + struct llist_node *list; + struct ocfs2_dquot *odquot, *next_odquot; + + list = llist_del_all(&osb->dquot_drop_list); + llist_for_each_entry_safe(odquot, next_odquot, list, list) { + /* Drop the reference we acquired in ocfs2_dquot_release() */ + dqput(&odquot->dq_dquot); + } +} + +/* + * Called when the last reference to dquot is dropped. If we are called from + * downconvert thread, we cannot do all the handling here because grabbing + * quota lock could deadlock (the node holding the quota lock could need some + * other cluster lock to proceed but with blocked downconvert thread we cannot + * release any lock). + */ static int ocfs2_release_dquot(struct dquot *dquot) { handle_t *handle; @@ -694,6 +716,19 @@ static int ocfs2_release_dquot(struct dquot *dquot) /* Check whether we are not racing with some other dqget() */ if (atomic_read(&dquot->dq_count) > 1) goto out; + /* Running from downconvert thread? Postpone quota processing to wq */ + if (current == osb->dc_task) { + /* + * Grab our own reference to dquot and queue it for delayed + * dropping. Quota code rechecks after calling + * ->release_dquot() and won't free dquot structure. + */ + dqgrab(dquot); + /* First entry on list -> queue work */ + if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list)) + queue_work(ocfs2_wq, &osb->dquot_drop_work); + goto out; + } status = ocfs2_lock_global_qf(oinfo, 1); if (status < 0) goto out; diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index ca5ce14cbddc..83f1a665ae97 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -496,7 +496,7 @@ static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj, } static struct kobj_attribute ocfs2_attr_max_locking_protocol = - __ATTR(max_locking_protocol, S_IFREG | S_IRUGO, + __ATTR(max_locking_protocol, S_IRUGO, ocfs2_max_locking_protocol_show, NULL); static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj, @@ -528,7 +528,7 @@ static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj, } static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins = - __ATTR(loaded_cluster_plugins, S_IFREG | S_IRUGO, + __ATTR(loaded_cluster_plugins, S_IRUGO, ocfs2_loaded_cluster_plugins_show, NULL); static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj, @@ -550,7 +550,7 @@ static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj, } static struct kobj_attribute ocfs2_attr_active_cluster_plugin = - __ATTR(active_cluster_plugin, S_IFREG | S_IRUGO, + __ATTR(active_cluster_plugin, S_IRUGO, ocfs2_active_cluster_plugin_show, NULL); static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj, @@ -599,15 +599,29 @@ static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj, static struct kobj_attribute ocfs2_attr_cluster_stack = - __ATTR(cluster_stack, S_IFREG | S_IRUGO | S_IWUSR, + __ATTR(cluster_stack, S_IRUGO | S_IWUSR, ocfs2_cluster_stack_show, ocfs2_cluster_stack_store); + + +static ssize_t ocfs2_dlm_recover_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "1\n"); +} + +static struct kobj_attribute ocfs2_attr_dlm_recover_support = + __ATTR(dlm_recover_callback_support, S_IRUGO, + ocfs2_dlm_recover_show, NULL); + static struct attribute *ocfs2_attrs[] = { &ocfs2_attr_max_locking_protocol.attr, &ocfs2_attr_loaded_cluster_plugins.attr, &ocfs2_attr_active_cluster_plugin.attr, &ocfs2_attr_cluster_stack.attr, + &ocfs2_attr_dlm_recover_support.attr, NULL, }; diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 47ae2663a6f5..0cb889a17ae1 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -771,6 +771,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); + ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0); status = 0; @@ -1607,6 +1608,21 @@ out: return ret; } +void ocfs2_rollback_alloc_dinode_counts(struct inode *inode, + struct buffer_head *di_bh, + u32 num_bits, + u16 chain) +{ + u32 tmp_used; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; + struct ocfs2_chain_list *cl; + + cl = (struct ocfs2_chain_list *)&di->id2.i_chain; + tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); + di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits); + le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits); +} + static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res, struct ocfs2_extent_rec *rec, struct ocfs2_chain_list *cl) @@ -1707,8 +1723,12 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, res->sr_bit_offset, res->sr_bits); - if (ret < 0) + if (ret < 0) { + ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh, + res->sr_bits, + le16_to_cpu(gd->bg_chain)); mlog_errno(ret); + } out_loc_only: *bits_left = le16_to_cpu(gd->bg_free_bits_count); @@ -1838,6 +1858,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, res->sr_bit_offset, res->sr_bits); if (status < 0) { + ocfs2_rollback_alloc_dinode_counts(alloc_inode, + ac->ac_bh, res->sr_bits, chain); mlog_errno(status); goto bail; } @@ -2091,7 +2113,7 @@ int ocfs2_find_new_inode_loc(struct inode *dir, ac->ac_find_loc_priv = res; *fe_blkno = res->sr_blkno; - + ocfs2_update_inode_fsync_trans(handle, dir, 0); out: if (handle) ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle); @@ -2149,6 +2171,8 @@ int ocfs2_claim_new_inode_at_loc(handle_t *handle, res->sr_bit_offset, res->sr_bits); if (ret < 0) { + ocfs2_rollback_alloc_dinode_counts(ac->ac_inode, + ac->ac_bh, res->sr_bits, chain); mlog_errno(ret); goto out; } @@ -2870,6 +2894,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); if (status < 0) { mutex_unlock(&inode_alloc_inode->i_mutex); + iput(inode_alloc_inode); mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n", (u32)suballoc_slot, status); goto bail; diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index 218d8036b3e7..2d2501767c0c 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -91,6 +91,10 @@ int ocfs2_alloc_dinode_update_counts(struct inode *inode, struct buffer_head *di_bh, u32 num_bits, u16 chain); +void ocfs2_rollback_alloc_dinode_counts(struct inode *inode, + struct buffer_head *di_bh, + u32 num_bits, + u16 chain); int ocfs2_block_group_set_bits(handle_t *handle, struct inode *alloc_inode, struct ocfs2_group_desc *bg, diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 49d84f80f36c..a7cdd56f4c79 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -561,6 +561,9 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb) if (!oi) return NULL; + oi->i_sync_tid = 0; + oi->i_datasync_tid = 0; + jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode); return &oi->vfs_inode; } @@ -631,6 +634,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) struct ocfs2_super *osb = OCFS2_SB(sb); u32 tmp; + sync_filesystem(sb); + if (!ocfs2_parse_options(sb, data, &parsed_options, 1) || !ocfs2_check_set_options(sb, &parsed_options)) { ret = -EINVAL; @@ -1238,30 +1243,11 @@ static struct dentry *ocfs2_mount(struct file_system_type *fs_type, return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super); } -static void ocfs2_kill_sb(struct super_block *sb) -{ - struct ocfs2_super *osb = OCFS2_SB(sb); - - /* Failed mount? */ - if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED) - goto out; - - /* Prevent further queueing of inode drop events */ - spin_lock(&dentry_list_lock); - ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED); - spin_unlock(&dentry_list_lock); - /* Wait for work to finish and/or remove it */ - cancel_work_sync(&osb->dentry_lock_work); -out: - kill_block_super(sb); -} - static struct file_system_type ocfs2_fs_type = { .owner = THIS_MODULE, .name = "ocfs2", .mount = ocfs2_mount, - .kill_sb = ocfs2_kill_sb, - + .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, .next = NULL }; @@ -1612,14 +1598,9 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) return 0; } -wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ]; - static int __init ocfs2_init(void) { - int status, i; - - for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++) - init_waitqueue_head(&ocfs2__ioend_wq[i]); + int status; status = init_ocfs2_uptodate_cache(); if (status < 0) @@ -1761,7 +1742,7 @@ static void ocfs2_inode_init_once(void *data) ocfs2_extent_map_init(&oi->vfs_inode); INIT_LIST_HEAD(&oi->ip_io_markers); oi->ip_dir_start_lookup = 0; - atomic_set(&oi->ip_unaligned_aio, 0); + mutex_init(&oi->ip_unaligned_aio); init_rwsem(&oi->ip_alloc_sem); init_rwsem(&oi->ip_xattr_sem); mutex_init(&oi->ip_io_mutex); @@ -1932,17 +1913,16 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) debugfs_remove(osb->osb_ctxt); - /* - * Flush inode dropping work queue so that deletes are - * performed while the filesystem is still working - */ - ocfs2_drop_all_dl_inodes(osb); - /* Orphan scan should be stopped as early as possible */ ocfs2_orphan_scan_stop(osb); ocfs2_disable_quotas(osb); + /* All dquots should be freed by now */ + WARN_ON(!llist_empty(&osb->dquot_drop_list)); + /* Wait for worker to be done with the work structure in osb */ + cancel_work_sync(&osb->dquot_drop_work); + ocfs2_shutdown_local_alloc(osb); /* This will disable recovery and flush any recovery work. */ @@ -2077,7 +2057,6 @@ static int ocfs2_initialize_super(struct super_block *sb, struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; struct inode *inode = NULL; struct ocfs2_journal *journal; - __le32 uuid_net_key; struct ocfs2_super *osb; u64 total_blocks; @@ -2123,6 +2102,8 @@ static int ocfs2_initialize_super(struct super_block *sb, spin_lock_init(&osb->osb_xattr_lock); ocfs2_init_steal_slots(osb); + mutex_init(&osb->system_file_mutex); + atomic_set(&osb->alloc_stats.moves, 0); atomic_set(&osb->alloc_stats.local_data, 0); atomic_set(&osb->alloc_stats.bitmap_data, 0); @@ -2276,8 +2257,8 @@ static int ocfs2_initialize_super(struct super_block *sb, INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); journal->j_state = OCFS2_JOURNAL_FREE; - INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes); - osb->dentry_lock_list = NULL; + INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs); + init_llist_head(&osb->dquot_drop_list); /* get some pseudo constants for clustersize bits */ osb->s_clustersize_bits = @@ -2311,8 +2292,6 @@ static int ocfs2_initialize_super(struct super_block *sb, goto bail; } - memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key)); - strncpy(osb->vol_label, di->id2.i_super.s_label, 63); osb->vol_label[63] = '\0'; osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno); diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c index f053688d22a3..af155c183123 100644 --- a/fs/ocfs2/sysfile.c +++ b/fs/ocfs2/sysfile.c @@ -113,9 +113,11 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, } else arr = get_local_system_inode(osb, type, slot); + mutex_lock(&osb->system_file_mutex); if (arr && ((inode = *arr) != NULL)) { /* get a ref in addition to the array ref */ inode = igrab(inode); + mutex_unlock(&osb->system_file_mutex); BUG_ON(!inode); return inode; @@ -129,6 +131,7 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, *arr = igrab(inode); BUG_ON(!*arr); } + mutex_unlock(&osb->system_file_mutex); return inode; } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 185fa3b7f962..016f01df3825 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -369,7 +369,7 @@ static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket) * them fully. */ static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket, - u64 xb_blkno) + u64 xb_blkno, int new) { int i, rc = 0; @@ -383,9 +383,16 @@ static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket, } if (!ocfs2_buffer_uptodate(INODE_CACHE(bucket->bu_inode), - bucket->bu_bhs[i])) - ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode), - bucket->bu_bhs[i]); + bucket->bu_bhs[i])) { + if (new) + ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode), + bucket->bu_bhs[i]); + else { + set_buffer_uptodate(bucket->bu_bhs[i]); + ocfs2_set_buffer_uptodate(INODE_CACHE(bucket->bu_inode), + bucket->bu_bhs[i]); + } + } } if (rc) @@ -2602,6 +2609,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL); di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); spin_unlock(&oi->ip_lock); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, di_bh); out_commit: @@ -3200,8 +3208,15 @@ meta_guess: clusters_add += 1; } } else { - meta_add += 1; credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS; + if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) { + struct ocfs2_extent_list *el = &def_xv.xv.xr_list; + meta_add += ocfs2_extend_meta_needed(el); + credits += ocfs2_calc_extend_credits(inode->i_sb, + el); + } else { + meta_add += 1; + } } out: if (clusters_need) @@ -3614,6 +3629,7 @@ int ocfs2_xattr_set(struct inode *inode, } ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt); + ocfs2_update_inode_fsync_trans(ctxt.handle, inode, 0); ocfs2_commit_trans(osb, ctxt.handle); @@ -4294,7 +4310,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode, trace_ocfs2_xattr_create_index_block((unsigned long long)blkno); - ret = ocfs2_init_xattr_bucket(xs->bucket, blkno); + ret = ocfs2_init_xattr_bucket(xs->bucket, blkno, 1); if (ret) { mlog_errno(ret); goto out; @@ -4638,7 +4654,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode, * Even if !new_bucket_head, we're overwriting t_bucket. Thus, * there's no need to read it. */ - ret = ocfs2_init_xattr_bucket(t_bucket, new_blk); + ret = ocfs2_init_xattr_bucket(t_bucket, new_blk, new_bucket_head); if (ret) { mlog_errno(ret); goto out; @@ -4804,7 +4820,7 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode, * Even if !t_is_new, we're overwriting t_bucket. Thus, * there's no need to read it. */ - ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno); + ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno, t_is_new); if (ret) goto out; @@ -5476,6 +5492,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode, ret = ocfs2_truncate_log_append(osb, handle, blkno, len); if (ret) mlog_errno(ret); + ocfs2_update_inode_fsync_trans(handle, inode, 0); out_commit: ocfs2_commit_trans(osb, handle); @@ -6830,7 +6847,7 @@ static int ocfs2_reflink_xattr_bucket(handle_t *handle, break; } - ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno); + ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno, 1); if (ret) { mlog_errno(ret); break; diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index d8b0afde2179..ec58c7659183 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -183,7 +183,7 @@ int omfs_sync_inode(struct inode *inode) */ static void omfs_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (inode->i_nlink) diff --git a/fs/open.c b/fs/open.c index b9ed8b25c108..9d64679cec73 100644 --- a/fs/open.c +++ b/fs/open.c @@ -231,7 +231,13 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) return -EINVAL; /* Return error if mode is not supported */ - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) + return -EOPNOTSUPP; + + /* Punch hole and zero range are mutually exclusive */ + if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) == + (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) return -EOPNOTSUPP; /* Punch hole must have keep size set */ @@ -239,17 +245,31 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) !(mode & FALLOC_FL_KEEP_SIZE)) return -EOPNOTSUPP; + /* Collapse range should only be used exclusively. */ + if ((mode & FALLOC_FL_COLLAPSE_RANGE) && + (mode & ~FALLOC_FL_COLLAPSE_RANGE)) + return -EINVAL; + if (!(file->f_mode & FMODE_WRITE)) return -EBADF; - /* It's not possible punch hole on append only file */ - if (mode & FALLOC_FL_PUNCH_HOLE && IS_APPEND(inode)) + /* + * We can only allow pure fallocate on append only files + */ + if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode)) return -EPERM; if (IS_IMMUTABLE(inode)) return -EPERM; /* + * We can not allow to do any fallocate operation on an active + * swapfile + */ + if (IS_SWAPFILE(inode)) + ret = -ETXTBSY; + + /* * Revalidate the write permissions, in case security policy has * changed since the files were opened. */ @@ -632,35 +652,6 @@ out: return error; } -/* - * You have to be very careful that these write - * counts get cleaned up in error cases and - * upon __fput(). This should probably never - * be called outside of __dentry_open(). - */ -static inline int __get_file_write_access(struct inode *inode, - struct vfsmount *mnt) -{ - int error; - error = get_write_access(inode); - if (error) - return error; - /* - * Do not take mount writer counts on - * special files since no writes to - * the mount itself will occur. - */ - if (!special_file(inode->i_mode)) { - /* - * Balanced in __fput() - */ - error = __mnt_want_write(mnt); - if (error) - put_write_access(inode); - } - return error; -} - int open_check_o_direct(struct file *f) { /* NB: we're sure to have correct a_ops only after f_op->open */ @@ -685,26 +676,28 @@ static int do_dentry_open(struct file *f, f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; - if (unlikely(f->f_flags & O_PATH)) - f->f_mode = FMODE_PATH; - path_get(&f->f_path); inode = f->f_inode = f->f_path.dentry->d_inode; - if (f->f_mode & FMODE_WRITE) { - error = __get_file_write_access(inode, f->f_path.mnt); - if (error) - goto cleanup_file; - if (!special_file(inode->i_mode)) - file_take_write(f); - } - f->f_mapping = inode->i_mapping; - if (unlikely(f->f_mode & FMODE_PATH)) { + if (unlikely(f->f_flags & O_PATH)) { + f->f_mode = FMODE_PATH; f->f_op = &empty_fops; return 0; } + if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { + error = get_write_access(inode); + if (unlikely(error)) + goto cleanup_file; + error = __mnt_want_write(f->f_path.mnt); + if (unlikely(error)) { + put_write_access(inode); + goto cleanup_file; + } + f->f_mode |= FMODE_WRITER; + } + /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */ if (S_ISREG(inode->i_mode)) f->f_mode |= FMODE_ATOMIC_POS; @@ -741,18 +734,9 @@ static int do_dentry_open(struct file *f, cleanup_all: fops_put(f->f_op); - if (f->f_mode & FMODE_WRITE) { + if (f->f_mode & FMODE_WRITER) { put_write_access(inode); - if (!special_file(inode->i_mode)) { - /* - * We don't consider this a real - * mnt_want/drop_write() pair - * because it all happenend right - * here, so just reset the state. - */ - file_reset_write(f); - __mnt_drop_write(f->f_path.mnt); - } + __mnt_drop_write(f->f_path.mnt); } cleanup_file: path_put(&f->f_path); diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 8c0ceb8dd1f7..15e4500cda3e 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -368,6 +368,7 @@ static struct inode *openprom_iget(struct super_block *sb, ino_t ino) static int openprom_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_NOATIME; return 0; } diff --git a/fs/pipe.c b/fs/pipe.c index 78fd0d0788db..034bffac3f97 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -142,55 +142,6 @@ pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len, return 0; } -static int -pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len, - int atomic) -{ - unsigned long copy; - - while (len > 0) { - while (!iov->iov_len) - iov++; - copy = min_t(unsigned long, len, iov->iov_len); - - if (atomic) { - if (__copy_to_user_inatomic(iov->iov_base, from, copy)) - return -EFAULT; - } else { - if (copy_to_user(iov->iov_base, from, copy)) - return -EFAULT; - } - from += copy; - len -= copy; - iov->iov_base += copy; - iov->iov_len -= copy; - } - return 0; -} - -/* - * Attempt to pre-fault in the user memory, so we can use atomic copies. - * Returns the number of bytes not faulted in. - */ -static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len) -{ - while (!iov->iov_len) - iov++; - - while (len > 0) { - unsigned long this_len; - - this_len = min_t(unsigned long, len, iov->iov_len); - if (fault_in_pages_writeable(iov->iov_base, this_len)) - break; - - len -= this_len; - iov++; - } - - return len; -} - /* * Pre-fault in the user memory, so we can use atomic copies. */ @@ -226,52 +177,6 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe, } /** - * generic_pipe_buf_map - virtually map a pipe buffer - * @pipe: the pipe that the buffer belongs to - * @buf: the buffer that should be mapped - * @atomic: whether to use an atomic map - * - * Description: - * This function returns a kernel virtual address mapping for the - * pipe_buffer passed in @buf. If @atomic is set, an atomic map is provided - * and the caller has to be careful not to fault before calling - * the unmap function. - * - * Note that this function calls kmap_atomic() if @atomic != 0. - */ -void *generic_pipe_buf_map(struct pipe_inode_info *pipe, - struct pipe_buffer *buf, int atomic) -{ - if (atomic) { - buf->flags |= PIPE_BUF_FLAG_ATOMIC; - return kmap_atomic(buf->page); - } - - return kmap(buf->page); -} -EXPORT_SYMBOL(generic_pipe_buf_map); - -/** - * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer - * @pipe: the pipe that the buffer belongs to - * @buf: the buffer that should be unmapped - * @map_data: the data that the mapping function returned - * - * Description: - * This function undoes the mapping that ->map() provided. - */ -void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, - struct pipe_buffer *buf, void *map_data) -{ - if (buf->flags & PIPE_BUF_FLAG_ATOMIC) { - buf->flags &= ~PIPE_BUF_FLAG_ATOMIC; - kunmap_atomic(map_data); - } else - kunmap(buf->page); -} -EXPORT_SYMBOL(generic_pipe_buf_unmap); - -/** * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to attempt to steal @@ -351,8 +256,6 @@ EXPORT_SYMBOL(generic_pipe_buf_release); static const struct pipe_buf_operations anon_pipe_buf_ops = { .can_merge = 1, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = anon_pipe_buf_release, .steal = generic_pipe_buf_steal, @@ -361,8 +264,6 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = { static const struct pipe_buf_operations packet_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = anon_pipe_buf_release, .steal = generic_pipe_buf_steal, @@ -379,12 +280,15 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, ssize_t ret; struct iovec *iov = (struct iovec *)_iov; size_t total_len; + struct iov_iter iter; total_len = iov_length(iov, nr_segs); /* Null read succeeds. */ if (unlikely(total_len == 0)) return 0; + iov_iter_init(&iter, iov, nr_segs, total_len, 0); + do_wakeup = 0; ret = 0; __pipe_lock(pipe); @@ -394,9 +298,9 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, int curbuf = pipe->curbuf; struct pipe_buffer *buf = pipe->bufs + curbuf; const struct pipe_buf_operations *ops = buf->ops; - void *addr; size_t chars = buf->len; - int error, atomic; + size_t written; + int error; if (chars > total_len) chars = total_len; @@ -408,21 +312,10 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, break; } - atomic = !iov_fault_in_pages_write(iov, chars); -redo: - addr = ops->map(pipe, buf, atomic); - error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic); - ops->unmap(pipe, buf, addr); - if (unlikely(error)) { - /* - * Just retry with the slow path if we failed. - */ - if (atomic) { - atomic = 0; - goto redo; - } + written = copy_page_to_iter(buf->page, buf->offset, chars, &iter); + if (unlikely(written < chars)) { if (!ret) - ret = error; + ret = -EFAULT; break; } ret += chars; @@ -538,10 +431,16 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov, iov_fault_in_pages_read(iov, chars); redo1: - addr = ops->map(pipe, buf, atomic); + if (atomic) + addr = kmap_atomic(buf->page); + else + addr = kmap(buf->page); error = pipe_iov_copy_from_user(offset + addr, iov, chars, atomic); - ops->unmap(pipe, buf, addr); + if (atomic) + kunmap_atomic(addr); + else + kunmap(buf->page); ret = error; do_wakeup = 1; if (error) { diff --git a/fs/pnode.c b/fs/pnode.c index 88396df725b4..302bf22c4a30 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -164,46 +164,94 @@ static struct mount *propagation_next(struct mount *m, } } -/* - * return the source mount to be used for cloning - * - * @dest the current destination mount - * @last_dest the last seen destination mount - * @last_src the last seen source mount - * @type return CL_SLAVE if the new mount has to be - * cloned as a slave. - */ -static struct mount *get_source(struct mount *dest, - struct mount *last_dest, - struct mount *last_src, - int *type) +static struct mount *next_group(struct mount *m, struct mount *origin) { - struct mount *p_last_src = NULL; - struct mount *p_last_dest = NULL; - - while (last_dest != dest->mnt_master) { - p_last_dest = last_dest; - p_last_src = last_src; - last_dest = last_dest->mnt_master; - last_src = last_src->mnt_master; + while (1) { + while (1) { + struct mount *next; + if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) + return first_slave(m); + next = next_peer(m); + if (m->mnt_group_id == origin->mnt_group_id) { + if (next == origin) + return NULL; + } else if (m->mnt_slave.next != &next->mnt_slave) + break; + m = next; + } + /* m is the last peer */ + while (1) { + struct mount *master = m->mnt_master; + if (m->mnt_slave.next != &master->mnt_slave_list) + return next_slave(m); + m = next_peer(master); + if (master->mnt_group_id == origin->mnt_group_id) + break; + if (master->mnt_slave.next == &m->mnt_slave) + break; + m = master; + } + if (m == origin) + return NULL; } +} - if (p_last_dest) { - do { - p_last_dest = next_peer(p_last_dest); - } while (IS_MNT_NEW(p_last_dest)); - /* is that a peer of the earlier? */ - if (dest == p_last_dest) { - *type = CL_MAKE_SHARED; - return p_last_src; +/* all accesses are serialized by namespace_sem */ +static struct user_namespace *user_ns; +static struct mount *last_dest, *last_source, *dest_master; +static struct mountpoint *mp; +static struct hlist_head *list; + +static int propagate_one(struct mount *m) +{ + struct mount *child; + int type; + /* skip ones added by this propagate_mnt() */ + if (IS_MNT_NEW(m)) + return 0; + /* skip if mountpoint isn't covered by it */ + if (!is_subdir(mp->m_dentry, m->mnt.mnt_root)) + return 0; + if (m->mnt_group_id == last_dest->mnt_group_id) { + type = CL_MAKE_SHARED; + } else { + struct mount *n, *p; + for (n = m; ; n = p) { + p = n->mnt_master; + if (p == dest_master || IS_MNT_MARKED(p)) { + while (last_dest->mnt_master != p) { + last_source = last_source->mnt_master; + last_dest = last_source->mnt_parent; + } + if (n->mnt_group_id != last_dest->mnt_group_id) { + last_source = last_source->mnt_master; + last_dest = last_source->mnt_parent; + } + break; + } } + type = CL_SLAVE; + /* beginning of peer group among the slaves? */ + if (IS_MNT_SHARED(m)) + type |= CL_MAKE_SHARED; } - /* slave of the earlier, then */ - *type = CL_SLAVE; - /* beginning of peer group among the slaves? */ - if (IS_MNT_SHARED(dest)) - *type |= CL_MAKE_SHARED; - return last_src; + + /* Notice when we are propagating across user namespaces */ + if (m->mnt_ns->user_ns != user_ns) + type |= CL_UNPRIVILEGED; + child = copy_tree(last_source, last_source->mnt.mnt_root, type); + if (IS_ERR(child)) + return PTR_ERR(child); + mnt_set_mountpoint(m, mp, child); + last_dest = m; + last_source = child; + if (m->mnt_master != dest_master) { + read_seqlock_excl(&mount_lock); + SET_MNT_MARK(m->mnt_master); + read_sequnlock_excl(&mount_lock); + } + hlist_add_head(&child->mnt_hash, list); + return 0; } /* @@ -222,56 +270,48 @@ static struct mount *get_source(struct mount *dest, int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, struct mount *source_mnt, struct hlist_head *tree_list) { - struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; - struct mount *m, *child; + struct mount *m, *n; int ret = 0; - struct mount *prev_dest_mnt = dest_mnt; - struct mount *prev_src_mnt = source_mnt; - HLIST_HEAD(tmp_list); - - for (m = propagation_next(dest_mnt, dest_mnt); m; - m = propagation_next(m, dest_mnt)) { - int type; - struct mount *source; - - if (IS_MNT_NEW(m)) - continue; - - source = get_source(m, prev_dest_mnt, prev_src_mnt, &type); - - /* Notice when we are propagating across user namespaces */ - if (m->mnt_ns->user_ns != user_ns) - type |= CL_UNPRIVILEGED; - - child = copy_tree(source, source->mnt.mnt_root, type); - if (IS_ERR(child)) { - ret = PTR_ERR(child); - tmp_list = *tree_list; - tmp_list.first->pprev = &tmp_list.first; - INIT_HLIST_HEAD(tree_list); + + /* + * we don't want to bother passing tons of arguments to + * propagate_one(); everything is serialized by namespace_sem, + * so globals will do just fine. + */ + user_ns = current->nsproxy->mnt_ns->user_ns; + last_dest = dest_mnt; + last_source = source_mnt; + mp = dest_mp; + list = tree_list; + dest_master = dest_mnt->mnt_master; + + /* all peers of dest_mnt, except dest_mnt itself */ + for (n = next_peer(dest_mnt); n != dest_mnt; n = next_peer(n)) { + ret = propagate_one(n); + if (ret) goto out; - } + } - if (is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) { - mnt_set_mountpoint(m, dest_mp, child); - hlist_add_head(&child->mnt_hash, tree_list); - } else { - /* - * This can happen if the parent mount was bind mounted - * on some subdirectory of a shared/slave mount. - */ - hlist_add_head(&child->mnt_hash, &tmp_list); - } - prev_dest_mnt = m; - prev_src_mnt = child; + /* all slave groups */ + for (m = next_group(dest_mnt, dest_mnt); m; + m = next_group(m, dest_mnt)) { + /* everything in that slave group */ + n = m; + do { + ret = propagate_one(n); + if (ret) + goto out; + n = next_peer(n); + } while (n != m); } out: - lock_mount_hash(); - while (!hlist_empty(&tmp_list)) { - child = hlist_entry(tmp_list.first, struct mount, mnt_hash); - umount_tree(child, 0); + read_seqlock_excl(&mount_lock); + hlist_for_each_entry(n, tree_list, mnt_hash) { + m = n->mnt_parent; + if (m->mnt_master != dest_mnt->mnt_master) + CLEAR_MNT_MARK(m->mnt_master); } - unlock_mount_hash(); + read_sequnlock_excl(&mount_lock); return ret; } diff --git a/fs/pnode.h b/fs/pnode.h index fc28a27fa892..4a246358b031 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -16,6 +16,9 @@ #define IS_MNT_NEW(m) (!(m)->mnt_ns) #define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED) #define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE) +#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED) +#define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED) +#define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED) #define CL_EXPIRE 0x01 #define CL_SLAVE 0x02 diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 11c54fd51e16..0855f772cd41 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -246,6 +246,12 @@ posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p) umode_t mode = 0; int not_equiv = 0; + /* + * A null ACL can always be presented as mode bits. + */ + if (!acl) + return 0; + FOREACH_ACL_ENTRY(pa, acl, pe) { switch (pa->e_tag) { case ACL_USER_OBJ: @@ -723,7 +729,7 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, void *buffer, size_t size) { posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer; - posix_acl_xattr_entry *ext_entry = ext_acl->a_entries; + posix_acl_xattr_entry *ext_entry; int real_size, n; real_size = posix_acl_xattr_size(acl->a_count); @@ -731,7 +737,8 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, return real_size; if (real_size > size) return -ERANGE; - + + ext_entry = ext_acl->a_entries; ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); for (n=0; n < acl->a_count; n++, ext_entry++) { diff --git a/fs/proc/Makefile b/fs/proc/Makefile index ab30716584f5..239493ec718e 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -27,6 +27,5 @@ proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o proc-$(CONFIG_NET) += proc_net.o proc-$(CONFIG_PROC_KCORE) += kcore.o proc-$(CONFIG_PROC_VMCORE) += vmcore.o -proc-$(CONFIG_PROC_DEVICETREE) += proc_devtree.o proc-$(CONFIG_PRINTK) += kmsg.o proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o diff --git a/fs/proc/array.c b/fs/proc/array.c index 656e401794de..64db2bceac59 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -138,8 +138,8 @@ static const char * const task_state_array[] = { "D (disk sleep)", /* 2 */ "T (stopped)", /* 4 */ "t (tracing stop)", /* 8 */ - "Z (zombie)", /* 16 */ - "X (dead)", /* 32 */ + "X (dead)", /* 16 */ + "Z (zombie)", /* 32 */ }; static inline const char *get_task_state(struct task_struct *tsk) diff --git a/fs/proc/base.c b/fs/proc/base.c index b9760628e1fd..2d696b0c93bf 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -200,41 +200,9 @@ static int proc_root_link(struct dentry *dentry, struct path *path) return result; } -static int proc_pid_cmdline(struct task_struct *task, char * buffer) +static int proc_pid_cmdline(struct task_struct *task, char *buffer) { - int res = 0; - unsigned int len; - struct mm_struct *mm = get_task_mm(task); - if (!mm) - goto out; - if (!mm->arg_end) - goto out_mm; /* Shh! No looking before we're done */ - - len = mm->arg_end - mm->arg_start; - - if (len > PAGE_SIZE) - len = PAGE_SIZE; - - res = access_process_vm(task, mm->arg_start, buffer, len, 0); - - // If the nul at the end of args has been overwritten, then - // assume application is using setproctitle(3). - if (res > 0 && buffer[res-1] != '\0' && len < PAGE_SIZE) { - len = strnlen(buffer, res); - if (len < res) { - res = len; - } else { - len = mm->env_end - mm->env_start; - if (len > PAGE_SIZE - res) - len = PAGE_SIZE - res; - res += access_process_vm(task, mm->env_start, buffer+res, len, 0); - res = strnlen(buffer, res); - } - } -out_mm: - mmput(mm); -out: - return res; + return get_cmdline(task, buffer, PAGE_SIZE); } static int proc_pid_auxv(struct task_struct *task, char *buffer) @@ -1236,6 +1204,9 @@ static ssize_t proc_fault_inject_write(struct file * file, make_it_fail = simple_strtol(strstrip(buffer), &end, 0); if (*end) return -EINVAL; + if (make_it_fail < 0 || make_it_fail > 1) + return -EINVAL; + task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; @@ -2588,7 +2559,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("environ", S_IRUSR, proc_environ_operations), INF("auxv", S_IRUSR, proc_pid_auxv), ONE("status", S_IRUGO, proc_pid_status), - ONE("personality", S_IRUGO, proc_pid_personality), + ONE("personality", S_IRUSR, proc_pid_personality), INF("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), @@ -2598,7 +2569,7 @@ static const struct pid_entry tgid_base_stuff[] = { #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK - INF("syscall", S_IRUGO, proc_pid_syscall), + INF("syscall", S_IRUSR, proc_pid_syscall), #endif INF("cmdline", S_IRUGO, proc_pid_cmdline), ONE("stat", S_IRUGO, proc_tgid_stat), @@ -2617,7 +2588,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), REG("smaps", S_IRUGO, proc_pid_smaps_operations), - REG("pagemap", S_IRUGO, proc_pagemap_operations), + REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), @@ -2626,7 +2597,7 @@ static const struct pid_entry tgid_base_stuff[] = { INF("wchan", S_IRUGO, proc_pid_wchan), #endif #ifdef CONFIG_STACKTRACE - ONE("stack", S_IRUGO, proc_pid_stack), + ONE("stack", S_IRUSR, proc_pid_stack), #endif #ifdef CONFIG_SCHEDSTATS INF("schedstat", S_IRUGO, proc_pid_schedstat), @@ -2927,14 +2898,14 @@ static const struct pid_entry tid_base_stuff[] = { REG("environ", S_IRUSR, proc_environ_operations), INF("auxv", S_IRUSR, proc_pid_auxv), ONE("status", S_IRUGO, proc_pid_status), - ONE("personality", S_IRUGO, proc_pid_personality), + ONE("personality", S_IRUSR, proc_pid_personality), INF("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK - INF("syscall", S_IRUGO, proc_pid_syscall), + INF("syscall", S_IRUSR, proc_pid_syscall), #endif INF("cmdline", S_IRUGO, proc_pid_cmdline), ONE("stat", S_IRUGO, proc_tid_stat), @@ -2955,7 +2926,7 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), REG("smaps", S_IRUGO, proc_tid_smaps_operations), - REG("pagemap", S_IRUGO, proc_pagemap_operations), + REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), @@ -2964,7 +2935,7 @@ static const struct pid_entry tid_base_stuff[] = { INF("wchan", S_IRUGO, proc_pid_wchan), #endif #ifdef CONFIG_STACKTRACE - ONE("stack", S_IRUGO, proc_pid_stack), + ONE("stack", S_IRUSR, proc_pid_stack), #endif #ifdef CONFIG_SCHEDSTATS INF("schedstat", S_IRUGO, proc_pid_schedstat), diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 985ea881b5bc..0788d093f5d8 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -11,6 +11,7 @@ #include <linux/proc_fs.h> +#include "../mount.h" #include "internal.h" #include "fd.h" @@ -48,8 +49,9 @@ static int seq_show(struct seq_file *m, void *v) } if (!ret) { - seq_printf(m, "pos:\t%lli\nflags:\t0%o\n", - (long long)file->f_pos, f_flags); + seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n", + (long long)file->f_pos, f_flags, + real_mount(file->f_path.mnt)->mnt_id); if (file->f_op->show_fdinfo) ret = file->f_op->show_fdinfo(m, file); fput(file); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 124fc43c7090..0adbc02d60e3 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -35,7 +35,7 @@ static void proc_evict_inode(struct inode *inode) const struct proc_ns_operations *ns_ops; void *ns; - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); /* Stop tracking associated processes */ @@ -47,7 +47,7 @@ static void proc_evict_inode(struct inode *inode) pde_put(de); head = PROC_I(inode)->sysctl; if (head) { - rcu_assign_pointer(PROC_I(inode)->sysctl, NULL); + RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL); sysctl_head_put(head); } /* Release any associated namespace */ diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 651d09a11dde..3ab6d14e71c5 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -211,13 +211,6 @@ extern int proc_fill_super(struct super_block *); extern void proc_entry_rundown(struct proc_dir_entry *); /* - * proc_devtree.c - */ -#ifdef CONFIG_PROC_DEVICETREE -extern void proc_device_tree_init(void); -#endif - -/* * proc_namespaces.c */ extern const struct inode_operations proc_ns_dir_inode_operations; diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 136e548d9567..7445af0b1aa3 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -73,7 +73,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) available += pagecache; /* - * Part of the reclaimable swap consists of items that are in use, + * Part of the reclaimable slab consists of items that are in use, * and cannot be freed. Cap this estimate at the low watermark. */ available += global_page_state(NR_SLAB_RECLAIMABLE) - diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 9ae46b87470d..89026095f2b5 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -146,7 +146,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl struct task_struct *task; void *ns; char name[50]; - int len = -EACCES; + int res = -EACCES; task = get_proc_task(inode); if (!task) @@ -155,24 +155,18 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto out_put_task; - len = -ENOENT; + res = -ENOENT; ns = ns_ops->get(task); if (!ns) goto out_put_task; snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns)); - len = strlen(name); - - if (len > buflen) - len = buflen; - if (copy_to_user(buffer, name, len)) - len = -EFAULT; - + res = readlink_copy(buffer, buflen, name); ns_ops->put(ns); out_put_task: put_task_struct(task); out: - return len; + return res; } static const struct inode_operations proc_ns_link_inode_operations = { diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c deleted file mode 100644 index c82dd5147845..000000000000 --- a/fs/proc/proc_devtree.c +++ /dev/null @@ -1,241 +0,0 @@ -/* - * proc_devtree.c - handles /proc/device-tree - * - * Copyright 1997 Paul Mackerras - */ -#include <linux/errno.h> -#include <linux/init.h> -#include <linux/time.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/printk.h> -#include <linux/stat.h> -#include <linux/string.h> -#include <linux/of.h> -#include <linux/export.h> -#include <linux/slab.h> -#include <asm/uaccess.h> -#include "internal.h" - -static inline void set_node_proc_entry(struct device_node *np, - struct proc_dir_entry *de) -{ - np->pde = de; -} - -static struct proc_dir_entry *proc_device_tree; - -/* - * Supply data on a read from /proc/device-tree/node/property. - */ -static int property_proc_show(struct seq_file *m, void *v) -{ - struct property *pp = m->private; - - seq_write(m, pp->value, pp->length); - return 0; -} - -static int property_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, property_proc_show, __PDE_DATA(inode)); -} - -static const struct file_operations property_proc_fops = { - .owner = THIS_MODULE, - .open = property_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -/* - * For a node with a name like "gc@10", we make symlinks called "gc" - * and "@10" to it. - */ - -/* - * Add a property to a node - */ -static struct proc_dir_entry * -__proc_device_tree_add_prop(struct proc_dir_entry *de, struct property *pp, - const char *name) -{ - struct proc_dir_entry *ent; - - /* - * Unfortunately proc_register puts each new entry - * at the beginning of the list. So we rearrange them. - */ - ent = proc_create_data(name, - strncmp(name, "security-", 9) ? S_IRUGO : S_IRUSR, - de, &property_proc_fops, pp); - if (ent == NULL) - return NULL; - - if (!strncmp(name, "security-", 9)) - proc_set_size(ent, 0); /* don't leak number of password chars */ - else - proc_set_size(ent, pp->length); - - return ent; -} - - -void proc_device_tree_add_prop(struct proc_dir_entry *pde, struct property *prop) -{ - __proc_device_tree_add_prop(pde, prop, prop->name); -} - -void proc_device_tree_remove_prop(struct proc_dir_entry *pde, - struct property *prop) -{ - remove_proc_entry(prop->name, pde); -} - -void proc_device_tree_update_prop(struct proc_dir_entry *pde, - struct property *newprop, - struct property *oldprop) -{ - struct proc_dir_entry *ent; - - if (!oldprop) { - proc_device_tree_add_prop(pde, newprop); - return; - } - - for (ent = pde->subdir; ent != NULL; ent = ent->next) - if (ent->data == oldprop) - break; - if (ent == NULL) { - pr_warn("device-tree: property \"%s\" does not exist\n", - oldprop->name); - } else { - ent->data = newprop; - ent->size = newprop->length; - } -} - -/* - * Various dodgy firmware might give us nodes and/or properties with - * conflicting names. That's generally ok, except for exporting via /proc, - * so munge names here to ensure they're unique. - */ - -static int duplicate_name(struct proc_dir_entry *de, const char *name) -{ - struct proc_dir_entry *ent; - int found = 0; - - spin_lock(&proc_subdir_lock); - - for (ent = de->subdir; ent != NULL; ent = ent->next) { - if (strcmp(ent->name, name) == 0) { - found = 1; - break; - } - } - - spin_unlock(&proc_subdir_lock); - - return found; -} - -static const char *fixup_name(struct device_node *np, struct proc_dir_entry *de, - const char *name) -{ - char *fixed_name; - int fixup_len = strlen(name) + 2 + 1; /* name + #x + \0 */ - int i = 1, size; - -realloc: - fixed_name = kmalloc(fixup_len, GFP_KERNEL); - if (fixed_name == NULL) { - pr_err("device-tree: Out of memory trying to fixup " - "name \"%s\"\n", name); - return name; - } - -retry: - size = snprintf(fixed_name, fixup_len, "%s#%d", name, i); - size++; /* account for NULL */ - - if (size > fixup_len) { - /* We ran out of space, free and reallocate. */ - kfree(fixed_name); - fixup_len = size; - goto realloc; - } - - if (duplicate_name(de, fixed_name)) { - /* Multiple duplicates. Retry with a different offset. */ - i++; - goto retry; - } - - pr_warn("device-tree: Duplicate name in %s, renamed to \"%s\"\n", - np->full_name, fixed_name); - - return fixed_name; -} - -/* - * Process a node, adding entries for its children and its properties. - */ -void proc_device_tree_add_node(struct device_node *np, - struct proc_dir_entry *de) -{ - struct property *pp; - struct proc_dir_entry *ent; - struct device_node *child; - const char *p; - - set_node_proc_entry(np, de); - for (child = NULL; (child = of_get_next_child(np, child));) { - /* Use everything after the last slash, or the full name */ - p = kbasename(child->full_name); - - if (duplicate_name(de, p)) - p = fixup_name(np, de, p); - - ent = proc_mkdir(p, de); - if (ent == NULL) - break; - proc_device_tree_add_node(child, ent); - } - of_node_put(child); - - for (pp = np->properties; pp != NULL; pp = pp->next) { - p = pp->name; - - if (strchr(p, '/')) - continue; - - if (duplicate_name(de, p)) - p = fixup_name(np, de, p); - - ent = __proc_device_tree_add_prop(de, pp, p); - if (ent == NULL) - break; - } -} - -/* - * Called on initialization to set up the /proc/device-tree subtree - */ -void __init proc_device_tree_init(void) -{ - struct device_node *root; - - proc_device_tree = proc_mkdir("device-tree", NULL); - if (proc_device_tree == NULL) - return; - root = of_find_node_by_path("/"); - if (root == NULL) { - remove_proc_entry("device-tree", NULL); - pr_debug("/proc/device-tree: can't find root\n"); - return; - } - proc_device_tree_add_node(root, proc_device_tree); - of_node_put(root); -} diff --git a/fs/proc/root.c b/fs/proc/root.c index 87dbcbef7fe4..5dbadecb234d 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -92,6 +92,8 @@ static int proc_parse_options(char *options, struct pid_namespace *pid) int proc_remount(struct super_block *sb, int *flags, char *data) { struct pid_namespace *pid = sb->s_fs_info; + + sync_filesystem(sb); return !proc_parse_options(data, pid); } @@ -183,9 +185,6 @@ void __init proc_root_init(void) proc_mkdir("openprom", NULL); #endif proc_tty_init(); -#ifdef CONFIG_PROC_DEVICETREE - proc_device_tree_init(); -#endif proc_mkdir("bus", NULL); proc_sys_init(); } diff --git a/fs/proc/self.c b/fs/proc/self.c index ffeb202ec942..4348bb8907c2 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -16,7 +16,7 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer, if (!tgid) return -ENOENT; sprintf(tmp, "%d", tgid); - return vfs_readlink(dentry,buffer,buflen,tmp); + return readlink_copy(buffer, buflen, tmp); } static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 6f599c62f0cc..9d231e9e5f0e 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -9,7 +9,7 @@ #include <linux/slab.h> #include <linux/time.h> #include <linux/irqnr.h> -#include <asm/cputime.h> +#include <linux/cputime.h> #include <linux/tick.h> #ifndef arch_irq_stat_cpu diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index fb52b548080d..c4b2646b6d7c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1,4 +1,5 @@ #include <linux/mm.h> +#include <linux/vmacache.h> #include <linux/hugetlb.h> #include <linux/huge_mm.h> #include <linux/mount.h> @@ -152,7 +153,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) /* * We remember last_addr rather than next_addr to hit with - * mmap_cache most of the time. We have zero last_addr at + * vmacache most of the time. We have zero last_addr at * the beginning and also after lseek. We will have -1 last_addr * after the end of the vmas. */ @@ -1350,7 +1351,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, struct numa_maps *md; struct page *page; - if (pte_none(*pte)) + if (!pte_present(*pte)) return 0; page = pte_page(*pte); diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index 7141b8d0ca9e..33de567c25af 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -5,7 +5,7 @@ #include <linux/seq_file.h> #include <linux/time.h> #include <linux/kernel_stat.h> -#include <asm/cputime.h> +#include <linux/cputime.h> static int uptime_proc_show(struct seq_file *m, void *v) { diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 88d4585b30f1..6a8e785b29da 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -484,7 +484,6 @@ static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr) phdr_ptr->p_memsz = real_sz; if (real_sz == 0) { pr_warn("Warning: Zero PT_NOTE entries found\n"); - return -EINVAL; } } @@ -671,7 +670,6 @@ static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr) phdr_ptr->p_memsz = real_sz; if (real_sz == 0) { pr_warn("Warning: Zero PT_NOTE entries found\n"); - return -EINVAL; } } @@ -1118,4 +1116,3 @@ void vmcore_cleanup(void) } free_elfcorebuf(); } -EXPORT_SYMBOL_GPL(vmcore_cleanup); diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 7be26f03a3f5..1a81373947f3 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -267,6 +267,7 @@ static int mounts_open_common(struct inode *inode, struct file *file, p->root = root; p->m.poll_event = ns->event; p->show = show; + p->cached_event = ~0ULL; return 0; diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 12823845d324..192297b0090d 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -249,6 +249,7 @@ static void parse_options(char *options) static int pstore_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); parse_options(data); return 0; diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 78c3c2097787..46d269e38706 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -497,6 +497,7 @@ void pstore_get_records(int quiet) big_oops_buf_sz); if (unzipped_len > 0) { + kfree(buf); buf = big_oops_buf; size = unzipped_len; compressed = false; diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index fa8cef2cca3a..3b5744306ed8 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -86,6 +86,7 @@ struct ramoops_context { struct persistent_ram_ecc_info ecc_info; unsigned int max_dump_cnt; unsigned int dump_write_cnt; + /* _read_cnt need clear on ramoops_pstore_open */ unsigned int dump_read_cnt; unsigned int console_read_cnt; unsigned int ftrace_read_cnt; @@ -101,6 +102,7 @@ static int ramoops_pstore_open(struct pstore_info *psi) cxt->dump_read_cnt = 0; cxt->console_read_cnt = 0; + cxt->ftrace_read_cnt = 0; return 0; } @@ -117,13 +119,15 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max, return NULL; prz = przs[i]; + if (!prz) + return NULL; - if (update) { - /* Update old/shadowed buffer. */ + /* Update old/shadowed buffer. */ + if (update) persistent_ram_save_old(prz); - if (!persistent_ram_old_size(prz)) - return NULL; - } + + if (!persistent_ram_old_size(prz)) + return NULL; *typep = type; *id = i; @@ -316,6 +320,7 @@ static void ramoops_free_przs(struct ramoops_context *cxt) { int i; + cxt->max_dump_cnt = 0; if (!cxt->przs) return; @@ -346,7 +351,7 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt, GFP_KERNEL); if (!cxt->przs) { dev_err(dev, "failed to initialize a prz array for dumps\n"); - return -ENOMEM; + goto fail_prz; } for (i = 0; i < cxt->max_dump_cnt; i++) { @@ -428,7 +433,6 @@ static int ramoops_probe(struct platform_device *pdev) if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size)) pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size); - cxt->dump_read_cnt = 0; cxt->size = pdata->mem_size; cxt->phys_addr = pdata->mem_address; cxt->record_size = pdata->record_size; @@ -505,7 +509,6 @@ fail_buf: kfree(cxt->pstore.buf); fail_clear: cxt->pstore.bufsize = 0; - cxt->max_dump_cnt = 0; fail_cnt: kfree(cxt->fprz); fail_init_fprz: diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index de272d426763..ff7e3d4df5a1 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -54,7 +54,7 @@ static size_t buffer_start_add_atomic(struct persistent_ram_zone *prz, size_t a) do { old = atomic_read(&prz->buffer->start); new = old + a; - while (unlikely(new > prz->buffer_size)) + while (unlikely(new >= prz->buffer_size)) new -= prz->buffer_size; } while (atomic_cmpxchg(&prz->buffer->start, old, new) != old); @@ -91,7 +91,7 @@ static size_t buffer_start_add_locked(struct persistent_ram_zone *prz, size_t a) old = atomic_read(&prz->buffer->start); new = old + a; - while (unlikely(new > prz->buffer_size)) + while (unlikely(new >= prz->buffer_size)) new -= prz->buffer_size; atomic_set(&prz->buffer->start, new); diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 89558810381c..c4bcb778886e 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -44,6 +44,7 @@ static int qnx4_remount(struct super_block *sb, int *flags, char *data) { struct qnx4_sb_info *qs; + sync_filesystem(sb); qs = qnx4_sb(sb); qs->Version = QNX4_VERSION; *flags |= MS_RDONLY; diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 8d941edfefa1..65cdaab3ed49 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -55,6 +55,7 @@ static int qnx6_show_options(struct seq_file *seq, struct dentry *root) static int qnx6_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_RDONLY; return 0; } diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig index 880fd9884366..c51df1dd237e 100644 --- a/fs/quota/Kconfig +++ b/fs/quota/Kconfig @@ -8,9 +8,10 @@ config QUOTA help If you say Y here, you will be able to set per user limits for disk usage (also called disk quotas). Currently, it works for the - ext2, ext3, and reiserfs file system. ext3 also supports journalled - quotas for which you don't need to run quotacheck(8) after an unclean - shutdown. + ext2, ext3, ext4, jfs, ocfs2 and reiserfs file systems. + Note that gfs2 and xfs use their own quota system. + Ext3, ext4 and reiserfs also support journaled quotas for which + you don't need to run quotacheck(8) after an unclean shutdown. For further details, read the Quota mini-HOWTO, available from <http://www.tldp.org/docs.html#howto>, or the documentation provided with the quota tools. Probably the quota support is only useful for diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index cfc8dcc16043..9cd5f63715c0 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -528,7 +528,7 @@ restart: if (atomic_read(&dquot->dq_count)) { DEFINE_WAIT(wait); - atomic_inc(&dquot->dq_count); + dqgrab(dquot); prepare_to_wait(&dquot->dq_wait_unused, &wait, TASK_UNINTERRUPTIBLE); spin_unlock(&dq_list_lock); @@ -632,7 +632,7 @@ int dquot_writeback_dquots(struct super_block *sb, int type) /* Now we have active dquot from which someone is * holding reference so we can safely just increase * use count */ - atomic_inc(&dquot->dq_count); + dqgrab(dquot); spin_unlock(&dq_list_lock); dqstats_inc(DQST_LOOKUPS); err = sb->dq_op->write_dquot(dquot); diff --git a/fs/read_write.c b/fs/read_write.c index 28cc9c810744..31c6efa43183 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -994,9 +994,9 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, return ret; } -COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, - const struct compat_iovec __user *,vec, - unsigned long, vlen, loff_t, pos) +static long __compat_sys_preadv64(unsigned long fd, + const struct compat_iovec __user *vec, + unsigned long vlen, loff_t pos) { struct fd f; ssize_t ret; @@ -1013,12 +1013,22 @@ COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, return ret; } +#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64 +COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, + const struct compat_iovec __user *,vec, + unsigned long, vlen, loff_t, pos) +{ + return __compat_sys_preadv64(fd, vec, vlen, pos); +} +#endif + COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, const struct compat_iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; - return compat_sys_preadv64(fd, vec, vlen, pos); + + return __compat_sys_preadv64(fd, vec, vlen, pos); } static size_t compat_writev(struct file *file, @@ -1061,9 +1071,9 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, return ret; } -COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, - const struct compat_iovec __user *,vec, - unsigned long, vlen, loff_t, pos) +static long __compat_sys_pwritev64(unsigned long fd, + const struct compat_iovec __user *vec, + unsigned long vlen, loff_t pos) { struct fd f; ssize_t ret; @@ -1080,12 +1090,22 @@ COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, return ret; } +#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64 +COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, + const struct compat_iovec __user *,vec, + unsigned long, vlen, loff_t, pos) +{ + return __compat_sys_pwritev64(fd, vec, vlen, pos); +} +#endif + COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, const struct compat_iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; - return compat_sys_pwritev64(fd, vec, vlen, pos); + + return __compat_sys_pwritev64(fd, vec, vlen, pos); } #endif diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 1fd2051109a3..af677353a3f5 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -125,6 +125,7 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx) int d_reclen; char *d_name; ino_t d_ino; + loff_t cur_pos = deh_offset(deh); if (!de_visible(deh)) /* it is hidden entry */ @@ -196,8 +197,9 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx) if (local_buf != small_buf) { kfree(local_buf); } - // next entry should be looked for with such offset - next_pos = deh_offset(deh) + 1; + + /* deh_offset(deh) may be invalid now. */ + next_pos = cur_pos + 1; if (item_moved(&tmp_ih, &path_to_entry)) { set_cpu_key_k_offset(&pos_key, diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index ad62bdbb451e..bc8b8009897d 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -35,7 +35,7 @@ void reiserfs_evict_inode(struct inode *inode) if (!inode->i_nlink && !is_bad_inode(inode)) dquot_initialize(inode); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); if (inode->i_nlink) goto no_delete; diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 8d06adf89948..83d4eac8059a 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -2831,6 +2831,7 @@ void reiserfs_init_alloc_options(struct super_block *s); */ __le32 reiserfs_choose_packing(struct inode *dir); +void show_alloc_options(struct seq_file *seq, struct super_block *s); int reiserfs_init_bitmap_cache(struct super_block *sb); void reiserfs_free_bitmap_cache(struct super_block *sb); void reiserfs_cache_bitmap_metadata(struct super_block *sb, struct buffer_head *bh, struct reiserfs_bitmap_info *info); diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 2c803353f8ac..9fb20426005e 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -62,7 +62,6 @@ static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs) static int reiserfs_remount(struct super_block *s, int *flags, char *data); static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf); -void show_alloc_options(struct seq_file *seq, struct super_block *s); static int reiserfs_sync_fs(struct super_block *s, int wait) { @@ -597,7 +596,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache", sizeof(struct @@ -1319,6 +1318,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) int i; #endif + sync_filesystem(s); reiserfs_write_lock(s); #ifdef CONFIG_QUOTA diff --git a/fs/romfs/super.c b/fs/romfs/super.c index d8418782862b..ef90e8bca95a 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -432,6 +432,7 @@ static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf) */ static int romfs_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_RDONLY; return 0; } diff --git a/fs/splice.c b/fs/splice.c index 12028fa41def..e246954ea48c 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -136,8 +136,6 @@ error: const struct pipe_buf_operations page_cache_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = page_cache_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .steal = page_cache_pipe_buf_steal, @@ -156,8 +154,6 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, static const struct pipe_buf_operations user_page_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .steal = user_page_pipe_buf_steal, @@ -547,8 +543,6 @@ EXPORT_SYMBOL(generic_file_splice_read); static const struct pipe_buf_operations default_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, .steal = generic_pipe_buf_steal, @@ -564,8 +558,6 @@ static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, /* Pipe buffer operations for a socket and similar. */ const struct pipe_buf_operations nosteal_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, .steal = generic_pipe_buf_nosteal, @@ -767,13 +759,13 @@ int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, goto out; if (buf->page != page) { - char *src = buf->ops->map(pipe, buf, 1); + char *src = kmap_atomic(buf->page); char *dst = kmap_atomic(page); memcpy(dst + offset, src + buf->offset, this_len); flush_dcache_page(page); kunmap_atomic(dst); - buf->ops->unmap(pipe, buf, src); + kunmap_atomic(src); } ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len, page, fsdata); @@ -1067,9 +1059,9 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, void *data; loff_t tmp = sd->pos; - data = buf->ops->map(pipe, buf, 0); + data = kmap(buf->page); ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp); - buf->ops->unmap(pipe, buf, data); + kunmap(buf->page); return ret; } @@ -1528,116 +1520,50 @@ static int get_iovec_page_array(const struct iovec __user *iov, static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { - char *src; - int ret; - - /* - * See if we can use the atomic maps, by prefaulting in the - * pages and doing an atomic copy - */ - if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) { - src = buf->ops->map(pipe, buf, 1); - ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset, - sd->len); - buf->ops->unmap(pipe, buf, src); - if (!ret) { - ret = sd->len; - goto out; - } - } - - /* - * No dice, use slow non-atomic map and copy - */ - src = buf->ops->map(pipe, buf, 0); - - ret = sd->len; - if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len)) - ret = -EFAULT; - - buf->ops->unmap(pipe, buf, src); -out: - if (ret > 0) - sd->u.userptr += ret; - return ret; + int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data); + return n == sd->len ? n : -EFAULT; } /* * For lack of a better implementation, implement vmsplice() to userspace * as a simple copy of the pipes pages to the user iov. */ -static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, +static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov, unsigned long nr_segs, unsigned int flags) { struct pipe_inode_info *pipe; struct splice_desc sd; - ssize_t size; - int error; long ret; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + ssize_t count; pipe = get_pipe_info(file); if (!pipe) return -EBADF; - pipe_lock(pipe); - - error = ret = 0; - while (nr_segs) { - void __user *base; - size_t len; - - /* - * Get user address base and length for this iovec. - */ - error = get_user(base, &iov->iov_base); - if (unlikely(error)) - break; - error = get_user(len, &iov->iov_len); - if (unlikely(error)) - break; - - /* - * Sanity check this iovec. 0 read succeeds. - */ - if (unlikely(!len)) - break; - if (unlikely(!base)) { - error = -EFAULT; - break; - } - - if (unlikely(!access_ok(VERIFY_WRITE, base, len))) { - error = -EFAULT; - break; - } - - sd.len = 0; - sd.total_len = len; - sd.flags = flags; - sd.u.userptr = base; - sd.pos = 0; - - size = __splice_from_pipe(pipe, &sd, pipe_to_user); - if (size < 0) { - if (!ret) - ret = size; - - break; - } - - ret += size; + ret = rw_copy_check_uvector(READ, uiov, nr_segs, + ARRAY_SIZE(iovstack), iovstack, &iov); + if (ret <= 0) + goto out; - if (size < len) - break; + count = ret; + iov_iter_init(&iter, iov, nr_segs, count, 0); - nr_segs--; - iov++; - } + sd.len = 0; + sd.total_len = count; + sd.flags = flags; + sd.u.data = &iter; + sd.pos = 0; + pipe_lock(pipe); + ret = __splice_from_pipe(pipe, &sd, pipe_to_user); pipe_unlock(pipe); - if (!ret) - ret = error; +out: + if (iov != iovstack) + kfree(iov); return ret; } diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 202df6312d4e..031c8d67fd51 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -371,6 +371,7 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf) static int squashfs_remount(struct super_block *sb, int *flags, char *data) { + sync_filesystem(sb); *flags |= MS_RDONLY; return 0; } diff --git a/fs/super.c b/fs/super.c index 80d5cf2ca765..48377f7463c0 100644 --- a/fs/super.c +++ b/fs/super.c @@ -719,8 +719,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) } } - sync_filesystem(sb); - if (sb->s_op->remount_fs) { retval = sb->s_op->remount_fs(sb, &flags, data); if (retval) { @@ -802,7 +800,10 @@ void emergency_remount(void) static DEFINE_IDA(unnamed_dev_ida); static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */ -static int unnamed_dev_start = 0; /* don't bother trying below it */ +/* Many userspace utilities consider an FSID of 0 invalid. + * Always return at least 1 from get_anon_bdev. + */ +static int unnamed_dev_start = 1; int get_anon_bdev(dev_t *p) { diff --git a/fs/sysfs/Kconfig b/fs/sysfs/Kconfig index 8c41feacbac5..b2756014508c 100644 --- a/fs/sysfs/Kconfig +++ b/fs/sysfs/Kconfig @@ -1,6 +1,7 @@ config SYSFS bool "sysfs file system support" if EXPERT default y + select KERNFS help The sysfs filesystem is a virtual filesystem that the kernel uses to export internal kernel objects, their attributes, and their diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index ee0d761c3179..0b45ff42f374 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -19,39 +19,18 @@ DEFINE_SPINLOCK(sysfs_symlink_target_lock); -/** - * sysfs_pathname - return full path to sysfs dirent - * @kn: kernfs_node whose path we want - * @path: caller allocated buffer of size PATH_MAX - * - * Gives the name "/" to the sysfs_root entry; any path returned - * is relative to wherever sysfs is mounted. - */ -static char *sysfs_pathname(struct kernfs_node *kn, char *path) -{ - if (kn->parent) { - sysfs_pathname(kn->parent, path); - strlcat(path, "/", PATH_MAX); - } - strlcat(path, kn->name, PATH_MAX); - return path; -} - void sysfs_warn_dup(struct kernfs_node *parent, const char *name) { - char *path; + char *buf, *path = NULL; - path = kzalloc(PATH_MAX, GFP_KERNEL); - if (path) { - sysfs_pathname(parent, path); - strlcat(path, "/", PATH_MAX); - strlcat(path, name, PATH_MAX); - } + buf = kzalloc(PATH_MAX, GFP_KERNEL); + if (buf) + path = kernfs_path(parent, buf, PATH_MAX); - WARN(1, KERN_WARNING "sysfs: cannot create duplicate filename '%s'\n", - path ? path : name); + WARN(1, KERN_WARNING "sysfs: cannot create duplicate filename '%s/%s'\n", + path, name); - kfree(path); + kfree(buf); } /** @@ -122,9 +101,13 @@ void sysfs_remove_dir(struct kobject *kobj) int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, const void *new_ns) { - struct kernfs_node *parent = kobj->sd->parent; + struct kernfs_node *parent; + int ret; - return kernfs_rename_ns(kobj->sd, parent, new_name, new_ns); + parent = kernfs_get_parent(kobj->sd); + ret = kernfs_rename_ns(kobj->sd, parent, new_name, new_ns); + kernfs_put(parent); + return ret; } int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, @@ -133,7 +116,6 @@ int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, struct kernfs_node *kn = kobj->sd; struct kernfs_node *new_parent; - BUG_ON(!kn->parent); new_parent = new_parent_kobj && new_parent_kobj->sd ? new_parent_kobj->sd : sysfs_root_kn; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 810cf6e613e5..e9ef59b3abb1 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -47,12 +47,13 @@ static int sysfs_kf_seq_show(struct seq_file *sf, void *v) ssize_t count; char *buf; - /* acquire buffer and ensure that it's >= PAGE_SIZE */ + /* acquire buffer and ensure that it's >= PAGE_SIZE and clear */ count = seq_get_buf(sf, &buf); if (count < PAGE_SIZE) { seq_commit(sf, -1); return 0; } + memset(buf, 0, PAGE_SIZE); /* * Invoke show(). Control may reach here via seq file lseek even @@ -372,6 +373,29 @@ void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, } EXPORT_SYMBOL_GPL(sysfs_remove_file_ns); +/** + * sysfs_remove_file_self - remove an object attribute from its own method + * @kobj: object we're acting for + * @attr: attribute descriptor + * + * See kernfs_remove_self() for details. + */ +bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr) +{ + struct kernfs_node *parent = kobj->sd; + struct kernfs_node *kn; + bool ret; + + kn = kernfs_find_and_get(parent, attr->name); + if (WARN_ON_ONCE(!kn)) + return false; + + ret = kernfs_remove_self(kn); + + kernfs_put(kn); + return ret; +} + void sysfs_remove_files(struct kobject *kobj, const struct attribute **ptr) { int i; @@ -430,95 +454,3 @@ void sysfs_remove_bin_file(struct kobject *kobj, kernfs_remove_by_name(kobj->sd, attr->attr.name); } EXPORT_SYMBOL_GPL(sysfs_remove_bin_file); - -struct sysfs_schedule_callback_struct { - struct list_head workq_list; - struct kobject *kobj; - void (*func)(void *); - void *data; - struct module *owner; - struct work_struct work; -}; - -static struct workqueue_struct *sysfs_workqueue; -static DEFINE_MUTEX(sysfs_workq_mutex); -static LIST_HEAD(sysfs_workq); -static void sysfs_schedule_callback_work(struct work_struct *work) -{ - struct sysfs_schedule_callback_struct *ss = container_of(work, - struct sysfs_schedule_callback_struct, work); - - (ss->func)(ss->data); - kobject_put(ss->kobj); - module_put(ss->owner); - mutex_lock(&sysfs_workq_mutex); - list_del(&ss->workq_list); - mutex_unlock(&sysfs_workq_mutex); - kfree(ss); -} - -/** - * sysfs_schedule_callback - helper to schedule a callback for a kobject - * @kobj: object we're acting for. - * @func: callback function to invoke later. - * @data: argument to pass to @func. - * @owner: module owning the callback code - * - * sysfs attribute methods must not unregister themselves or their parent - * kobject (which would amount to the same thing). Attempts to do so will - * deadlock, since unregistration is mutually exclusive with driver - * callbacks. - * - * Instead methods can call this routine, which will attempt to allocate - * and schedule a workqueue request to call back @func with @data as its - * argument in the workqueue's process context. @kobj will be pinned - * until @func returns. - * - * Returns 0 if the request was submitted, -ENOMEM if storage could not - * be allocated, -ENODEV if a reference to @owner isn't available, - * -EAGAIN if a callback has already been scheduled for @kobj. - */ -int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), - void *data, struct module *owner) -{ - struct sysfs_schedule_callback_struct *ss, *tmp; - - if (!try_module_get(owner)) - return -ENODEV; - - mutex_lock(&sysfs_workq_mutex); - list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list) - if (ss->kobj == kobj) { - module_put(owner); - mutex_unlock(&sysfs_workq_mutex); - return -EAGAIN; - } - mutex_unlock(&sysfs_workq_mutex); - - if (sysfs_workqueue == NULL) { - sysfs_workqueue = create_singlethread_workqueue("sysfsd"); - if (sysfs_workqueue == NULL) { - module_put(owner); - return -ENOMEM; - } - } - - ss = kmalloc(sizeof(*ss), GFP_KERNEL); - if (!ss) { - module_put(owner); - return -ENOMEM; - } - kobject_get(kobj); - ss->kobj = kobj; - ss->func = func; - ss->data = data; - ss->owner = owner; - INIT_WORK(&ss->work, sysfs_schedule_callback_work); - INIT_LIST_HEAD(&ss->workq_list); - mutex_lock(&sysfs_workq_mutex); - list_add_tail(&ss->workq_list, &sysfs_workq); - mutex_unlock(&sysfs_workq_mutex); - queue_work(sysfs_workqueue, &ss->work); - return 0; -} -EXPORT_SYMBOL_GPL(sysfs_schedule_callback); diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 6b579387c67a..aa0406895b53 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -70,8 +70,11 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj, if (grp->bin_attrs) { for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) { if (update) - sysfs_remove_bin_file(kobj, *bin_attr); - error = sysfs_create_bin_file(kobj, *bin_attr); + kernfs_remove_by_name(parent, + (*bin_attr)->attr.name); + error = sysfs_add_file_mode_ns(parent, + &(*bin_attr)->attr, true, + (*bin_attr)->attr.mode, NULL); if (error) break; } diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 3eaf5c6622eb..8a49486bf30c 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -13,6 +13,7 @@ #define DEBUG #include <linux/fs.h> +#include <linux/magic.h> #include <linux/mount.h> #include <linux/init.h> #include <linux/user_namespace.h> @@ -38,7 +39,8 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, } ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); - root = kernfs_mount_ns(fs_type, flags, sysfs_root, &new_sb, ns); + root = kernfs_mount_ns(fs_type, flags, sysfs_root, + SYSFS_MAGIC, &new_sb, ns); if (IS_ERR(root) || !new_sb) kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); return root; @@ -63,7 +65,8 @@ int __init sysfs_init(void) { int err; - sysfs_root = kernfs_create_root(NULL, NULL); + sysfs_root = kernfs_create_root(NULL, KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, + NULL); if (IS_ERR(sysfs_root)) return PTR_ERR(sysfs_root); diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index c327d4ee1235..88956309cc86 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -60,6 +60,7 @@ static int sysv_remount(struct super_block *sb, int *flags, char *data) { struct sysv_sb_info *sbi = SYSV_SB(sb); + sync_filesystem(sb); if (sbi->s_forced_ro) *flags |= MS_RDONLY; return 0; @@ -295,7 +296,7 @@ int sysv_sync_inode(struct inode *inode) static void sysv_evict_inode(struct inode *inode) { - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); if (!inode->i_nlink) { inode->i_size = 0; sysv_truncate(inode); diff --git a/fs/timerfd.c b/fs/timerfd.c index 929312180dd0..0013142c0475 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -317,6 +317,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) (clockid != CLOCK_MONOTONIC && clockid != CLOCK_REALTIME && clockid != CLOCK_REALTIME_ALARM && + clockid != CLOCK_BOOTTIME && clockid != CLOCK_BOOTTIME_ALARM)) return -EINVAL; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 123c79b7261e..4f34dbae823d 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1538,6 +1538,7 @@ out_unlock: static const struct vm_operations_struct ubifs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = ubifs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 5ded8490c0c6..a81c7b556896 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -351,7 +351,7 @@ static void ubifs_evict_inode(struct inode *inode) dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode); ubifs_assert(!atomic_read(&inode->i_count)); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); if (inode->i_nlink) goto done; @@ -1556,7 +1556,7 @@ static int ubifs_remount_rw(struct ubifs_info *c) if (c->space_fixup) { err = ubifs_fixup_free_space(c); if (err) - return err; + goto out; } err = check_free_space(c); @@ -1827,6 +1827,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) int err; struct ubifs_info *c = sb->s_fs_info; + sync_filesystem(sb); dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags); err = ubifs_parse_options(c, data, 1); diff --git a/fs/udf/file.c b/fs/udf/file.c index 1037637957c7..d2c170f8b035 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -171,7 +171,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, } else up_write(&iinfo->i_data_sem); - retval = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); + retval = __generic_file_aio_write(iocb, iov, nr_segs); mutex_unlock(&inode->i_mutex); if (retval > 0) { diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 982ce05c87ed..5d643706212f 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -146,8 +146,8 @@ void udf_evict_inode(struct inode *inode) want_delete = 1; udf_setsize(inode, 0); udf_update_inode(inode, IS_SYNC(inode)); - } else - truncate_inode_pages(&inode->i_data, 0); + } + truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); clear_inode(inode); if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && diff --git a/fs/udf/super.c b/fs/udf/super.c index 3306b9f69bed..3286db047a40 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -175,7 +175,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { udf_inode_cachep = kmem_cache_create("udf_inode_cache", sizeof(struct udf_inode_info), @@ -505,6 +505,7 @@ static int udf_parse_options(char *options, struct udf_options *uopt, while ((p = strsep(&options, ",")) != NULL) { substring_t args[MAX_OPT_ARGS]; int token; + unsigned n; if (!*p) continue; @@ -516,7 +517,10 @@ static int udf_parse_options(char *options, struct udf_options *uopt, case Opt_bs: if (match_int(&args[0], &option)) return 0; - uopt->blocksize = option; + n = option; + if (n != 512 && n != 1024 && n != 2048 && n != 4096) + return 0; + uopt->blocksize = n; uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET); break; case Opt_unhide: @@ -646,6 +650,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) int error = 0; struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb); + sync_filesystem(sb); if (lvidiu) { int write_rev = le16_to_cpu(lvidiu->minUDFWriteRev); if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & MS_RDONLY)) diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index a7ea492ae660..0ab1de4b39a5 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -38,7 +38,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) { struct super_block * sb; struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; unsigned cgno, bit, end_bit, bbase, blkmap, i; @@ -46,7 +45,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; - usb1 = ubh_get_usb_first(uspi); UFSD("ENTER, fragment %llu, count %u\n", (unsigned long long)fragment, count); @@ -135,7 +133,6 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count) { struct super_block * sb; struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; unsigned overflow, cgno, bit, end_bit, i; @@ -143,7 +140,6 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count) sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; - usb1 = ubh_get_usb_first(uspi); UFSD("ENTER, fragment %llu, count %u\n", (unsigned long long)fragment, count); @@ -499,7 +495,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, { struct super_block * sb; struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; unsigned cgno, fragno, fragoff, count, fragsize, i; @@ -509,7 +504,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; - usb1 = ubh_get_usb_first (uspi); count = newcount - oldcount; cgno = ufs_dtog(uspi, fragment); @@ -577,7 +571,6 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno, { struct super_block * sb; struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; unsigned oldcg, i, j, k, allocsize; @@ -588,7 +581,6 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno, sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; - usb1 = ubh_get_usb_first(uspi); oldcg = cgno; /* @@ -690,7 +682,6 @@ static u64 ufs_alloccg_block(struct inode *inode, { struct super_block * sb; struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; struct ufs_cylinder_group * ucg; u64 result, blkno; @@ -698,7 +689,6 @@ static u64 ufs_alloccg_block(struct inode *inode, sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; - usb1 = ubh_get_usb_first(uspi); ucg = ubh_get_ucg(UCPI_UBH(ucpi)); if (goal == 0) { @@ -794,7 +784,6 @@ static u64 ufs_bitmap_search(struct super_block *sb, 0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe }; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - struct ufs_super_block_first *usb1; struct ufs_cylinder_group *ucg; unsigned start, length, loc; unsigned pos, want, blockmap, mask, end; @@ -803,7 +792,6 @@ static u64 ufs_bitmap_search(struct super_block *sb, UFSD("ENTER, cg %u, goal %llu, count %u\n", ucpi->c_cgx, (unsigned long long)goal, count); - usb1 = ubh_get_usb_first (uspi); ucg = ubh_get_ucg(UCPI_UBH(ucpi)); if (goal) diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index d0426d74817b..98f7211599ff 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -57,7 +57,6 @@ void ufs_free_inode (struct inode * inode) { struct super_block * sb; struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; int is_directory; @@ -67,7 +66,6 @@ void ufs_free_inode (struct inode * inode) sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; - usb1 = ubh_get_usb_first(uspi); ino = inode->i_ino; @@ -175,7 +173,6 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode) struct super_block * sb; struct ufs_sb_info * sbi; struct ufs_sb_private_info * uspi; - struct ufs_super_block_first * usb1; struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; struct inode * inode; @@ -195,7 +192,6 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode) ufsi = UFS_I(inode); sbi = UFS_SB(sb); uspi = sbi->s_uspi; - usb1 = ubh_get_usb_first(uspi); mutex_lock(&sbi->s_lock); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index c8ca96086784..61e8a9b021dd 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -885,7 +885,7 @@ void ufs_evict_inode(struct inode * inode) if (!inode->i_nlink && !is_bad_inode(inode)) want_delete = 1; - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); if (want_delete) { loff_t old_i_size; /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/ diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 329f2f53b7ed..c1183f9f69dc 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -524,11 +524,9 @@ static int ufs_read_cylinder_structures(struct super_block *sb) struct ufs_buffer_head * ubh; unsigned char * base, * space; unsigned size, blks, i; - struct ufs_super_block_third *usb3; UFSD("ENTER\n"); - usb3 = ubh_get_usb_third(uspi); /* * Read cs structures from (usually) first data block * on the device. @@ -1280,6 +1278,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) unsigned new_mount_opt, ufstype; unsigned flags; + sync_filesystem(sb); lock_ufs(sb); mutex_lock(&UFS_SB(sb)->s_lock); uspi = UFS_SB(sb)->s_uspi; @@ -1389,15 +1388,11 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf) struct super_block *sb = dentry->d_sb; struct ufs_sb_private_info *uspi= UFS_SB(sb)->s_uspi; unsigned flags = UFS_SB(sb)->s_flags; - struct ufs_super_block_first *usb1; - struct ufs_super_block_second *usb2; struct ufs_super_block_third *usb3; u64 id = huge_encode_dev(sb->s_bdev->bd_dev); lock_ufs(sb); - usb1 = ubh_get_usb_first(uspi); - usb2 = ubh_get_usb_second(uspi); usb3 = ubh_get_usb_third(uspi); if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { @@ -1453,7 +1448,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { ufs_inode_cachep = kmem_cache_create("ufs_inode_cache", sizeof(struct ufs_inode_info), diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 66a36befc5c0..844e288b9576 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -65,12 +65,31 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) void * kmem_zalloc_large(size_t size, xfs_km_flags_t flags) { + unsigned noio_flag = 0; void *ptr; + gfp_t lflags; ptr = kmem_zalloc(size, flags | KM_MAYFAIL); if (ptr) return ptr; - return vzalloc(size); + + /* + * __vmalloc() will allocate data pages and auxillary structures (e.g. + * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context + * here. Hence we need to tell memory reclaim that we are in such a + * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering + * the filesystem here and potentially deadlocking. + */ + if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) + noio_flag = memalloc_noio_save(); + + lflags = kmem_flags_convert(flags); + ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); + + if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) + memalloc_noio_restore(noio_flag); + + return ptr; } void diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 0ecec1896f25..6888ad886ff6 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -281,7 +281,7 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (!acl) goto set_acl; - error = -EINVAL; + error = -E2BIG; if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb))) return error; diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 3fc109819c34..0fdd4109c624 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -89,6 +89,8 @@ typedef struct xfs_agf { /* structure must be padded to 64 bit alignment */ } xfs_agf_t; +#define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc) + #define XFS_AGF_MAGICNUM 0x00000001 #define XFS_AGF_VERSIONNUM 0x00000002 #define XFS_AGF_SEQNO 0x00000004 @@ -167,6 +169,8 @@ typedef struct xfs_agi { /* structure must be padded to 64 bit alignment */ } xfs_agi_t; +#define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc) + #define XFS_AGI_MAGICNUM 0x00000001 #define XFS_AGI_VERSIONNUM 0x00000002 #define XFS_AGI_SEQNO 0x00000004 @@ -222,6 +226,8 @@ typedef struct xfs_agfl { __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */ } xfs_agfl_t; +#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc) + /* * tags for inode radix tree */ diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 9eab2dfdcbb5..c1cf6a336a72 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -474,7 +474,6 @@ xfs_agfl_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - int agfl_ok = 1; /* * There is no verification of non-crc AGFLs because mkfs does not @@ -485,15 +484,13 @@ xfs_agfl_read_verify( if (!xfs_sb_version_hascrc(&mp->m_sb)) return; - agfl_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - offsetof(struct xfs_agfl, agfl_crc)); - - agfl_ok = agfl_ok && xfs_agfl_verify(bp); - - if (!agfl_ok) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_agfl_verify(bp)) xfs_buf_ioerror(bp, EFSCORRUPTED); - } + + if (bp->b_error) + xfs_verifier_error(bp); } static void @@ -508,16 +505,15 @@ xfs_agfl_write_verify( return; if (!xfs_agfl_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); return; } if (bip) XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn); - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), - offsetof(struct xfs_agfl, agfl_crc)); + xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF); } const struct xfs_buf_ops xfs_agfl_buf_ops = { @@ -2238,19 +2234,17 @@ xfs_agf_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - int agf_ok = 1; - - if (xfs_sb_version_hascrc(&mp->m_sb)) - agf_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - offsetof(struct xfs_agf, agf_crc)); - agf_ok = agf_ok && xfs_agf_verify(mp, bp); - - if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF, - XFS_RANDOM_ALLOC_READ_AGF))) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp, + XFS_ERRTAG_ALLOC_READ_AGF, + XFS_RANDOM_ALLOC_READ_AGF)) xfs_buf_ioerror(bp, EFSCORRUPTED); - } + + if (bp->b_error) + xfs_verifier_error(bp); } static void @@ -2261,8 +2255,8 @@ xfs_agf_write_verify( struct xfs_buf_log_item *bip = bp->b_fspriv; if (!xfs_agf_verify(mp, bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); return; } @@ -2272,8 +2266,7 @@ xfs_agf_write_verify( if (bip) XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), - offsetof(struct xfs_agf, agf_crc)); + xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF); } const struct xfs_buf_ops xfs_agf_buf_ops = { diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index 13085429e523..cc1eadcbb049 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -355,12 +355,14 @@ static void xfs_allocbt_read_verify( struct xfs_buf *bp) { - if (!(xfs_btree_sblock_verify_crc(bp) && - xfs_allocbt_verify(bp))) { - trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, - bp->b_target->bt_mount, bp->b_addr); + if (!xfs_btree_sblock_verify_crc(bp)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_allocbt_verify(bp)) xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp); } } @@ -370,9 +372,9 @@ xfs_allocbt_write_verify( { if (!xfs_allocbt_verify(bp)) { trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, - bp->b_target->bt_mount, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; } xfs_btree_sblock_calc_crc(bp); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index db2cfb067d0b..0479c32c5eb1 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -632,38 +632,46 @@ xfs_map_at_offset( } /* - * Test if a given page is suitable for writing as part of an unwritten - * or delayed allocate extent. + * Test if a given page contains at least one buffer of a given @type. + * If @check_all_buffers is true, then we walk all the buffers in the page to + * try to find one of the type passed in. If it is not set, then the caller only + * needs to check the first buffer on the page for a match. */ -STATIC int +STATIC bool xfs_check_page_type( struct page *page, - unsigned int type) + unsigned int type, + bool check_all_buffers) { - if (PageWriteback(page)) - return 0; + struct buffer_head *bh; + struct buffer_head *head; - if (page->mapping && page_has_buffers(page)) { - struct buffer_head *bh, *head; - int acceptable = 0; + if (PageWriteback(page)) + return false; + if (!page->mapping) + return false; + if (!page_has_buffers(page)) + return false; - bh = head = page_buffers(page); - do { - if (buffer_unwritten(bh)) - acceptable += (type == XFS_IO_UNWRITTEN); - else if (buffer_delay(bh)) - acceptable += (type == XFS_IO_DELALLOC); - else if (buffer_dirty(bh) && buffer_mapped(bh)) - acceptable += (type == XFS_IO_OVERWRITE); - else - break; - } while ((bh = bh->b_this_page) != head); + bh = head = page_buffers(page); + do { + if (buffer_unwritten(bh)) { + if (type == XFS_IO_UNWRITTEN) + return true; + } else if (buffer_delay(bh)) { + if (type == XFS_IO_DELALLOC) + return true; + } else if (buffer_dirty(bh) && buffer_mapped(bh)) { + if (type == XFS_IO_OVERWRITE) + return true; + } - if (acceptable) - return 1; - } + /* If we are only checking the first buffer, we are done now. */ + if (!check_all_buffers) + break; + } while ((bh = bh->b_this_page) != head); - return 0; + return false; } /* @@ -697,7 +705,7 @@ xfs_convert_page( goto fail_unlock_page; if (page->mapping != inode->i_mapping) goto fail_unlock_page; - if (!xfs_check_page_type(page, (*ioendp)->io_type)) + if (!xfs_check_page_type(page, (*ioendp)->io_type, false)) goto fail_unlock_page; /* @@ -742,6 +750,15 @@ xfs_convert_page( p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; page_dirty = p_offset / len; + /* + * The moment we find a buffer that doesn't match our current type + * specification or can't be written, abort the loop and start + * writeback. As per the above xfs_imap_valid() check, only + * xfs_vm_writepage() can handle partial page writeback fully - we are + * limited here to the buffers that are contiguous with the current + * ioend, and hence a buffer we can't write breaks that contiguity and + * we have to defer the rest of the IO to xfs_vm_writepage(). + */ bh = head = page_buffers(page); do { if (offset >= end_offset) @@ -750,7 +767,7 @@ xfs_convert_page( uptodate = 0; if (!(PageUptodate(page) || buffer_uptodate(bh))) { done = 1; - continue; + break; } if (buffer_unwritten(bh) || buffer_delay(bh) || @@ -762,10 +779,11 @@ xfs_convert_page( else type = XFS_IO_OVERWRITE; - if (!xfs_imap_valid(inode, imap, offset)) { - done = 1; - continue; - } + /* + * imap should always be valid because of the above + * partial page end_offset check on the imap. + */ + ASSERT(xfs_imap_valid(inode, imap, offset)); lock_buffer(bh); if (type != XFS_IO_OVERWRITE) @@ -777,6 +795,7 @@ xfs_convert_page( count++; } else { done = 1; + break; } } while (offset += len, (bh = bh->b_this_page) != head); @@ -868,7 +887,7 @@ xfs_aops_discard_page( struct buffer_head *bh, *head; loff_t offset = page_offset(page); - if (!xfs_check_page_type(page, XFS_IO_DELALLOC)) + if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true)) goto out_invalidate; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) @@ -1325,6 +1344,14 @@ __xfs_get_blocks( /* * If this is O_DIRECT or the mpage code calling tell them how large * the mapping is, so that we can avoid repeated get_blocks calls. + * + * If the mapping spans EOF, then we have to break the mapping up as the + * mapping for blocks beyond EOF must be marked new so that sub block + * regions can be correctly zeroed. We can't do this for mappings within + * EOF unless the mapping was just allocated or is unwritten, otherwise + * the callers would overwrite existing data with zeros. Hence we have + * to split the mapping into a range up to and including EOF, and a + * second mapping for beyond EOF. */ if (direct || size > (1 << inode->i_blkbits)) { xfs_off_t mapping_size; @@ -1335,6 +1362,12 @@ __xfs_get_blocks( ASSERT(mapping_size > 0); if (mapping_size > size) mapping_size = size; + if (offset < i_size_read(inode) && + offset + mapping_size >= i_size_read(inode)) { + /* limit mapping to block that spans EOF */ + mapping_size = roundup_64(i_size_read(inode) - offset, + 1 << inode->i_blkbits); + } if (mapping_size > LONG_MAX) mapping_size = LONG_MAX; @@ -1441,7 +1474,8 @@ xfs_vm_direct_IO( ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, nr_segs, xfs_get_blocks_direct, - xfs_end_io_direct_write, NULL, 0); + xfs_end_io_direct_write, NULL, + DIO_ASYNC_EXTEND); if (ret != -EIOCBQUEUED && iocb->private) goto out_destroy_ioend; } else { @@ -1546,6 +1580,16 @@ xfs_vm_write_failed( xfs_vm_kill_delalloc_range(inode, block_offset, block_offset + bh->b_size); + + /* + * This buffer does not contain data anymore. make sure anyone + * who finds it knows that for certain. + */ + clear_buffer_delay(bh); + clear_buffer_uptodate(bh); + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_dirty(bh); } } @@ -1579,12 +1623,21 @@ xfs_vm_write_begin( status = __block_write_begin(page, pos, len, xfs_get_blocks); if (unlikely(status)) { struct inode *inode = mapping->host; + size_t isize = i_size_read(inode); xfs_vm_write_failed(inode, page, pos, len); unlock_page(page); - if (pos + len > i_size_read(inode)) - truncate_pagecache(inode, i_size_read(inode)); + /* + * If the write is beyond EOF, we only want to kill blocks + * allocated in this write, not blocks that were previously + * written successfully. + */ + if (pos + len > isize) { + ssize_t start = max_t(ssize_t, pos, isize); + + truncate_pagecache_range(inode, start, pos + len); + } page_cache_release(page); page = NULL; @@ -1595,9 +1648,12 @@ xfs_vm_write_begin( } /* - * On failure, we only need to kill delalloc blocks beyond EOF because they - * will never be written. For blocks within EOF, generic_write_end() zeros them - * so they are safe to leave alone and be written with all the other valid data. + * On failure, we only need to kill delalloc blocks beyond EOF in the range of + * this specific write because they will never be written. Previous writes + * beyond EOF where block allocation succeeded do not need to be trashed, so + * only new blocks from this write should be trashed. For blocks within + * EOF, generic_write_end() zeros them so they are safe to leave alone and be + * written with all the other valid data. */ STATIC int xfs_vm_write_end( @@ -1620,8 +1676,11 @@ xfs_vm_write_end( loff_t to = pos + len; if (to > isize) { - truncate_pagecache(inode, isize); + /* only kill blocks in this write beyond EOF */ + if (pos > isize) + isize = pos; xfs_vm_kill_delalloc_range(inode, isize, to); + truncate_pagecache_range(inode, isize, to); } } return ret; diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 01b6a0102fbd..abda1124a70f 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -213,7 +213,7 @@ xfs_attr_calc_size( * Out of line attribute, cannot double split, but * make room for the attribute value itself. */ - uint dblocks = XFS_B_TO_FSB(mp, valuelen); + uint dblocks = xfs_attr3_rmt_blocks(mp, valuelen); nblks += dblocks; nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK); } @@ -698,11 +698,22 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) trace_xfs_attr_leaf_replace(args); + /* save the attribute state for later removal*/ args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */ args->blkno2 = args->blkno; /* set 2nd entry info*/ args->index2 = args->index; args->rmtblkno2 = args->rmtblkno; args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtvaluelen2 = args->rmtvaluelen; + + /* + * clear the remote attr state now that it is saved so that the + * values reflect the state of the attribute we are about to + * add, not the attribute we just found and will remove later. + */ + args->rmtblkno = 0; + args->rmtblkcnt = 0; + args->rmtvaluelen = 0; } /* @@ -794,6 +805,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) args->blkno = args->blkno2; args->rmtblkno = args->rmtblkno2; args->rmtblkcnt = args->rmtblkcnt2; + args->rmtvaluelen = args->rmtvaluelen2; if (args->rmtblkno) { error = xfs_attr_rmtval_remove(args); if (error) @@ -999,13 +1011,22 @@ restart: trace_xfs_attr_node_replace(args); + /* save the attribute state for later removal*/ args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */ args->blkno2 = args->blkno; /* set 2nd entry info*/ args->index2 = args->index; args->rmtblkno2 = args->rmtblkno; args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtvaluelen2 = args->rmtvaluelen; + + /* + * clear the remote attr state now that it is saved so that the + * values reflect the state of the attribute we are about to + * add, not the attribute we just found and will remove later. + */ args->rmtblkno = 0; args->rmtblkcnt = 0; + args->rmtvaluelen = 0; } retval = xfs_attr3_leaf_add(blk->bp, state->args); @@ -1133,6 +1154,7 @@ restart: args->blkno = args->blkno2; args->rmtblkno = args->rmtblkno2; args->rmtblkcnt = args->rmtblkcnt2; + args->rmtvaluelen = args->rmtvaluelen2; if (args->rmtblkno) { error = xfs_attr_rmtval_remove(args); if (error) diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 7b126f46a2f9..511c283459b1 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -213,8 +213,8 @@ xfs_attr3_leaf_write_verify( struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr; if (!xfs_attr3_leaf_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); return; } @@ -224,7 +224,7 @@ xfs_attr3_leaf_write_verify( if (bip) hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_ATTR3_LEAF_CRC_OFF); + xfs_buf_update_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF); } /* @@ -239,13 +239,14 @@ xfs_attr3_leaf_read_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; - if ((xfs_sb_version_hascrc(&mp->m_sb) && - !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_ATTR3_LEAF_CRC_OFF)) || - !xfs_attr3_leaf_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_attr3_leaf_verify(bp)) xfs_buf_ioerror(bp, EFSCORRUPTED); - } + + if (bp->b_error) + xfs_verifier_error(bp); } const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = { @@ -1228,6 +1229,7 @@ xfs_attr3_leaf_add_work( name_rmt->valueblk = 0; args->rmtblkno = 1; args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); + args->rmtvaluelen = args->valuelen; } xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), @@ -2166,11 +2168,11 @@ xfs_attr3_leaf_lookup_int( if (!xfs_attr_namesp_match(args->flags, entry->flags)) continue; args->index = probe; - args->valuelen = be32_to_cpu(name_rmt->valuelen); + args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); args->rmtblkcnt = xfs_attr3_rmt_blocks( args->dp->i_mount, - args->valuelen); + args->rmtvaluelen); return XFS_ERROR(EEXIST); } } @@ -2219,19 +2221,19 @@ xfs_attr3_leaf_getvalue( name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); ASSERT(name_rmt->namelen == args->namelen); ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); - valuelen = be32_to_cpu(name_rmt->valuelen); + args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount, - valuelen); + args->rmtvaluelen); if (args->flags & ATTR_KERNOVAL) { - args->valuelen = valuelen; + args->valuelen = args->rmtvaluelen; return 0; } - if (args->valuelen < valuelen) { - args->valuelen = valuelen; + if (args->valuelen < args->rmtvaluelen) { + args->valuelen = args->rmtvaluelen; return XFS_ERROR(ERANGE); } - args->valuelen = valuelen; + args->valuelen = args->rmtvaluelen; } return 0; } @@ -2518,7 +2520,7 @@ xfs_attr3_leaf_clearflag( ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0); name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); - name_rmt->valuelen = cpu_to_be32(args->valuelen); + name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); } @@ -2676,7 +2678,7 @@ xfs_attr3_leaf_flipflags( ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0); name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); - name_rmt->valuelen = cpu_to_be32(args->valuelen); + name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen); xfs_trans_log_buf(args->trans, bp1, XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt))); } diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 01db96f60cf0..833fe5d98d80 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -447,6 +447,7 @@ xfs_attr3_leaf_list_int( args.dp = context->dp; args.whichfork = XFS_ATTR_FORK; args.valuelen = valuelen; + args.rmtvaluelen = valuelen; args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS); args.rmtblkno = be32_to_cpu(name_rmt->valueblk); args.rmtblkcnt = xfs_attr3_rmt_blocks( diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c index 5549d69ddb45..d2e6e948cec7 100644 --- a/fs/xfs/xfs_attr_remote.c +++ b/fs/xfs/xfs_attr_remote.c @@ -125,7 +125,6 @@ xfs_attr3_rmt_read_verify( struct xfs_mount *mp = bp->b_target->bt_mount; char *ptr; int len; - bool corrupt = false; xfs_daddr_t bno; /* no verification of non-crc buffers */ @@ -140,11 +139,11 @@ xfs_attr3_rmt_read_verify( while (len > 0) { if (!xfs_verify_cksum(ptr, XFS_LBSIZE(mp), XFS_ATTR3_RMT_CRC_OFF)) { - corrupt = true; + xfs_buf_ioerror(bp, EFSBADCRC); break; } if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) { - corrupt = true; + xfs_buf_ioerror(bp, EFSCORRUPTED); break; } len -= XFS_LBSIZE(mp); @@ -152,10 +151,9 @@ xfs_attr3_rmt_read_verify( bno += mp->m_bsize; } - if (corrupt) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); - xfs_buf_ioerror(bp, EFSCORRUPTED); - } else + if (bp->b_error) + xfs_verifier_error(bp); + else ASSERT(len == 0); } @@ -180,9 +178,8 @@ xfs_attr3_rmt_write_verify( while (len > 0) { if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) { - XFS_CORRUPTION_ERROR(__func__, - XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); return; } if (bip) { @@ -340,7 +337,7 @@ xfs_attr_rmtval_get( struct xfs_buf *bp; xfs_dablk_t lblkno = args->rmtblkno; __uint8_t *dst = args->value; - int valuelen = args->valuelen; + int valuelen; int nmap; int error; int blkcnt = args->rmtblkcnt; @@ -350,7 +347,9 @@ xfs_attr_rmtval_get( trace_xfs_attr_rmtval_get(args); ASSERT(!(args->flags & ATTR_KERNOVAL)); + ASSERT(args->rmtvaluelen == args->valuelen); + valuelen = args->rmtvaluelen; while (valuelen > 0) { nmap = ATTR_RMTVALUE_MAPSIZE; error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, @@ -418,7 +417,7 @@ xfs_attr_rmtval_set( * attributes have headers, we can't just do a straight byte to FSB * conversion and have to take the header space into account. */ - blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); + blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen); error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, XFS_ATTR_FORK); if (error) @@ -483,7 +482,7 @@ xfs_attr_rmtval_set( */ lblkno = args->rmtblkno; blkcnt = args->rmtblkcnt; - valuelen = args->valuelen; + valuelen = args->rmtvaluelen; while (valuelen > 0) { struct xfs_buf *bp; xfs_daddr_t dblkno; diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 152543c4ca70..f0efc7e970ef 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5378,3 +5378,201 @@ error0: } return error; } + +/* + * Shift extent records to the left to cover a hole. + * + * The maximum number of extents to be shifted in a single operation + * is @num_exts, and @current_ext keeps track of the current extent + * index we have shifted. @offset_shift_fsb is the length by which each + * extent is shifted. If there is no hole to shift the extents + * into, this will be considered invalid operation and we abort immediately. + */ +int +xfs_bmap_shift_extents( + struct xfs_trans *tp, + struct xfs_inode *ip, + int *done, + xfs_fileoff_t start_fsb, + xfs_fileoff_t offset_shift_fsb, + xfs_extnum_t *current_ext, + xfs_fsblock_t *firstblock, + struct xfs_bmap_free *flist, + int num_exts) +{ + struct xfs_btree_cur *cur; + struct xfs_bmbt_rec_host *gotp; + struct xfs_bmbt_irec got; + struct xfs_bmbt_irec left; + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp; + xfs_extnum_t nexts = 0; + xfs_fileoff_t startoff; + int error = 0; + int i; + int whichfork = XFS_DATA_FORK; + int logflags; + xfs_filblks_t blockcount = 0; + int total_extents; + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmap_shift_extents", + XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + ASSERT(current_ext != NULL); + + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + /* Read in all the extents */ + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; + } + + /* + * If *current_ext is 0, we would need to lookup the extent + * from where we would start shifting and store it in gotp. + */ + if (!*current_ext) { + gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext); + /* + * gotp can be null in 2 cases: 1) if there are no extents + * or 2) start_fsb lies in a hole beyond which there are + * no extents. Either way, we are done. + */ + if (!gotp) { + *done = 1; + return 0; + } + } + + /* We are going to change core inode */ + logflags = XFS_ILOG_CORE; + if (ifp->if_flags & XFS_IFBROOT) { + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + cur->bc_private.b.firstblock = *firstblock; + cur->bc_private.b.flist = flist; + cur->bc_private.b.flags = 0; + } else { + cur = NULL; + logflags |= XFS_ILOG_DEXT; + } + + /* + * There may be delalloc extents in the data fork before the range we + * are collapsing out, so we cannot + * use the count of real extents here. Instead we have to calculate it + * from the incore fork. + */ + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + while (nexts++ < num_exts && *current_ext < total_extents) { + + gotp = xfs_iext_get_ext(ifp, *current_ext); + xfs_bmbt_get_all(gotp, &got); + startoff = got.br_startoff - offset_shift_fsb; + + /* + * Before shifting extent into hole, make sure that the hole + * is large enough to accomodate the shift. + */ + if (*current_ext) { + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, + *current_ext - 1), &left); + + if (startoff < left.br_startoff + left.br_blockcount) + error = XFS_ERROR(EINVAL); + } else if (offset_shift_fsb > got.br_startoff) { + /* + * When first extent is shifted, offset_shift_fsb + * should be less than the stating offset of + * the first extent. + */ + error = XFS_ERROR(EINVAL); + } + + if (error) + goto del_cursor; + + if (cur) { + error = xfs_bmbt_lookup_eq(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount, + &i); + if (error) + goto del_cursor; + XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor); + } + + /* Check if we can merge 2 adjacent extents */ + if (*current_ext && + left.br_startoff + left.br_blockcount == startoff && + left.br_startblock + left.br_blockcount == + got.br_startblock && + left.br_state == got.br_state && + left.br_blockcount + got.br_blockcount <= MAXEXTLEN) { + blockcount = left.br_blockcount + + got.br_blockcount; + xfs_iext_remove(ip, *current_ext, 1, 0); + if (cur) { + error = xfs_btree_delete(cur, &i); + if (error) + goto del_cursor; + XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor); + } + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + gotp = xfs_iext_get_ext(ifp, --*current_ext); + xfs_bmbt_get_all(gotp, &got); + + /* Make cursor point to the extent we will update */ + if (cur) { + error = xfs_bmbt_lookup_eq(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount, + &i); + if (error) + goto del_cursor; + XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor); + } + + xfs_bmbt_set_blockcount(gotp, blockcount); + got.br_blockcount = blockcount; + } else { + /* We have to update the startoff */ + xfs_bmbt_set_startoff(gotp, startoff); + got.br_startoff = startoff; + } + + if (cur) { + error = xfs_bmbt_update(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount, + got.br_state); + if (error) + goto del_cursor; + } + + (*current_ext)++; + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + } + + /* Check if we are done */ + if (*current_ext == total_extents) + *done = 1; + +del_cursor: + if (cur) + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + + xfs_trans_log_inode(tp, ip, logflags); + return error; +} diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 33b41f351225..f84bd7af43be 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -127,6 +127,16 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp) { BMAP_RIGHT_FILLING, "RF" }, \ { BMAP_ATTRFORK, "ATTR" } + +/* + * This macro is used to determine how many extents will be shifted + * in one write transaction. We could require two splits, + * an extent move on the first and an extent merge on the second, + * So it is proper that one extent is shifted inside write transaction + * at a time. + */ +#define XFS_BMAP_MAX_SHIFT_EXTENTS 1 + #ifdef DEBUG void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, int whichfork, unsigned long caller_ip); @@ -169,5 +179,10 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, xfs_extnum_t num); uint xfs_default_attroffset(struct xfs_inode *ip); +int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, + int *done, xfs_fileoff_t start_fsb, + xfs_fileoff_t offset_shift_fsb, xfs_extnum_t *current_ext, + xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist, + int num_exts); #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 706bc3f777cb..818d546664e7 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -780,12 +780,14 @@ static void xfs_bmbt_read_verify( struct xfs_buf *bp) { - if (!(xfs_btree_lblock_verify_crc(bp) && - xfs_bmbt_verify(bp))) { - trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, - bp->b_target->bt_mount, bp->b_addr); + if (!xfs_btree_lblock_verify_crc(bp)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_bmbt_verify(bp)) xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp); } } @@ -794,11 +796,9 @@ xfs_bmbt_write_verify( struct xfs_buf *bp) { if (!xfs_bmbt_verify(bp)) { - xfs_warn(bp->b_target->bt_mount, "bmbt daddr 0x%llx failed", bp->b_bn); trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, - bp->b_target->bt_mount, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); return; } xfs_btree_lblock_calc_crc(bp); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index f264616080ca..296160b8e78c 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1349,7 +1349,6 @@ xfs_free_file_space( * the freeing of the space succeeds at ENOSPC. */ tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - tp->t_flags |= XFS_TRANS_RESERVE; error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0); /* @@ -1419,6 +1418,8 @@ xfs_zero_file_space( xfs_off_t end_boundary; int error; + trace_xfs_zero_file_space(ip); + granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); /* @@ -1433,9 +1434,18 @@ xfs_zero_file_space( ASSERT(end_boundary <= offset + len); if (start_boundary < end_boundary - 1) { - /* punch out the page cache over the conversion range */ + /* + * punch out delayed allocation blocks and the page cache over + * the conversion range + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, + XFS_B_TO_FSBT(mp, start_boundary), + XFS_B_TO_FSB(mp, end_boundary - start_boundary)); + xfs_iunlock(ip, XFS_ILOCK_EXCL); truncate_pagecache_range(VFS_I(ip), start_boundary, end_boundary - 1); + /* convert the blocks */ error = xfs_alloc_file_space(ip, start_boundary, end_boundary - start_boundary - 1, @@ -1468,6 +1478,102 @@ out: } /* + * xfs_collapse_file_space() + * This routine frees disk space and shift extent for the given file. + * The first thing we do is to free data blocks in the specified range + * by calling xfs_free_file_space(). It would also sync dirty data + * and invalidate page cache over the region on which collapse range + * is working. And Shift extent records to the left to cover a hole. + * RETURNS: + * 0 on success + * errno on error + * + */ +int +xfs_collapse_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len) +{ + int done = 0; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + xfs_extnum_t current_ext = 0; + struct xfs_bmap_free free_list; + xfs_fsblock_t first_block; + int committed; + xfs_fileoff_t start_fsb; + xfs_fileoff_t shift_fsb; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + + trace_xfs_collapse_file_space(ip); + + start_fsb = XFS_B_TO_FSB(mp, offset + len); + shift_fsb = XFS_B_TO_FSB(mp, len); + + error = xfs_free_file_space(ip, offset, len); + if (error) + return error; + + while (!error && !done) { + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + tp->t_flags |= XFS_TRANS_RESERVE; + /* + * We would need to reserve permanent block for transaction. + * This will come into picture when after shifting extent into + * hole we found that adjacent extents can be merged which + * may lead to freeing of a block during record update. + */ + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); + if (error) { + ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + break; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, + ip->i_gdquot, ip->i_pdquot, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, + XFS_QMOPT_RES_REGBLKS); + if (error) + goto out; + + xfs_trans_ijoin(tp, ip, 0); + + xfs_bmap_init(&free_list, &first_block); + + /* + * We are using the write transaction in which max 2 bmbt + * updates are allowed + */ + error = xfs_bmap_shift_extents(tp, ip, &done, start_fsb, + shift_fsb, ¤t_ext, + &first_block, &free_list, + XFS_BMAP_MAX_SHIFT_EXTENTS); + if (error) + goto out; + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + return error; + +out: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* * We need to check that the format of the data fork in the temporary inode is * valid for the target inode before doing the swap. This is not a problem with * attr1 because of the fixed fork offset, but attr2 has a dynamically sized diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 900747b25772..935ed2b24edf 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -99,6 +99,8 @@ int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len); int xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len); +int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset, + xfs_off_t len); /* EOF block manipulation functions */ bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force); diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 9adaae4f3e2f..e80d59fdf89a 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -234,8 +234,7 @@ xfs_btree_lblock_calc_crc( return; if (bip) block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_BTREE_LBLOCK_CRC_OFF); + xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); } bool @@ -243,8 +242,8 @@ xfs_btree_lblock_verify_crc( struct xfs_buf *bp) { if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) - return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_BTREE_LBLOCK_CRC_OFF); + return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); + return true; } @@ -267,8 +266,7 @@ xfs_btree_sblock_calc_crc( return; if (bip) block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_BTREE_SBLOCK_CRC_OFF); + xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); } bool @@ -276,8 +274,8 @@ xfs_btree_sblock_verify_crc( struct xfs_buf *bp) { if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) - return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_BTREE_SBLOCK_CRC_OFF); + return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); + return true; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 9c061ef2b0d9..cb10a0aaab3a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -396,7 +396,17 @@ _xfs_buf_map_pages( bp->b_addr = NULL; } else { int retried = 0; + unsigned noio_flag; + /* + * vm_map_ram() will allocate auxillary structures (e.g. + * pagetables) with GFP_KERNEL, yet we are likely to be under + * GFP_NOFS context here. Hence we need to tell memory reclaim + * that we are in such a context via PF_MEMALLOC_NOIO to prevent + * memory reclaim re-entering the filesystem here and + * potentially deadlocking. + */ + noio_flag = memalloc_noio_save(); do { bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, -1, PAGE_KERNEL); @@ -404,6 +414,7 @@ _xfs_buf_map_pages( break; vm_unmap_aliases(); } while (retried++ <= 1); + memalloc_noio_restore(noio_flag); if (!bp->b_addr) return -ENOMEM; @@ -1361,21 +1372,29 @@ xfs_buf_iorequest( xfs_buf_wait_unpin(bp); xfs_buf_hold(bp); - /* Set the count to 1 initially, this will stop an I/O + /* + * Set the count to 1 initially, this will stop an I/O * completion callout which happens before we have started * all the I/O from calling xfs_buf_ioend too early. */ atomic_set(&bp->b_io_remaining, 1); _xfs_buf_ioapply(bp); - _xfs_buf_ioend(bp, 1); + /* + * If _xfs_buf_ioapply failed, we'll get back here with + * only the reference we took above. _xfs_buf_ioend will + * drop it to zero, so we'd better not queue it for later, + * or we'll free it before it's done. + */ + _xfs_buf_ioend(bp, bp->b_error ? 0 : 1); xfs_buf_rele(bp); } /* * Waits for I/O to complete on the buffer supplied. It returns immediately if - * no I/O is pending or there is already a pending error on the buffer. It - * returns the I/O error code, if any, or 0 if there was no error. + * no I/O is pending or there is already a pending error on the buffer, in which + * case nothing will ever complete. It returns the I/O error code, if any, or + * 0 if there was no error. */ int xfs_buf_iowait( diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 995339534db6..b8a3abf6cf47 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -369,6 +369,20 @@ static inline void xfs_buf_relse(xfs_buf_t *bp) xfs_buf_rele(bp); } +static inline int +xfs_buf_verify_cksum(struct xfs_buf *bp, unsigned long cksum_offset) +{ + return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + cksum_offset); +} + +static inline void +xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset) +{ + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + cksum_offset); +} + /* * Handling of buftargs. */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 33149113e333..8752821443be 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -796,20 +796,6 @@ xfs_buf_item_init( bip->bli_formats[i].blf_map_size = map_size; } -#ifdef XFS_TRANS_DEBUG - /* - * Allocate the arrays for tracking what needs to be logged - * and what our callers request to be logged. bli_orig - * holds a copy of the original, clean buffer for comparison - * against, and bli_logged keeps a 1 bit flag per byte in - * the buffer to indicate which bytes the callers have asked - * to have logged. - */ - bip->bli_orig = kmem_alloc(BBTOB(bp->b_length), KM_SLEEP); - memcpy(bip->bli_orig, bp->b_addr, BBTOB(bp->b_length)); - bip->bli_logged = kmem_zalloc(BBTOB(bp->b_length) / NBBY, KM_SLEEP); -#endif - /* * Put the buf item into the list of items attached to the * buffer at the front. @@ -957,11 +943,6 @@ STATIC void xfs_buf_item_free( xfs_buf_log_item_t *bip) { -#ifdef XFS_TRANS_DEBUG - kmem_free(bip->bli_orig); - kmem_free(bip->bli_logged); -#endif /* XFS_TRANS_DEBUG */ - xfs_buf_item_free_format(bip); kmem_zone_free(xfs_buf_item_zone, bip); } diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 796272a2e129..6cc5f6785a77 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -185,8 +185,8 @@ xfs_da3_node_write_verify( struct xfs_da3_node_hdr *hdr3 = bp->b_addr; if (!xfs_da3_node_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); return; } @@ -196,7 +196,7 @@ xfs_da3_node_write_verify( if (bip) hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DA3_NODE_CRC_OFF); + xfs_buf_update_cksum(bp, XFS_DA3_NODE_CRC_OFF); } /* @@ -209,18 +209,20 @@ static void xfs_da3_node_read_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_da_blkinfo *info = bp->b_addr; switch (be16_to_cpu(info->magic)) { case XFS_DA3_NODE_MAGIC: - if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_DA3_NODE_CRC_OFF)) + if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) { + xfs_buf_ioerror(bp, EFSBADCRC); break; + } /* fall through */ case XFS_DA_NODE_MAGIC: - if (!xfs_da3_node_verify(bp)) + if (!xfs_da3_node_verify(bp)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); break; + } return; case XFS_ATTR_LEAF_MAGIC: case XFS_ATTR3_LEAF_MAGIC: @@ -237,8 +239,7 @@ xfs_da3_node_read_verify( } /* corrupt block */ - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); } const struct xfs_buf_ops xfs_da3_node_buf_ops = { @@ -1295,7 +1296,7 @@ xfs_da3_fixhashpath( node = blk->bp->b_addr; dp->d_ops->node_hdr_from_disk(&nodehdr, node); btree = dp->d_ops->node_tree_p(node); - if (be32_to_cpu(btree->hashval) == lasthash) + if (be32_to_cpu(btree[blk->index].hashval) == lasthash) break; blk->hashval = lasthash; btree[blk->index].hashval = cpu_to_be32(lasthash); diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 6e95ea79f5d7..201c6091d26a 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -60,10 +60,12 @@ typedef struct xfs_da_args { int index; /* index of attr of interest in blk */ xfs_dablk_t rmtblkno; /* remote attr value starting blkno */ int rmtblkcnt; /* remote attr value block count */ + int rmtvaluelen; /* remote attr value length in bytes */ xfs_dablk_t blkno2; /* blkno of 2nd attr leaf of interest */ int index2; /* index of 2nd attr in blk */ xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */ int rmtblkcnt2; /* remote attr value block count */ + int rmtvaluelen2; /* remote attr value length in bytes */ int op_flags; /* operation flags */ enum xfs_dacmp cmpresult; /* name compare result for lookups */ } xfs_da_args_t; diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h index e5869b50dc41..623bbe8fd921 100644 --- a/fs/xfs/xfs_dinode.h +++ b/fs/xfs/xfs_dinode.h @@ -89,6 +89,8 @@ typedef struct xfs_dinode { /* structure must be padded to 64 bit alignment */ } xfs_dinode_t; +#define XFS_DINODE_CRC_OFF offsetof(struct xfs_dinode, di_crc) + #define DI_MAX_FLUSH 0xffff /* diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index ce16ef02997a..fda46253966a 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -180,16 +180,23 @@ xfs_dir_init( xfs_inode_t *dp, xfs_inode_t *pdp) { - xfs_da_args_t args; + struct xfs_da_args *args; int error; - memset((char *)&args, 0, sizeof(args)); - args.dp = dp; - args.trans = tp; ASSERT(S_ISDIR(dp->i_d.di_mode)); - if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino))) + error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino); + if (error) return error; - return xfs_dir2_sf_create(&args, pdp->i_ino); + + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return ENOMEM; + + args->dp = dp; + args->trans = tp; + error = xfs_dir2_sf_create(args, pdp->i_ino); + kmem_free(args); + return error; } /* @@ -205,41 +212,56 @@ xfs_dir_createname( xfs_bmap_free_t *flist, /* bmap's freeblock list */ xfs_extlen_t total) /* bmap's total block count */ { - xfs_da_args_t args; + struct xfs_da_args *args; int rval; int v; /* type-checking value */ ASSERT(S_ISDIR(dp->i_d.di_mode)); - if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) + rval = xfs_dir_ino_validate(tp->t_mountp, inum); + if (rval) return rval; XFS_STATS_INC(xs_dir_create); - memset(&args, 0, sizeof(xfs_da_args_t)); - args.name = name->name; - args.namelen = name->len; - args.filetype = name->type; - args.hashval = dp->i_mount->m_dirnameops->hashname(name); - args.inumber = inum; - args.dp = dp; - args.firstblock = first; - args.flist = flist; - args.total = total; - args.whichfork = XFS_DATA_FORK; - args.trans = tp; - args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; - - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_addname(&args); - else if ((rval = xfs_dir2_isblock(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_block_addname(&args); - else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_leaf_addname(&args); + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return ENOMEM; + + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->inumber = inum; + args->dp = dp; + args->firstblock = first; + args->flist = flist; + args->total = total; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_addname(args); + goto out_free; + } + + rval = xfs_dir2_isblock(tp, dp, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_addname(args); + goto out_free; + } + + rval = xfs_dir2_isleaf(tp, dp, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_addname(args); else - rval = xfs_dir2_node_addname(&args); + rval = xfs_dir2_node_addname(args); + +out_free: + kmem_free(args); return rval; } @@ -282,46 +304,66 @@ xfs_dir_lookup( xfs_ino_t *inum, /* out: inode number */ struct xfs_name *ci_name) /* out: actual name if CI match */ { - xfs_da_args_t args; + struct xfs_da_args *args; int rval; int v; /* type-checking value */ ASSERT(S_ISDIR(dp->i_d.di_mode)); XFS_STATS_INC(xs_dir_lookup); - memset(&args, 0, sizeof(xfs_da_args_t)); - args.name = name->name; - args.namelen = name->len; - args.filetype = name->type; - args.hashval = dp->i_mount->m_dirnameops->hashname(name); - args.dp = dp; - args.whichfork = XFS_DATA_FORK; - args.trans = tp; - args.op_flags = XFS_DA_OP_OKNOENT; + /* + * We need to use KM_NOFS here so that lockdep will not throw false + * positive deadlock warnings on a non-transactional lookup path. It is + * safe to recurse into inode recalim in that case, but lockdep can't + * easily be taught about it. Hence KM_NOFS avoids having to add more + * lockdep Doing this avoids having to add a bunch of lockdep class + * annotations into the reclaim path for the ilock. + */ + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->dp = dp; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + args->op_flags = XFS_DA_OP_OKNOENT; if (ci_name) - args.op_flags |= XFS_DA_OP_CILOOKUP; + args->op_flags |= XFS_DA_OP_CILOOKUP; - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_lookup(&args); - else if ((rval = xfs_dir2_isblock(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_block_lookup(&args); - else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_leaf_lookup(&args); + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_lookup(args); + goto out_check_rval; + } + + rval = xfs_dir2_isblock(tp, dp, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_lookup(args); + goto out_check_rval; + } + + rval = xfs_dir2_isleaf(tp, dp, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_lookup(args); else - rval = xfs_dir2_node_lookup(&args); + rval = xfs_dir2_node_lookup(args); + +out_check_rval: if (rval == EEXIST) rval = 0; if (!rval) { - *inum = args.inumber; + *inum = args->inumber; if (ci_name) { - ci_name->name = args.value; - ci_name->len = args.valuelen; + ci_name->name = args->value; + ci_name->len = args->valuelen; } } +out_free: + kmem_free(args); return rval; } @@ -338,38 +380,51 @@ xfs_dir_removename( xfs_bmap_free_t *flist, /* bmap's freeblock list */ xfs_extlen_t total) /* bmap's total block count */ { - xfs_da_args_t args; + struct xfs_da_args *args; int rval; int v; /* type-checking value */ ASSERT(S_ISDIR(dp->i_d.di_mode)); XFS_STATS_INC(xs_dir_remove); - memset(&args, 0, sizeof(xfs_da_args_t)); - args.name = name->name; - args.namelen = name->len; - args.filetype = name->type; - args.hashval = dp->i_mount->m_dirnameops->hashname(name); - args.inumber = ino; - args.dp = dp; - args.firstblock = first; - args.flist = flist; - args.total = total; - args.whichfork = XFS_DATA_FORK; - args.trans = tp; - - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_removename(&args); - else if ((rval = xfs_dir2_isblock(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_block_removename(&args); - else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_leaf_removename(&args); + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return ENOMEM; + + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->inumber = ino; + args->dp = dp; + args->firstblock = first; + args->flist = flist; + args->total = total; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_removename(args); + goto out_free; + } + + rval = xfs_dir2_isblock(tp, dp, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_removename(args); + goto out_free; + } + + rval = xfs_dir2_isleaf(tp, dp, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_removename(args); else - rval = xfs_dir2_node_removename(&args); + rval = xfs_dir2_node_removename(args); +out_free: + kmem_free(args); return rval; } @@ -386,40 +441,54 @@ xfs_dir_replace( xfs_bmap_free_t *flist, /* bmap's freeblock list */ xfs_extlen_t total) /* bmap's total block count */ { - xfs_da_args_t args; + struct xfs_da_args *args; int rval; int v; /* type-checking value */ ASSERT(S_ISDIR(dp->i_d.di_mode)); - if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) + rval = xfs_dir_ino_validate(tp->t_mountp, inum); + if (rval) return rval; - memset(&args, 0, sizeof(xfs_da_args_t)); - args.name = name->name; - args.namelen = name->len; - args.filetype = name->type; - args.hashval = dp->i_mount->m_dirnameops->hashname(name); - args.inumber = inum; - args.dp = dp; - args.firstblock = first; - args.flist = flist; - args.total = total; - args.whichfork = XFS_DATA_FORK; - args.trans = tp; - - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_replace(&args); - else if ((rval = xfs_dir2_isblock(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_block_replace(&args); - else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_leaf_replace(&args); + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return ENOMEM; + + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->inumber = inum; + args->dp = dp; + args->firstblock = first; + args->flist = flist; + args->total = total; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_replace(args); + goto out_free; + } + + rval = xfs_dir2_isblock(tp, dp, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_replace(args); + goto out_free; + } + + rval = xfs_dir2_isleaf(tp, dp, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_replace(args); else - rval = xfs_dir2_node_replace(&args); + rval = xfs_dir2_node_replace(args); +out_free: + kmem_free(args); return rval; } @@ -434,7 +503,7 @@ xfs_dir_canenter( struct xfs_name *name, /* name of entry to add */ uint resblks) { - xfs_da_args_t args; + struct xfs_da_args *args; int rval; int v; /* type-checking value */ @@ -443,29 +512,42 @@ xfs_dir_canenter( ASSERT(S_ISDIR(dp->i_d.di_mode)); - memset(&args, 0, sizeof(xfs_da_args_t)); - args.name = name->name; - args.namelen = name->len; - args.filetype = name->type; - args.hashval = dp->i_mount->m_dirnameops->hashname(name); - args.dp = dp; - args.whichfork = XFS_DATA_FORK; - args.trans = tp; - args.op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME | + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return ENOMEM; + + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->dp = dp; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + args->op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_addname(&args); - else if ((rval = xfs_dir2_isblock(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_block_addname(&args); - else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_leaf_addname(&args); + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_addname(args); + goto out_free; + } + + rval = xfs_dir2_isblock(tp, dp, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_addname(args); + goto out_free; + } + + rval = xfs_dir2_isleaf(tp, dp, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_addname(args); else - rval = xfs_dir2_node_addname(&args); + rval = xfs_dir2_node_addname(args); +out_free: + kmem_free(args); return rval; } diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 90cdbf4b5f19..4f6a38cb83a4 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -89,13 +89,14 @@ xfs_dir3_block_read_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; - if ((xfs_sb_version_hascrc(&mp->m_sb) && - !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_DIR3_DATA_CRC_OFF)) || - !xfs_dir3_block_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_dir3_block_verify(bp)) xfs_buf_ioerror(bp, EFSCORRUPTED); - } + + if (bp->b_error) + xfs_verifier_error(bp); } static void @@ -107,8 +108,8 @@ xfs_dir3_block_write_verify( struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; if (!xfs_dir3_block_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); return; } @@ -118,7 +119,7 @@ xfs_dir3_block_write_verify( if (bip) hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_DATA_CRC_OFF); + xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF); } const struct xfs_buf_ops xfs_dir3_block_buf_ops = { diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index 70acff4ee173..afa4ad523f3f 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -241,7 +241,6 @@ static void xfs_dir3_data_reada_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir2_data_hdr *hdr = bp->b_addr; switch (hdr->magic) { @@ -255,8 +254,8 @@ xfs_dir3_data_reada_verify( xfs_dir3_data_verify(bp); return; default: - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); break; } } @@ -267,13 +266,14 @@ xfs_dir3_data_read_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; - if ((xfs_sb_version_hascrc(&mp->m_sb) && - !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_DIR3_DATA_CRC_OFF)) || - !xfs_dir3_data_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_dir3_data_verify(bp)) xfs_buf_ioerror(bp, EFSCORRUPTED); - } + + if (bp->b_error) + xfs_verifier_error(bp); } static void @@ -285,8 +285,8 @@ xfs_dir3_data_write_verify( struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; if (!xfs_dir3_data_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); return; } @@ -296,7 +296,7 @@ xfs_dir3_data_write_verify( if (bip) hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_DATA_CRC_OFF); + xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF); } const struct xfs_buf_ops xfs_dir3_data_buf_ops = { diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index ae47ec6e16c4..d36e97df1187 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -179,13 +179,14 @@ __read_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; - if ((xfs_sb_version_hascrc(&mp->m_sb) && - !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_DIR3_LEAF_CRC_OFF)) || - !xfs_dir3_leaf_verify(bp, magic)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_dir3_leaf_verify(bp, magic)) xfs_buf_ioerror(bp, EFSCORRUPTED); - } + + if (bp->b_error) + xfs_verifier_error(bp); } static void @@ -198,8 +199,8 @@ __write_verify( struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr; if (!xfs_dir3_leaf_verify(bp, magic)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); return; } @@ -209,7 +210,7 @@ __write_verify( if (bip) hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_LEAF_CRC_OFF); + xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF); } static void diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 48c7d18f68c3..cb434d732681 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -115,13 +115,14 @@ xfs_dir3_free_read_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; - if ((xfs_sb_version_hascrc(&mp->m_sb) && - !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_DIR3_FREE_CRC_OFF)) || - !xfs_dir3_free_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_dir3_free_verify(bp)) xfs_buf_ioerror(bp, EFSCORRUPTED); - } + + if (bp->b_error) + xfs_verifier_error(bp); } static void @@ -133,8 +134,8 @@ xfs_dir3_free_write_verify( struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; if (!xfs_dir3_free_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); return; } @@ -144,7 +145,7 @@ xfs_dir3_free_write_verify( if (bip) hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_FREE_CRC_OFF); + xfs_buf_update_cksum(bp, XFS_DIR3_FREE_CRC_OFF); } const struct xfs_buf_ops xfs_dir3_free_buf_ops = { diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 7aeb4c895b32..868b19f096bf 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -615,7 +615,7 @@ xfs_qm_dqread( if (flags & XFS_QMOPT_DQALLOC) { tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_attrsetm, + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_dqalloc, XFS_QM_DQALLOC_SPACE_RES(mp), 0); if (error) goto error1; diff --git a/fs/xfs/xfs_dquot_buf.c b/fs/xfs/xfs_dquot_buf.c index d401457d2f25..610da8177737 100644 --- a/fs/xfs/xfs_dquot_buf.c +++ b/fs/xfs/xfs_dquot_buf.c @@ -257,10 +257,13 @@ xfs_dquot_buf_read_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; - if (!xfs_dquot_buf_verify_crc(mp, bp) || !xfs_dquot_buf_verify(mp, bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + if (!xfs_dquot_buf_verify_crc(mp, bp)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_dquot_buf_verify(mp, bp)) xfs_buf_ioerror(bp, EFSCORRUPTED); - } + + if (bp->b_error) + xfs_verifier_error(bp); } /* @@ -275,8 +278,8 @@ xfs_dquot_buf_write_verify( struct xfs_mount *mp = bp->b_target->bt_mount; if (!xfs_dquot_buf_verify(mp, bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); return; } } diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 9995b807d627..edac5b057d28 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -156,7 +156,7 @@ xfs_error_report( { if (level <= xfs_error_level) { xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT, - "Internal error %s at line %d of file %s. Caller 0x%p", + "Internal error %s at line %d of file %s. Caller %pF", tag, linenum, filename, ra); xfs_stack_trace(); @@ -178,3 +178,28 @@ xfs_corruption_error( xfs_error_report(tag, level, mp, filename, linenum, ra); xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair"); } + +/* + * Warnings specifically for verifier errors. Differentiate CRC vs. invalid + * values, and omit the stack trace unless the error level is tuned high. + */ +void +xfs_verifier_error( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx", + bp->b_error == EFSBADCRC ? "CRC error" : "corruption", + __return_address, bp->b_bn); + + xfs_alert(mp, "Unmount and run xfs_repair"); + + if (xfs_error_level >= XFS_ERRLEVEL_LOW) { + xfs_alert(mp, "First 64 bytes of corrupted metadata buffer:"); + xfs_hex_dump(xfs_buf_offset(bp, 0), 64); + } + + if (xfs_error_level >= XFS_ERRLEVEL_HIGH) + xfs_stack_trace(); +} diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 079a367f44ee..c1c57d4a4b5d 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -34,6 +34,7 @@ extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, extern void xfs_corruption_error(const char *tag, int level, struct xfs_mount *mp, void *p, const char *filename, int linenum, inst_t *ra); +extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_ERROR_REPORT(e, lvl, mp) \ xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address) diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 1399e187d425..753e467aa1a5 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -237,7 +237,7 @@ xfs_fs_nfs_commit_metadata( if (!lsn) return 0; - return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); } const struct export_operations xfs_export_operations = { diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 64b48eade91d..830c1c937b88 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -155,7 +155,7 @@ xfs_dir_fsync( if (!lsn) return 0; - return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); } STATIC int @@ -295,7 +295,7 @@ xfs_file_aio_read( xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); if (inode->i_mapping->nrpages) { - ret = -filemap_write_and_wait_range( + ret = filemap_write_and_wait_range( VFS_I(ip)->i_mapping, pos, -1); if (ret) { @@ -679,7 +679,7 @@ xfs_file_dio_aio_write( goto out; if (mapping->nrpages) { - ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, pos, -1); if (ret) goto out; @@ -699,7 +699,7 @@ xfs_file_dio_aio_write( trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); ret = generic_file_direct_write(iocb, iovp, - &nr_segs, pos, &iocb->ki_pos, count, ocount); + &nr_segs, pos, count, ocount); out: xfs_rw_iunlock(ip, iolock); @@ -715,7 +715,7 @@ xfs_file_buffered_aio_write( const struct iovec *iovp, unsigned long nr_segs, loff_t pos, - size_t ocount) + size_t count) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -724,7 +724,7 @@ xfs_file_buffered_aio_write( ssize_t ret; int enospc = 0; int iolock = XFS_IOLOCK_EXCL; - size_t count = ocount; + struct iov_iter from; xfs_rw_ilock(ip, iolock); @@ -732,14 +732,15 @@ xfs_file_buffered_aio_write( if (ret) goto out; + iov_iter_init(&from, iovp, nr_segs, count, 0); /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; write_retry: trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); - ret = generic_file_buffered_write(iocb, iovp, nr_segs, - pos, &iocb->ki_pos, count, 0); - + ret = generic_perform_write(file, &from, pos); + if (likely(ret >= 0)) + iocb->ki_pos = pos + ret; /* * If we just got an ENOSPC, try to write back all dirty inodes to * convert delalloc space to free up some of the excess reserved @@ -823,7 +824,8 @@ xfs_file_fallocate( if (!S_ISREG(inode->i_mode)) return -EINVAL; - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) return -EOPNOTSUPP; xfs_ilock(ip, XFS_IOLOCK_EXCL); @@ -831,6 +833,28 @@ xfs_file_fallocate( error = xfs_free_file_space(ip, offset, len); if (error) goto out_unlock; + } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { + unsigned blksize_mask = (1 << inode->i_blkbits) - 1; + + if (offset & blksize_mask || len & blksize_mask) { + error = EINVAL; + goto out_unlock; + } + + /* + * There is no need to overlap collapse range with EOF, + * in which case it is effectively a truncate operation + */ + if (offset + len >= i_size_read(inode)) { + error = EINVAL; + goto out_unlock; + } + + new_size = i_size_read(inode) - len; + + error = xfs_collapse_file_space(ip, offset, len); + if (error) + goto out_unlock; } else { if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > i_size_read(inode)) { @@ -840,8 +864,11 @@ xfs_file_fallocate( goto out_unlock; } - error = xfs_alloc_file_space(ip, offset, len, - XFS_BMAPI_PREALLOC); + if (mode & FALLOC_FL_ZERO_RANGE) + error = xfs_zero_file_space(ip, offset, len); + else + error = xfs_alloc_file_space(ip, offset, len, + XFS_BMAPI_PREALLOC); if (error) goto out_unlock; } @@ -859,7 +886,7 @@ xfs_file_fallocate( if (ip->i_d.di_mode & S_IXGRP) ip->i_d.di_mode &= ~S_ISGID; - if (!(mode & FALLOC_FL_PUNCH_HOLE)) + if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE))) ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -1465,6 +1492,7 @@ const struct file_operations xfs_dir_file_operations = { static const struct vm_operations_struct xfs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = xfs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/xfs/xfs_format.h b/fs/xfs/xfs_format.h index b6ab5a3cfa12..9898f31d05d8 100644 --- a/fs/xfs/xfs_format.h +++ b/fs/xfs/xfs_format.h @@ -145,6 +145,8 @@ struct xfs_dsymlink_hdr { __be64 sl_lsn; }; +#define XFS_SYMLINK_CRC_OFF offsetof(struct xfs_dsymlink_hdr, sl_crc) + /* * The maximum pathlen is 1024 bytes. Since the minimum file system * blocksize is 512 bytes, we can get a max of 3 extents back from diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 5d7f105a1c82..8f711db61a0c 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -363,6 +363,18 @@ xfs_ialloc_ag_alloc( args.minleft = args.mp->m_in_maxlevels - 1; if ((error = xfs_alloc_vextent(&args))) return error; + + /* + * This request might have dirtied the transaction if the AG can + * satisfy the request, but the exact block was not available. + * If the allocation did fail, subsequent requests will relax + * the exact agbno requirement and increase the alignment + * instead. It is critical that the total size of the request + * (len + alignment + slop) does not increase from this point + * on, so reset minalignslop to ensure it is not included in + * subsequent requests. + */ + args.minalignslop = 0; } else args.fsbno = NULLFSBLOCK; @@ -1568,18 +1580,17 @@ xfs_agi_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - int agi_ok = 1; - - if (xfs_sb_version_hascrc(&mp->m_sb)) - agi_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - offsetof(struct xfs_agi, agi_crc)); - agi_ok = agi_ok && xfs_agi_verify(bp); - if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI, - XFS_RANDOM_IALLOC_READ_AGI))) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp, + XFS_ERRTAG_IALLOC_READ_AGI, + XFS_RANDOM_IALLOC_READ_AGI)) xfs_buf_ioerror(bp, EFSCORRUPTED); - } + + if (bp->b_error) + xfs_verifier_error(bp); } static void @@ -1590,8 +1601,8 @@ xfs_agi_write_verify( struct xfs_buf_log_item *bip = bp->b_fspriv; if (!xfs_agi_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); return; } @@ -1600,8 +1611,7 @@ xfs_agi_write_verify( if (bip) XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn); - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), - offsetof(struct xfs_agi, agi_crc)); + xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF); } const struct xfs_buf_ops xfs_agi_buf_ops = { diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index c8fa5bbb36de..7e309b11e87d 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -243,12 +243,14 @@ static void xfs_inobt_read_verify( struct xfs_buf *bp) { - if (!(xfs_btree_sblock_verify_crc(bp) && - xfs_inobt_verify(bp))) { - trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, - bp->b_target->bt_mount, bp->b_addr); + if (!xfs_btree_sblock_verify_crc(bp)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_inobt_verify(bp)) xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp); } } @@ -258,9 +260,9 @@ xfs_inobt_write_verify( { if (!xfs_inobt_verify(bp)) { trace_xfs_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, - bp->b_target->bt_mount, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; } xfs_btree_sblock_calc_crc(bp); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 3a137e9f9a7d..768087bedbac 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -42,7 +42,6 @@ #include "xfs_bmap_util.h" #include "xfs_error.h" #include "xfs_quota.h" -#include "xfs_dinode.h" #include "xfs_filestream.h" #include "xfs_cksum.h" #include "xfs_trace.h" @@ -62,6 +61,8 @@ kmem_zone_t *xfs_inode_zone; STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *); +STATIC int xfs_iunlink_remove(xfs_trans_t *, xfs_inode_t *); + /* * helper function to extract extent size hint from inode */ @@ -1115,7 +1116,7 @@ xfs_bumplink( { xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); - ASSERT(ip->i_d.di_nlink > 0); + ASSERT(ip->i_d.di_nlink > 0 || (VFS_I(ip)->i_state & I_LINKABLE)); ip->i_d.di_nlink++; inc_nlink(VFS_I(ip)); if ((ip->i_d.di_version == 1) && @@ -1165,10 +1166,7 @@ xfs_create( if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) - prid = xfs_get_projid(dp); - else - prid = XFS_PROJID_DEFAULT; + prid = xfs_get_initial_prid(dp); /* * Make sure that we have allocated dquot(s) on disk. @@ -1333,6 +1331,114 @@ xfs_create( } int +xfs_create_tmpfile( + struct xfs_inode *dp, + struct dentry *dentry, + umode_t mode, + struct xfs_inode **ipp) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_inode *ip = NULL; + struct xfs_trans *tp = NULL; + int error; + uint cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + prid_t prid; + struct xfs_dquot *udqp = NULL; + struct xfs_dquot *gdqp = NULL; + struct xfs_dquot *pdqp = NULL; + struct xfs_trans_res *tres; + uint resblks; + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + prid = xfs_get_initial_prid(dp); + + /* + * Make sure that we have allocated dquot(s) on disk. + */ + error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()), + xfs_kgid_to_gid(current_fsgid()), prid, + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, + &udqp, &gdqp, &pdqp); + if (error) + return error; + + resblks = XFS_IALLOC_SPACE_RES(mp); + tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE_TMPFILE); + + tres = &M_RES(mp)->tr_create_tmpfile; + error = xfs_trans_reserve(tp, tres, resblks, 0); + if (error == ENOSPC) { + /* No space at all so try a "no-allocation" reservation */ + resblks = 0; + error = xfs_trans_reserve(tp, tres, 0, 0); + } + if (error) { + cancel_flags = 0; + goto out_trans_cancel; + } + + error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, + pdqp, resblks, 1, 0); + if (error) + goto out_trans_cancel; + + error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, + prid, resblks > 0, &ip, NULL); + if (error) { + if (error == ENOSPC) + goto out_trans_cancel; + goto out_trans_abort; + } + + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); + + /* + * Attach the dquot(s) to the inodes and modify them incore. + * These ids of the inode couldn't have changed since the new + * inode has been locked ever since it was created. + */ + xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); + + ip->i_d.di_nlink--; + error = xfs_iunlink(tp, ip); + if (error) + goto out_trans_abort; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto out_release_inode; + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + *ipp = ip; + return 0; + + out_trans_abort: + cancel_flags |= XFS_TRANS_ABORT; + out_trans_cancel: + xfs_trans_cancel(tp, cancel_flags); + out_release_inode: + /* + * Wait until after the current transaction is aborted to + * release the inode. This prevents recursive transactions + * and deadlocks from xfs_inactive. + */ + if (ip) + IRELE(ip); + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + return error; +} + +int xfs_link( xfs_inode_t *tdp, xfs_inode_t *sip, @@ -1397,6 +1503,12 @@ xfs_link( xfs_bmap_init(&free_list, &first_block); + if (sip->i_d.di_nlink == 0) { + error = xfs_iunlink_remove(tp, sip); + if (error) + goto abort_return; + } + error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, &first_block, &free_list, resblks); if (error) diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 65e2350f449c..f2fcde52b66d 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -20,6 +20,7 @@ #include "xfs_inode_buf.h" #include "xfs_inode_fork.h" +#include "xfs_dinode.h" /* * Kernel only inode definitions @@ -192,6 +193,15 @@ xfs_set_projid(struct xfs_inode *ip, ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff); } +static inline prid_t +xfs_get_initial_prid(struct xfs_inode *dp) +{ + if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) + return xfs_get_projid(dp); + + return XFS_PROJID_DEFAULT; +} + /* * In-core inode flags. */ @@ -323,6 +333,8 @@ int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode **ipp, struct xfs_name *ci_name); int xfs_create(struct xfs_inode *dp, struct xfs_name *name, umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp); +int xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry, + umode_t mode, struct xfs_inode **ipp); int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/xfs_inode_buf.c index 4fc9f39dd89e..24e993996bdc 100644 --- a/fs/xfs/xfs_inode_buf.c +++ b/fs/xfs/xfs_inode_buf.c @@ -102,8 +102,7 @@ xfs_inode_buf_verify( } xfs_buf_ioerror(bp, EFSCORRUPTED); - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH, - mp, dip); + xfs_verifier_error(bp); #ifdef DEBUG xfs_alert(mp, "bad inode magic/vsn daddr %lld #%d (magic=%x)", @@ -306,7 +305,7 @@ xfs_dinode_verify( if (!xfs_sb_version_hascrc(&mp->m_sb)) return false; if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, - offsetof(struct xfs_dinode, di_crc))) + XFS_DINODE_CRC_OFF)) return false; if (be64_to_cpu(dip->di_ino) != ip->i_ino) return false; @@ -327,7 +326,7 @@ xfs_dinode_calc_crc( ASSERT(xfs_sb_version_hascrc(&mp->m_sb)); crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize, - offsetof(struct xfs_dinode, di_crc)); + XFS_DINODE_CRC_OFF); dip->di_crc = xfs_end_cksum(crc); } diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index bcfe61202115..0b18776b075e 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -271,32 +271,6 @@ xfs_open_by_handle( return error; } -/* - * This is a copy from fs/namei.c:vfs_readlink(), except for removing it's - * unused first argument. - */ -STATIC int -do_readlink( - char __user *buffer, - int buflen, - const char *link) -{ - int len; - - len = PTR_ERR(link); - if (IS_ERR(link)) - goto out; - - len = strlen(link); - if (len > (unsigned) buflen) - len = buflen; - if (copy_to_user(buffer, link, len)) - len = -EFAULT; - out: - return len; -} - - int xfs_readlink_by_handle( struct file *parfilp, @@ -334,7 +308,7 @@ xfs_readlink_by_handle( error = -xfs_readlink(XFS_I(dentry->d_inode), link); if (error) goto out_kfree; - error = do_readlink(hreq->ohandle, olen, link); + error = readlink_copy(hreq->ohandle, olen, link); if (error) goto out_kfree; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 22d1cbea283d..3b80ebae05f5 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -128,7 +128,6 @@ xfs_iomap_write_direct( xfs_fsblock_t firstfsb; xfs_extlen_t extsz, temp; int nimaps; - int bmapi_flag; int quota_flag; int rt; xfs_trans_t *tp; @@ -200,18 +199,15 @@ xfs_iomap_write_direct( xfs_trans_ijoin(tp, ip, 0); - bmapi_flag = 0; - if (offset < XFS_ISIZE(ip) || extsz) - bmapi_flag |= XFS_BMAPI_PREALLOC; - /* * From this point onwards we overwrite the imap pointer that the * caller gave to us. */ xfs_bmap_init(&free_list, &firstfsb); nimaps = 1; - error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flag, - &firstfsb, 0, imap, &nimaps, &free_list); + error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, + XFS_BMAPI_PREALLOC, &firstfsb, 0, + imap, &nimaps, &free_list); if (error) goto out_bmap_cancel; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 9ddfb8190ca1..36d630319a27 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -39,6 +39,7 @@ #include "xfs_da_btree.h" #include "xfs_dir2_priv.h" #include "xfs_dinode.h" +#include "xfs_trans_space.h" #include <linux/capability.h> #include <linux/xattr.h> @@ -48,6 +49,18 @@ #include <linux/fiemap.h> #include <linux/slab.h> +/* + * Directories have different lock order w.r.t. mmap_sem compared to regular + * files. This is due to readdir potentially triggering page faults on a user + * buffer inside filldir(), and this happens with the ilock on the directory + * held. For regular files, the lock order is the other way around - the + * mmap_sem is taken during the page fault, and then we lock the ilock to do + * block mapping. Hence we need a different class for the directory ilock so + * that lockdep can tell them apart. + */ +static struct lock_class_key xfs_nondir_ilock_class; +static struct lock_class_key xfs_dir_ilock_class; + static int xfs_initxattrs( struct inode *inode, @@ -59,8 +72,8 @@ xfs_initxattrs( int error = 0; for (xattr = xattr_array; xattr->name != NULL; xattr++) { - error = xfs_attr_set(ip, xattr->name, xattr->value, - xattr->value_len, ATTR_SECURE); + error = -xfs_attr_set(ip, xattr->name, xattr->value, + xattr->value_len, ATTR_SECURE); if (error < 0) break; } @@ -80,8 +93,8 @@ xfs_init_security( struct inode *dir, const struct qstr *qstr) { - return security_inode_init_security(inode, dir, qstr, - &xfs_initxattrs, NULL); + return -security_inode_init_security(inode, dir, qstr, + &xfs_initxattrs, NULL); } static void @@ -111,15 +124,15 @@ xfs_cleanup_inode( xfs_dentry_to_name(&teardown, dentry, 0); xfs_remove(XFS_I(dir), &teardown, XFS_I(inode)); - iput(inode); } STATIC int -xfs_vn_mknod( +xfs_generic_create( struct inode *dir, struct dentry *dentry, umode_t mode, - dev_t rdev) + dev_t rdev, + bool tmpfile) /* unnamed file */ { struct inode *inode; struct xfs_inode *ip = NULL; @@ -143,8 +156,12 @@ xfs_vn_mknod( if (error) return error; - xfs_dentry_to_name(&name, dentry, mode); - error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); + if (!tmpfile) { + xfs_dentry_to_name(&name, dentry, mode); + error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); + } else { + error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip); + } if (unlikely(error)) goto out_free_acl; @@ -156,18 +173,22 @@ xfs_vn_mknod( #ifdef CONFIG_XFS_POSIX_ACL if (default_acl) { - error = xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + error = -xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); if (error) goto out_cleanup_inode; } if (acl) { - error = xfs_set_acl(inode, acl, ACL_TYPE_ACCESS); + error = -xfs_set_acl(inode, acl, ACL_TYPE_ACCESS); if (error) goto out_cleanup_inode; } #endif - d_instantiate(dentry, inode); + if (tmpfile) + d_tmpfile(dentry, inode); + else + d_instantiate(dentry, inode); + out_free_acl: if (default_acl) posix_acl_release(default_acl); @@ -176,11 +197,23 @@ xfs_vn_mknod( return -error; out_cleanup_inode: - xfs_cleanup_inode(dir, inode, dentry); + if (!tmpfile) + xfs_cleanup_inode(dir, inode, dentry); + iput(inode); goto out_free_acl; } STATIC int +xfs_vn_mknod( + struct inode *dir, + struct dentry *dentry, + umode_t mode, + dev_t rdev) +{ + return xfs_generic_create(dir, dentry, mode, rdev, false); +} + +STATIC int xfs_vn_create( struct inode *dir, struct dentry *dentry, @@ -340,6 +373,7 @@ xfs_vn_symlink( out_cleanup_inode: xfs_cleanup_inode(dir, inode, dentry); + iput(inode); out: return -error; } @@ -1034,6 +1068,15 @@ xfs_vn_fiemap( return 0; } +STATIC int +xfs_vn_tmpfile( + struct inode *dir, + struct dentry *dentry, + umode_t mode) +{ + return xfs_generic_create(dir, dentry, mode, 0, true); +} + static const struct inode_operations xfs_inode_operations = { .get_acl = xfs_get_acl, .set_acl = xfs_set_acl, @@ -1072,6 +1115,7 @@ static const struct inode_operations xfs_dir_inode_operations = { .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, .update_time = xfs_vn_update_time, + .tmpfile = xfs_vn_tmpfile, }; static const struct inode_operations xfs_dir_ci_inode_operations = { @@ -1099,6 +1143,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = { .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, .update_time = xfs_vn_update_time, + .tmpfile = xfs_vn_tmpfile, }; static const struct inode_operations xfs_symlink_inode_operations = { @@ -1191,6 +1236,7 @@ xfs_setup_inode( xfs_diflags_to_iflags(inode, ip); ip->d_ops = ip->i_mount->m_nondir_inode_ops; + lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class); switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_op = &xfs_inode_operations; @@ -1198,6 +1244,7 @@ xfs_setup_inode( inode->i_mapping->a_ops = &xfs_address_space_operations; break; case S_IFDIR: + lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class); if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) inode->i_op = &xfs_dir_ci_inode_operations; else diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index f9bb590acc0e..825249d2dfc1 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -119,6 +119,7 @@ typedef __uint64_t __psunsigned_t; #include "xfs_iops.h" #include "xfs_aops.h" #include "xfs_super.h" +#include "xfs_cksum.h" #include "xfs_buf.h" #include "xfs_message.h" @@ -178,6 +179,7 @@ typedef __uint64_t __psunsigned_t; #define ENOATTR ENODATA /* Attribute not found */ #define EWRONGFS EINVAL /* Mount with wrong filesystem type */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define SYNCHRONIZE() barrier() #define __return_address __builtin_return_address(0) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 8497a00e399d..a5f8bd9899d3 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -616,11 +616,13 @@ xfs_log_mount( int error = 0; int min_logfsbs; - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) - xfs_notice(mp, "Mounting Filesystem"); - else { + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { + xfs_notice(mp, "Mounting V%d Filesystem", + XFS_SB_VERSION_NUM(&mp->m_sb)); + } else { xfs_notice(mp, -"Mounting filesystem in no-recovery mode. Filesystem will be inconsistent."); +"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.", + XFS_SB_VERSION_NUM(&mp->m_sb)); ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); } @@ -1181,11 +1183,14 @@ xlog_iodone(xfs_buf_t *bp) /* log I/O is always issued ASYNC */ ASSERT(XFS_BUF_ISASYNC(bp)); xlog_state_done_syncing(iclog, aborted); + /* - * do not reference the buffer (bp) here as we could race - * with it being freed after writing the unmount record to the - * log. + * drop the buffer lock now that we are done. Nothing references + * the buffer after this, so an unmount waiting on this lock can now + * tear it down safely. As such, it is unsafe to reference the buffer + * (bp) after the unlock as we could race with it being freed. */ + xfs_buf_unlock(bp); } /* @@ -1368,8 +1373,16 @@ xlog_alloc_log( bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0); if (!bp) goto out_free_log; - bp->b_iodone = xlog_iodone; + + /* + * The iclogbuf buffer locks are held over IO but we are not going to do + * IO yet. Hence unlock the buffer so that the log IO path can grab it + * when appropriately. + */ ASSERT(xfs_buf_islocked(bp)); + xfs_buf_unlock(bp); + + bp->b_iodone = xlog_iodone; log->l_xbuf = bp; spin_lock_init(&log->l_icloglock); @@ -1398,6 +1411,9 @@ xlog_alloc_log( if (!bp) goto out_free_iclog; + ASSERT(xfs_buf_islocked(bp)); + xfs_buf_unlock(bp); + bp->b_iodone = xlog_iodone; iclog->ic_bp = bp; iclog->ic_data = bp->b_addr; @@ -1422,7 +1438,6 @@ xlog_alloc_log( iclog->ic_callback_tail = &(iclog->ic_callback); iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; - ASSERT(xfs_buf_islocked(iclog->ic_bp)); init_waitqueue_head(&iclog->ic_force_wait); init_waitqueue_head(&iclog->ic_write_wait); @@ -1631,6 +1646,12 @@ xlog_cksum( * we transition the iclogs to IOERROR state *after* flushing all existing * iclogs to disk. This is because we don't want anymore new transactions to be * started or completed afterwards. + * + * We lock the iclogbufs here so that we can serialise against IO completion + * during unmount. We might be processing a shutdown triggered during unmount, + * and that can occur asynchronously to the unmount thread, and hence we need to + * ensure that completes before tearing down the iclogbufs. Hence we need to + * hold the buffer lock across the log IO to acheive that. */ STATIC int xlog_bdstrat( @@ -1638,6 +1659,7 @@ xlog_bdstrat( { struct xlog_in_core *iclog = bp->b_fspriv; + xfs_buf_lock(bp); if (iclog->ic_state & XLOG_STATE_IOERROR) { xfs_buf_ioerror(bp, EIO); xfs_buf_stale(bp); @@ -1645,7 +1667,8 @@ xlog_bdstrat( /* * It would seem logical to return EIO here, but we rely on * the log state machine to propagate I/O errors instead of - * doing it here. + * doing it here. Similarly, IO completion will unlock the + * buffer, so we don't do it here. */ return 0; } @@ -1847,14 +1870,28 @@ xlog_dealloc_log( xlog_cil_destroy(log); /* - * always need to ensure that the extra buffer does not point to memory - * owned by another log buffer before we free it. + * Cycle all the iclogbuf locks to make sure all log IO completion + * is done before we tear down these buffers. */ + iclog = log->l_iclog; + for (i = 0; i < log->l_iclog_bufs; i++) { + xfs_buf_lock(iclog->ic_bp); + xfs_buf_unlock(iclog->ic_bp); + iclog = iclog->ic_next; + } + + /* + * Always need to ensure that the extra buffer does not point to memory + * owned by another log buffer before we free it. Also, cycle the lock + * first to ensure we've completed IO on it. + */ + xfs_buf_lock(log->l_xbuf); + xfs_buf_unlock(log->l_xbuf); xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size)); xfs_buf_free(log->l_xbuf); iclog = log->l_iclog; - for (i=0; i<log->l_iclog_bufs; i++) { + for (i = 0; i < log->l_iclog_bufs; i++) { xfs_buf_free(iclog->ic_bp); next_iclog = iclog->ic_next; kmem_free(iclog); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index b0f4ef77fa70..2c4004475e71 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -175,7 +175,7 @@ void xlog_iodone(struct xfs_buf *); struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); -int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, +void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, xfs_lsn_t *commit_lsn, int flags); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 4ef6fdbced78..7e5455391176 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -499,13 +499,6 @@ xlog_cil_push( cil->xc_ctx = new_ctx; /* - * mirror the new sequence into the cil structure so that we can do - * unlocked checks against the current sequence in log forces without - * risking deferencing a freed context pointer. - */ - cil->xc_current_sequence = new_ctx->sequence; - - /* * The switch is now done, so we can drop the context lock and move out * of a shared context. We can't just go straight to the commit record, * though - we need to synchronise with previous and future commits so @@ -523,8 +516,15 @@ xlog_cil_push( * Hence we need to add this context to the committing context list so * that higher sequences will wait for us to write out a commit record * before they do. + * + * xfs_log_force_lsn requires us to mirror the new sequence into the cil + * structure atomically with the addition of this sequence to the + * committing list. This also ensures that we can do unlocked checks + * against the current sequence in log forces without risking + * deferencing a freed context pointer. */ spin_lock(&cil->xc_push_lock); + cil->xc_current_sequence = new_ctx->sequence; list_add(&ctx->committing, &cil->xc_committing); spin_unlock(&cil->xc_push_lock); up_write(&cil->xc_ctx_lock); @@ -662,8 +662,14 @@ xlog_cil_push_background( } +/* + * xlog_cil_push_now() is used to trigger an immediate CIL push to the sequence + * number that is passed. When it returns, the work will be queued for + * @push_seq, but it won't be completed. The caller is expected to do any + * waiting for push_seq to complete if it is required. + */ static void -xlog_cil_push_foreground( +xlog_cil_push_now( struct xlog *log, xfs_lsn_t push_seq) { @@ -688,10 +694,8 @@ xlog_cil_push_foreground( } cil->xc_push_seq = push_seq; + queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); spin_unlock(&cil->xc_push_lock); - - /* do the push now */ - xlog_cil_push(log); } bool @@ -721,7 +725,7 @@ xlog_cil_empty( * background commit, returns without it held once background commits are * allowed again. */ -int +void xfs_log_commit_cil( struct xfs_mount *mp, struct xfs_trans *tp, @@ -767,7 +771,6 @@ xfs_log_commit_cil( xlog_cil_push_background(log); up_read(&cil->xc_ctx_lock); - return 0; } /* @@ -796,7 +799,8 @@ xlog_cil_force_lsn( * xlog_cil_push() handles racing pushes for the same sequence, * so no need to deal with it here. */ - xlog_cil_push_foreground(log, sequence); +restart: + xlog_cil_push_now(log, sequence); /* * See if we can find a previous sequence still committing. @@ -804,7 +808,6 @@ xlog_cil_force_lsn( * before allowing the force of push_seq to go ahead. Hence block * on commits for those as well. */ -restart: spin_lock(&cil->xc_push_lock); list_for_each_entry(ctx, &cil->xc_committing, committing) { if (ctx->sequence > sequence) @@ -822,6 +825,28 @@ restart: /* found it! */ commit_lsn = ctx->commit_lsn; } + + /* + * The call to xlog_cil_push_now() executes the push in the background. + * Hence by the time we have got here it our sequence may not have been + * pushed yet. This is true if the current sequence still matches the + * push sequence after the above wait loop and the CIL still contains + * dirty objects. + * + * When the push occurs, it will empty the CIL and + * atomically increment the currect sequence past the push sequence and + * move it into the committing list. Of course, if the CIL is clean at + * the time of the push, it won't have pushed the CIL at all, so in that + * case we should try the push for this sequence again from the start + * just in case. + */ + + if (sequence == cil->xc_current_sequence && + !list_empty(&cil->xc_cil)) { + spin_unlock(&cil->xc_push_lock); + goto restart; + } + spin_unlock(&cil->xc_push_lock); return commit_lsn; } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index f96c05669a9e..944f3d9456a8 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -314,6 +314,9 @@ reread: error = bp->b_error; if (loud) xfs_warn(mp, "SB validate failed with error %d.", error); + /* bad CRC means corrupted metadata */ + if (error == EFSBADCRC) + error = EFSCORRUPTED; goto release_buf; } @@ -740,8 +743,6 @@ xfs_mountfs( new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE; if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size)) mp->m_inode_cluster_size = new_size; - xfs_info(mp, "Using inode cluster size of %d bytes", - mp->m_inode_cluster_size); } /* diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 348e4d2ed6e6..dc977b6e6a36 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -843,22 +843,17 @@ xfs_qm_init_quotainfo( qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); - if ((error = list_lru_init(&qinf->qi_lru))) { - kmem_free(qinf); - mp->m_quotainfo = NULL; - return error; - } + error = -list_lru_init(&qinf->qi_lru); + if (error) + goto out_free_qinf; /* * See if quotainodes are setup, and if not, allocate them, * and change the superblock accordingly. */ - if ((error = xfs_qm_init_quotainos(mp))) { - list_lru_destroy(&qinf->qi_lru); - kmem_free(qinf); - mp->m_quotainfo = NULL; - return error; - } + error = xfs_qm_init_quotainos(mp); + if (error) + goto out_free_lru; INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS); INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS); @@ -918,7 +913,7 @@ xfs_qm_init_quotainfo( qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit); qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); - + xfs_qm_dqdestroy(dqp); } else { qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; @@ -935,6 +930,13 @@ xfs_qm_init_quotainfo( qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE; register_shrinker(&qinf->qi_shrinker); return 0; + +out_free_lru: + list_lru_destroy(&qinf->qi_lru); +out_free_qinf: + kmem_free(qinf); + mp->m_quotainfo = NULL; + return error; } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index a6a76b2b6a85..ec5ca65c6211 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -842,7 +842,7 @@ xfs_growfs_rt_alloc( /* * Reserve space & log for one extent added to the file. */ - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata, + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc, resblks, 0); if (error) goto error_cancel; diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c index 1e116794bb66..8baf61afae1d 100644 --- a/fs/xfs/xfs_sb.c +++ b/fs/xfs/xfs_sb.c @@ -201,10 +201,6 @@ xfs_mount_validate_sb( * write validation, we don't need to check feature masks. */ if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) { - xfs_alert(mp, -"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n" -"Use of these features in this kernel is at your own risk!"); - if (xfs_sb_has_compat_feature(sbp, XFS_SB_FEAT_COMPAT_UNKNOWN)) { xfs_warn(mp, @@ -288,6 +284,7 @@ xfs_mount_validate_sb( sbp->sb_inodelog < XFS_DINODE_MIN_LOG || sbp->sb_inodelog > XFS_DINODE_MAX_LOG || sbp->sb_inodesize != (1 << sbp->sb_inodelog) || + sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) || (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || @@ -610,12 +607,11 @@ xfs_sb_read_verify( XFS_SB_VERSION_5) || dsb->sb_crc != 0)) { - if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - offsetof(struct xfs_sb, sb_crc))) { + if (!xfs_buf_verify_cksum(bp, XFS_SB_CRC_OFF)) { /* Only fail bad secondaries on a known V5 filesystem */ if (bp->b_bn == XFS_SB_DADDR || xfs_sb_version_hascrc(&mp->m_sb)) { - error = EFSCORRUPTED; + error = EFSBADCRC; goto out_error; } } @@ -624,10 +620,9 @@ xfs_sb_read_verify( out_error: if (error) { - if (error == EFSCORRUPTED) - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, - mp, bp->b_addr); xfs_buf_ioerror(bp, error); + if (error == EFSCORRUPTED || error == EFSBADCRC) + xfs_verifier_error(bp); } } @@ -662,9 +657,8 @@ xfs_sb_write_verify( error = xfs_sb_verify(bp, false); if (error) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, - mp, bp->b_addr); xfs_buf_ioerror(bp, error); + xfs_verifier_error(bp); return; } @@ -674,8 +668,7 @@ xfs_sb_write_verify( if (bip) XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn); - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), - offsetof(struct xfs_sb, sb_crc)); + xfs_buf_update_cksum(bp, XFS_SB_CRC_OFF); } const struct xfs_buf_ops xfs_sb_buf_ops = { diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h index 35061d4b614c..f7b2fe77c5a5 100644 --- a/fs/xfs/xfs_sb.h +++ b/fs/xfs/xfs_sb.h @@ -182,6 +182,8 @@ typedef struct xfs_sb { /* must be padded to 64 bit alignment */ } xfs_sb_t; +#define XFS_SB_CRC_OFF offsetof(struct xfs_sb, sb_crc) + /* * Superblock - on disk version. Must match the in core version above. * Must be padded to 64 bit alignment. diff --git a/fs/xfs/xfs_shared.h b/fs/xfs/xfs_shared.h index 8c5035a13df1..4484e5151395 100644 --- a/fs/xfs/xfs_shared.h +++ b/fs/xfs/xfs_shared.h @@ -104,7 +104,8 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops; #define XFS_TRANS_SB_COUNT 41 #define XFS_TRANS_CHECKPOINT 42 #define XFS_TRANS_ICREATE 43 -#define XFS_TRANS_TYPE_MAX 43 +#define XFS_TRANS_CREATE_TMPFILE 44 +#define XFS_TRANS_TYPE_MAX 44 /* new transaction types need to be reflected in xfs_logprint(8) */ #define XFS_TRANS_TYPES \ @@ -112,6 +113,7 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops; { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \ { XFS_TRANS_INACTIVE, "INACTIVE" }, \ { XFS_TRANS_CREATE, "CREATE" }, \ + { XFS_TRANS_CREATE_TMPFILE, "CREATE_TMPFILE" }, \ { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \ { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \ { XFS_TRANS_REMOVE, "REMOVE" }, \ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index d971f4932b5d..3494eff8e4eb 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -996,7 +996,7 @@ xfs_fs_evict_inode( trace_xfs_evict_inode(ip); - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); XFS_STATS_INC(vn_rele); XFS_STATS_INC(vn_remove); @@ -1197,6 +1197,7 @@ xfs_fs_remount( char *p; int error; + sync_filesystem(sb); while ((p = strsep(&options, ",")) != NULL) { int token; @@ -1432,11 +1433,11 @@ xfs_fs_fill_super( if (error) goto out_free_fsname; - error = xfs_init_mount_workqueues(mp); + error = -xfs_init_mount_workqueues(mp); if (error) goto out_close_devices; - error = xfs_icsb_init_counters(mp); + error = -xfs_icsb_init_counters(mp); if (error) goto out_destroy_workqueues; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 14e58f2c96bd..52979aa90986 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -80,6 +80,10 @@ xfs_readlink_bmap( if (error) { xfs_buf_ioerror_alert(bp, __func__); xfs_buf_relse(bp); + + /* bad CRC means corrupted metadata */ + if (error == EFSBADCRC) + error = EFSCORRUPTED; goto out; } byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); @@ -208,10 +212,7 @@ xfs_symlink( return XFS_ERROR(ENAMETOOLONG); udqp = gdqp = NULL; - if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) - prid = xfs_get_projid(dp); - else - prid = XFS_PROJID_DEFAULT; + prid = xfs_get_initial_prid(dp); /* * Make sure that we have allocated dquot(s) on disk. diff --git a/fs/xfs/xfs_symlink_remote.c b/fs/xfs/xfs_symlink_remote.c index bf59a2b45f8c..9b32052ff65e 100644 --- a/fs/xfs/xfs_symlink_remote.c +++ b/fs/xfs/xfs_symlink_remote.c @@ -133,12 +133,13 @@ xfs_symlink_read_verify( if (!xfs_sb_version_hascrc(&mp->m_sb)) return; - if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - offsetof(struct xfs_dsymlink_hdr, sl_crc)) || - !xfs_symlink_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); + if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_symlink_verify(bp)) xfs_buf_ioerror(bp, EFSCORRUPTED); - } + + if (bp->b_error) + xfs_verifier_error(bp); } static void @@ -153,8 +154,8 @@ xfs_symlink_write_verify( return; if (!xfs_symlink_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); return; } @@ -162,8 +163,7 @@ xfs_symlink_write_verify( struct xfs_dsymlink_hdr *dsl = bp->b_addr; dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn); } - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), - offsetof(struct xfs_dsymlink_hdr, sl_crc)); + xfs_buf_update_cksum(bp, XFS_SYMLINK_CRC_OFF); } const struct xfs_buf_ops xfs_symlink_buf_ops = { diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 425dfa45b9a0..65d8c793a25c 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -603,6 +603,8 @@ DEFINE_INODE_EVENT(xfs_readlink); DEFINE_INODE_EVENT(xfs_inactive_symlink); DEFINE_INODE_EVENT(xfs_alloc_file_space); DEFINE_INODE_EVENT(xfs_free_file_space); +DEFINE_INODE_EVENT(xfs_zero_file_space); +DEFINE_INODE_EVENT(xfs_collapse_file_space); DEFINE_INODE_EVENT(xfs_readdir); #ifdef CONFIG_XFS_POSIX_ACL DEFINE_INODE_EVENT(xfs_get_acl); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index c812c5c060de..54a57326d85b 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -887,12 +887,7 @@ xfs_trans_commit( xfs_trans_apply_sb_deltas(tp); xfs_trans_apply_dquot_deltas(tp); - error = xfs_log_commit_cil(mp, tp, &commit_lsn, flags); - if (error == ENOMEM) { - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); - error = XFS_ERROR(EIO); - goto out_unreserve; - } + xfs_log_commit_cil(mp, tp, &commit_lsn, flags); current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); xfs_trans_free(tp); @@ -902,10 +897,7 @@ xfs_trans_commit( * log out now and wait for it. */ if (sync) { - if (!error) { - error = _xfs_log_force_lsn(mp, commit_lsn, - XFS_LOG_SYNC, NULL); - } + error = _xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL); XFS_STATS_INC(xs_trans_sync); } else { XFS_STATS_INC(xs_trans_async); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 647b6f1d8923..b8eef0549f3f 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -275,6 +275,10 @@ xfs_trans_read_buf_map( XFS_BUF_UNDONE(bp); xfs_buf_stale(bp); xfs_buf_relse(bp); + + /* bad CRC means corrupted metadata */ + if (error == EFSBADCRC) + error = EFSCORRUPTED; return error; } #ifdef DEBUG @@ -338,6 +342,9 @@ xfs_trans_read_buf_map( if (tp->t_flags & XFS_TRANS_DIRTY) xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); + /* bad CRC means corrupted metadata */ + if (error == EFSBADCRC) + error = EFSCORRUPTED; return error; } } @@ -375,6 +382,10 @@ xfs_trans_read_buf_map( if (tp->t_flags & XFS_TRANS_DIRTY) xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); xfs_buf_relse(bp); + + /* bad CRC means corrupted metadata */ + if (error == EFSBADCRC) + error = EFSCORRUPTED; return error; } #ifdef DEBUG diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c index 2ffd3e331b49..ae368165244d 100644 --- a/fs/xfs/xfs_trans_resv.c +++ b/fs/xfs/xfs_trans_resv.c @@ -81,20 +81,28 @@ xfs_calc_buf_res( * on disk. Hence we need an inode reservation function that calculates all this * correctly. So, we log: * - * - log op headers for object + * - 4 log op headers for object + * - for the ilf, the inode core and 2 forks * - inode log format object - * - the entire inode contents (core + 2 forks) - * - two bmap btree block headers + * - the inode core + * - two inode forks containing bmap btree root blocks. + * - the btree data contained by both forks will fit into the inode size, + * hence when combined with the inode core above, we have a total of the + * actual inode size. + * - the BMBT headers need to be accounted separately, as they are + * additional to the records and pointers that fit inside the inode + * forks. */ STATIC uint xfs_calc_inode_res( struct xfs_mount *mp, uint ninodes) { - return ninodes * (sizeof(struct xlog_op_header) + - sizeof(struct xfs_inode_log_format) + - mp->m_sb.sb_inodesize + - 2 * XFS_BMBT_BLOCK_LEN(mp)); + return ninodes * + (4 * sizeof(struct xlog_op_header) + + sizeof(struct xfs_inode_log_format) + + mp->m_sb.sb_inodesize + + 2 * XFS_BMBT_BLOCK_LEN(mp)); } /* @@ -204,6 +212,19 @@ xfs_calc_rename_reservation( } /* + * For removing an inode from unlinked list at first, we can modify: + * the agi hash list and counters: sector size + * the on disk inode before ours in the agi hash list: inode cluster size + */ +STATIC uint +xfs_calc_iunlink_remove_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size); +} + +/* * For creating a link to an inode: * the parent directory inode: inode size * the linked inode: inode size @@ -220,6 +241,7 @@ xfs_calc_link_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + + xfs_calc_iunlink_remove_reservation(mp) + MAX((xfs_calc_inode_res(mp, 2) + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), @@ -229,6 +251,18 @@ xfs_calc_link_reservation( } /* + * For adding an inode to unlinked list we can modify: + * the agi hash list: sector size + * the unlinked inode: inode size + */ +STATIC uint +xfs_calc_iunlink_add_reservation(xfs_mount_t *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_inode_res(mp, 1); +} + +/* * For removing a directory entry we can modify: * the parent directory inode: inode size * the removed inode: inode size @@ -245,10 +279,11 @@ xfs_calc_remove_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + - MAX((xfs_calc_inode_res(mp, 2) + + xfs_calc_iunlink_add_reservation(mp) + + MAX((xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), XFS_FSB_TO_B(mp, 1)))); } @@ -343,6 +378,20 @@ xfs_calc_create_reservation( } +STATIC uint +xfs_calc_create_tmpfile_reservation( + struct xfs_mount *mp) +{ + uint res = XFS_DQUOT_LOGRES(mp); + + if (xfs_sb_version_hascrc(&mp->m_sb)) + res += xfs_calc_icreate_resv_alloc(mp); + else + res += xfs_calc_create_resv_alloc(mp); + + return res + xfs_calc_iunlink_add_reservation(mp); +} + /* * Making a new directory is the same as creating a new file. */ @@ -383,9 +432,9 @@ xfs_calc_ifree_reservation( { return XFS_DQUOT_LOGRES(mp) + xfs_calc_inode_res(mp, 1) + - xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + - max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size) + + xfs_calc_iunlink_remove_reservation(mp) + xfs_calc_buf_res(1, 0) + xfs_calc_buf_res(2 + mp->m_ialloc_blks + mp->m_in_maxlevels, 0) + @@ -644,15 +693,14 @@ xfs_calc_qm_setqlim_reservation( /* * Allocating quota on disk if needed. - * the write transaction log space: M_RES(mp)->tr_write.tr_logres + * the write transaction log space for quota file extent allocation * the unit of quota allocation: one system block size */ STATIC uint xfs_calc_qm_dqalloc_reservation( struct xfs_mount *mp) { - ASSERT(M_RES(mp)->tr_write.tr_logres); - return M_RES(mp)->tr_write.tr_logres + + return xfs_calc_write_reservation(mp) + xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1); } @@ -729,6 +777,11 @@ xfs_trans_resv_calc( resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT; resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + resp->tr_create_tmpfile.tr_logres = + xfs_calc_create_tmpfile_reservation(mp); + resp->tr_create_tmpfile.tr_logcount = XFS_CREATE_TMPFILE_LOG_COUNT; + resp->tr_create_tmpfile.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp); resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT; resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES; @@ -784,7 +837,6 @@ xfs_trans_resv_calc( /* The following transaction are logged in logical format */ resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp); resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp); - resp->tr_swrite.tr_logres = xfs_calc_swrite_reservation(mp); resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp); resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp); resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp); diff --git a/fs/xfs/xfs_trans_resv.h b/fs/xfs/xfs_trans_resv.h index de7de9aaad8a..1097d14cd583 100644 --- a/fs/xfs/xfs_trans_resv.h +++ b/fs/xfs/xfs_trans_resv.h @@ -38,11 +38,11 @@ struct xfs_trans_resv { struct xfs_trans_res tr_remove; /* unlink trans */ struct xfs_trans_res tr_symlink; /* symlink trans */ struct xfs_trans_res tr_create; /* create trans */ + struct xfs_trans_res tr_create_tmpfile; /* create O_TMPFILE trans */ struct xfs_trans_res tr_mkdir; /* mkdir trans */ struct xfs_trans_res tr_ifree; /* inode free trans */ struct xfs_trans_res tr_ichange; /* inode update trans */ struct xfs_trans_res tr_growdata; /* fs data section grow trans */ - struct xfs_trans_res tr_swrite; /* sync write inode trans */ struct xfs_trans_res tr_addafork; /* add inode attr fork trans */ struct xfs_trans_res tr_writeid; /* write setuid/setgid file */ struct xfs_trans_res tr_attrinval; /* attr fork buffer @@ -100,6 +100,7 @@ struct xfs_trans_resv { #define XFS_ITRUNCATE_LOG_COUNT 2 #define XFS_INACTIVE_LOG_COUNT 2 #define XFS_CREATE_LOG_COUNT 2 +#define XFS_CREATE_TMPFILE_LOG_COUNT 2 #define XFS_MKDIR_LOG_COUNT 3 #define XFS_SYMLINK_LOG_COUNT 3 #define XFS_REMOVE_LOG_COUNT 2 |