diff options
Diffstat (limited to 'fs')
349 files changed, 14185 insertions, 9241 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index 2501e6f1f965..7b623e9fc1b0 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -322,4 +322,7 @@ source "fs/nls/Kconfig" source "fs/dlm/Kconfig" source "fs/unicode/Kconfig" +config IO_WQ + bool + endmenu diff --git a/fs/Makefile b/fs/Makefile index 14231b4cf383..1148c555c4d3 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -32,6 +32,7 @@ obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_AIO) += aio.o obj-$(CONFIG_IO_URING) += io_uring.o +obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FS_ENCRYPTION) += crypto/ obj-$(CONFIG_FS_VERITY) += verity/ diff --git a/fs/affs/affs.h b/fs/affs/affs.h index a92eb6ae2ae2..a755bef7c4c7 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -43,8 +43,8 @@ struct affs_ext_key { */ struct affs_inode_info { atomic_t i_opencnt; - struct semaphore i_link_lock; /* Protects internal inode access. */ - struct semaphore i_ext_lock; /* Protects internal inode access. */ + struct mutex i_link_lock; /* Protects internal inode access. */ + struct mutex i_ext_lock; /* Protects internal inode access. */ #define i_hash_lock i_ext_lock u32 i_blkcnt; /* block count */ u32 i_extcnt; /* extended block count */ @@ -293,30 +293,30 @@ affs_adjust_bitmapchecksum(struct buffer_head *bh, u32 val) static inline void affs_lock_link(struct inode *inode) { - down(&AFFS_I(inode)->i_link_lock); + mutex_lock(&AFFS_I(inode)->i_link_lock); } static inline void affs_unlock_link(struct inode *inode) { - up(&AFFS_I(inode)->i_link_lock); + mutex_unlock(&AFFS_I(inode)->i_link_lock); } static inline void affs_lock_dir(struct inode *inode) { - down(&AFFS_I(inode)->i_hash_lock); + mutex_lock_nested(&AFFS_I(inode)->i_hash_lock, SINGLE_DEPTH_NESTING); } static inline void affs_unlock_dir(struct inode *inode) { - up(&AFFS_I(inode)->i_hash_lock); + mutex_unlock(&AFFS_I(inode)->i_hash_lock); } static inline void affs_lock_ext(struct inode *inode) { - down(&AFFS_I(inode)->i_ext_lock); + mutex_lock(&AFFS_I(inode)->i_ext_lock); } static inline void affs_unlock_ext(struct inode *inode) { - up(&AFFS_I(inode)->i_ext_lock); + mutex_unlock(&AFFS_I(inode)->i_ext_lock); } diff --git a/fs/affs/super.c b/fs/affs/super.c index cc463ae47c12..47107c6712a6 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -121,8 +121,8 @@ static void init_once(void *foo) { struct affs_inode_info *ei = (struct affs_inode_info *) foo; - sema_init(&ei->i_link_lock, 1); - sema_init(&ei->i_ext_lock, 1); + mutex_init(&ei->i_link_lock); + mutex_init(&ei->i_ext_lock); inode_init_once(&ei->vfs_inode); } @@ -561,14 +561,9 @@ affs_remount(struct super_block *sb, int *flags, char *data) int root_block; unsigned long mount_flags; int res = 0; - char *new_opts; char volume[32]; char *prefix = NULL; - new_opts = kstrdup(data, GFP_KERNEL); - if (data && !new_opts) - return -ENOMEM; - pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data); sync_filesystem(sb); @@ -579,7 +574,6 @@ affs_remount(struct super_block *sb, int *flags, char *data) &blocksize, &prefix, volume, &mount_flags)) { kfree(prefix); - kfree(new_opts); return -EINVAL; } diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 6cdd7047c809..2dca8df1a18d 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -312,7 +312,6 @@ void afs_break_callbacks(struct afs_server *server, size_t count, _enter("%p,%zu,", server, count); ASSERT(server != NULL); - ASSERTCMP(count, <=, AFSCBMAX); /* TODO: Sort the callback break list by volume ID */ diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index b86195e4dc6c..ff3994a6be23 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -342,14 +342,14 @@ static int afs_deliver_cb_callback(struct afs_call *call) if (call->count2 != call->count && call->count2 != 0) return afs_protocol_error(call, -EBADMSG, afs_eproto_cb_count); - call->_iter = &call->iter; - iov_iter_discard(&call->iter, READ, call->count2 * 3 * 4); + call->iter = &call->def_iter; + iov_iter_discard(&call->def_iter, READ, call->count2 * 3 * 4); call->unmarshall++; /* Fall through */ case 4: _debug("extract discard %zu/%u", - iov_iter_count(&call->iter), call->count2 * 3 * 4); + iov_iter_count(call->iter), call->count2 * 3 * 4); ret = afs_extract_data(call, false); if (ret < 0) diff --git a/fs/afs/dir.c b/fs/afs/dir.c index cc12772d0a4d..497f979018c2 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -803,7 +803,12 @@ success: continue; if (cookie->inodes[i]) { - afs_vnode_commit_status(&fc, AFS_FS_I(cookie->inodes[i]), + struct afs_vnode *iv = AFS_FS_I(cookie->inodes[i]); + + if (test_bit(AFS_VNODE_UNSET, &iv->flags)) + continue; + + afs_vnode_commit_status(&fc, iv, scb->cb_break, NULL, scb); continue; } diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index d4fbe5f85f1b..b108528bf010 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -68,13 +68,11 @@ static int afs_find_contig_bits(union afs_xdr_dir_block *block, unsigned int nr_ static void afs_set_contig_bits(union afs_xdr_dir_block *block, int bit, unsigned int nr_slots) { - u64 mask, before, after; + u64 mask; mask = (1 << nr_slots) - 1; mask <<= bit; - before = *(u64 *)block->hdr.bitmap; - block->hdr.bitmap[0] |= (u8)(mask >> 0 * 8); block->hdr.bitmap[1] |= (u8)(mask >> 1 * 8); block->hdr.bitmap[2] |= (u8)(mask >> 2 * 8); @@ -83,8 +81,6 @@ static void afs_set_contig_bits(union afs_xdr_dir_block *block, block->hdr.bitmap[5] |= (u8)(mask >> 5 * 8); block->hdr.bitmap[6] |= (u8)(mask >> 6 * 8); block->hdr.bitmap[7] |= (u8)(mask >> 7 * 8); - - after = *(u64 *)block->hdr.bitmap; } /* @@ -93,13 +89,11 @@ static void afs_set_contig_bits(union afs_xdr_dir_block *block, static void afs_clear_contig_bits(union afs_xdr_dir_block *block, int bit, unsigned int nr_slots) { - u64 mask, before, after; + u64 mask; mask = (1 << nr_slots) - 1; mask <<= bit; - before = *(u64 *)block->hdr.bitmap; - block->hdr.bitmap[0] &= ~(u8)(mask >> 0 * 8); block->hdr.bitmap[1] &= ~(u8)(mask >> 1 * 8); block->hdr.bitmap[2] &= ~(u8)(mask >> 2 * 8); @@ -108,8 +102,6 @@ static void afs_clear_contig_bits(union afs_xdr_dir_block *block, block->hdr.bitmap[5] &= ~(u8)(mask >> 5 * 8); block->hdr.bitmap[6] &= ~(u8)(mask >> 6 * 8); block->hdr.bitmap[7] &= ~(u8)(mask >> 7 * 8); - - after = *(u64 *)block->hdr.bitmap; } /* diff --git a/fs/afs/file.c b/fs/afs/file.c index dd3c55c9101c..8415733f7bc1 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -223,7 +223,7 @@ static void afs_file_readpage_read_complete(struct page *page, /* * Fetch file data from the volume. */ -int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *desc) +int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *req) { struct afs_fs_cursor fc; struct afs_status_cb *scb; @@ -246,7 +246,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *de while (afs_select_fileserver(&fc)) { fc.cb_break = afs_calc_vnode_cb_break(vnode); - afs_fs_fetch_data(&fc, scb, desc); + afs_fs_fetch_data(&fc, scb, req); } afs_check_for_remote_deletion(&fc, vnode); @@ -257,7 +257,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *de if (ret == 0) { afs_stat_v(vnode, n_fetches); - atomic_long_add(desc->actual_len, + atomic_long_add(req->actual_len, &afs_v2net(vnode)->n_fetch_bytes); } diff --git a/fs/afs/flock.c b/fs/afs/flock.c index d5e5a6ddc847..0f2a94ba73cb 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -346,8 +346,8 @@ again: if (ret < 0) { trace_afs_flock_ev(vnode, NULL, afs_flock_extend_fail, ret); - pr_warning("AFS: Failed to extend lock on {%llx:%llx} error %d\n", - vnode->fid.vid, vnode->fid.vnode, ret); + pr_warn("AFS: Failed to extend lock on {%llx:%llx} error %d\n", + vnode->fid.vid, vnode->fid.vnode, ret); } spin_lock(&vnode->lock); diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 6f84231f11a5..1f9c5d8e6fe5 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -323,7 +323,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) int ret; _enter("{%u,%zu/%llu}", - call->unmarshall, iov_iter_count(&call->iter), req->actual_len); + call->unmarshall, iov_iter_count(call->iter), req->actual_len); switch (call->unmarshall) { case 0: @@ -363,14 +363,14 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) call->bvec[0].bv_len = size; call->bvec[0].bv_offset = req->offset; call->bvec[0].bv_page = req->pages[req->index]; - iov_iter_bvec(&call->iter, READ, call->bvec, 1, size); + iov_iter_bvec(&call->def_iter, READ, call->bvec, 1, size); ASSERTCMP(size, <=, PAGE_SIZE); /* Fall through */ /* extract the returned data */ case 2: _debug("extract data %zu/%llu", - iov_iter_count(&call->iter), req->remain); + iov_iter_count(call->iter), req->remain); ret = afs_extract_data(call, true); if (ret < 0) @@ -398,7 +398,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) case 3: _debug("extract discard %zu/%llu", - iov_iter_count(&call->iter), req->actual_len - req->len); + iov_iter_count(call->iter), req->actual_len - req->len); ret = afs_extract_data(call, true); if (ret < 0) @@ -490,7 +490,7 @@ static int afs_fs_fetch_data64(struct afs_fs_cursor *fc, call->key = fc->key; call->out_scb = scb; call->out_volsync = NULL; - call->read_request = req; + call->read_request = afs_get_read(req); /* marshall the parameters */ bp = call->request; @@ -503,7 +503,6 @@ static int afs_fs_fetch_data64(struct afs_fs_cursor *fc, bp[6] = 0; bp[7] = htonl(lower_32_bits(req->len)); - refcount_inc(&req->usage); afs_use_fs_server(call, fc->cbi); trace_afs_make_fs_call(call, &vnode->fid); afs_set_fc_call(call, fc); @@ -540,7 +539,7 @@ int afs_fs_fetch_data(struct afs_fs_cursor *fc, call->key = fc->key; call->out_scb = scb; call->out_volsync = NULL; - call->read_request = req; + call->read_request = afs_get_read(req); /* marshall the parameters */ bp = call->request; @@ -551,7 +550,6 @@ int afs_fs_fetch_data(struct afs_fs_cursor *fc, bp[4] = htonl(lower_32_bits(req->pos)); bp[5] = htonl(lower_32_bits(req->len)); - refcount_inc(&req->usage); afs_use_fs_server(call, fc->cbi); trace_afs_make_fs_call(call, &vnode->fid); afs_set_fc_call(call, fc); @@ -1852,7 +1850,7 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call) u32 count; int ret; - _enter("{%u,%zu}", call->unmarshall, iov_iter_count(&call->iter)); + _enter("{%u,%zu}", call->unmarshall, iov_iter_count(call->iter)); switch (call->unmarshall) { case 0: diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 46d2d7cb461d..281470fe1183 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -34,8 +34,7 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren { static unsigned long once_only; - pr_warn("kAFS: AFS vnode with undefined type %u\n", - vnode->status.type); + pr_warn("kAFS: AFS vnode with undefined type %u\n", vnode->status.type); pr_warn("kAFS: A=%d m=%o s=%llx v=%llx\n", vnode->status.abort_code, vnode->status.mode, @@ -175,11 +174,11 @@ static void afs_apply_status(struct afs_fs_cursor *fc, BUG_ON(test_bit(AFS_VNODE_UNSET, &vnode->flags)); if (status->type != vnode->status.type) { - pr_warning("Vnode %llx:%llx:%x changed type %u to %u\n", - vnode->fid.vid, - vnode->fid.vnode, - vnode->fid.unique, - status->type, vnode->status.type); + pr_warn("Vnode %llx:%llx:%x changed type %u to %u\n", + vnode->fid.vid, + vnode->fid.vnode, + vnode->fid.unique, + status->type, vnode->status.type); afs_protocol_error(NULL, -EBADMSG, afs_eproto_bad_status); return; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 759e0578012c..1d81fc4c3058 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -115,9 +115,9 @@ struct afs_call { struct afs_vnode *lvnode; /* vnode being locked */ void *request; /* request data (first part) */ struct address_space *mapping; /* Pages being written from */ - struct iov_iter iter; /* Buffer iterator */ - struct iov_iter *_iter; /* Iterator currently in use */ - union { /* Convenience for ->iter */ + struct iov_iter def_iter; /* Default buffer/data iterator */ + struct iov_iter *iter; /* Iterator currently in use */ + union { /* Convenience for ->def_iter */ struct kvec kvec[1]; struct bio_vec bvec[1]; }; @@ -934,6 +934,12 @@ extern int afs_fetch_data(struct afs_vnode *, struct key *, struct afs_read *); extern int afs_page_filler(void *, struct page *); extern void afs_put_read(struct afs_read *); +static inline struct afs_read *afs_get_read(struct afs_read *req) +{ + refcount_inc(&req->usage); + return req; +} + /* * flock.c */ @@ -1136,7 +1142,7 @@ static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t si { call->kvec[0].iov_base = buf; call->kvec[0].iov_len = size; - iov_iter_kvec(&call->iter, READ, call->kvec, 1, size); + iov_iter_kvec(&call->def_iter, READ, call->kvec, 1, size); } static inline void afs_extract_to_tmp(struct afs_call *call) @@ -1151,7 +1157,7 @@ static inline void afs_extract_to_tmp64(struct afs_call *call) static inline void afs_extract_discard(struct afs_call *call, size_t size) { - iov_iter_discard(&call->iter, READ, size); + iov_iter_discard(&call->def_iter, READ, size); } static inline void afs_extract_to_buf(struct afs_call *call, size_t size) diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 0e5269374ac1..58d396592250 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -152,7 +152,7 @@ static struct afs_call *afs_alloc_call(struct afs_net *net, INIT_WORK(&call->async_work, afs_process_async_call); init_waitqueue_head(&call->waitq); spin_lock_init(&call->state_lock); - call->_iter = &call->iter; + call->iter = &call->def_iter; o = atomic_inc_return(&net->nr_outstanding_calls); trace_afs_call(call, afs_call_trace_alloc, 1, o, @@ -513,12 +513,12 @@ static void afs_deliver_to_call(struct afs_call *call) state == AFS_CALL_SV_AWAIT_ACK ) { if (state == AFS_CALL_SV_AWAIT_ACK) { - iov_iter_kvec(&call->iter, READ, NULL, 0, 0); + iov_iter_kvec(&call->def_iter, READ, NULL, 0, 0); ret = rxrpc_kernel_recv_data(call->net->socket, - call->rxcall, &call->iter, + call->rxcall, &call->def_iter, false, &remote_abort, &call->service_id); - trace_afs_receive_data(call, &call->iter, false, ret); + trace_afs_receive_data(call, &call->def_iter, false, ret); if (ret == -EINPROGRESS || ret == -EAGAIN) return; @@ -637,6 +637,7 @@ long afs_wait_for_call_to_complete(struct afs_call *call, call->need_attention = false; __set_current_state(TASK_RUNNING); afs_deliver_to_call(call); + timeout = rtt2; continue; } @@ -858,7 +859,7 @@ static int afs_deliver_cm_op_id(struct afs_call *call) { int ret; - _enter("{%zu}", iov_iter_count(call->_iter)); + _enter("{%zu}", iov_iter_count(call->iter)); /* the operation ID forms the first four bytes of the request data */ ret = afs_extract_data(call, true); @@ -974,7 +975,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) int afs_extract_data(struct afs_call *call, bool want_more) { struct afs_net *net = call->net; - struct iov_iter *iter = call->_iter; + struct iov_iter *iter = call->iter; enum afs_call_state state; u32 remote_abort = 0; int ret; diff --git a/fs/afs/server.c b/fs/afs/server.c index 64d440aaabc0..1686bf188ccd 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -151,7 +151,7 @@ static struct afs_server *afs_install_server(struct afs_net *net, const struct afs_addr_list *alist; struct afs_server *server; struct rb_node **pp, *p; - int ret = -EEXIST, diff; + int diff; _enter("%p", candidate); @@ -196,7 +196,6 @@ static struct afs_server *afs_install_server(struct afs_net *net, hlist_add_head_rcu(&server->addr6_link, &net->fs_addresses6); write_sequnlock(&net->fs_addr_lock); - ret = 0; exists: afs_get_server(server, afs_server_trace_get_install); diff --git a/fs/afs/super.c b/fs/afs/super.c index f18911e8d770..488641b1a418 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -435,6 +435,7 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx) /* fill in the superblock */ sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; + sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_magic = AFS_FS_MAGIC; sb->s_op = &afs_super_ops; if (!as->dyn_root) diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c index 21eb0c0be912..8fea54eba0c2 100644 --- a/fs/afs/vl_list.c +++ b/fs/afs/vl_list.c @@ -279,8 +279,8 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell, struct afs_addr_list *old = addrs; write_lock(&server->lock); - rcu_swap_protected(server->addresses, old, - lockdep_is_held(&server->lock)); + old = rcu_replace_pointer(server->addresses, old, + lockdep_is_held(&server->lock)); write_unlock(&server->lock); afs_put_addrlist(old); } diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index cfb0ac4bd039..516e9a3bb5b4 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -185,7 +185,7 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call) int i, ret; _enter("{%u,%zu/%u}", - call->unmarshall, iov_iter_count(call->_iter), call->count); + call->unmarshall, iov_iter_count(call->iter), call->count); switch (call->unmarshall) { case 0: @@ -316,7 +316,7 @@ static int afs_deliver_vl_get_capabilities(struct afs_call *call) int ret; _enter("{%u,%zu/%u}", - call->unmarshall, iov_iter_count(call->_iter), call->count); + call->unmarshall, iov_iter_count(call->iter), call->count); switch (call->unmarshall) { case 0: @@ -425,7 +425,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) int ret; _enter("{%u,%zu,%u}", - call->unmarshall, iov_iter_count(call->_iter), call->count2); + call->unmarshall, iov_iter_count(call->iter), call->count2); switch (call->unmarshall) { case 0: diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c index 5552d034090a..7af41fd5f3ee 100644 --- a/fs/afs/xattr.c +++ b/fs/afs/xattr.c @@ -228,11 +228,11 @@ static int afs_xattr_get_yfs(const struct xattr_handler *handler, break; case 1: data = buf; - dsize = snprintf(buf, sizeof(buf), "%u", yacl->inherit_flag); + dsize = scnprintf(buf, sizeof(buf), "%u", yacl->inherit_flag); break; case 2: data = buf; - dsize = snprintf(buf, sizeof(buf), "%u", yacl->num_cleaned); + dsize = scnprintf(buf, sizeof(buf), "%u", yacl->num_cleaned); break; case 3: data = yacl->vol_acl->data; @@ -370,13 +370,15 @@ static int afs_xattr_get_fid(const struct xattr_handler *handler, /* The volume ID is 64-bit, the vnode ID is 96-bit and the * uniquifier is 32-bit. */ - len = sprintf(text, "%llx:", vnode->fid.vid); + len = scnprintf(text, sizeof(text), "%llx:", vnode->fid.vid); if (vnode->fid.vnode_hi) - len += sprintf(text + len, "%x%016llx", - vnode->fid.vnode_hi, vnode->fid.vnode); + len += scnprintf(text + len, sizeof(text) - len, "%x%016llx", + vnode->fid.vnode_hi, vnode->fid.vnode); else - len += sprintf(text + len, "%llx", vnode->fid.vnode); - len += sprintf(text + len, ":%x", vnode->fid.unique); + len += scnprintf(text + len, sizeof(text) - len, "%llx", + vnode->fid.vnode); + len += scnprintf(text + len, sizeof(text) - len, ":%x", + vnode->fid.unique); if (size == 0) return len; diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 3ee7abf4b2d0..a26126ac7bf1 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -152,8 +152,8 @@ static void yfs_check_req(struct afs_call *call, __be32 *bp) pr_err("kAFS: %s: Request buffer overflow (%zu>%u)\n", call->type->name, len, call->request_size); else if (len < call->request_size) - pr_warning("kAFS: %s: Request buffer underflow (%zu<%u)\n", - call->type->name, len, call->request_size); + pr_warn("kAFS: %s: Request buffer underflow (%zu<%u)\n", + call->type->name, len, call->request_size); } /* @@ -441,7 +441,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) int ret; _enter("{%u,%zu/%llu}", - call->unmarshall, iov_iter_count(&call->iter), req->actual_len); + call->unmarshall, iov_iter_count(call->iter), req->actual_len); switch (call->unmarshall) { case 0: @@ -476,14 +476,14 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) call->bvec[0].bv_len = size; call->bvec[0].bv_offset = req->offset; call->bvec[0].bv_page = req->pages[req->index]; - iov_iter_bvec(&call->iter, READ, call->bvec, 1, size); + iov_iter_bvec(&call->def_iter, READ, call->bvec, 1, size); ASSERTCMP(size, <=, PAGE_SIZE); /* Fall through */ /* extract the returned data */ case 2: _debug("extract data %zu/%llu", - iov_iter_count(&call->iter), req->remain); + iov_iter_count(call->iter), req->remain); ret = afs_extract_data(call, true); if (ret < 0) @@ -511,7 +511,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) case 3: _debug("extract discard %zu/%llu", - iov_iter_count(&call->iter), req->actual_len - req->len); + iov_iter_count(call->iter), req->actual_len - req->len); ret = afs_extract_data(call, true); if (ret < 0) @@ -605,7 +605,7 @@ int yfs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_status_cb *scb, call->key = fc->key; call->out_scb = scb; call->out_volsync = NULL; - call->read_request = req; + call->read_request = afs_get_read(req); /* marshall the parameters */ bp = call->request; @@ -616,7 +616,6 @@ int yfs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_status_cb *scb, bp = xdr_encode_u64(bp, req->len); yfs_check_req(call, bp); - refcount_inc(&req->usage); afs_use_fs_server(call, fc->cbi); trace_afs_make_fs_call(call, &vnode->fid); afs_set_fc_call(call, fc); @@ -2056,7 +2056,7 @@ static long do_io_getevents(aio_context_t ctx_id, * specifies an infinite timeout. Note that the timeout pointed to by * timeout is relative. Will fail with -ENOSYS if not implemented. */ -#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT) +#ifdef CONFIG_64BIT SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, long, min_nr, @@ -2179,7 +2179,7 @@ SYSCALL_DEFINE5(io_getevents_time32, __u32, ctx_id, #ifdef CONFIG_COMPAT struct __compat_aio_sigset { - compat_sigset_t __user *sigmask; + compat_uptr_t sigmask; compat_size_t sigsetsize; }; @@ -2193,7 +2193,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents, struct old_timespec32 __user *, timeout, const struct __compat_aio_sigset __user *, usig) { - struct __compat_aio_sigset ksig = { NULL, }; + struct __compat_aio_sigset ksig = { 0, }; struct timespec64 t; bool interrupted; int ret; @@ -2204,7 +2204,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents, if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; - ret = set_compat_user_sigmask(ksig.sigmask, ksig.sigsetsize); + ret = set_compat_user_sigmask(compat_ptr(ksig.sigmask), ksig.sigsetsize); if (ret) return ret; @@ -2228,7 +2228,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64, struct __kernel_timespec __user *, timeout, const struct __compat_aio_sigset __user *, usig) { - struct __compat_aio_sigset ksig = { NULL, }; + struct __compat_aio_sigset ksig = { 0, }; struct timespec64 t; bool interrupted; int ret; @@ -2239,7 +2239,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64, if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; - ret = set_compat_user_sigmask(ksig.sigmask, ksig.sigsetsize); + ret = set_compat_user_sigmask(compat_ptr(ksig.sigmask), ksig.sigsetsize); if (ret) return ret; diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 8bcec8dcabb6..054f97b07754 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -63,7 +63,7 @@ struct autofs_info { struct autofs_sb_info *sbi; unsigned long last_used; - atomic_t count; + int count; kuid_t uid; kgid_t gid; diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index 2866fabf497f..a1c7701007e7 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -211,7 +211,7 @@ static int autofs_tree_busy(struct vfsmount *mnt, } } else { struct autofs_info *ino = autofs_dentry_ino(p); - unsigned int ino_count = atomic_read(&ino->count); + unsigned int ino_count = READ_ONCE(ino->count); /* allow for dget above and top is already dgot */ if (p == top) @@ -379,7 +379,7 @@ static struct dentry *should_expire(struct dentry *dentry, /* Not a forced expire? */ if (!(how & AUTOFS_EXP_FORCED)) { /* ref-walk currently on this dentry? */ - ino_count = atomic_read(&ino->count) + 1; + ino_count = READ_ONCE(ino->count) + 1; if (d_count(dentry) > ino_count) return NULL; } @@ -396,7 +396,7 @@ static struct dentry *should_expire(struct dentry *dentry, /* Not a forced expire? */ if (!(how & AUTOFS_EXP_FORCED)) { /* ref-walk currently on this dentry? */ - ino_count = atomic_read(&ino->count) + 1; + ino_count = READ_ONCE(ino->count) + 1; if (d_count(dentry) > ino_count) return NULL; } @@ -459,9 +459,10 @@ static struct dentry *autofs_expire_indirect(struct super_block *sb, */ how &= ~AUTOFS_EXP_LEAVES; found = should_expire(expired, mnt, timeout, how); - if (!found || found != expired) - /* Something has changed, continue */ + if (found != expired) { // something has changed, continue + dput(found); goto next; + } if (expired != dentry) dput(dentry); diff --git a/fs/autofs/root.c b/fs/autofs/root.c index 29abafc0ce31..5aaa1732bf1e 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -569,10 +569,9 @@ static int autofs_dir_symlink(struct inode *dir, d_add(dentry, inode); dget(dentry); - atomic_inc(&ino->count); + ino->count++; p_ino = autofs_dentry_ino(dentry->d_parent); - if (p_ino && !IS_ROOT(dentry)) - atomic_inc(&p_ino->count); + p_ino->count++; dir->i_mtime = current_time(dir); @@ -610,11 +609,9 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry) if (sbi->flags & AUTOFS_SBI_CATATONIC) return -EACCES; - if (atomic_dec_and_test(&ino->count)) { - p_ino = autofs_dentry_ino(dentry->d_parent); - if (p_ino && !IS_ROOT(dentry)) - atomic_dec(&p_ino->count); - } + ino->count--; + p_ino = autofs_dentry_ino(dentry->d_parent); + p_ino->count--; dput(ino->dentry); d_inode(dentry)->i_size = 0; @@ -660,7 +657,6 @@ static void autofs_set_leaf_automount_flags(struct dentry *dentry) static void autofs_clear_leaf_automount_flags(struct dentry *dentry) { - struct list_head *d_child; struct dentry *parent; /* flags for dentrys in the root are handled elsewhere */ @@ -673,10 +669,7 @@ static void autofs_clear_leaf_automount_flags(struct dentry *dentry) /* only consider parents below dentrys in the root */ if (IS_ROOT(parent->d_parent)) return; - d_child = &dentry->d_child; - /* Set parent managed if it's becoming empty */ - if (d_child->next == &parent->d_subdirs && - d_child->prev == &parent->d_subdirs) + if (autofs_dentry_ino(parent)->count == 2) managed_dentry_set_managed(parent); } @@ -698,11 +691,10 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) if (sbi->flags & AUTOFS_SBI_CATATONIC) return -EACCES; - spin_lock(&sbi->lookup_lock); - if (!simple_empty(dentry)) { - spin_unlock(&sbi->lookup_lock); + if (ino->count != 1) return -ENOTEMPTY; - } + + spin_lock(&sbi->lookup_lock); __autofs_add_expiring(dentry); d_drop(dentry); spin_unlock(&sbi->lookup_lock); @@ -710,11 +702,9 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) if (sbi->version < 5) autofs_clear_leaf_automount_flags(dentry); - if (atomic_dec_and_test(&ino->count)) { - p_ino = autofs_dentry_ino(dentry->d_parent); - if (p_ino && dentry->d_parent != dentry) - atomic_dec(&p_ino->count); - } + ino->count--; + p_ino = autofs_dentry_ino(dentry->d_parent); + p_ino->count--; dput(ino->dentry); d_inode(dentry)->i_size = 0; clear_nlink(d_inode(dentry)); @@ -760,10 +750,9 @@ static int autofs_dir_mkdir(struct inode *dir, autofs_set_leaf_automount_flags(dentry); dget(dentry); - atomic_inc(&ino->count); + ino->count++; p_ino = autofs_dentry_ino(dentry->d_parent); - if (p_ino && !IS_ROOT(dentry)) - atomic_inc(&p_ino->count); + p_ino->count++; inc_nlink(dir); dir->i_mtime = current_time(dir); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index c5642bcb6b46..ecd8d2698515 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -404,6 +404,17 @@ static unsigned long total_mapping_size(const struct elf_phdr *cmds, int nr) ELF_PAGESTART(cmds[first_idx].p_vaddr); } +static int elf_read(struct file *file, void *buf, size_t len, loff_t pos) +{ + ssize_t rv; + + rv = kernel_read(file, buf, len, &pos); + if (unlikely(rv != len)) { + return (rv < 0) ? rv : -EIO; + } + return 0; +} + /** * load_elf_phdrs() - load ELF program headers * @elf_ex: ELF header of the binary whose program headers should be loaded @@ -418,7 +429,6 @@ static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex, { struct elf_phdr *elf_phdata = NULL; int retval, err = -1; - loff_t pos = elf_ex->e_phoff; unsigned int size; /* @@ -439,9 +449,9 @@ static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex, goto out; /* Read in the program headers */ - retval = kernel_read(elf_file, elf_phdata, size, &pos); - if (retval != size) { - err = (retval < 0) ? retval : -EIO; + retval = elf_read(elf_file, elf_phdata, size, elf_ex->e_phoff); + if (retval < 0) { + err = retval; goto out; } @@ -544,7 +554,7 @@ static inline int make_prot(u32 p_flags) an ELF header */ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, - struct file *interpreter, unsigned long *interp_map_addr, + struct file *interpreter, unsigned long no_base, struct elf_phdr *interp_elf_phdata) { struct elf_phdr *eppnt; @@ -590,8 +600,6 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, map_addr = elf_map(interpreter, load_addr + vaddr, eppnt, elf_prot, elf_type, total_size); total_size = 0; - if (!*interp_map_addr) - *interp_map_addr = map_addr; error = map_addr; if (BAD_ADDR(map_addr)) goto out; @@ -722,7 +730,6 @@ static int load_elf_binary(struct linux_binprm *bprm) elf_ppnt = elf_phdata; for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { char *elf_interpreter; - loff_t pos; if (elf_ppnt->p_type != PT_INTERP) continue; @@ -740,14 +747,10 @@ static int load_elf_binary(struct linux_binprm *bprm) if (!elf_interpreter) goto out_free_ph; - pos = elf_ppnt->p_offset; - retval = kernel_read(bprm->file, elf_interpreter, - elf_ppnt->p_filesz, &pos); - if (retval != elf_ppnt->p_filesz) { - if (retval >= 0) - retval = -EIO; + retval = elf_read(bprm->file, elf_interpreter, elf_ppnt->p_filesz, + elf_ppnt->p_offset); + if (retval < 0) goto out_free_interp; - } /* make sure path is NULL terminated */ retval = -ENOEXEC; if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0') @@ -766,14 +769,10 @@ static int load_elf_binary(struct linux_binprm *bprm) would_dump(bprm, interpreter); /* Get the exec headers */ - pos = 0; - retval = kernel_read(interpreter, &loc->interp_elf_ex, - sizeof(loc->interp_elf_ex), &pos); - if (retval != sizeof(loc->interp_elf_ex)) { - if (retval >= 0) - retval = -EIO; + retval = elf_read(interpreter, &loc->interp_elf_ex, + sizeof(loc->interp_elf_ex), 0); + if (retval < 0) goto out_free_dentry; - } break; @@ -1054,11 +1053,8 @@ out_free_interp: } if (interpreter) { - unsigned long interp_map_addr = 0; - elf_entry = load_elf_interp(&loc->interp_elf_ex, interpreter, - &interp_map_addr, load_bias, interp_elf_phdata); if (!IS_ERR((void *)elf_entry)) { /* @@ -1179,11 +1175,10 @@ static int load_elf_library(struct file *file) unsigned long elf_bss, bss, len; int retval, error, i, j; struct elfhdr elf_ex; - loff_t pos = 0; error = -ENOEXEC; - retval = kernel_read(file, &elf_ex, sizeof(elf_ex), &pos); - if (retval != sizeof(elf_ex)) + retval = elf_read(file, &elf_ex, sizeof(elf_ex), 0); + if (retval < 0) goto out; if (memcmp(elf_ex.e_ident, ELFMAG, SELFMAG) != 0) @@ -1208,9 +1203,8 @@ static int load_elf_library(struct file *file) eppnt = elf_phdata; error = -ENOEXEC; - pos = elf_ex.e_phoff; - retval = kernel_read(file, eppnt, j, &pos); - if (retval != j) + retval = elf_read(file, eppnt, j, elf_ex.e_phoff); + if (retval < 0) goto out_free_ph; for (j = 0, i = 0; i<elf_ex.e_phnum; i++) @@ -1489,18 +1483,18 @@ static void fill_prstatus(struct elf_prstatus *prstatus, * group-wide total, not its individual thread total. */ thread_group_cputime(p, &cputime); - prstatus->pr_utime = ns_to_timeval(cputime.utime); - prstatus->pr_stime = ns_to_timeval(cputime.stime); + prstatus->pr_utime = ns_to_kernel_old_timeval(cputime.utime); + prstatus->pr_stime = ns_to_kernel_old_timeval(cputime.stime); } else { u64 utime, stime; task_cputime(p, &utime, &stime); - prstatus->pr_utime = ns_to_timeval(utime); - prstatus->pr_stime = ns_to_timeval(stime); + prstatus->pr_utime = ns_to_kernel_old_timeval(utime); + prstatus->pr_stime = ns_to_kernel_old_timeval(stime); } - prstatus->pr_cutime = ns_to_timeval(p->signal->cutime); - prstatus->pr_cstime = ns_to_timeval(p->signal->cstime); + prstatus->pr_cutime = ns_to_kernel_old_timeval(p->signal->cutime); + prstatus->pr_cstime = ns_to_kernel_old_timeval(p->signal->cstime); } static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index d86ebd0dcc3d..240f66663543 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1359,17 +1359,17 @@ static void fill_prstatus(struct elf_prstatus *prstatus, * group-wide total, not its individual thread total. */ thread_group_cputime(p, &cputime); - prstatus->pr_utime = ns_to_timeval(cputime.utime); - prstatus->pr_stime = ns_to_timeval(cputime.stime); + prstatus->pr_utime = ns_to_kernel_old_timeval(cputime.utime); + prstatus->pr_stime = ns_to_kernel_old_timeval(cputime.stime); } else { u64 utime, stime; task_cputime(p, &utime, &stime); - prstatus->pr_utime = ns_to_timeval(utime); - prstatus->pr_stime = ns_to_timeval(stime); + prstatus->pr_utime = ns_to_kernel_old_timeval(utime); + prstatus->pr_stime = ns_to_kernel_old_timeval(stime); } - prstatus->pr_cutime = ns_to_timeval(p->signal->cutime); - prstatus->pr_cstime = ns_to_timeval(p->signal->cstime); + prstatus->pr_cutime = ns_to_kernel_old_timeval(p->signal->cutime); + prstatus->pr_cstime = ns_to_kernel_old_timeval(p->signal->cstime); prstatus->pr_exec_fdpic_loadmap = p->mm->context.exec_fdpic_loadmap; prstatus->pr_interp_fdpic_loadmap = p->mm->context.interp_fdpic_loadmap; diff --git a/fs/block_dev.c b/fs/block_dev.c index 9c073dbdc1b0..69bf2fb6f7cd 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1403,11 +1403,7 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty) "resized disk %s\n", bdev->bd_disk ? bdev->bd_disk->disk_name : ""); } - - if (!bdev->bd_disk) - return; - if (disk_part_scan_enabled(bdev->bd_disk)) - bdev->bd_invalidated = 1; + bdev->bd_invalidated = 1; } /** @@ -1420,8 +1416,8 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty) * and adjusts it if it differs. When shrinking the bdev size, its all caches * are freed. */ -void check_disk_size_change(struct gendisk *disk, struct block_device *bdev, - bool verbose) +static void check_disk_size_change(struct gendisk *disk, + struct block_device *bdev, bool verbose) { loff_t disk_size, bdev_size; @@ -1437,6 +1433,7 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev, if (bdev_size > disk_size) flush_disk(bdev, false); } + bdev->bd_invalidated = 0; } /** @@ -1466,7 +1463,6 @@ int revalidate_disk(struct gendisk *disk) mutex_lock(&bdev->bd_mutex); check_disk_size_change(disk, bdev, ret == 0); - bdev->bd_invalidated = 0; mutex_unlock(&bdev->bd_mutex); bdput(bdev); } @@ -1512,6 +1508,45 @@ EXPORT_SYMBOL(bd_set_size); static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); +int bdev_disk_changed(struct block_device *bdev, bool invalidate) +{ + struct gendisk *disk = bdev->bd_disk; + int ret; + + lockdep_assert_held(&bdev->bd_mutex); + +rescan: + ret = blk_drop_partitions(disk, bdev); + if (ret) + return ret; + + if (invalidate) + set_capacity(disk, 0); + else if (disk->fops->revalidate_disk) + disk->fops->revalidate_disk(disk); + + check_disk_size_change(disk, bdev, !invalidate); + + if (get_capacity(disk)) { + ret = blk_add_partitions(disk, bdev); + if (ret == -EAGAIN) + goto rescan; + } else if (invalidate) { + /* + * Tell userspace that the media / partition table may have + * changed. + */ + kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); + } + + return ret; +} +/* + * Only exported for for loop and dasd for historic reasons. Don't use in new + * code! + */ +EXPORT_SYMBOL_GPL(bdev_disk_changed); + /* * bd_mutex locking: * @@ -1594,12 +1629,9 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) * The latter is necessary to prevent ghost * partitions on a removed medium. */ - if (bdev->bd_invalidated) { - if (!ret) - rescan_partitions(disk, bdev); - else if (ret == -ENOMEDIUM) - invalidate_partitions(disk, bdev); - } + if (bdev->bd_invalidated && + (!ret || ret == -ENOMEDIUM)) + bdev_disk_changed(bdev, ret == -ENOMEDIUM); if (ret) goto out_clear; @@ -1632,12 +1664,9 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (bdev->bd_disk->fops->open) ret = bdev->bd_disk->fops->open(bdev, mode); /* the same as first opener case, read comment there */ - if (bdev->bd_invalidated) { - if (!ret) - rescan_partitions(bdev->bd_disk, bdev); - else if (ret == -ENOMEDIUM) - invalidate_partitions(bdev->bd_disk, bdev); - } + if (bdev->bd_invalidated && + (!ret || ret == -ENOMEDIUM)) + bdev_disk_changed(bdev, ret == -ENOMEDIUM); if (ret) goto out_unlock_bdev; } diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 38651fae7f21..75b6d10c9845 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -5,6 +5,8 @@ config BTRFS_FS select CRYPTO select CRYPTO_CRC32C select LIBCRC32C + select CRYPTO_XXHASH + select CRYPTO_SHA256 select ZLIB_INFLATE select ZLIB_DEFLATE select LZO_COMPRESS diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 2e9e13ffbd08..1d32a07bb2d1 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -53,24 +53,12 @@ struct btrfs_workqueue { struct __btrfs_workqueue *high; }; -static void normal_work_helper(struct btrfs_work *work); - -#define BTRFS_WORK_HELPER(name) \ -noinline_for_stack void btrfs_##name(struct work_struct *arg) \ -{ \ - struct btrfs_work *work = container_of(arg, struct btrfs_work, \ - normal_work); \ - normal_work_helper(work); \ -} - -struct btrfs_fs_info * -btrfs_workqueue_owner(const struct __btrfs_workqueue *wq) +struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq) { return wq->fs_info; } -struct btrfs_fs_info * -btrfs_work_owner(const struct btrfs_work *work) +struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work) { return work->wq->fs_info; } @@ -89,29 +77,6 @@ bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq) return atomic_read(&wq->normal->pending) > wq->normal->thresh * 2; } -BTRFS_WORK_HELPER(worker_helper); -BTRFS_WORK_HELPER(delalloc_helper); -BTRFS_WORK_HELPER(flush_delalloc_helper); -BTRFS_WORK_HELPER(cache_helper); -BTRFS_WORK_HELPER(submit_helper); -BTRFS_WORK_HELPER(fixup_helper); -BTRFS_WORK_HELPER(endio_helper); -BTRFS_WORK_HELPER(endio_meta_helper); -BTRFS_WORK_HELPER(endio_meta_write_helper); -BTRFS_WORK_HELPER(endio_raid56_helper); -BTRFS_WORK_HELPER(endio_repair_helper); -BTRFS_WORK_HELPER(rmw_helper); -BTRFS_WORK_HELPER(endio_write_helper); -BTRFS_WORK_HELPER(freespace_write_helper); -BTRFS_WORK_HELPER(delayed_meta_helper); -BTRFS_WORK_HELPER(readahead_helper); -BTRFS_WORK_HELPER(qgroup_rescan_helper); -BTRFS_WORK_HELPER(extent_refs_helper); -BTRFS_WORK_HELPER(scrub_helper); -BTRFS_WORK_HELPER(scrubwrc_helper); -BTRFS_WORK_HELPER(scrubnc_helper); -BTRFS_WORK_HELPER(scrubparity_helper); - static struct __btrfs_workqueue * __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, unsigned int flags, int limit_active, int thresh) @@ -252,16 +217,16 @@ out: } } -static void run_ordered_work(struct __btrfs_workqueue *wq) +static void run_ordered_work(struct __btrfs_workqueue *wq, + struct btrfs_work *self) { struct list_head *list = &wq->ordered_list; struct btrfs_work *work; spinlock_t *lock = &wq->list_lock; unsigned long flags; + bool free_self = false; while (1) { - void *wtag; - spin_lock_irqsave(lock, flags); if (list_empty(list)) break; @@ -287,22 +252,53 @@ static void run_ordered_work(struct __btrfs_workqueue *wq) list_del(&work->ordered_list); spin_unlock_irqrestore(lock, flags); - /* - * We don't want to call the ordered free functions with the - * lock held though. Save the work as tag for the trace event, - * because the callback could free the structure. - */ - wtag = work; - work->ordered_free(work); - trace_btrfs_all_work_done(wq->fs_info, wtag); + if (work == self) { + /* + * This is the work item that the worker is currently + * executing. + * + * The kernel workqueue code guarantees non-reentrancy + * of work items. I.e., if a work item with the same + * address and work function is queued twice, the second + * execution is blocked until the first one finishes. A + * work item may be freed and recycled with the same + * work function; the workqueue code assumes that the + * original work item cannot depend on the recycled work + * item in that case (see find_worker_executing_work()). + * + * Note that different types of Btrfs work can depend on + * each other, and one type of work on one Btrfs + * filesystem may even depend on the same type of work + * on another Btrfs filesystem via, e.g., a loop device. + * Therefore, we must not allow the current work item to + * be recycled until we are really done, otherwise we + * break the above assumption and can deadlock. + */ + free_self = true; + } else { + /* + * We don't want to call the ordered free functions with + * the lock held. + */ + work->ordered_free(work); + /* NB: work must not be dereferenced past this point. */ + trace_btrfs_all_work_done(wq->fs_info, work); + } } spin_unlock_irqrestore(lock, flags); + + if (free_self) { + self->ordered_free(self); + /* NB: self must not be dereferenced past this point. */ + trace_btrfs_all_work_done(wq->fs_info, self); + } } -static void normal_work_helper(struct btrfs_work *work) +static void btrfs_work_helper(struct work_struct *normal_work) { + struct btrfs_work *work = container_of(normal_work, struct btrfs_work, + normal_work); struct __btrfs_workqueue *wq; - void *wtag; int need_order = 0; /* @@ -316,29 +312,26 @@ static void normal_work_helper(struct btrfs_work *work) if (work->ordered_func) need_order = 1; wq = work->wq; - /* Safe for tracepoints in case work gets freed by the callback */ - wtag = work; trace_btrfs_work_sched(work); thresh_exec_hook(wq); work->func(work); if (need_order) { set_bit(WORK_DONE_BIT, &work->flags); - run_ordered_work(wq); + run_ordered_work(wq, work); + } else { + /* NB: work must not be dereferenced past this point. */ + trace_btrfs_all_work_done(wq->fs_info, work); } - if (!need_order) - trace_btrfs_all_work_done(wq->fs_info, wtag); } -void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t uniq_func, - btrfs_func_t func, - btrfs_func_t ordered_func, - btrfs_func_t ordered_free) +void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func, + btrfs_func_t ordered_func, btrfs_func_t ordered_free) { work->func = func; work->ordered_func = ordered_func; work->ordered_free = ordered_free; - INIT_WORK(&work->normal_work, uniq_func); + INIT_WORK(&work->normal_work, btrfs_work_helper); INIT_LIST_HEAD(&work->ordered_list); work->flags = 0; } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 7861c9feba5f..a4434301d84d 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -29,49 +29,20 @@ struct btrfs_work { unsigned long flags; }; -#define BTRFS_WORK_HELPER_PROTO(name) \ -void btrfs_##name(struct work_struct *arg) - -BTRFS_WORK_HELPER_PROTO(worker_helper); -BTRFS_WORK_HELPER_PROTO(delalloc_helper); -BTRFS_WORK_HELPER_PROTO(flush_delalloc_helper); -BTRFS_WORK_HELPER_PROTO(cache_helper); -BTRFS_WORK_HELPER_PROTO(submit_helper); -BTRFS_WORK_HELPER_PROTO(fixup_helper); -BTRFS_WORK_HELPER_PROTO(endio_helper); -BTRFS_WORK_HELPER_PROTO(endio_meta_helper); -BTRFS_WORK_HELPER_PROTO(endio_meta_write_helper); -BTRFS_WORK_HELPER_PROTO(endio_raid56_helper); -BTRFS_WORK_HELPER_PROTO(endio_repair_helper); -BTRFS_WORK_HELPER_PROTO(rmw_helper); -BTRFS_WORK_HELPER_PROTO(endio_write_helper); -BTRFS_WORK_HELPER_PROTO(freespace_write_helper); -BTRFS_WORK_HELPER_PROTO(delayed_meta_helper); -BTRFS_WORK_HELPER_PROTO(readahead_helper); -BTRFS_WORK_HELPER_PROTO(qgroup_rescan_helper); -BTRFS_WORK_HELPER_PROTO(extent_refs_helper); -BTRFS_WORK_HELPER_PROTO(scrub_helper); -BTRFS_WORK_HELPER_PROTO(scrubwrc_helper); -BTRFS_WORK_HELPER_PROTO(scrubnc_helper); -BTRFS_WORK_HELPER_PROTO(scrubparity_helper); - - struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, unsigned int flags, int limit_active, int thresh); -void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper, - btrfs_func_t func, - btrfs_func_t ordered_func, - btrfs_func_t ordered_free); +void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func, + btrfs_func_t ordered_func, btrfs_func_t ordered_free); void btrfs_queue_work(struct btrfs_workqueue *wq, struct btrfs_work *work); void btrfs_destroy_workqueue(struct btrfs_workqueue *wq); void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max); void btrfs_set_work_high_priority(struct btrfs_work *work); -struct btrfs_fs_info *btrfs_work_owner(const struct btrfs_work *work); -struct btrfs_fs_info *btrfs_workqueue_owner(const struct __btrfs_workqueue *wq); +struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work); +struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq); bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq); #endif diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index bf7e3f23bba7..6934a5b8708f 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -120,12 +120,12 @@ u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) return get_alloc_profile(fs_info, orig_flags); } -void btrfs_get_block_group(struct btrfs_block_group_cache *cache) +void btrfs_get_block_group(struct btrfs_block_group *cache) { atomic_inc(&cache->count); } -void btrfs_put_block_group(struct btrfs_block_group_cache *cache) +void btrfs_put_block_group(struct btrfs_block_group *cache) { if (atomic_dec_and_test(&cache->count)) { WARN_ON(cache->pinned > 0); @@ -149,22 +149,21 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache) * This adds the block group to the fs_info rb tree for the block group cache */ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, - struct btrfs_block_group_cache *block_group) + struct btrfs_block_group *block_group) { struct rb_node **p; struct rb_node *parent = NULL; - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; spin_lock(&info->block_group_cache_lock); p = &info->block_group_cache_tree.rb_node; while (*p) { parent = *p; - cache = rb_entry(parent, struct btrfs_block_group_cache, - cache_node); - if (block_group->key.objectid < cache->key.objectid) { + cache = rb_entry(parent, struct btrfs_block_group, cache_node); + if (block_group->start < cache->start) { p = &(*p)->rb_left; - } else if (block_group->key.objectid > cache->key.objectid) { + } else if (block_group->start > cache->start) { p = &(*p)->rb_right; } else { spin_unlock(&info->block_group_cache_lock); @@ -176,8 +175,8 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, rb_insert_color(&block_group->cache_node, &info->block_group_cache_tree); - if (info->first_logical_byte > block_group->key.objectid) - info->first_logical_byte = block_group->key.objectid; + if (info->first_logical_byte > block_group->start) + info->first_logical_byte = block_group->start; spin_unlock(&info->block_group_cache_lock); @@ -188,10 +187,10 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, * This will return the block group at or after bytenr if contains is 0, else * it will return the block group that contains the bytenr */ -static struct btrfs_block_group_cache *block_group_cache_tree_search( +static struct btrfs_block_group *block_group_cache_tree_search( struct btrfs_fs_info *info, u64 bytenr, int contains) { - struct btrfs_block_group_cache *cache, *ret = NULL; + struct btrfs_block_group *cache, *ret = NULL; struct rb_node *n; u64 end, start; @@ -199,13 +198,12 @@ static struct btrfs_block_group_cache *block_group_cache_tree_search( n = info->block_group_cache_tree.rb_node; while (n) { - cache = rb_entry(n, struct btrfs_block_group_cache, - cache_node); - end = cache->key.objectid + cache->key.offset - 1; - start = cache->key.objectid; + cache = rb_entry(n, struct btrfs_block_group, cache_node); + end = cache->start + cache->length - 1; + start = cache->start; if (bytenr < start) { - if (!contains && (!ret || start < ret->key.objectid)) + if (!contains && (!ret || start < ret->start)) ret = cache; n = n->rb_left; } else if (bytenr > start) { @@ -221,8 +219,8 @@ static struct btrfs_block_group_cache *block_group_cache_tree_search( } if (ret) { btrfs_get_block_group(ret); - if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) - info->first_logical_byte = ret->key.objectid; + if (bytenr == 0 && info->first_logical_byte > ret->start) + info->first_logical_byte = ret->start; } spin_unlock(&info->block_group_cache_lock); @@ -232,7 +230,7 @@ static struct btrfs_block_group_cache *block_group_cache_tree_search( /* * Return the block group that starts at or after bytenr */ -struct btrfs_block_group_cache *btrfs_lookup_first_block_group( +struct btrfs_block_group *btrfs_lookup_first_block_group( struct btrfs_fs_info *info, u64 bytenr) { return block_group_cache_tree_search(info, bytenr, 0); @@ -241,14 +239,14 @@ struct btrfs_block_group_cache *btrfs_lookup_first_block_group( /* * Return the block group that contains the given bytenr */ -struct btrfs_block_group_cache *btrfs_lookup_block_group( +struct btrfs_block_group *btrfs_lookup_block_group( struct btrfs_fs_info *info, u64 bytenr) { return block_group_cache_tree_search(info, bytenr, 1); } -struct btrfs_block_group_cache *btrfs_next_block_group( - struct btrfs_block_group_cache *cache) +struct btrfs_block_group *btrfs_next_block_group( + struct btrfs_block_group *cache) { struct btrfs_fs_info *fs_info = cache->fs_info; struct rb_node *node; @@ -257,7 +255,7 @@ struct btrfs_block_group_cache *btrfs_next_block_group( /* If our block group was removed, we need a full search. */ if (RB_EMPTY_NODE(&cache->cache_node)) { - const u64 next_bytenr = cache->key.objectid + cache->key.offset; + const u64 next_bytenr = cache->start + cache->length; spin_unlock(&fs_info->block_group_cache_lock); btrfs_put_block_group(cache); @@ -266,8 +264,7 @@ struct btrfs_block_group_cache *btrfs_next_block_group( node = rb_next(&cache->cache_node); btrfs_put_block_group(cache); if (node) { - cache = rb_entry(node, struct btrfs_block_group_cache, - cache_node); + cache = rb_entry(node, struct btrfs_block_group, cache_node); btrfs_get_block_group(cache); } else cache = NULL; @@ -277,7 +274,7 @@ struct btrfs_block_group_cache *btrfs_next_block_group( bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) { - struct btrfs_block_group_cache *bg; + struct btrfs_block_group *bg; bool ret = true; bg = btrfs_lookup_block_group(fs_info, bytenr); @@ -300,7 +297,7 @@ bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) { - struct btrfs_block_group_cache *bg; + struct btrfs_block_group *bg; bg = btrfs_lookup_block_group(fs_info, bytenr); ASSERT(bg); @@ -314,7 +311,7 @@ void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) btrfs_put_block_group(bg); } -void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) +void btrfs_wait_nocow_writers(struct btrfs_block_group *bg) { wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers)); } @@ -322,7 +319,7 @@ void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, const u64 start) { - struct btrfs_block_group_cache *bg; + struct btrfs_block_group *bg; bg = btrfs_lookup_block_group(fs_info, start); ASSERT(bg); @@ -331,7 +328,7 @@ void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, btrfs_put_block_group(bg); } -void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) +void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg) { struct btrfs_space_info *space_info = bg->space_info; @@ -357,7 +354,7 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) } struct btrfs_caching_control *btrfs_get_caching_control( - struct btrfs_block_group_cache *cache) + struct btrfs_block_group *cache) { struct btrfs_caching_control *ctl; @@ -392,7 +389,7 @@ void btrfs_put_caching_control(struct btrfs_caching_control *ctl) * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using * any of the information in this block group. */ -void btrfs_wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, +void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, u64 num_bytes) { struct btrfs_caching_control *caching_ctl; @@ -401,13 +398,13 @@ void btrfs_wait_block_group_cache_progress(struct btrfs_block_group_cache *cache if (!caching_ctl) return; - wait_event(caching_ctl->wait, btrfs_block_group_cache_done(cache) || + wait_event(caching_ctl->wait, btrfs_block_group_done(cache) || (cache->free_space_ctl->free_space >= num_bytes)); btrfs_put_caching_control(caching_ctl); } -int btrfs_wait_block_group_cache_done(struct btrfs_block_group_cache *cache) +int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache) { struct btrfs_caching_control *caching_ctl; int ret = 0; @@ -416,7 +413,7 @@ int btrfs_wait_block_group_cache_done(struct btrfs_block_group_cache *cache) if (!caching_ctl) return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; - wait_event(caching_ctl->wait, btrfs_block_group_cache_done(cache)); + wait_event(caching_ctl->wait, btrfs_block_group_done(cache)); if (cache->cached == BTRFS_CACHE_ERROR) ret = -EIO; btrfs_put_caching_control(caching_ctl); @@ -424,11 +421,11 @@ int btrfs_wait_block_group_cache_done(struct btrfs_block_group_cache *cache) } #ifdef CONFIG_BTRFS_DEBUG -static void fragment_free_space(struct btrfs_block_group_cache *block_group) +static void fragment_free_space(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; - u64 start = block_group->key.objectid; - u64 len = block_group->key.offset; + u64 start = block_group->start; + u64 len = block_group->length; u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? fs_info->nodesize : fs_info->sectorsize; u64 step = chunk << 1; @@ -450,8 +447,7 @@ static void fragment_free_space(struct btrfs_block_group_cache *block_group) * used yet since their free space will be released as soon as the transaction * commits. */ -u64 add_new_free_space(struct btrfs_block_group_cache *block_group, - u64 start, u64 end) +u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end) { struct btrfs_fs_info *info = block_group->fs_info; u64 extent_start, extent_end, size, total_added = 0; @@ -491,7 +487,7 @@ u64 add_new_free_space(struct btrfs_block_group_cache *block_group, static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) { - struct btrfs_block_group_cache *block_group = caching_ctl->block_group; + struct btrfs_block_group *block_group = caching_ctl->block_group; struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *extent_root = fs_info->extent_root; struct btrfs_path *path; @@ -507,7 +503,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) if (!path) return -ENOMEM; - last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); + last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET); #ifdef CONFIG_BTRFS_DEBUG /* @@ -587,13 +583,12 @@ next: goto next; } - if (key.objectid < block_group->key.objectid) { + if (key.objectid < block_group->start) { path->slots[0]++; continue; } - if (key.objectid >= block_group->key.objectid + - block_group->key.offset) + if (key.objectid >= block_group->start + block_group->length) break; if (key.type == BTRFS_EXTENT_ITEM_KEY || @@ -617,8 +612,7 @@ next: ret = 0; total_found += add_new_free_space(block_group, last, - block_group->key.objectid + - block_group->key.offset); + block_group->start + block_group->length); caching_ctl->progress = (u64)-1; out: @@ -628,7 +622,7 @@ out: static noinline void caching_thread(struct btrfs_work *work) { - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; struct btrfs_fs_info *fs_info; struct btrfs_caching_control *caching_ctl; int ret; @@ -656,8 +650,7 @@ static noinline void caching_thread(struct btrfs_work *work) spin_lock(&block_group->space_info->lock); spin_lock(&block_group->lock); - bytes_used = block_group->key.offset - - btrfs_block_group_used(&block_group->item); + bytes_used = block_group->length - block_group->used; block_group->space_info->bytes_used += bytes_used >> 1; spin_unlock(&block_group->lock); spin_unlock(&block_group->space_info->lock); @@ -677,8 +670,7 @@ static noinline void caching_thread(struct btrfs_work *work) btrfs_put_block_group(block_group); } -int btrfs_cache_block_group(struct btrfs_block_group_cache *cache, - int load_cache_only) +int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only) { DEFINE_WAIT(wait); struct btrfs_fs_info *fs_info = cache->fs_info; @@ -693,10 +685,9 @@ int btrfs_cache_block_group(struct btrfs_block_group_cache *cache, mutex_init(&caching_ctl->mutex); init_waitqueue_head(&caching_ctl->wait); caching_ctl->block_group = cache; - caching_ctl->progress = cache->key.objectid; + caching_ctl->progress = cache->start; refcount_set(&caching_ctl->count, 1); - btrfs_init_work(&caching_ctl->work, btrfs_cache_helper, - caching_thread, NULL, NULL); + btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); spin_lock(&cache->lock); /* @@ -763,8 +754,7 @@ int btrfs_cache_block_group(struct btrfs_block_group_cache *cache, spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); - bytes_used = cache->key.offset - - btrfs_block_group_used(&cache->item); + bytes_used = cache->length - cache->used; cache->space_info->bytes_used += bytes_used >> 1; spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); @@ -833,27 +823,36 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) * * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group * in the whole filesystem + * + * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups */ static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags) { - if (flags & BTRFS_BLOCK_GROUP_RAID56_MASK) { + bool found_raid56 = false; + bool found_raid1c34 = false; + + if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) || + (flags & BTRFS_BLOCK_GROUP_RAID1C3) || + (flags & BTRFS_BLOCK_GROUP_RAID1C4)) { struct list_head *head = &fs_info->space_info; struct btrfs_space_info *sinfo; list_for_each_entry_rcu(sinfo, head, list) { - bool found = false; - down_read(&sinfo->groups_sem); if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5])) - found = true; + found_raid56 = true; if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6])) - found = true; + found_raid56 = true; + if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3])) + found_raid1c34 = true; + if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4])) + found_raid1c34 = true; up_read(&sinfo->groups_sem); - - if (found) - return; } - btrfs_clear_fs_incompat(fs_info, RAID56); + if (found_raid56) + btrfs_clear_fs_incompat(fs_info, RAID56); + if (found_raid1c34) + btrfs_clear_fs_incompat(fs_info, RAID1C34); } } @@ -863,7 +862,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = fs_info->extent_root; struct btrfs_path *path; - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; struct btrfs_free_cluster *cluster; struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_key key; @@ -886,10 +885,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * remove it. */ btrfs_free_excluded_extents(block_group); - btrfs_free_ref_tree_range(fs_info, block_group->key.objectid, - block_group->key.offset); + btrfs_free_ref_tree_range(fs_info, block_group->start, + block_group->length); - memcpy(&key, &block_group->key, sizeof(key)); index = btrfs_bg_flags_to_raid_index(block_group->flags); factor = btrfs_bg_type_to_factor(block_group->flags); @@ -967,8 +965,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, } key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = block_group->key.objectid; key.type = 0; + key.offset = block_group->start; ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); if (ret < 0) @@ -987,7 +985,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, &fs_info->block_group_cache_tree); RB_CLEAR_NODE(&block_group->cache_node); - if (fs_info->first_logical_byte == block_group->key.objectid) + if (fs_info->first_logical_byte == block_group->start) fs_info->first_logical_byte = (u64)-1; spin_unlock(&fs_info->block_group_cache_lock); @@ -1048,19 +1046,21 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { WARN_ON(block_group->space_info->total_bytes - < block_group->key.offset); + < block_group->length); WARN_ON(block_group->space_info->bytes_readonly - < block_group->key.offset); + < block_group->length); WARN_ON(block_group->space_info->disk_total - < block_group->key.offset * factor); + < block_group->length * factor); } - block_group->space_info->total_bytes -= block_group->key.offset; - block_group->space_info->bytes_readonly -= block_group->key.offset; - block_group->space_info->disk_total -= block_group->key.offset * factor; + block_group->space_info->total_bytes -= block_group->length; + block_group->space_info->bytes_readonly -= block_group->length; + block_group->space_info->disk_total -= block_group->length * factor; spin_unlock(&block_group->space_info->lock); - memcpy(&key, &block_group->key, sizeof(key)); + key.objectid = block_group->start; + key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + key.offset = block_group->length; mutex_lock(&fs_info->chunk_mutex); spin_lock(&block_group->lock); @@ -1180,7 +1180,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( * data in this block group. That check should be done by relocation routine, * not this function. */ -static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) +static int inc_block_group_ro(struct btrfs_block_group *cache, int force) { struct btrfs_space_info *sinfo = cache->space_info; u64 num_bytes; @@ -1209,8 +1209,8 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) goto out; } - num_bytes = cache->key.offset - cache->reserved - cache->pinned - - cache->bytes_super - btrfs_block_group_used(&cache->item); + num_bytes = cache->length - cache->reserved - cache->pinned - + cache->bytes_super - cache->used; sinfo_used = btrfs_space_info_used(sinfo, true); /* @@ -1231,8 +1231,7 @@ out: spin_unlock(&sinfo->lock); if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { btrfs_info(cache->fs_info, - "unable to make block group %llu ro", - cache->key.objectid); + "unable to make block group %llu ro", cache->start); btrfs_info(cache->fs_info, "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu", sinfo_used, num_bytes, min_allocable_bytes); @@ -1247,7 +1246,7 @@ out: */ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) { - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; int ret = 0; @@ -1261,7 +1260,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) int trimming; block_group = list_first_entry(&fs_info->unused_bgs, - struct btrfs_block_group_cache, + struct btrfs_block_group, bg_list); list_del_init(&block_group->bg_list); @@ -1279,8 +1278,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) down_write(&space_info->groups_sem); spin_lock(&block_group->lock); if (block_group->reserved || block_group->pinned || - btrfs_block_group_used(&block_group->item) || - block_group->ro || + block_group->used || block_group->ro || list_is_singular(&block_group->list)) { /* * We want to bail if we made new allocations or have @@ -1308,7 +1306,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * properly if we fail to join the transaction. */ trans = btrfs_start_trans_remove_block_group(fs_info, - block_group->key.objectid); + block_group->start); if (IS_ERR(trans)) { btrfs_dec_block_group_ro(block_group); ret = PTR_ERR(trans); @@ -1319,8 +1317,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * We could have pending pinned extents for this block group, * just delete them, we don't care about them anymore. */ - start = block_group->key.objectid; - end = start + block_group->key.offset - 1; + start = block_group->start; + end = start + block_group->length - 1; /* * Hold the unused_bg_unpin_mutex lock to avoid racing with * btrfs_finish_extent_commit(). If we are at transaction N, @@ -1375,7 +1373,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * Btrfs_remove_chunk will abort the transaction if things go * horribly wrong. */ - ret = btrfs_remove_chunk(trans, block_group->key.objectid); + ret = btrfs_remove_chunk(trans, block_group->start); if (ret) { if (trimming) @@ -1410,7 +1408,7 @@ next: spin_unlock(&fs_info->unused_bgs_lock); } -void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg) +void btrfs_mark_bg_unused(struct btrfs_block_group *bg) { struct btrfs_fs_info *fs_info = bg->fs_info; @@ -1478,7 +1476,7 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info, read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot), sizeof(bg)); - flags = btrfs_block_group_flags(&bg) & + flags = btrfs_stack_block_group_flags(&bg) & BTRFS_BLOCK_GROUP_TYPE_MASK; if (flags != (em->map_lookup->type & @@ -1518,7 +1516,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) write_sequnlock(&fs_info->profiles_lock); } -static int exclude_super_stripes(struct btrfs_block_group_cache *cache) +static int exclude_super_stripes(struct btrfs_block_group *cache) { struct btrfs_fs_info *fs_info = cache->fs_info; u64 bytenr; @@ -1526,10 +1524,10 @@ static int exclude_super_stripes(struct btrfs_block_group_cache *cache) int stripe_len; int i, nr, ret; - if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { - stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; + if (cache->start < BTRFS_SUPER_INFO_OFFSET) { + stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start; cache->bytes_super += stripe_len; - ret = btrfs_add_excluded_extent(fs_info, cache->key.objectid, + ret = btrfs_add_excluded_extent(fs_info, cache->start, stripe_len); if (ret) return ret; @@ -1537,7 +1535,7 @@ static int exclude_super_stripes(struct btrfs_block_group_cache *cache) for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); - ret = btrfs_rmap_block(fs_info, cache->key.objectid, + ret = btrfs_rmap_block(fs_info, cache->start, bytenr, &logical, &nr, &stripe_len); if (ret) return ret; @@ -1545,21 +1543,19 @@ static int exclude_super_stripes(struct btrfs_block_group_cache *cache) while (nr--) { u64 start, len; - if (logical[nr] > cache->key.objectid + - cache->key.offset) + if (logical[nr] > cache->start + cache->length) continue; - if (logical[nr] + stripe_len <= cache->key.objectid) + if (logical[nr] + stripe_len <= cache->start) continue; start = logical[nr]; - if (start < cache->key.objectid) { - start = cache->key.objectid; + if (start < cache->start) { + start = cache->start; len = (logical[nr] + stripe_len) - start; } else { len = min_t(u64, stripe_len, - cache->key.objectid + - cache->key.offset - start); + cache->start + cache->length - start); } cache->bytes_super += len; @@ -1575,7 +1571,7 @@ static int exclude_super_stripes(struct btrfs_block_group_cache *cache) return 0; } -static void link_block_group(struct btrfs_block_group_cache *cache) +static void link_block_group(struct btrfs_block_group *cache) { struct btrfs_space_info *space_info = cache->space_info; int index = btrfs_bg_flags_to_raid_index(cache->flags); @@ -1591,10 +1587,10 @@ static void link_block_group(struct btrfs_block_group_cache *cache) btrfs_sysfs_add_block_group_type(cache); } -static struct btrfs_block_group_cache *btrfs_create_block_group_cache( +static struct btrfs_block_group *btrfs_create_block_group_cache( struct btrfs_fs_info *fs_info, u64 start, u64 size) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; cache = kzalloc(sizeof(*cache), GFP_NOFS); if (!cache) @@ -1607,9 +1603,8 @@ static struct btrfs_block_group_cache *btrfs_create_block_group_cache( return NULL; } - cache->key.objectid = start; - cache->key.offset = size; - cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + cache->start = start; + cache->length = size; cache->fs_info = fs_info; cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start); @@ -1640,7 +1635,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) { struct extent_map_tree *map_tree = &fs_info->mapping_tree; struct extent_map *em; - struct btrfs_block_group_cache *bg; + struct btrfs_block_group *bg; u64 start = 0; int ret = 0; @@ -1665,15 +1660,14 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) free_extent_map(em); break; } - if (bg->key.objectid != em->start || - bg->key.offset != em->len || + if (bg->start != em->start || bg->length != em->len || (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { btrfs_err(fs_info, "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", em->start, em->len, em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK, - bg->key.objectid, bg->key.offset, + bg->start, bg->length, bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK); ret = -EUCLEAN; free_extent_map(em); @@ -1687,22 +1681,117 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) return ret; } +static int read_one_block_group(struct btrfs_fs_info *info, + struct btrfs_path *path, + const struct btrfs_key *key, + int need_clear) +{ + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_block_group *cache; + struct btrfs_space_info *space_info; + struct btrfs_block_group_item bgi; + const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS); + int slot = path->slots[0]; + int ret; + + ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY); + + cache = btrfs_create_block_group_cache(info, key->objectid, key->offset); + if (!cache) + return -ENOMEM; + + if (need_clear) { + /* + * When we mount with old space cache, we need to + * set BTRFS_DC_CLEAR and set dirty flag. + * + * a) Setting 'BTRFS_DC_CLEAR' makes sure that we + * truncate the old free space cache inode and + * setup a new one. + * b) Setting 'dirty flag' makes sure that we flush + * the new space cache info onto disk. + */ + if (btrfs_test_opt(info, SPACE_CACHE)) + cache->disk_cache_state = BTRFS_DC_CLEAR; + } + read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), + sizeof(bgi)); + cache->used = btrfs_stack_block_group_used(&bgi); + cache->flags = btrfs_stack_block_group_flags(&bgi); + if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) && + (cache->flags & BTRFS_BLOCK_GROUP_DATA))) { + btrfs_err(info, +"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups", + cache->start); + ret = -EINVAL; + goto error; + } + + /* + * We need to exclude the super stripes now so that the space info has + * super bytes accounted for, otherwise we'll think we have more space + * than we actually do. + */ + ret = exclude_super_stripes(cache); + if (ret) { + /* We may have excluded something, so call this just in case. */ + btrfs_free_excluded_extents(cache); + goto error; + } + + /* + * Check for two cases, either we are full, and therefore don't need + * to bother with the caching work since we won't find any space, or we + * are empty, and we can just add all the space in and be done with it. + * This saves us _a_lot_ of time, particularly in the full case. + */ + if (key->offset == cache->used) { + cache->last_byte_to_unpin = (u64)-1; + cache->cached = BTRFS_CACHE_FINISHED; + btrfs_free_excluded_extents(cache); + } else if (cache->used == 0) { + cache->last_byte_to_unpin = (u64)-1; + cache->cached = BTRFS_CACHE_FINISHED; + add_new_free_space(cache, key->objectid, + key->objectid + key->offset); + btrfs_free_excluded_extents(cache); + } + + ret = btrfs_add_block_group_cache(info, cache); + if (ret) { + btrfs_remove_free_space_cache(cache); + goto error; + } + trace_btrfs_add_block_group(info, cache, 0); + btrfs_update_space_info(info, cache->flags, key->offset, + cache->used, cache->bytes_super, &space_info); + + cache->space_info = space_info; + + link_block_group(cache); + + set_avail_alloc_bits(info, cache->flags); + if (btrfs_chunk_readonly(info, cache->start)) { + inc_block_group_ro(cache, 1); + } else if (cache->used == 0) { + ASSERT(list_empty(&cache->bg_list)); + btrfs_mark_bg_unused(cache); + } + return 0; +error: + btrfs_put_block_group(cache); + return ret; +} + int btrfs_read_block_groups(struct btrfs_fs_info *info) { struct btrfs_path *path; int ret; - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; struct btrfs_space_info *space_info; struct btrfs_key key; - struct btrfs_key found_key; - struct extent_buffer *leaf; int need_clear = 0; u64 cache_gen; - u64 feature; - int mixed; - - feature = btrfs_super_incompat_flags(info->super_copy); - mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS); key.objectid = 0; key.offset = 0; @@ -1726,107 +1815,13 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) if (ret != 0) goto error; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - - cache = btrfs_create_block_group_cache(info, found_key.objectid, - found_key.offset); - if (!cache) { - ret = -ENOMEM; - goto error; - } - - if (need_clear) { - /* - * When we mount with old space cache, we need to - * set BTRFS_DC_CLEAR and set dirty flag. - * - * a) Setting 'BTRFS_DC_CLEAR' makes sure that we - * truncate the old free space cache inode and - * setup a new one. - * b) Setting 'dirty flag' makes sure that we flush - * the new space cache info onto disk. - */ - if (btrfs_test_opt(info, SPACE_CACHE)) - cache->disk_cache_state = BTRFS_DC_CLEAR; - } - - read_extent_buffer(leaf, &cache->item, - btrfs_item_ptr_offset(leaf, path->slots[0]), - sizeof(cache->item)); - cache->flags = btrfs_block_group_flags(&cache->item); - if (!mixed && - ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) && - (cache->flags & BTRFS_BLOCK_GROUP_DATA))) { - btrfs_err(info, -"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups", - cache->key.objectid); - ret = -EINVAL; + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + ret = read_one_block_group(info, path, &key, need_clear); + if (ret < 0) goto error; - } - - key.objectid = found_key.objectid + found_key.offset; + key.objectid += key.offset; + key.offset = 0; btrfs_release_path(path); - - /* - * We need to exclude the super stripes now so that the space - * info has super bytes accounted for, otherwise we'll think - * we have more space than we actually do. - */ - ret = exclude_super_stripes(cache); - if (ret) { - /* - * We may have excluded something, so call this just in - * case. - */ - btrfs_free_excluded_extents(cache); - btrfs_put_block_group(cache); - goto error; - } - - /* - * Check for two cases, either we are full, and therefore - * don't need to bother with the caching work since we won't - * find any space, or we are empty, and we can just add all - * the space in and be done with it. This saves us _a_lot_ of - * time, particularly in the full case. - */ - if (found_key.offset == btrfs_block_group_used(&cache->item)) { - cache->last_byte_to_unpin = (u64)-1; - cache->cached = BTRFS_CACHE_FINISHED; - btrfs_free_excluded_extents(cache); - } else if (btrfs_block_group_used(&cache->item) == 0) { - cache->last_byte_to_unpin = (u64)-1; - cache->cached = BTRFS_CACHE_FINISHED; - add_new_free_space(cache, found_key.objectid, - found_key.objectid + - found_key.offset); - btrfs_free_excluded_extents(cache); - } - - ret = btrfs_add_block_group_cache(info, cache); - if (ret) { - btrfs_remove_free_space_cache(cache); - btrfs_put_block_group(cache); - goto error; - } - - trace_btrfs_add_block_group(info, cache, 0); - btrfs_update_space_info(info, cache->flags, found_key.offset, - btrfs_block_group_used(&cache->item), - cache->bytes_super, &space_info); - - cache->space_info = space_info; - - link_block_group(cache); - - set_avail_alloc_bits(info, cache->flags); - if (btrfs_chunk_readonly(info, cache->key.objectid)) { - inc_block_group_ro(cache, 1); - } else if (btrfs_block_group_used(&cache->item) == 0) { - ASSERT(list_empty(&cache->bg_list)); - btrfs_mark_bg_unused(cache); - } } list_for_each_entry_rcu(space_info, &info->space_info, list) { @@ -1860,7 +1855,7 @@ error: void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; struct btrfs_root *extent_root = fs_info->extent_root; struct btrfs_block_group_item item; struct btrfs_key key; @@ -1871,14 +1866,19 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) while (!list_empty(&trans->new_bgs)) { block_group = list_first_entry(&trans->new_bgs, - struct btrfs_block_group_cache, + struct btrfs_block_group, bg_list); if (ret) goto next; spin_lock(&block_group->lock); - memcpy(&item, &block_group->item, sizeof(item)); - memcpy(&key, &block_group->key, sizeof(key)); + btrfs_set_stack_block_group_used(&item, block_group->used); + btrfs_set_stack_block_group_chunk_objectid(&item, + BTRFS_FIRST_CHUNK_TREE_OBJECTID); + btrfs_set_stack_block_group_flags(&item, block_group->flags); + key.objectid = block_group->start; + key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + key.offset = block_group->length; spin_unlock(&block_group->lock); ret = btrfs_insert_item(trans, extent_root, &key, &item, @@ -1901,7 +1901,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, u64 type, u64 chunk_offset, u64 size) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; int ret; btrfs_set_log_full_commit(trans); @@ -1910,11 +1910,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, if (!cache) return -ENOMEM; - btrfs_set_block_group_used(&cache->item, bytes_used); - btrfs_set_block_group_chunk_objectid(&cache->item, - BTRFS_FIRST_CHUNK_TREE_OBJECTID); - btrfs_set_block_group_flags(&cache->item, type); - + cache->used = bytes_used; cache->flags = type; cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; @@ -2021,8 +2017,17 @@ static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) return flags; } -int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache) - +/* + * Mark one block group RO, can be called several times for the same block + * group. + * + * @cache: the destination block group + * @do_chunk_alloc: whether need to do chunk pre-allocation, this is to + * ensure we still have some free space after marking this + * block group RO. + */ +int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, + bool do_chunk_alloc) { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_trans_handle *trans; @@ -2052,25 +2057,29 @@ again: goto again; } - /* - * if we are changing raid levels, try to allocate a corresponding - * block group with the new raid level. - */ - alloc_flags = update_block_group_flags(fs_info, cache->flags); - if (alloc_flags != cache->flags) { - ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); + if (do_chunk_alloc) { /* - * ENOSPC is allowed here, we may have enough space - * already allocated at the new raid level to - * carry on + * If we are changing raid levels, try to allocate a + * corresponding block group with the new raid level. */ - if (ret == -ENOSPC) - ret = 0; - if (ret < 0) - goto out; + alloc_flags = update_block_group_flags(fs_info, cache->flags); + if (alloc_flags != cache->flags) { + ret = btrfs_chunk_alloc(trans, alloc_flags, + CHUNK_ALLOC_FORCE); + /* + * ENOSPC is allowed here, we may have enough space + * already allocated at the new raid level to carry on + */ + if (ret == -ENOSPC) + ret = 0; + if (ret < 0) + goto out; + } } - ret = inc_block_group_ro(cache, 0); + ret = inc_block_group_ro(cache, !do_chunk_alloc); + if (!do_chunk_alloc) + goto unlock_out; if (!ret) goto out; alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags); @@ -2085,13 +2094,14 @@ out: check_system_chunk(trans, alloc_flags); mutex_unlock(&fs_info->chunk_mutex); } +unlock_out: mutex_unlock(&fs_info->ro_block_group_mutex); btrfs_end_transaction(trans); return ret; } -void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache) +void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) { struct btrfs_space_info *sinfo = cache->space_info; u64 num_bytes; @@ -2101,9 +2111,8 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache) spin_lock(&sinfo->lock); spin_lock(&cache->lock); if (!--cache->ro) { - num_bytes = cache->key.offset - cache->reserved - - cache->pinned - cache->bytes_super - - btrfs_block_group_used(&cache->item); + num_bytes = cache->length - cache->reserved - + cache->pinned - cache->bytes_super - cache->used; sinfo->bytes_readonly -= num_bytes; list_del_init(&cache->ro_list); } @@ -2113,15 +2122,21 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache) static int write_one_cache_group(struct btrfs_trans_handle *trans, struct btrfs_path *path, - struct btrfs_block_group_cache *cache) + struct btrfs_block_group *cache) { struct btrfs_fs_info *fs_info = trans->fs_info; int ret; struct btrfs_root *extent_root = fs_info->extent_root; unsigned long bi; struct extent_buffer *leaf; + struct btrfs_block_group_item bgi; + struct btrfs_key key; + + key.objectid = cache->start; + key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + key.offset = cache->length; - ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); + ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1); if (ret) { if (ret > 0) ret = -ENOENT; @@ -2130,7 +2145,11 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; bi = btrfs_item_ptr_offset(leaf, path->slots[0]); - write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); + btrfs_set_stack_block_group_used(&bgi, cache->used); + btrfs_set_stack_block_group_chunk_objectid(&bgi, + BTRFS_FIRST_CHUNK_TREE_OBJECTID); + btrfs_set_stack_block_group_flags(&bgi, cache->flags); + write_extent_buffer(leaf, &bgi, bi, sizeof(bgi)); btrfs_mark_buffer_dirty(leaf); fail: btrfs_release_path(path); @@ -2138,7 +2157,7 @@ fail: } -static int cache_save_setup(struct btrfs_block_group_cache *block_group, +static int cache_save_setup(struct btrfs_block_group *block_group, struct btrfs_trans_handle *trans, struct btrfs_path *path) { @@ -2156,7 +2175,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, * If this block group is smaller than 100 megs don't bother caching the * block group. */ - if (block_group->key.offset < (100 * SZ_1M)) { + if (block_group->length < (100 * SZ_1M)) { spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_WRITTEN; spin_unlock(&block_group->lock); @@ -2257,7 +2276,7 @@ again: * taking up quite a bit since it's not folded into the other space * cache. */ - num_pages = div_u64(block_group->key.offset, SZ_256M); + num_pages = div_u64(block_group->length, SZ_256M); if (!num_pages) num_pages = 1; @@ -2302,7 +2321,7 @@ out: int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_group_cache *cache, *tmp; + struct btrfs_block_group *cache, *tmp; struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_path *path; @@ -2340,7 +2359,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; struct btrfs_transaction *cur_trans = trans->transaction; int ret = 0; int should_put; @@ -2377,8 +2396,7 @@ again: while (!list_empty(&dirty)) { bool drop_reserve = true; - cache = list_first_entry(&dirty, - struct btrfs_block_group_cache, + cache = list_first_entry(&dirty, struct btrfs_block_group, dirty_list); /* * This can happen if something re-dirties a block group that @@ -2503,7 +2521,7 @@ again: int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; struct btrfs_transaction *cur_trans = trans->transaction; int ret = 0; int should_put; @@ -2533,7 +2551,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) spin_lock(&cur_trans->dirty_bgs_lock); while (!list_empty(&cur_trans->dirty_bgs)) { cache = list_first_entry(&cur_trans->dirty_bgs, - struct btrfs_block_group_cache, + struct btrfs_block_group, dirty_list); /* @@ -2615,7 +2633,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) * to use it without any locking */ while (!list_empty(io)) { - cache = list_first_entry(io, struct btrfs_block_group_cache, + cache = list_first_entry(io, struct btrfs_block_group, io_list); list_del_init(&cache->io_list); btrfs_wait_cache_io(trans, cache, path); @@ -2630,7 +2648,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, int alloc) { struct btrfs_fs_info *info = trans->fs_info; - struct btrfs_block_group_cache *cache = NULL; + struct btrfs_block_group *cache = NULL; u64 total = num_bytes; u64 old_val; u64 byte_in_group; @@ -2661,11 +2679,11 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, * is because we need the unpinning stage to actually add the * space back to the block group, otherwise we will leak space. */ - if (!alloc && cache->cached == BTRFS_CACHE_NO) + if (!alloc && !btrfs_block_group_done(cache)) btrfs_cache_block_group(cache, 1); - byte_in_group = bytenr - cache->key.objectid; - WARN_ON(byte_in_group > cache->key.offset); + byte_in_group = bytenr - cache->start; + WARN_ON(byte_in_group > cache->length); spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); @@ -2674,11 +2692,11 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, cache->disk_cache_state < BTRFS_DC_CLEAR) cache->disk_cache_state = BTRFS_DC_CLEAR; - old_val = btrfs_block_group_used(&cache->item); - num_bytes = min(total, cache->key.offset - byte_in_group); + old_val = cache->used; + num_bytes = min(total, cache->length - byte_in_group); if (alloc) { old_val += num_bytes; - btrfs_set_block_group_used(&cache->item, old_val); + cache->used = old_val; cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; cache->space_info->bytes_used += num_bytes; @@ -2687,7 +2705,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cache->space_info->lock); } else { old_val -= num_bytes; - btrfs_set_block_group_used(&cache->item, old_val); + cache->used = old_val; cache->pinned += num_bytes; btrfs_space_info_update_bytes_pinned(info, cache->space_info, num_bytes); @@ -2745,7 +2763,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, * reservation and the block group has become read only we cannot make the * reservation and return -EAGAIN, otherwise this function always succeeds. */ -int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, +int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, u64 ram_bytes, u64 num_bytes, int delalloc) { struct btrfs_space_info *space_info = cache->space_info; @@ -2781,7 +2799,7 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, * A and before transaction A commits you free that leaf, you call this with * reserve set to 0 in order to clear the reservation. */ -void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, +void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes, int delalloc) { struct btrfs_space_info *space_info = cache->space_info; @@ -2987,9 +3005,7 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) } /* - * If @is_allocation is true, reserve space in the system space info necessary - * for allocating a chunk, otherwise if it's false, reserve space necessary for - * removing a chunk. + * Reserve space in the system space for allocating or removing a chunk */ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) { @@ -3046,7 +3062,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) void btrfs_put_block_group_cache(struct btrfs_fs_info *info) { - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; u64 last = 0; while (1) { @@ -3074,7 +3090,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info) spin_unlock(&block_group->lock); ASSERT(block_group->io_ctl.inode == NULL); iput(inode); - last = block_group->key.objectid + block_group->key.offset; + last = block_group->start + block_group->length; btrfs_put_block_group(block_group); } } @@ -3086,7 +3102,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info) */ int btrfs_free_block_groups(struct btrfs_fs_info *info) { - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; struct btrfs_space_info *space_info; struct btrfs_caching_control *caching_ctl; struct rb_node *n; @@ -3103,7 +3119,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) spin_lock(&info->unused_bgs_lock); while (!list_empty(&info->unused_bgs)) { block_group = list_first_entry(&info->unused_bgs, - struct btrfs_block_group_cache, + struct btrfs_block_group, bg_list); list_del_init(&block_group->bg_list); btrfs_put_block_group(block_group); @@ -3112,7 +3128,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) spin_lock(&info->block_group_cache_lock); while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { - block_group = rb_entry(n, struct btrfs_block_group_cache, + block_group = rb_entry(n, struct btrfs_block_group, cache_node); rb_erase(&block_group->cache_node, &info->block_group_cache_tree); diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index c391800388dd..9b409676c4b2 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -34,7 +34,7 @@ struct btrfs_caching_control { struct mutex mutex; wait_queue_head_t wait; struct btrfs_work work; - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; u64 progress; refcount_t count; }; @@ -42,14 +42,15 @@ struct btrfs_caching_control { /* Once caching_thread() finds this much free space, it will wake up waiters. */ #define CACHING_CTL_WAKE_UP SZ_2M -struct btrfs_block_group_cache { - struct btrfs_key key; - struct btrfs_block_group_item item; +struct btrfs_block_group { struct btrfs_fs_info *fs_info; struct inode *inode; spinlock_t lock; + u64 start; + u64 length; u64 pinned; u64 reserved; + u64 used; u64 delalloc_bytes; u64 bytes_super; u64 flags; @@ -159,7 +160,7 @@ struct btrfs_block_group_cache { #ifdef CONFIG_BTRFS_DEBUG static inline int btrfs_should_fragment_free_space( - struct btrfs_block_group_cache *block_group) + struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; @@ -170,29 +171,29 @@ static inline int btrfs_should_fragment_free_space( } #endif -struct btrfs_block_group_cache *btrfs_lookup_first_block_group( +struct btrfs_block_group *btrfs_lookup_first_block_group( struct btrfs_fs_info *info, u64 bytenr); -struct btrfs_block_group_cache *btrfs_lookup_block_group( +struct btrfs_block_group *btrfs_lookup_block_group( struct btrfs_fs_info *info, u64 bytenr); -struct btrfs_block_group_cache *btrfs_next_block_group( - struct btrfs_block_group_cache *cache); -void btrfs_get_block_group(struct btrfs_block_group_cache *cache); -void btrfs_put_block_group(struct btrfs_block_group_cache *cache); +struct btrfs_block_group *btrfs_next_block_group( + struct btrfs_block_group *cache); +void btrfs_get_block_group(struct btrfs_block_group *cache); +void btrfs_put_block_group(struct btrfs_block_group *cache); void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, const u64 start); -void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); +void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg); bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); -void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg); -void btrfs_wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, +void btrfs_wait_nocow_writers(struct btrfs_block_group *bg); +void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, u64 num_bytes); -int btrfs_wait_block_group_cache_done(struct btrfs_block_group_cache *cache); -int btrfs_cache_block_group(struct btrfs_block_group_cache *cache, +int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache); +int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only); void btrfs_put_caching_control(struct btrfs_caching_control *ctl); struct btrfs_caching_control *btrfs_get_caching_control( - struct btrfs_block_group_cache *cache); -u64 add_new_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *cache); +u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end); struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( struct btrfs_fs_info *fs_info, @@ -200,21 +201,22 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( int btrfs_remove_block_group(struct btrfs_trans_handle *trans, u64 group_start, struct extent_map *em); void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); -void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg); +void btrfs_mark_bg_unused(struct btrfs_block_group *bg); int btrfs_read_block_groups(struct btrfs_fs_info *info); int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, u64 type, u64 chunk_offset, u64 size); void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans); -int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache); -void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache); +int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, + bool do_chunk_alloc); +void btrfs_dec_block_group_ro(struct btrfs_block_group *cache); int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans); int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans); int btrfs_setup_space_cache(struct btrfs_trans_handle *trans); int btrfs_update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, int alloc); -int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, +int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, u64 ram_bytes, u64 num_bytes, int delalloc); -void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, +void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes, int delalloc); int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, enum btrfs_chunk_alloc_enum force); @@ -239,8 +241,7 @@ static inline u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info) return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); } -static inline int btrfs_block_group_cache_done( - struct btrfs_block_group_cache *cache) +static inline int btrfs_block_group_done(struct btrfs_block_group *cache) { smp_mb(); return cache->cached == BTRFS_CACHE_FINISHED || diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index f853835c409c..4e12a477d32e 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -63,9 +63,6 @@ struct btrfs_inode { /* held while logging the inode in tree-log.c */ struct mutex log_mutex; - /* held while doing delalloc reservations */ - struct mutex delalloc_mutex; - /* used to order data wrt metadata */ struct btrfs_ordered_inode_tree ordered_tree; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index b05b361e2062..ee834ef7beb4 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -29,6 +29,41 @@ #include "extent_io.h" #include "extent_map.h" +int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out); +int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); +int zlib_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen); +struct list_head *zlib_alloc_workspace(unsigned int level); +void zlib_free_workspace(struct list_head *ws); +struct list_head *zlib_get_workspace(unsigned int level); + +int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out); +int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); +int lzo_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen); +struct list_head *lzo_alloc_workspace(unsigned int level); +void lzo_free_workspace(struct list_head *ws); + +int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out); +int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); +int zstd_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen); +void zstd_init_workspace_manager(void); +void zstd_cleanup_workspace_manager(void); +struct list_head *zstd_alloc_workspace(unsigned int level); +void zstd_free_workspace(struct list_head *ws); +struct list_head *zstd_get_workspace(unsigned int level); +void zstd_put_workspace(struct list_head *ws); + static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; const char* btrfs_compress_type2str(enum btrfs_compression_type type) @@ -39,6 +74,8 @@ const char* btrfs_compress_type2str(enum btrfs_compression_type type) case BTRFS_COMPRESS_ZSTD: case BTRFS_COMPRESS_NONE: return btrfs_compress_types[type]; + default: + break; } return NULL; @@ -60,6 +97,70 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len) return false; } +static int compression_compress_pages(int type, struct list_head *ws, + struct address_space *mapping, u64 start, struct page **pages, + unsigned long *out_pages, unsigned long *total_in, + unsigned long *total_out) +{ + switch (type) { + case BTRFS_COMPRESS_ZLIB: + return zlib_compress_pages(ws, mapping, start, pages, + out_pages, total_in, total_out); + case BTRFS_COMPRESS_LZO: + return lzo_compress_pages(ws, mapping, start, pages, + out_pages, total_in, total_out); + case BTRFS_COMPRESS_ZSTD: + return zstd_compress_pages(ws, mapping, start, pages, + out_pages, total_in, total_out); + case BTRFS_COMPRESS_NONE: + default: + /* + * This can't happen, the type is validated several times + * before we get here. As a sane fallback, return what the + * callers will understand as 'no compression happened'. + */ + return -E2BIG; + } +} + +static int compression_decompress_bio(int type, struct list_head *ws, + struct compressed_bio *cb) +{ + switch (type) { + case BTRFS_COMPRESS_ZLIB: return zlib_decompress_bio(ws, cb); + case BTRFS_COMPRESS_LZO: return lzo_decompress_bio(ws, cb); + case BTRFS_COMPRESS_ZSTD: return zstd_decompress_bio(ws, cb); + case BTRFS_COMPRESS_NONE: + default: + /* + * This can't happen, the type is validated several times + * before we get here. + */ + BUG(); + } +} + +static int compression_decompress(int type, struct list_head *ws, + unsigned char *data_in, struct page *dest_page, + unsigned long start_byte, size_t srclen, size_t destlen) +{ + switch (type) { + case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page, + start_byte, srclen, destlen); + case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page, + start_byte, srclen, destlen); + case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page, + start_byte, srclen, destlen); + case BTRFS_COMPRESS_NONE: + default: + /* + * This can't happen, the type is validated several times + * before we get here. + */ + BUG(); + } +} + static int btrfs_decompress_bio(struct compressed_bio *cb); static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, @@ -311,7 +412,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long compressed_len, struct page **compressed_pages, unsigned long nr_pages, - unsigned int write_flags) + unsigned int write_flags, + struct cgroup_subsys_state *blkcg_css) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct bio *bio = NULL; @@ -320,7 +422,6 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, int pg_index = 0; struct page *page; u64 first_byte = disk_start; - struct block_device *bdev; blk_status_t ret; int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; @@ -339,13 +440,15 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, cb->orig_bio = NULL; cb->nr_pages = nr_pages; - bdev = fs_info->fs_devices->latest_bdev; - bio = btrfs_bio_alloc(first_byte); - bio_set_dev(bio, bdev); bio->bi_opf = REQ_OP_WRITE | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; + + if (blkcg_css) { + bio->bi_opf |= REQ_CGROUP_PUNT; + bio_associate_blkg_from_css(bio, blkcg_css); + } refcount_set(&cb->pending_bios, 1); /* create and submit bios for the compressed pages */ @@ -378,14 +481,13 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, BUG_ON(ret); /* -ENOMEM */ } - ret = btrfs_map_bio(fs_info, bio, 0, 1); + ret = btrfs_map_bio(fs_info, bio, 0); if (ret) { bio->bi_status = ret; bio_endio(bio); } bio = btrfs_bio_alloc(first_byte); - bio_set_dev(bio, bdev); bio->bi_opf = REQ_OP_WRITE | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; @@ -409,7 +511,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, BUG_ON(ret); /* -ENOMEM */ } - ret = btrfs_map_bio(fs_info, bio, 0, 1); + ret = btrfs_map_bio(fs_info, bio, 0); if (ret) { bio->bi_status = ret; bio_endio(bio); @@ -553,7 +655,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, unsigned long nr_pages; unsigned long pg_index; struct page *page; - struct block_device *bdev; struct bio *comp_bio; u64 cur_disk_byte = (u64)bio->bi_iter.bi_sector << 9; u64 em_len; @@ -604,8 +705,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, if (!cb->compressed_pages) goto fail1; - bdev = fs_info->fs_devices->latest_bdev; - for (pg_index = 0; pg_index < nr_pages; pg_index++) { cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS | __GFP_HIGHMEM); @@ -624,7 +723,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, cb->len = bio->bi_iter.bi_size; comp_bio = btrfs_bio_alloc(cur_disk_byte); - bio_set_dev(comp_bio, bdev); comp_bio->bi_opf = REQ_OP_READ; comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; @@ -668,14 +766,13 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, fs_info->sectorsize); sums += csum_size * nr_sectors; - ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0); + ret = btrfs_map_bio(fs_info, comp_bio, mirror_num); if (ret) { comp_bio->bi_status = ret; bio_endio(comp_bio); } comp_bio = btrfs_bio_alloc(cur_disk_byte); - bio_set_dev(comp_bio, bdev); comp_bio->bi_opf = REQ_OP_READ; comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; @@ -693,7 +790,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, BUG_ON(ret); /* -ENOMEM */ } - ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0); + ret = btrfs_map_bio(fs_info, comp_bio, mirror_num); if (ret) { comp_bio->bi_status = ret; bio_endio(comp_bio); @@ -764,26 +861,6 @@ struct heuristic_ws { static struct workspace_manager heuristic_wsm; -static void heuristic_init_workspace_manager(void) -{ - btrfs_init_workspace_manager(&heuristic_wsm, &btrfs_heuristic_compress); -} - -static void heuristic_cleanup_workspace_manager(void) -{ - btrfs_cleanup_workspace_manager(&heuristic_wsm); -} - -static struct list_head *heuristic_get_workspace(unsigned int level) -{ - return btrfs_get_workspace(&heuristic_wsm, level); -} - -static void heuristic_put_workspace(struct list_head *ws) -{ - btrfs_put_workspace(&heuristic_wsm, ws); -} - static void free_heuristic_ws(struct list_head *ws) { struct heuristic_ws *workspace; @@ -824,12 +901,7 @@ fail: } const struct btrfs_compress_op btrfs_heuristic_compress = { - .init_workspace_manager = heuristic_init_workspace_manager, - .cleanup_workspace_manager = heuristic_cleanup_workspace_manager, - .get_workspace = heuristic_get_workspace, - .put_workspace = heuristic_put_workspace, - .alloc_workspace = alloc_heuristic_ws, - .free_workspace = free_heuristic_ws, + .workspace_manager = &heuristic_wsm, }; static const struct btrfs_compress_op * const btrfs_compress_op[] = { @@ -840,13 +912,44 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = { &btrfs_zstd_compress, }; -void btrfs_init_workspace_manager(struct workspace_manager *wsm, - const struct btrfs_compress_op *ops) +static struct list_head *alloc_workspace(int type, unsigned int level) { - struct list_head *workspace; + switch (type) { + case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(level); + case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level); + case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(level); + case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level); + default: + /* + * This can't happen, the type is validated several times + * before we get here. + */ + BUG(); + } +} - wsm->ops = ops; +static void free_workspace(int type, struct list_head *ws) +{ + switch (type) { + case BTRFS_COMPRESS_NONE: return free_heuristic_ws(ws); + case BTRFS_COMPRESS_ZLIB: return zlib_free_workspace(ws); + case BTRFS_COMPRESS_LZO: return lzo_free_workspace(ws); + case BTRFS_COMPRESS_ZSTD: return zstd_free_workspace(ws); + default: + /* + * This can't happen, the type is validated several times + * before we get here. + */ + BUG(); + } +} + +static void btrfs_init_workspace_manager(int type) +{ + struct workspace_manager *wsm; + struct list_head *workspace; + wsm = btrfs_compress_op[type]->workspace_manager; INIT_LIST_HEAD(&wsm->idle_ws); spin_lock_init(&wsm->ws_lock); atomic_set(&wsm->total_ws, 0); @@ -856,7 +959,7 @@ void btrfs_init_workspace_manager(struct workspace_manager *wsm, * Preallocate one workspace for each compression type so we can * guarantee forward progress in the worst case */ - workspace = wsm->ops->alloc_workspace(0); + workspace = alloc_workspace(type, 0); if (IS_ERR(workspace)) { pr_warn( "BTRFS: cannot preallocate compression workspace, will try later\n"); @@ -867,14 +970,16 @@ void btrfs_init_workspace_manager(struct workspace_manager *wsm, } } -void btrfs_cleanup_workspace_manager(struct workspace_manager *wsman) +static void btrfs_cleanup_workspace_manager(int type) { + struct workspace_manager *wsman; struct list_head *ws; + wsman = btrfs_compress_op[type]->workspace_manager; while (!list_empty(&wsman->idle_ws)) { ws = wsman->idle_ws.next; list_del(ws); - wsman->ops->free_workspace(ws); + free_workspace(type, ws); atomic_dec(&wsman->total_ws); } } @@ -885,9 +990,9 @@ void btrfs_cleanup_workspace_manager(struct workspace_manager *wsman) * Preallocation makes a forward progress guarantees and we do not return * errors. */ -struct list_head *btrfs_get_workspace(struct workspace_manager *wsm, - unsigned int level) +struct list_head *btrfs_get_workspace(int type, unsigned int level) { + struct workspace_manager *wsm; struct list_head *workspace; int cpus = num_online_cpus(); unsigned nofs_flag; @@ -897,6 +1002,7 @@ struct list_head *btrfs_get_workspace(struct workspace_manager *wsm, wait_queue_head_t *ws_wait; int *free_ws; + wsm = btrfs_compress_op[type]->workspace_manager; idle_ws = &wsm->idle_ws; ws_lock = &wsm->ws_lock; total_ws = &wsm->total_ws; @@ -932,7 +1038,7 @@ again: * context of btrfs_compress_bio/btrfs_compress_pages */ nofs_flag = memalloc_nofs_save(); - workspace = wsm->ops->alloc_workspace(level); + workspace = alloc_workspace(type, level); memalloc_nofs_restore(nofs_flag); if (IS_ERR(workspace)) { @@ -965,21 +1071,34 @@ again: static struct list_head *get_workspace(int type, int level) { - return btrfs_compress_op[type]->get_workspace(level); + switch (type) { + case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(type, level); + case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(level); + case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(type, level); + case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(level); + default: + /* + * This can't happen, the type is validated several times + * before we get here. + */ + BUG(); + } } /* * put a workspace struct back on the list or free it if we have enough * idle ones sitting around */ -void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws) +void btrfs_put_workspace(int type, struct list_head *ws) { + struct workspace_manager *wsm; struct list_head *idle_ws; spinlock_t *ws_lock; atomic_t *total_ws; wait_queue_head_t *ws_wait; int *free_ws; + wsm = btrfs_compress_op[type]->workspace_manager; idle_ws = &wsm->idle_ws; ws_lock = &wsm->ws_lock; total_ws = &wsm->total_ws; @@ -995,7 +1114,7 @@ void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws) } spin_unlock(ws_lock); - wsm->ops->free_workspace(ws); + free_workspace(type, ws); atomic_dec(total_ws); wake: cond_wake_up(ws_wait); @@ -1003,7 +1122,18 @@ wake: static void put_workspace(int type, struct list_head *ws) { - return btrfs_compress_op[type]->put_workspace(ws); + switch (type) { + case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(type, ws); + case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(type, ws); + case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(type, ws); + case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(ws); + default: + /* + * This can't happen, the type is validated several times + * before we get here. + */ + BUG(); + } } /* @@ -1042,10 +1172,8 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, level = btrfs_compress_set_level(type, level); workspace = get_workspace(type, level); - ret = btrfs_compress_op[type]->compress_pages(workspace, mapping, - start, pages, - out_pages, - total_in, total_out); + ret = compression_compress_pages(type, workspace, mapping, start, pages, + out_pages, total_in, total_out); put_workspace(type, workspace); return ret; } @@ -1071,7 +1199,7 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) int type = cb->compress_type; workspace = get_workspace(type, 0); - ret = btrfs_compress_op[type]->decompress_bio(workspace, cb); + ret = compression_decompress_bio(type, workspace, cb); put_workspace(type, workspace); return ret; @@ -1089,9 +1217,8 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, int ret; workspace = get_workspace(type, 0); - ret = btrfs_compress_op[type]->decompress(workspace, data_in, - dest_page, start_byte, - srclen, destlen); + ret = compression_decompress(type, workspace, data_in, dest_page, + start_byte, srclen, destlen); put_workspace(type, workspace); return ret; @@ -1099,18 +1226,18 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, void __init btrfs_init_compress(void) { - int i; - - for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++) - btrfs_compress_op[i]->init_workspace_manager(); + btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE); + btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB); + btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO); + zstd_init_workspace_manager(); } void __cold btrfs_exit_compress(void) { - int i; - - for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++) - btrfs_compress_op[i]->cleanup_workspace_manager(); + btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE); + btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB); + btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO); + zstd_cleanup_workspace_manager(); } /* diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 4cb8be9ff88b..d253f7aa8ed5 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -93,7 +93,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long compressed_len, struct page **compressed_pages, unsigned long nr_pages, - unsigned int write_flags); + unsigned int write_flags, + struct cgroup_subsys_state *blkcg_css); blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); @@ -104,11 +105,10 @@ enum btrfs_compression_type { BTRFS_COMPRESS_ZLIB = 1, BTRFS_COMPRESS_LZO = 2, BTRFS_COMPRESS_ZSTD = 3, - BTRFS_COMPRESS_TYPES = 3, + BTRFS_NR_COMPRESS_TYPES = 4, }; struct workspace_manager { - const struct btrfs_compress_op *ops; struct list_head idle_ws; spinlock_t ws_lock; /* Number of free workspaces */ @@ -119,50 +119,18 @@ struct workspace_manager { wait_queue_head_t ws_wait; }; -void btrfs_init_workspace_manager(struct workspace_manager *wsm, - const struct btrfs_compress_op *ops); -struct list_head *btrfs_get_workspace(struct workspace_manager *wsm, - unsigned int level); -void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws); -void btrfs_cleanup_workspace_manager(struct workspace_manager *wsm); +struct list_head *btrfs_get_workspace(int type, unsigned int level); +void btrfs_put_workspace(int type, struct list_head *ws); struct btrfs_compress_op { - void (*init_workspace_manager)(void); - - void (*cleanup_workspace_manager)(void); - - struct list_head *(*get_workspace)(unsigned int level); - - void (*put_workspace)(struct list_head *ws); - - struct list_head *(*alloc_workspace)(unsigned int level); - - void (*free_workspace)(struct list_head *workspace); - - int (*compress_pages)(struct list_head *workspace, - struct address_space *mapping, - u64 start, - struct page **pages, - unsigned long *out_pages, - unsigned long *total_in, - unsigned long *total_out); - - int (*decompress_bio)(struct list_head *workspace, - struct compressed_bio *cb); - - int (*decompress)(struct list_head *workspace, - unsigned char *data_in, - struct page *dest_page, - unsigned long start_byte, - size_t srclen, size_t destlen); - + struct workspace_manager *workspace_manager; /* Maximum level supported by the compression algorithm */ unsigned int max_level; unsigned int default_level; }; /* The heuristic workspaces are managed via the 0th workspace manager */ -#define BTRFS_NR_WORKSPACE_MANAGERS (BTRFS_COMPRESS_TYPES + 1) +#define BTRFS_NR_WORKSPACE_MANAGERS BTRFS_NR_COMPRESS_TYPES extern const struct btrfs_compress_op btrfs_heuristic_compress; extern const struct btrfs_compress_op btrfs_zlib_compress; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e59cde204b2f..5b6e86aaf2e1 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -32,8 +32,13 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, static const struct btrfs_csums { u16 size; const char *name; + const char *driver; } btrfs_csums[] = { [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, + [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, + [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" }, + [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b", + .driver = "blake2b-256" }, }; int btrfs_super_csum_size(const struct btrfs_super_block *s) @@ -51,34 +56,25 @@ const char *btrfs_super_csum_name(u16 csum_type) return btrfs_csums[csum_type].name; } -struct btrfs_path *btrfs_alloc_path(void) +/* + * Return driver name if defined, otherwise the name that's also a valid driver + * name + */ +const char *btrfs_super_csum_driver(u16 csum_type) { - return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); + /* csum type is validated at mount time */ + return btrfs_csums[csum_type].driver ?: + btrfs_csums[csum_type].name; } -/* - * set all locked nodes in the path to blocking locks. This should - * be done before scheduling - */ -noinline void btrfs_set_path_blocking(struct btrfs_path *p) +size_t __const btrfs_get_num_csums(void) { - int i; - for (i = 0; i < BTRFS_MAX_LEVEL; i++) { - if (!p->nodes[i] || !p->locks[i]) - continue; - /* - * If we currently have a spinning reader or writer lock this - * will bump the count of blocking holders and drop the - * spinlock. - */ - if (p->locks[i] == BTRFS_READ_LOCK) { - btrfs_set_lock_blocking_read(p->nodes[i]); - p->locks[i] = BTRFS_READ_LOCK_BLOCKING; - } else if (p->locks[i] == BTRFS_WRITE_LOCK) { - btrfs_set_lock_blocking_write(p->nodes[i]); - p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING; - } - } + return ARRAY_SIZE(btrfs_csums); +} + +struct btrfs_path *btrfs_alloc_path(void) +{ + return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); } /* this also releases the path */ @@ -1125,7 +1121,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) parent_start = buf->start; - extent_buffer_get(cow); + atomic_inc(&cow->refs); ret = tree_mod_log_insert_root(root->node, cow, 1); BUG_ON(ret < 0); rcu_assign_pointer(root->node, cow); @@ -1563,7 +1559,7 @@ static int comp_keys(const struct btrfs_disk_key *disk, /* * same as comp_keys only with two btrfs_key's */ -int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2) +int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2) { if (k1->objectid > k2->objectid) return 1; @@ -2036,7 +2032,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* update the path */ if (left) { if (btrfs_header_nritems(left) > orig_slot) { - extent_buffer_get(left); + atomic_inc(&left->refs); /* left was locked after cow */ path->nodes[level] = left; path->slots[level + 1] -= 1; @@ -2379,32 +2375,6 @@ static noinline void unlock_up(struct btrfs_path *path, int level, } /* - * This releases any locks held in the path starting at level and - * going all the way up to the root. - * - * btrfs_search_slot will keep the lock held on higher nodes in a few - * corner cases, such as COW of the block at slot zero in the node. This - * ignores those rules, and it should only be called when there are no - * more updates to be done higher up in the tree. - */ -noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) -{ - int i; - - if (path->keep_locks) - return; - - for (i = level; i < BTRFS_MAX_LEVEL; i++) { - if (!path->nodes[i]) - continue; - if (!path->locks[i]) - continue; - btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); - path->locks[i] = 0; - } -} - -/* * helper function for btrfs_search_slot. The goal is to find a block * in cache without setting the path to blocking. If we find the block * we return zero and the path is unchanged. @@ -2652,7 +2622,7 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, } else { b = root->commit_root; - extent_buffer_get(b); + atomic_inc(&b->refs); } level = btrfs_header_level(b); /* @@ -2785,12 +2755,10 @@ again: } while (b) { + int dec = 0; + level = btrfs_header_level(b); - /* - * setup the path here so we can release it under lock - * contention with the cow code - */ if (cow) { bool last_level = (level == (BTRFS_MAX_LEVEL - 1)); @@ -2861,73 +2829,7 @@ cow_done: if (ret < 0) goto done; - if (level != 0) { - int dec = 0; - if (ret && slot > 0) { - dec = 1; - slot -= 1; - } - p->slots[level] = slot; - err = setup_nodes_for_search(trans, root, p, b, level, - ins_len, &write_lock_level); - if (err == -EAGAIN) - goto again; - if (err) { - ret = err; - goto done; - } - b = p->nodes[level]; - slot = p->slots[level]; - - /* - * slot 0 is special, if we change the key - * we have to update the parent pointer - * which means we must have a write lock - * on the parent - */ - if (slot == 0 && ins_len && - write_lock_level < level + 1) { - write_lock_level = level + 1; - btrfs_release_path(p); - goto again; - } - - unlock_up(p, level, lowest_unlock, - min_write_lock_level, &write_lock_level); - - if (level == lowest_level) { - if (dec) - p->slots[level]++; - goto done; - } - - err = read_block_for_search(root, p, &b, level, - slot, key); - if (err == -EAGAIN) - goto again; - if (err) { - ret = err; - goto done; - } - - if (!p->skip_locking) { - level = btrfs_header_level(b); - if (level <= write_lock_level) { - if (!btrfs_try_tree_write_lock(b)) { - btrfs_set_path_blocking(p); - btrfs_tree_lock(b); - } - p->locks[level] = BTRFS_WRITE_LOCK; - } else { - if (!btrfs_tree_read_lock_atomic(b)) { - btrfs_set_path_blocking(p); - btrfs_tree_read_lock(b); - } - p->locks[level] = BTRFS_READ_LOCK; - } - p->nodes[level] = b; - } - } else { + if (level == 0) { p->slots[level] = slot; if (ins_len > 0 && btrfs_leaf_free_space(b) < ins_len) { @@ -2952,6 +2854,67 @@ cow_done: min_write_lock_level, NULL); goto done; } + if (ret && slot > 0) { + dec = 1; + slot--; + } + p->slots[level] = slot; + err = setup_nodes_for_search(trans, root, p, b, level, ins_len, + &write_lock_level); + if (err == -EAGAIN) + goto again; + if (err) { + ret = err; + goto done; + } + b = p->nodes[level]; + slot = p->slots[level]; + + /* + * Slot 0 is special, if we change the key we have to update + * the parent pointer which means we must have a write lock on + * the parent + */ + if (slot == 0 && ins_len && write_lock_level < level + 1) { + write_lock_level = level + 1; + btrfs_release_path(p); + goto again; + } + + unlock_up(p, level, lowest_unlock, min_write_lock_level, + &write_lock_level); + + if (level == lowest_level) { + if (dec) + p->slots[level]++; + goto done; + } + + err = read_block_for_search(root, p, &b, level, slot, key); + if (err == -EAGAIN) + goto again; + if (err) { + ret = err; + goto done; + } + + if (!p->skip_locking) { + level = btrfs_header_level(b); + if (level <= write_lock_level) { + if (!btrfs_try_tree_write_lock(b)) { + btrfs_set_path_blocking(p); + btrfs_tree_lock(b); + } + p->locks[level] = BTRFS_WRITE_LOCK; + } else { + if (!btrfs_tree_read_lock_atomic(b)) { + btrfs_set_path_blocking(p); + btrfs_tree_read_lock(b); + } + p->locks[level] = BTRFS_READ_LOCK; + } + p->nodes[level] = b; + } } ret = 1; done: @@ -3008,6 +2971,8 @@ again: p->locks[level] = BTRFS_READ_LOCK; while (b) { + int dec = 0; + level = btrfs_header_level(b); p->nodes[level] = b; @@ -3028,47 +2993,45 @@ again: if (ret < 0) goto done; - if (level != 0) { - int dec = 0; - if (ret && slot > 0) { - dec = 1; - slot -= 1; - } + if (level == 0) { p->slots[level] = slot; unlock_up(p, level, lowest_unlock, 0, NULL); + goto done; + } - if (level == lowest_level) { - if (dec) - p->slots[level]++; - goto done; - } + if (ret && slot > 0) { + dec = 1; + slot--; + } + p->slots[level] = slot; + unlock_up(p, level, lowest_unlock, 0, NULL); - err = read_block_for_search(root, p, &b, level, - slot, key); - if (err == -EAGAIN) - goto again; - if (err) { - ret = err; - goto done; - } + if (level == lowest_level) { + if (dec) + p->slots[level]++; + goto done; + } - level = btrfs_header_level(b); - if (!btrfs_tree_read_lock_atomic(b)) { - btrfs_set_path_blocking(p); - btrfs_tree_read_lock(b); - } - b = tree_mod_log_rewind(fs_info, p, b, time_seq); - if (!b) { - ret = -ENOMEM; - goto done; - } - p->locks[level] = BTRFS_READ_LOCK; - p->nodes[level] = b; - } else { - p->slots[level] = slot; - unlock_up(p, level, lowest_unlock, 0, NULL); + err = read_block_for_search(root, p, &b, level, slot, key); + if (err == -EAGAIN) + goto again; + if (err) { + ret = err; + goto done; + } + + level = btrfs_header_level(b); + if (!btrfs_tree_read_lock_atomic(b)) { + btrfs_set_path_blocking(p); + btrfs_tree_read_lock(b); + } + b = tree_mod_log_rewind(fs_info, p, b, time_seq); + if (!b) { + ret = -ENOMEM; goto done; } + p->locks[level] = BTRFS_READ_LOCK; + p->nodes[level] = b; } ret = 1; done: @@ -3433,7 +3396,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, free_extent_buffer(old); add_root_to_dirty_list(root); - extent_buffer_get(c); + atomic_inc(&c->refs); path->nodes[level] = c; path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; path->slots[level] = 0; @@ -4966,7 +4929,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, root_sub_used(root, leaf->len); - extent_buffer_get(leaf); + atomic_inc(&leaf->refs); btrfs_free_tree_block(trans, root, leaf, 0, 1); free_extent_buffer_stale(leaf); } @@ -5047,7 +5010,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, * for possible call to del_ptr below */ slot = path->slots[1]; - extent_buffer_get(leaf); + atomic_inc(&leaf->refs); btrfs_set_path_blocking(path); wret = push_leaf_left(trans, root, path, 1, 1, diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 19d669d12ca1..b2e8fd8a8e59 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -28,6 +28,7 @@ #include <linux/dynamic_debug.h> #include <linux/refcount.h> #include <linux/crc32c.h> +#include "extent-io-tree.h" #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" @@ -38,7 +39,7 @@ struct btrfs_transaction; struct btrfs_pending_snapshot; struct btrfs_delayed_ref_root; struct btrfs_space_info; -struct btrfs_block_group_cache; +struct btrfs_block_group; extern struct kmem_cache *btrfs_trans_handle_cachep; extern struct kmem_cache *btrfs_bit_radix_cachep; extern struct kmem_cache *btrfs_path_cachep; @@ -56,9 +57,9 @@ struct btrfs_ref; * filesystem data as well that can be used to read data in order to repair * read errors on other disks. * - * Current value is derived from RAID1 with 2 copies. + * Current value is derived from RAID1C4 with 4 copies. */ -#define BTRFS_MAX_MIRRORS (2 + 1) +#define BTRFS_MAX_MIRRORS (4 + 1) #define BTRFS_MAX_LEVEL 8 @@ -291,7 +292,8 @@ struct btrfs_super_block { BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ - BTRFS_FEATURE_INCOMPAT_METADATA_UUID) + BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ + BTRFS_FEATURE_INCOMPAT_RAID1C34) #define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) @@ -413,7 +415,7 @@ struct btrfs_free_cluster { /* We did a full search and couldn't create a cluster */ bool fragmented; - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; /* * when a cluster is allocated from a block group, we put the * cluster onto a list in the block group so that it can @@ -476,8 +478,8 @@ struct btrfs_swapfile_pin { void *ptr; struct inode *inode; /* - * If true, ptr points to a struct btrfs_block_group_cache. Otherwise, - * ptr points to a struct btrfs_device. + * If true, ptr points to a struct btrfs_block_group. Otherwise, ptr + * points to a struct btrfs_device. */ bool is_block_group; }; @@ -722,7 +724,6 @@ struct btrfs_fs_info { struct btrfs_workqueue *endio_meta_write_workers; struct btrfs_workqueue *endio_write_workers; struct btrfs_workqueue *endio_freespace_worker; - struct btrfs_workqueue *submit_workers; struct btrfs_workqueue *caching_workers; struct btrfs_workqueue *readahead_workers; @@ -734,8 +735,6 @@ struct btrfs_fs_info { struct btrfs_workqueue *fixup_workers; struct btrfs_workqueue *delayed_workers; - /* the extent workers do delayed refs on the extent allocation tree */ - struct btrfs_workqueue *extent_workers; struct task_struct *transaction_kthread; struct task_struct *cleaner_kthread; u32 thread_pool_size; @@ -1521,18 +1520,18 @@ static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb, } /* struct btrfs_block_group_item */ -BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item, +BTRFS_SETGET_STACK_FUNCS(stack_block_group_used, struct btrfs_block_group_item, used, 64); -BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item, +BTRFS_SETGET_FUNCS(block_group_used, struct btrfs_block_group_item, used, 64); -BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid, +BTRFS_SETGET_STACK_FUNCS(stack_block_group_chunk_objectid, struct btrfs_block_group_item, chunk_objectid, 64); -BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid, +BTRFS_SETGET_FUNCS(block_group_chunk_objectid, struct btrfs_block_group_item, chunk_objectid, 64); -BTRFS_SETGET_FUNCS(disk_block_group_flags, +BTRFS_SETGET_FUNCS(block_group_flags, struct btrfs_block_group_item, flags, 64); -BTRFS_SETGET_STACK_FUNCS(block_group_flags, +BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags, struct btrfs_block_group_item, flags, 64); /* struct btrfs_free_space_info */ @@ -2165,6 +2164,9 @@ BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, int btrfs_super_csum_size(const struct btrfs_super_block *s); const char *btrfs_super_csum_name(u16 csum_type); +const char *btrfs_super_csum_driver(u16 csum_type); +size_t __const btrfs_get_num_csums(void); + /* * The leaf data grows from end-to-front in the node. @@ -2399,7 +2401,7 @@ static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info, int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, u64 start, u64 num_bytes); -void btrfs_free_excluded_extents(struct btrfs_block_group_cache *cache); +void btrfs_free_excluded_extents(struct btrfs_block_group *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count); void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, @@ -2455,8 +2457,8 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref); int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr); -void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache); -void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache); +void btrfs_get_block_group_trimming(struct btrfs_block_group *cache); +void btrfs_put_block_group_trimming(struct btrfs_block_group *cache); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); enum btrfs_reserve_flush_enum { @@ -2489,8 +2491,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, int nitems, bool use_global_rsv); void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); -void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, - bool qgroup_free); +void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); @@ -2510,7 +2511,7 @@ void btrfs_wait_for_snapshot_creation(struct btrfs_root *root); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, int level, int *slot); -int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); +int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, int type); @@ -2570,8 +2571,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, void btrfs_release_path(struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); -void btrfs_set_path_blocking(struct btrfs_path *p); -void btrfs_unlock_up_safe(struct btrfs_path *p, int level); int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int slot, int nr); @@ -2873,10 +2872,9 @@ int btrfs_drop_inode(struct inode *inode); int __init btrfs_init_cachep(void); void __cold btrfs_destroy_cachep(void); struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location, - struct btrfs_root *root, int *new, - struct btrfs_path *path); + struct btrfs_root *root, struct btrfs_path *path); struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, - struct btrfs_root *root, int *was_new); + struct btrfs_root *root); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 end, int create); @@ -2912,7 +2910,7 @@ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int btrfs_ioctl_get_supported_features(void __user *arg); void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); -int btrfs_is_empty_uuid(u8 *uuid); +int __pure btrfs_is_empty_uuid(u8 *uuid); int btrfs_defrag_file(struct inode *inode, struct file *file, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_pages); @@ -3146,7 +3144,7 @@ __cold void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); -const char *btrfs_decode_error(int errno); +const char * __attribute_const__ btrfs_decode_error(int errno); __cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index d949d7d2abed..4cdac4d834f5 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -307,7 +307,6 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) unsigned nr_extents; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; int ret = 0; - bool delalloc_lock = true; /* * If we are a free space inode we need to not flush since we will be in @@ -320,7 +319,6 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) */ if (btrfs_is_free_space_inode(inode)) { flush = BTRFS_RESERVE_NO_FLUSH; - delalloc_lock = false; } else { if (current->journal_info) flush = BTRFS_RESERVE_FLUSH_LIMIT; @@ -329,9 +327,6 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) schedule_timeout(1); } - if (delalloc_lock) - mutex_lock(&inode->delalloc_mutex); - num_bytes = ALIGN(num_bytes, fs_info->sectorsize); /* @@ -348,10 +343,12 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) &qgroup_reserve); ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true); if (ret) - goto out_fail; + return ret; ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush); - if (ret) - goto out_qgroup; + if (ret) { + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); + return ret; + } /* * Now we need to update our outstanding extents and csum bytes _first_ @@ -375,16 +372,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) block_rsv->qgroup_rsv_reserved += qgroup_reserve; spin_unlock(&block_rsv->lock); - if (delalloc_lock) - mutex_unlock(&inode->delalloc_mutex); return 0; -out_qgroup: - btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); -out_fail: - btrfs_inode_rsv_release(inode, true); - if (delalloc_lock) - mutex_unlock(&inode->delalloc_mutex); - return ret; } /** @@ -418,7 +406,6 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, * btrfs_delalloc_release_extents - release our outstanding_extents * @inode: the inode to balance the reservation for. * @num_bytes: the number of bytes we originally reserved with - * @qgroup_free: do we need to free qgroup meta reservation or convert them. * * When we reserve space we increase outstanding_extents for the extents we may * add. Once we've set the range as delalloc or created our ordered extents we @@ -426,8 +413,7 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, * temporarily tracked outstanding_extents. This _must_ be used in conjunction * with btrfs_delalloc_reserve_metadata. */ -void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, - bool qgroup_free) +void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes) { struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned num_extents; @@ -441,7 +427,7 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, if (btrfs_is_testing(fs_info)) return; - btrfs_inode_rsv_release(inode, qgroup_free); + btrfs_inode_rsv_release(inode, true); } /** diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 1f7f39b10bd0..d3e15e1d4a91 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -12,6 +12,7 @@ #include "transaction.h" #include "ctree.h" #include "qgroup.h" +#include "locking.h" #define BTRFS_DELAYED_WRITEBACK 512 #define BTRFS_DELAYED_BACKGROUND 128 @@ -1367,8 +1368,8 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, return -ENOMEM; async_work->delayed_root = delayed_root; - btrfs_init_work(&async_work->work, btrfs_delayed_meta_helper, - btrfs_async_run_delayed_root, NULL, NULL); + btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL, + NULL); async_work->nr = nr; btrfs_queue_work(fs_info->delayed_workers, &async_work->work); @@ -1949,12 +1950,19 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) } inode_id = delayed_nodes[n - 1]->inode_id + 1; - - for (i = 0; i < n; i++) - refcount_inc(&delayed_nodes[i]->refs); + for (i = 0; i < n; i++) { + /* + * Don't increase refs in case the node is dead and + * about to be removed from the tree in the loop below + */ + if (!refcount_inc_not_zero(&delayed_nodes[i]->refs)) + delayed_nodes[i] = NULL; + } spin_unlock(&root->inode_lock); for (i = 0; i < n; i++) { + if (!delayed_nodes[i]) + continue; __btrfs_kill_delayed_node(delayed_nodes[i]); btrfs_release_delayed_node(delayed_nodes[i]); } diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 48890826b5e6..f639dde2a679 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -986,7 +986,7 @@ static int btrfs_dev_replace_kthread(void *data) return 0; } -int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) +int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) { if (!dev_replace->is_valid) return 0; diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 78c5d8f1adda..60b70dacc299 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -17,6 +17,6 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); -int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); +int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 044981cf6df9..e0edfdc9c82b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -205,7 +205,6 @@ struct extent_map *btree_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, int create) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; int ret; @@ -213,7 +212,6 @@ struct extent_map *btree_get_extent(struct btrfs_inode *inode, read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); if (em) { - em->bdev = fs_info->fs_devices->latest_bdev; read_unlock(&em_tree->lock); goto out; } @@ -228,7 +226,6 @@ struct extent_map *btree_get_extent(struct btrfs_inode *inode, em->len = (u64)-1; em->block_len = (u64)-1; em->block_start = 0; - em->bdev = fs_info->fs_devices->latest_bdev; write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); @@ -352,6 +349,9 @@ static bool btrfs_supported_super_csum(u16 csum_type) { switch (csum_type) { case BTRFS_CSUM_TYPE_CRC32: + case BTRFS_CSUM_TYPE_XXHASH: + case BTRFS_CSUM_TYPE_SHA256: + case BTRFS_CSUM_TYPE_BLAKE2: return true; default: return false; @@ -545,9 +545,11 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) ret = btrfs_check_leaf_full(eb); if (ret < 0) { + btrfs_print_tree(eb, 0); btrfs_err(fs_info, "block=%llu write time tree block corruption detected", eb->start); + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); return ret; } write_extent_buffer(eb, result, 0, csum_size); @@ -608,7 +610,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, /* the pending IO might have been the only thing that kept this buffer * in memory. Make sure we have a ref for all this other checks */ - extent_buffer_get(eb); + atomic_inc(&eb->refs); reads_done = atomic_dec_and_test(&eb->io_pages); if (!reads_done) @@ -706,43 +708,31 @@ static void end_workqueue_bio(struct bio *bio) struct btrfs_end_io_wq *end_io_wq = bio->bi_private; struct btrfs_fs_info *fs_info; struct btrfs_workqueue *wq; - btrfs_work_func_t func; fs_info = end_io_wq->info; end_io_wq->status = bio->bi_status; if (bio_op(bio) == REQ_OP_WRITE) { - if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) { + if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) wq = fs_info->endio_meta_write_workers; - func = btrfs_endio_meta_write_helper; - } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) { + else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) wq = fs_info->endio_freespace_worker; - func = btrfs_freespace_write_helper; - } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) { + else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) wq = fs_info->endio_raid56_workers; - func = btrfs_endio_raid56_helper; - } else { + else wq = fs_info->endio_write_workers; - func = btrfs_endio_write_helper; - } } else { - if (unlikely(end_io_wq->metadata == - BTRFS_WQ_ENDIO_DIO_REPAIR)) { + if (unlikely(end_io_wq->metadata == BTRFS_WQ_ENDIO_DIO_REPAIR)) wq = fs_info->endio_repair_workers; - func = btrfs_endio_repair_helper; - } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) { + else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) wq = fs_info->endio_raid56_workers; - func = btrfs_endio_raid56_helper; - } else if (end_io_wq->metadata) { + else if (end_io_wq->metadata) wq = fs_info->endio_meta_workers; - func = btrfs_endio_meta_helper; - } else { + else wq = fs_info->endio_workers; - func = btrfs_endio_helper; - } } - btrfs_init_work(&end_io_wq->work, func, end_workqueue_fn, NULL, NULL); + btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL); btrfs_queue_work(wq, &end_io_wq->work); } @@ -803,8 +793,13 @@ static void run_one_async_done(struct btrfs_work *work) return; } - ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, - async->mirror_num, 1); + /* + * All of the bios that pass through here are from async helpers. + * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context. + * This changes nothing when cgroups aren't in use. + */ + async->bio->bi_opf |= REQ_CGROUP_PUNT; + ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num); if (ret) { async->bio->bi_status = ret; bio_endio(async->bio); @@ -835,8 +830,8 @@ blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, async->mirror_num = mirror_num; async->submit_bio_start = submit_bio_start; - btrfs_init_work(&async->work, btrfs_worker_helper, run_one_async_start, - run_one_async_done, run_one_async_free); + btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, + run_one_async_free); async->bio_offset = bio_offset; @@ -904,12 +899,12 @@ static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio, BTRFS_WQ_ENDIO_METADATA); if (ret) goto out_w_error; - ret = btrfs_map_bio(fs_info, bio, mirror_num, 0); + ret = btrfs_map_bio(fs_info, bio, mirror_num); } else if (!async) { ret = btree_csum_one_bio(bio); if (ret) goto out_w_error; - ret = btrfs_map_bio(fs_info, bio, mirror_num, 0); + ret = btrfs_map_bio(fs_info, bio, mirror_num); } else { /* * kthread helpers are used to submit writes so that @@ -1657,8 +1652,8 @@ static void end_workqueue_fn(struct btrfs_work *work) bio->bi_status = end_io_wq->status; bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; - kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); bio_endio(bio); + kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); } static int cleaner_kthread(void *arg) @@ -1753,7 +1748,7 @@ static int transaction_kthread(void *arg) } now = ktime_get_seconds(); - if (cur->state < TRANS_STATE_BLOCKED && + if (cur->state < TRANS_STATE_COMMIT_START && !test_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags) && (now < cur->start_time || now - cur->start_time < fs_info->commit_interval)) { @@ -1792,18 +1787,18 @@ sleep: } /* - * this will find the highest generation in the array of - * root backups. The index of the highest array is returned, - * or -1 if we can't find anything. + * This will find the highest generation in the array of root backups. The + * index of the highest array is returned, or -EINVAL if we can't find + * anything. * * We check to make sure the array is valid by comparing the * generation of the latest root in the array with the generation * in the super block. If they don't match we pitch it. */ -static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen) +static int find_newest_super_backup(struct btrfs_fs_info *info) { + const u64 newest_gen = btrfs_super_generation(info->super_copy); u64 cur; - int newest_index = -1; struct btrfs_root_backup *root_backup; int i; @@ -1811,37 +1806,10 @@ static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen) root_backup = info->super_copy->super_roots + i; cur = btrfs_backup_tree_root_gen(root_backup); if (cur == newest_gen) - newest_index = i; + return i; } - /* check to see if we actually wrapped around */ - if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) { - root_backup = info->super_copy->super_roots; - cur = btrfs_backup_tree_root_gen(root_backup); - if (cur == newest_gen) - newest_index = 0; - } - return newest_index; -} - - -/* - * find the oldest backup so we know where to store new entries - * in the backup array. This will set the backup_root_index - * field in the fs_info struct - */ -static void find_oldest_super_backup(struct btrfs_fs_info *info, - u64 newest_gen) -{ - int newest_index = -1; - - newest_index = find_newest_super_backup(info, newest_gen); - /* if there was garbage in there, just move along */ - if (newest_index == -1) { - info->backup_root_index = 0; - } else { - info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS; - } + return -EINVAL; } /* @@ -1851,22 +1819,8 @@ static void find_oldest_super_backup(struct btrfs_fs_info *info, */ static void backup_super_roots(struct btrfs_fs_info *info) { - int next_backup; + const int next_backup = info->backup_root_index; struct btrfs_root_backup *root_backup; - int last_backup; - - next_backup = info->backup_root_index; - last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) % - BTRFS_NUM_BACKUP_ROOTS; - - /* - * just overwrite the last backup if we're at the same generation - * this happens only at umount - */ - root_backup = info->super_for_commit->super_roots + last_backup; - if (btrfs_backup_tree_root_gen(root_backup) == - btrfs_header_generation(info->tree_root->node)) - next_backup = last_backup; root_backup = info->super_for_commit->super_roots + next_backup; @@ -1939,40 +1893,31 @@ static void backup_super_roots(struct btrfs_fs_info *info) } /* - * this copies info out of the root backup array and back into - * the in-memory super block. It is meant to help iterate through - * the array, so you send it the number of backups you've already - * tried and the last backup index you used. + * read_backup_root - Reads a backup root based on the passed priority. Prio 0 + * is the newest, prio 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots * - * this returns -1 when it has tried all the backups + * fs_info - filesystem whose backup roots need to be read + * priority - priority of backup root required + * + * Returns backup root index on success and -EINVAL otherwise. */ -static noinline int next_root_backup(struct btrfs_fs_info *info, - struct btrfs_super_block *super, - int *num_backups_tried, int *backup_index) +static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority) { + int backup_index = find_newest_super_backup(fs_info); + struct btrfs_super_block *super = fs_info->super_copy; struct btrfs_root_backup *root_backup; - int newest = *backup_index; - - if (*num_backups_tried == 0) { - u64 gen = btrfs_super_generation(super); - newest = find_newest_super_backup(info, gen); - if (newest == -1) - return -1; + if (priority < BTRFS_NUM_BACKUP_ROOTS && backup_index >= 0) { + if (priority == 0) + return backup_index; - *backup_index = newest; - *num_backups_tried = 1; - } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) { - /* we've tried all the backups, all done */ - return -1; + backup_index = backup_index + BTRFS_NUM_BACKUP_ROOTS - priority; + backup_index %= BTRFS_NUM_BACKUP_ROOTS; } else { - /* jump to the next oldest backup */ - newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) % - BTRFS_NUM_BACKUP_ROOTS; - *backup_index = newest; - *num_backups_tried += 1; + return -EINVAL; } - root_backup = super->super_roots + newest; + + root_backup = super->super_roots + backup_index; btrfs_set_super_generation(super, btrfs_backup_tree_root_gen(root_backup)); @@ -1982,12 +1927,13 @@ static noinline int next_root_backup(struct btrfs_fs_info *info, btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup)); /* - * fixme: the total bytes and num_devices need to match or we should + * Fixme: the total bytes and num_devices need to match or we should * need a fsck */ btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup)); btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup)); - return 0; + + return backup_index; } /* helper to cleanup workers */ @@ -2002,13 +1948,11 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->rmw_workers); btrfs_destroy_workqueue(fs_info->endio_write_workers); btrfs_destroy_workqueue(fs_info->endio_freespace_worker); - btrfs_destroy_workqueue(fs_info->submit_workers); btrfs_destroy_workqueue(fs_info->delayed_workers); btrfs_destroy_workqueue(fs_info->caching_workers); btrfs_destroy_workqueue(fs_info->readahead_workers); btrfs_destroy_workqueue(fs_info->flush_workers); btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers); - btrfs_destroy_workqueue(fs_info->extent_workers); /* * Now that all other work queues are destroyed, we can safely destroy * the queues used for metadata I/O, since tasks from those other work @@ -2029,7 +1973,7 @@ static void free_root_extent_buffers(struct btrfs_root *root) } /* helper to cleanup tree roots */ -static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) +static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root) { free_root_extent_buffers(info->tree_root); @@ -2038,7 +1982,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) free_root_extent_buffers(info->csum_root); free_root_extent_buffers(info->quota_root); free_root_extent_buffers(info->uuid_root); - if (chunk_root) + if (free_chunk_root) free_root_extent_buffers(info->chunk_root); free_root_extent_buffers(info->free_space_root); } @@ -2168,16 +2112,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, fs_info->caching_workers = btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0); - /* - * a higher idle thresh on the submit workers makes it much more - * likely that bios will be send down in a sane order to the - * devices - */ - fs_info->submit_workers = - btrfs_alloc_workqueue(fs_info, "submit", flags, - min_t(u64, fs_devices->num_devices, - max_active), 64); - fs_info->fixup_workers = btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0); @@ -2214,13 +2148,9 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, max_active, 2); fs_info->qgroup_rescan_workers = btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0); - fs_info->extent_workers = - btrfs_alloc_workqueue(fs_info, "extent-refs", flags, - min_t(u64, fs_devices->num_devices, - max_active), 8); if (!(fs_info->workers && fs_info->delalloc_workers && - fs_info->submit_workers && fs_info->flush_workers && + fs_info->flush_workers && fs_info->endio_workers && fs_info->endio_meta_workers && fs_info->endio_meta_write_workers && fs_info->endio_repair_workers && @@ -2228,7 +2158,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->readahead_workers && fs_info->fixup_workers && fs_info->delayed_workers && - fs_info->extent_workers && fs_info->qgroup_rescan_workers)) { return -ENOMEM; } @@ -2239,13 +2168,13 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) { struct crypto_shash *csum_shash; - const char *csum_name = btrfs_super_csum_name(csum_type); + const char *csum_driver = btrfs_super_csum_driver(csum_type); - csum_shash = crypto_alloc_shash(csum_name, 0, 0); + csum_shash = crypto_alloc_shash(csum_driver, 0, 0); if (IS_ERR(csum_shash)) { btrfs_err(fs_info, "error allocating %s hash for checksum", - csum_name); + csum_driver); return PTR_ERR(csum_shash); } @@ -2595,7 +2524,101 @@ out: return ret; } -int open_ctree(struct super_block *sb, +static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) +{ + int backup_index = find_newest_super_backup(fs_info); + struct btrfs_super_block *sb = fs_info->super_copy; + struct btrfs_root *tree_root = fs_info->tree_root; + bool handle_error = false; + int ret = 0; + int i; + + for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { + u64 generation; + int level; + + if (handle_error) { + if (!IS_ERR(tree_root->node)) + free_extent_buffer(tree_root->node); + tree_root->node = NULL; + + if (!btrfs_test_opt(fs_info, USEBACKUPROOT)) + break; + + free_root_pointers(fs_info, 0); + + /* + * Don't use the log in recovery mode, it won't be + * valid + */ + btrfs_set_super_log_root(sb, 0); + + /* We can't trust the free space cache either */ + btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE); + + ret = read_backup_root(fs_info, i); + backup_index = ret; + if (ret < 0) + return ret; + } + generation = btrfs_super_generation(sb); + level = btrfs_super_root_level(sb); + tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb), + generation, level, NULL); + if (IS_ERR(tree_root->node) || + !extent_buffer_uptodate(tree_root->node)) { + handle_error = true; + + if (IS_ERR(tree_root->node)) + ret = PTR_ERR(tree_root->node); + else if (!extent_buffer_uptodate(tree_root->node)) + ret = -EUCLEAN; + + btrfs_warn(fs_info, "failed to read tree root"); + continue; + } + + btrfs_set_root_node(&tree_root->root_item, tree_root->node); + tree_root->commit_root = btrfs_root_node(tree_root); + btrfs_set_root_refs(&tree_root->root_item, 1); + + /* + * No need to hold btrfs_root::objectid_mutex since the fs + * hasn't been fully initialised and we are the only user + */ + ret = btrfs_find_highest_objectid(tree_root, + &tree_root->highest_objectid); + if (ret < 0) { + handle_error = true; + continue; + } + + ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); + + ret = btrfs_read_roots(fs_info); + if (ret < 0) { + handle_error = true; + continue; + } + + /* All successful */ + fs_info->generation = generation; + fs_info->last_trans_committed = generation; + + /* Always begin writing backup roots after the one being used */ + if (backup_index < 0) { + fs_info->backup_root_index = 0; + } else { + fs_info->backup_root_index = backup_index + 1; + fs_info->backup_root_index %= BTRFS_NUM_BACKUP_ROOTS; + } + break; + } + + return ret; +} + +int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options) { @@ -2613,8 +2636,6 @@ int open_ctree(struct super_block *sb, struct btrfs_root *chunk_root; int ret; int err = -EINVAL; - int num_backups_tried = 0; - int backup_index = 0; int clear_free_space_tree = 0; int level; @@ -2885,13 +2906,6 @@ int open_ctree(struct super_block *sb, set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); /* - * run through our array of backup supers and setup - * our ring pointer to the oldest one - */ - generation = btrfs_super_generation(disk_super); - find_oldest_super_backup(fs_info, generation); - - /* * In the long term, we'll store the compression type in the super * block, and it'll be used for per file compression control. */ @@ -3037,44 +3051,9 @@ int open_ctree(struct super_block *sb, goto fail_tree_roots; } -retry_root_backup: - generation = btrfs_super_generation(disk_super); - level = btrfs_super_root_level(disk_super); - - tree_root->node = read_tree_block(fs_info, - btrfs_super_root(disk_super), - generation, level, NULL); - if (IS_ERR(tree_root->node) || - !extent_buffer_uptodate(tree_root->node)) { - btrfs_warn(fs_info, "failed to read tree root"); - if (!IS_ERR(tree_root->node)) - free_extent_buffer(tree_root->node); - tree_root->node = NULL; - goto recovery_tree_root; - } - - btrfs_set_root_node(&tree_root->root_item, tree_root->node); - tree_root->commit_root = btrfs_root_node(tree_root); - btrfs_set_root_refs(&tree_root->root_item, 1); - - mutex_lock(&tree_root->objectid_mutex); - ret = btrfs_find_highest_objectid(tree_root, - &tree_root->highest_objectid); - if (ret) { - mutex_unlock(&tree_root->objectid_mutex); - goto recovery_tree_root; - } - - ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); - - mutex_unlock(&tree_root->objectid_mutex); - - ret = btrfs_read_roots(fs_info); + ret = init_tree_roots(fs_info); if (ret) - goto recovery_tree_root; - - fs_info->generation = generation; - fs_info->last_trans_committed = generation; + goto fail_tree_roots; ret = btrfs_verify_dev_extents(fs_info); if (ret) { @@ -3342,7 +3321,7 @@ fail_block_groups: btrfs_put_block_group_cache(fs_info); fail_tree_roots: - free_root_pointers(fs_info, 1); + free_root_pointers(fs_info, true); invalidate_inode_pages2(fs_info->btree_inode->i_mapping); fail_sb_buffer: @@ -3369,24 +3348,6 @@ fail: btrfs_free_stripe_hash_table(fs_info); btrfs_close_devices(fs_info->fs_devices); return err; - -recovery_tree_root: - if (!btrfs_test_opt(fs_info, USEBACKUPROOT)) - goto fail_tree_roots; - - free_root_pointers(fs_info, 0); - - /* don't use the log in recovery mode, it won't be valid */ - btrfs_set_super_log_root(disk_super, 0); - - /* we can't trust the free space cache either */ - btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE); - - ret = next_root_backup(fs_info, fs_info->super_copy, - &num_backups_tried, &backup_index); - if (ret == -1) - goto fail_block_groups; - goto retry_root_backup; } ALLOW_ERROR_INJECTION(open_ctree, ERRNO); @@ -3980,7 +3941,7 @@ int btrfs_commit_super(struct btrfs_fs_info *fs_info) return btrfs_commit_transaction(trans); } -void close_ctree(struct btrfs_fs_info *fs_info) +void __cold close_ctree(struct btrfs_fs_info *fs_info) { int ret; @@ -4068,7 +4029,7 @@ void close_ctree(struct btrfs_fs_info *fs_info) btrfs_free_block_groups(fs_info); clear_bit(BTRFS_FS_OPEN, &fs_info->flags); - free_root_pointers(fs_info, 1); + free_root_pointers(fs_info, true); iput(fs_info->btree_inode); @@ -4445,7 +4406,7 @@ again: return 0; } -static void btrfs_cleanup_bg_io(struct btrfs_block_group_cache *cache) +static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache) { struct inode *inode; @@ -4462,12 +4423,12 @@ static void btrfs_cleanup_bg_io(struct btrfs_block_group_cache *cache) void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, struct btrfs_fs_info *fs_info) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; spin_lock(&cur_trans->dirty_bgs_lock); while (!list_empty(&cur_trans->dirty_bgs)) { cache = list_first_entry(&cur_trans->dirty_bgs, - struct btrfs_block_group_cache, + struct btrfs_block_group, dirty_list); if (!list_empty(&cache->io_list)) { @@ -4495,7 +4456,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, */ while (!list_empty(&cur_trans->io_bgs)) { cache = list_first_entry(&cur_trans->io_bgs, - struct btrfs_block_group_cache, + struct btrfs_block_group, io_list); list_del_init(&cache->io_list); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index a6958103d87e..76f123ebb292 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -49,10 +49,10 @@ struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr); void btrfs_clean_tree_block(struct extent_buffer *buf); -int open_ctree(struct super_block *sb, +int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options); -void close_ctree(struct btrfs_fs_info *fs_info); +void __cold close_ctree(struct btrfs_fs_info *fs_info); int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors); struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index ddf28ecf17f9..72e312cae69d 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -87,7 +87,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - inode = btrfs_iget(sb, &key, root, NULL); + inode = btrfs_iget(sb, &key, root); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto fail; @@ -214,7 +214,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child) key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - return d_obtain_alias(btrfs_iget(fs_info->sb, &key, root, NULL)); + return d_obtain_alias(btrfs_iget(fs_info->sb, &key, root)); fail: btrfs_free_path(path); return ERR_PTR(ret); diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h new file mode 100644 index 000000000000..a3febe746c79 --- /dev/null +++ b/fs/btrfs/extent-io-tree.h @@ -0,0 +1,248 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_EXTENT_IO_TREE_H +#define BTRFS_EXTENT_IO_TREE_H + +struct extent_changeset; +struct io_failure_record; + +/* Bits for the extent state */ +#define EXTENT_DIRTY (1U << 0) +#define EXTENT_UPTODATE (1U << 1) +#define EXTENT_LOCKED (1U << 2) +#define EXTENT_NEW (1U << 3) +#define EXTENT_DELALLOC (1U << 4) +#define EXTENT_DEFRAG (1U << 5) +#define EXTENT_BOUNDARY (1U << 6) +#define EXTENT_NODATASUM (1U << 7) +#define EXTENT_CLEAR_META_RESV (1U << 8) +#define EXTENT_NEED_WAIT (1U << 9) +#define EXTENT_DAMAGED (1U << 10) +#define EXTENT_NORESERVE (1U << 11) +#define EXTENT_QGROUP_RESERVED (1U << 12) +#define EXTENT_CLEAR_DATA_RESV (1U << 13) +#define EXTENT_DELALLOC_NEW (1U << 14) +#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \ + EXTENT_CLEAR_DATA_RESV) +#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING) + +/* + * Redefined bits above which are used only in the device allocation tree, + * shouldn't be using EXTENT_LOCKED / EXTENT_BOUNDARY / EXTENT_CLEAR_META_RESV + * / EXTENT_CLEAR_DATA_RESV because they have special meaning to the bit + * manipulation functions + */ +#define CHUNK_ALLOCATED EXTENT_DIRTY +#define CHUNK_TRIMMED EXTENT_DEFRAG + +enum { + IO_TREE_FS_INFO_FREED_EXTENTS0, + IO_TREE_FS_INFO_FREED_EXTENTS1, + IO_TREE_INODE_IO, + IO_TREE_INODE_IO_FAILURE, + IO_TREE_RELOC_BLOCKS, + IO_TREE_TRANS_DIRTY_PAGES, + IO_TREE_ROOT_DIRTY_LOG_PAGES, + IO_TREE_SELFTEST, +}; + +struct extent_io_tree { + struct rb_root state; + struct btrfs_fs_info *fs_info; + void *private_data; + u64 dirty_bytes; + bool track_uptodate; + + /* Who owns this io tree, should be one of IO_TREE_* */ + u8 owner; + + spinlock_t lock; + const struct extent_io_ops *ops; +}; + +struct extent_state { + u64 start; + u64 end; /* inclusive */ + struct rb_node rb_node; + + /* ADD NEW ELEMENTS AFTER THIS */ + wait_queue_head_t wq; + refcount_t refs; + unsigned state; + + struct io_failure_record *failrec; + +#ifdef CONFIG_BTRFS_DEBUG + struct list_head leak_list; +#endif +}; + +int __init extent_state_cache_init(void); +void __cold extent_state_cache_exit(void); + +void extent_io_tree_init(struct btrfs_fs_info *fs_info, + struct extent_io_tree *tree, unsigned int owner, + void *private_data); +void extent_io_tree_release(struct extent_io_tree *tree); + +int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached); + +static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) +{ + return lock_extent_bits(tree, start, end, NULL); +} + +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end); + +int __init extent_io_init(void); +void __cold extent_io_exit(void); + +u64 count_range_bits(struct extent_io_tree *tree, + u64 *start, u64 search_end, + u64 max_bytes, unsigned bits, int contig); + +void free_extent_state(struct extent_state *state); +int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, int filled, + struct extent_state *cached_state); +int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, struct extent_changeset *changeset); +int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, int wake, int delete, + struct extent_state **cached); +int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, int wake, int delete, + struct extent_state **cached, gfp_t mask, + struct extent_changeset *changeset); + +static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) +{ + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL); +} + +static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) +{ + return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, + GFP_NOFS, NULL); +} + +static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree, + u64 start, u64 end, struct extent_state **cached) +{ + return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, + GFP_ATOMIC, NULL); +} + +static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, + u64 end, unsigned bits) +{ + int wake = 0; + + if (bits & EXTENT_LOCKED) + wake = 1; + + return clear_extent_bit(tree, start, end, bits, wake, 0, NULL); +} + +int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, struct extent_changeset *changeset); +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, u64 *failed_start, + struct extent_state **cached_state, gfp_t mask); +int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits); + +static inline int set_extent_bits(struct extent_io_tree *tree, u64 start, + u64 end, unsigned bits) +{ + return set_extent_bit(tree, start, end, bits, NULL, NULL, GFP_NOFS); +} + +static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached_state) +{ + return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, + cached_state, GFP_NOFS, NULL); +} + +static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start, + u64 end, gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, + NULL, mask); +} + +static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) +{ + return clear_extent_bit(tree, start, end, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 0, 0, cached); +} + +int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, unsigned clear_bits, + struct extent_state **cached_state); + +static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start, + u64 end, unsigned int extra_bits, + struct extent_state **cached_state) +{ + return set_extent_bit(tree, start, end, + EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits, + NULL, cached_state, GFP_NOFS); +} + +static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached_state) +{ + return set_extent_bit(tree, start, end, + EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, + NULL, cached_state, GFP_NOFS); +} + +static inline int set_extent_new(struct extent_io_tree *tree, u64 start, + u64 end) +{ + return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, + GFP_NOFS); +} + +static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached_state, gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL, + cached_state, mask); +} + +int find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, unsigned bits, + struct extent_state **cached_state); +void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, unsigned bits); +int extent_invalidatepage(struct extent_io_tree *tree, + struct page *page, unsigned long offset); +bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, + u64 *end, u64 max_bytes, + struct extent_state **cached_state); + +/* This should be reworked in the future and put elsewhere. */ +int get_state_failrec(struct extent_io_tree *tree, u64 start, + struct io_failure_record **failrec); +int set_state_failrec(struct extent_io_tree *tree, u64 start, + struct io_failure_record *failrec); +void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, + u64 end); +int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, + struct io_failure_record **failrec_ret); +int free_io_failure(struct extent_io_tree *failure_tree, + struct extent_io_tree *io_tree, + struct io_failure_record *rec); +int clean_io_failure(struct btrfs_fs_info *fs_info, + struct extent_io_tree *failure_tree, + struct extent_io_tree *io_tree, u64 start, + struct page *page, u64 ino, unsigned int pg_offset); + +#endif /* BTRFS_EXTENT_IO_TREE_H */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 49cb26fa7c63..153f71a5bba9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -54,7 +54,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); -static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) +static int block_group_bits(struct btrfs_block_group *cache, u64 bits) { return (cache->flags & bits) == bits; } @@ -70,13 +70,13 @@ int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, return 0; } -void btrfs_free_excluded_extents(struct btrfs_block_group_cache *cache) +void btrfs_free_excluded_extents(struct btrfs_block_group *cache) { struct btrfs_fs_info *fs_info = cache->fs_info; u64 start, end; - start = cache->key.objectid; - end = start + cache->key.offset - 1; + start = cache->start; + end = start + cache->length - 1; clear_extent_bits(&fs_info->freed_extents[0], start, end, EXTENT_UPTODATE); @@ -1306,8 +1306,10 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes) { - int ret; + int ret = 0; u64 discarded_bytes = 0; + u64 end = bytenr + num_bytes; + u64 cur = bytenr; struct btrfs_bio *bbio = NULL; @@ -1316,15 +1318,23 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, * associated to its stripes that don't go away while we are discarding. */ btrfs_bio_counter_inc_blocked(fs_info); - /* Tell the block device(s) that the sectors can be discarded */ - ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes, - &bbio, 0); - /* Error condition is -ENOMEM */ - if (!ret) { - struct btrfs_bio_stripe *stripe = bbio->stripes; + while (cur < end) { + struct btrfs_bio_stripe *stripe; int i; + num_bytes = end - cur; + /* Tell the block device(s) that the sectors can be discarded */ + ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur, + &num_bytes, &bbio, 0); + /* + * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or + * -EOPNOTSUPP. For any such error, @num_bytes is not updated, + * thus we can't continue anyway. + */ + if (ret < 0) + goto out; + stripe = bbio->stripes; for (i = 0; i < bbio->num_stripes; i++, stripe++) { u64 bytes; struct request_queue *req_q; @@ -1341,10 +1351,19 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, stripe->physical, stripe->length, &bytes); - if (!ret) + if (!ret) { discarded_bytes += bytes; - else if (ret != -EOPNOTSUPP) - break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ + } else if (ret != -EOPNOTSUPP) { + /* + * Logic errors or -ENOMEM, or -EIO, but + * unlikely to happen. + * + * And since there are two loops, explicitly + * go to out to avoid confusion. + */ + btrfs_put_bbio(bbio); + goto out; + } /* * Just in case we get back EOPNOTSUPP for some reason, @@ -1354,7 +1373,9 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, ret = 0; } btrfs_put_bbio(bbio); + cur += num_bytes; } +out: btrfs_bio_counter_dec(fs_info); if (actual_bytes) @@ -2516,7 +2537,7 @@ int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) { - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; int readonly = 0; block_group = btrfs_lookup_block_group(fs_info, bytenr); @@ -2546,7 +2567,7 @@ static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; u64 bytenr; spin_lock(&fs_info->block_group_cache_lock); @@ -2560,13 +2581,13 @@ static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) if (!cache) return 0; - bytenr = cache->key.objectid; + bytenr = cache->start; btrfs_put_block_group(cache); return bytenr; } -static int pin_down_extent(struct btrfs_block_group_cache *cache, +static int pin_down_extent(struct btrfs_block_group *cache, u64 bytenr, u64 num_bytes, int reserved) { struct btrfs_fs_info *fs_info = cache->fs_info; @@ -2590,13 +2611,12 @@ static int pin_down_extent(struct btrfs_block_group_cache *cache, return 0; } -/* - * this function must be called within transaction - */ int btrfs_pin_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, int reserved) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; + + ASSERT(fs_info->running_transaction); cache = btrfs_lookup_block_group(fs_info, bytenr); BUG_ON(!cache); /* Logic error */ @@ -2613,7 +2633,7 @@ int btrfs_pin_extent(struct btrfs_fs_info *fs_info, int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; int ret; cache = btrfs_lookup_block_group(fs_info, bytenr); @@ -2640,7 +2660,7 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, u64 start, u64 num_bytes) { int ret; - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; struct btrfs_caching_control *caching_ctl; block_group = btrfs_lookup_block_group(fs_info, start); @@ -2652,7 +2672,7 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, if (!caching_ctl) { /* Logic error */ - BUG_ON(!btrfs_block_group_cache_done(block_group)); + BUG_ON(!btrfs_block_group_done(block_group)); ret = btrfs_remove_free_space(block_group, start, num_bytes); } else { mutex_lock(&caching_ctl->mutex); @@ -2717,7 +2737,7 @@ int btrfs_exclude_logged_extents(struct extent_buffer *eb) } static void -btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg) +btrfs_inc_block_group_reservations(struct btrfs_block_group *bg) { atomic_inc(&bg->reservations); } @@ -2726,14 +2746,14 @@ void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info) { struct btrfs_caching_control *next; struct btrfs_caching_control *caching_ctl; - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; down_write(&fs_info->commit_root_sem); list_for_each_entry_safe(caching_ctl, next, &fs_info->caching_block_groups, list) { cache = caching_ctl->block_group; - if (btrfs_block_group_cache_done(cache)) { + if (btrfs_block_group_done(cache)) { cache->last_byte_to_unpin = (u64)-1; list_del_init(&caching_ctl->list); btrfs_put_caching_control(caching_ctl); @@ -2785,7 +2805,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end, const bool return_free_space) { - struct btrfs_block_group_cache *cache = NULL; + struct btrfs_block_group *cache = NULL; struct btrfs_space_info *space_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; struct btrfs_free_cluster *cluster = NULL; @@ -2797,7 +2817,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, while (start <= end) { readonly = false; if (!cache || - start >= cache->key.objectid + cache->key.offset) { + start >= cache->start + cache->length) { if (cache) btrfs_put_block_group(cache); total_unpinned = 0; @@ -2810,7 +2830,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, empty_cluster <<= 1; } - len = cache->key.objectid + cache->key.offset - start; + len = cache->start + cache->length - start; len = min(len, end + 1 - start); if (start < cache->last_byte_to_unpin) { @@ -2880,7 +2900,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_group_cache *block_group, *tmp; + struct btrfs_block_group *block_group, *tmp; struct list_head *deleted_bgs; struct extent_io_tree *unpin; u64 start; @@ -2926,8 +2946,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) ret = -EROFS; if (!trans->aborted) ret = btrfs_discard_extent(fs_info, - block_group->key.objectid, - block_group->key.offset, + block_group->start, + block_group->length, &trimmed); list_del_init(&block_group->bg_list); @@ -3262,7 +3282,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, } if (last_ref && btrfs_header_generation(buf) == trans->transid) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { ret = check_ref_cleanup(trans, buf->start); @@ -3349,15 +3369,14 @@ enum btrfs_loop_type { }; static inline void -btrfs_lock_block_group(struct btrfs_block_group_cache *cache, +btrfs_lock_block_group(struct btrfs_block_group *cache, int delalloc) { if (delalloc) down_read(&cache->data_rwsem); } -static inline void -btrfs_grab_block_group(struct btrfs_block_group_cache *cache, +static inline void btrfs_grab_block_group(struct btrfs_block_group *cache, int delalloc) { btrfs_get_block_group(cache); @@ -3365,12 +3384,12 @@ btrfs_grab_block_group(struct btrfs_block_group_cache *cache, down_read(&cache->data_rwsem); } -static struct btrfs_block_group_cache * -btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, +static struct btrfs_block_group *btrfs_lock_cluster( + struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, int delalloc) { - struct btrfs_block_group_cache *used_bg = NULL; + struct btrfs_block_group *used_bg = NULL; spin_lock(&cluster->refill_lock); while (1) { @@ -3404,7 +3423,7 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, } static inline void -btrfs_release_block_group(struct btrfs_block_group_cache *cache, +btrfs_release_block_group(struct btrfs_block_group *cache, int delalloc) { if (delalloc) @@ -3475,12 +3494,12 @@ struct find_free_extent_ctl { * Return >0 to inform caller that we find nothing * Return 0 means we have found a location and set ffe_ctl->found_offset. */ -static int find_free_extent_clustered(struct btrfs_block_group_cache *bg, +static int find_free_extent_clustered(struct btrfs_block_group *bg, struct btrfs_free_cluster *last_ptr, struct find_free_extent_ctl *ffe_ctl, - struct btrfs_block_group_cache **cluster_bg_ret) + struct btrfs_block_group **cluster_bg_ret) { - struct btrfs_block_group_cache *cluster_bg; + struct btrfs_block_group *cluster_bg; u64 aligned_cluster; u64 offset; int ret; @@ -3493,7 +3512,7 @@ static int find_free_extent_clustered(struct btrfs_block_group_cache *bg, goto release_cluster; offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr, - ffe_ctl->num_bytes, cluster_bg->key.objectid, + ffe_ctl->num_bytes, cluster_bg->start, &ffe_ctl->max_extent_size); if (offset) { /* We have a block, we're done */ @@ -3579,7 +3598,7 @@ refill_cluster: * Return 0 when we found an free extent and set ffe_ctrl->found_offset * Return -EAGAIN to inform caller that we need to re-search this block group */ -static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg, +static int find_free_extent_unclustered(struct btrfs_block_group *bg, struct btrfs_free_cluster *last_ptr, struct find_free_extent_ctl *ffe_ctl) { @@ -3781,7 +3800,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, { int ret = 0; struct btrfs_free_cluster *last_ptr = NULL; - struct btrfs_block_group_cache *block_group = NULL; + struct btrfs_block_group *block_group = NULL; struct find_free_extent_ctl ffe_ctl = {0}; struct btrfs_space_info *space_info; bool use_cluster = true; @@ -3904,7 +3923,7 @@ search: continue; btrfs_grab_block_group(block_group, delalloc); - ffe_ctl.search_start = block_group->key.objectid; + ffe_ctl.search_start = block_group->start; /* * this can happen if we end up cycling through all the @@ -3935,7 +3954,7 @@ search: } have_block_group: - ffe_ctl.cached = btrfs_block_group_cache_done(block_group); + ffe_ctl.cached = btrfs_block_group_done(block_group); if (unlikely(!ffe_ctl.cached)) { ffe_ctl.have_caching_bg = true; ret = btrfs_cache_block_group(block_group, 0); @@ -3951,7 +3970,7 @@ have_block_group: * lets look there */ if (last_ptr && use_cluster) { - struct btrfs_block_group_cache *cluster_bg = NULL; + struct btrfs_block_group *cluster_bg = NULL; ret = find_free_extent_clustered(block_group, last_ptr, &ffe_ctl, &cluster_bg); @@ -3984,7 +4003,7 @@ checks: /* move on to the next group */ if (ffe_ctl.search_start + num_bytes > - block_group->key.objectid + block_group->key.offset) { + block_group->start + block_group->length) { btrfs_add_free_space(block_group, ffe_ctl.found_offset, num_bytes); goto loop; @@ -4133,7 +4152,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, int pin, int delalloc) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; int ret = 0; cache = btrfs_lookup_block_group(fs_info, start); @@ -4366,7 +4385,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = trans->fs_info; int ret; - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; struct btrfs_space_info *space_info; /* @@ -5436,7 +5455,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, btrfs_assert_tree_locked(parent); parent_level = btrfs_header_level(parent); - extent_buffer_get(parent); + atomic_inc(&parent->refs); path->nodes[parent_level] = parent; path->slots[parent_level] = btrfs_header_nritems(parent); @@ -5480,7 +5499,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, */ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) { - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; u64 free_bytes = 0; int factor; @@ -5498,9 +5517,8 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) } factor = btrfs_bg_type_to_factor(block_group->flags); - free_bytes += (block_group->key.offset - - btrfs_block_group_used(&block_group->item)) * - factor; + free_bytes += (block_group->length - + block_group->used) * factor; spin_unlock(&block_group->lock); } @@ -5623,7 +5641,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) */ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) { - struct btrfs_block_group_cache *cache = NULL; + struct btrfs_block_group *cache = NULL; struct btrfs_device *device; struct list_head *devices; u64 group_trimmed; @@ -5647,16 +5665,16 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) cache = btrfs_lookup_first_block_group(fs_info, range->start); for (; cache; cache = btrfs_next_block_group(cache)) { - if (cache->key.objectid >= range_end) { + if (cache->start >= range_end) { btrfs_put_block_group(cache); break; } - start = max(range->start, cache->key.objectid); - end = min(range_end, cache->key.objectid + cache->key.offset); + start = max(range->start, cache->start); + end = min(range_end, cache->start + cache->length); if (end - start >= range->minlen) { - if (!btrfs_block_group_cache_done(cache)) { + if (!btrfs_block_group_done(cache)) { ret = btrfs_cache_block_group(cache, 0); if (ret) { bg_failed++; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cceaf05aada2..eb8bd0258360 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -14,6 +14,7 @@ #include <linux/prefetch.h> #include <linux/cleancache.h> #include "extent_io.h" +#include "extent-io-tree.h" #include "extent_map.h" #include "ctree.h" #include "btrfs_inode.h" @@ -59,12 +60,23 @@ void btrfs_leak_debug_del(struct list_head *entry) spin_unlock_irqrestore(&leak_lock, flags); } -static inline -void btrfs_leak_debug_check(void) +static inline void btrfs_extent_buffer_leak_debug_check(void) { - struct extent_state *state; struct extent_buffer *eb; + while (!list_empty(&buffers)) { + eb = list_entry(buffers.next, struct extent_buffer, leak_list); + pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n", + eb->start, eb->len, atomic_read(&eb->refs), eb->bflags); + list_del(&eb->leak_list); + kmem_cache_free(extent_buffer_cache, eb); + } +} + +static inline void btrfs_extent_state_leak_debug_check(void) +{ + struct extent_state *state; + while (!list_empty(&states)) { state = list_entry(states.next, struct extent_state, leak_list); pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", @@ -74,14 +86,6 @@ void btrfs_leak_debug_check(void) list_del(&state->leak_list); kmem_cache_free(extent_state_cache, state); } - - while (!list_empty(&buffers)) { - eb = list_entry(buffers.next, struct extent_buffer, leak_list); - pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n", - eb->start, eb->len, atomic_read(&eb->refs), eb->bflags); - list_del(&eb->leak_list); - kmem_cache_free(extent_buffer_cache, eb); - } } #define btrfs_debug_check_extent_io_range(tree, start, end) \ @@ -105,7 +109,8 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, #else #define btrfs_leak_debug_add(new, head) do {} while (0) #define btrfs_leak_debug_del(entry) do {} while (0) -#define btrfs_leak_debug_check() do {} while (0) +#define btrfs_extent_buffer_leak_debug_check() do {} while (0) +#define btrfs_extent_state_leak_debug_check() do {} while (0) #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) #endif @@ -196,19 +201,23 @@ static int __must_check flush_write_bio(struct extent_page_data *epd) return ret; } -int __init extent_io_init(void) +int __init extent_state_cache_init(void) { extent_state_cache = kmem_cache_create("btrfs_extent_state", sizeof(struct extent_state), 0, SLAB_MEM_SPREAD, NULL); if (!extent_state_cache) return -ENOMEM; + return 0; +} +int __init extent_io_init(void) +{ extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", sizeof(struct extent_buffer), 0, SLAB_MEM_SPREAD, NULL); if (!extent_buffer_cache) - goto free_state_cache; + return -ENOMEM; if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, offsetof(struct btrfs_io_bio, bio), @@ -226,23 +235,24 @@ free_bioset: free_buffer_cache: kmem_cache_destroy(extent_buffer_cache); extent_buffer_cache = NULL; + return -ENOMEM; +} -free_state_cache: +void __cold extent_state_cache_exit(void) +{ + btrfs_extent_state_leak_debug_check(); kmem_cache_destroy(extent_state_cache); - extent_state_cache = NULL; - return -ENOMEM; } void __cold extent_io_exit(void) { - btrfs_leak_debug_check(); + btrfs_extent_buffer_leak_debug_check(); /* * Make sure all delayed rcu free are flushed before we * destroy caches. */ rcu_barrier(); - kmem_cache_destroy(extent_state_cache); kmem_cache_destroy(extent_buffer_cache); bioset_exit(&btrfs_bioset); } @@ -1676,9 +1686,9 @@ out: * * true is returned if we find something, false if nothing was in the tree */ -static noinline bool find_delalloc_range(struct extent_io_tree *tree, - u64 *start, u64 *end, u64 max_bytes, - struct extent_state **cached_state) +bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, + u64 *end, u64 max_bytes, + struct extent_state **cached_state) { struct rb_node *node; struct extent_state *state; @@ -1796,8 +1806,8 @@ again: /* step one, find a bunch of delalloc bytes starting at start */ delalloc_start = *start; delalloc_end = 0; - found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, - max_bytes, &cached_state); + found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end, + max_bytes, &cached_state); if (!found || delalloc_end <= *start) { *start = delalloc_start; *end = delalloc_end; @@ -1899,7 +1909,7 @@ static int __process_pages_contig(struct address_space *mapping, if (page_ops & PAGE_SET_PRIVATE2) SetPagePrivate2(pages[i]); - if (pages[i] == locked_page) { + if (locked_page && pages[i] == locked_page) { put_page(pages[i]); pages_locked++; continue; @@ -2014,8 +2024,8 @@ out: * set the private field for a given byte offset in the tree. If there isn't * an extent_state there already, this does nothing. */ -static noinline int set_state_failrec(struct extent_io_tree *tree, u64 start, - struct io_failure_record *failrec) +int set_state_failrec(struct extent_io_tree *tree, u64 start, + struct io_failure_record *failrec) { struct rb_node *node; struct extent_state *state; @@ -2042,8 +2052,8 @@ out: return ret; } -static noinline int get_state_failrec(struct extent_io_tree *tree, u64 start, - struct io_failure_record **failrec) +int get_state_failrec(struct extent_io_tree *tree, u64 start, + struct io_failure_record **failrec) { struct rb_node *node; struct extent_state *state; @@ -2534,7 +2544,6 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, bio = btrfs_io_bio_alloc(1); bio->bi_end_io = endio_func; bio->bi_iter.bi_sector = failrec->logical >> 9; - bio_set_dev(bio, fs_info->fs_devices->latest_bdev); bio->bi_iter.bi_size = 0; bio->bi_private = data; @@ -2920,7 +2929,6 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) * a contiguous page to the previous one * @size: portion of page that we want to write * @offset: starting offset in the page - * @bdev: attach newly created bios to this bdev * @bio_ret: must be valid pointer, newly allocated bio will be stored there * @end_io_func: end_io callback for new bio * @mirror_num: desired mirror to read/write @@ -2931,7 +2939,6 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, struct writeback_control *wbc, struct page *page, u64 offset, size_t size, unsigned long pg_offset, - struct block_device *bdev, struct bio **bio_ret, bio_end_io_t end_io_func, int mirror_num, @@ -2977,13 +2984,16 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, } bio = btrfs_bio_alloc(offset); - bio_set_dev(bio, bdev); bio_add_page(bio, page, page_size, pg_offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; bio->bi_write_hint = page->mapping->host->i_write_hint; bio->bi_opf = opf; if (wbc) { + struct block_device *bdev; + + bdev = BTRFS_I(page->mapping->host)->root->fs_info->fs_devices->latest_bdev; + bio_set_dev(bio, bdev); wbc_init_bio(wbc, bio); wbc_account_cgroup_owner(wbc, page, page_size); } @@ -3065,7 +3075,6 @@ static int __do_readpage(struct extent_io_tree *tree, u64 block_start; u64 cur_end; struct extent_map *em; - struct block_device *bdev; int ret = 0; int nr = 0; size_t pg_offset = 0; @@ -3142,7 +3151,6 @@ static int __do_readpage(struct extent_io_tree *tree, offset = em->block_start + extent_offset; disk_io_size = iosize; } - bdev = em->bdev; block_start = em->block_start; if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) block_start = EXTENT_MAP_HOLE; @@ -3232,7 +3240,7 @@ static int __do_readpage(struct extent_io_tree *tree, ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL, page, offset, disk_io_size, - pg_offset, bdev, bio, + pg_offset, bio, end_bio_extent_readpage, mirror_num, *bio_flags, this_bio_flag, @@ -3409,7 +3417,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, struct extent_page_data *epd, loff_t i_size, unsigned long nr_written, - unsigned int write_flags, int *nr_ret) + int *nr_ret) { struct extent_io_tree *tree = epd->tree; u64 start = page_offset(page); @@ -3420,11 +3428,11 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, u64 block_start; u64 iosize; struct extent_map *em; - struct block_device *bdev; size_t pg_offset = 0; size_t blocksize; int ret = 0; int nr = 0; + const unsigned int write_flags = wbc_to_write_flags(wbc); bool compressed; ret = btrfs_writepage_cow_fixup(page, start, page_end); @@ -3478,7 +3486,6 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, iosize = min(em_end - cur, end - cur + 1); iosize = ALIGN(iosize, blocksize); offset = em->block_start + extent_offset; - bdev = em->bdev; block_start = em->block_start; compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); free_extent_map(em); @@ -3520,7 +3527,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, page, offset, iosize, pg_offset, - bdev, &epd->bio, + &epd->bio, end_bio_extent_writepage, 0, 0, 0, false); if (ret) { @@ -3558,11 +3565,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, size_t pg_offset = 0; loff_t i_size = i_size_read(inode); unsigned long end_index = i_size >> PAGE_SHIFT; - unsigned int write_flags = 0; unsigned long nr_written = 0; - write_flags = wbc_to_write_flags(wbc); - trace___extent_writepage(page, inode, wbc); WARN_ON(!PageLocked(page)); @@ -3600,7 +3604,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, } ret = __extent_writepage_io(inode, page, wbc, epd, - i_size, nr_written, write_flags, &nr); + i_size, nr_written, &nr); if (ret == 1) goto done_unlocked; @@ -3849,7 +3853,6 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, struct extent_page_data *epd) { struct btrfs_fs_info *fs_info = eb->fs_info; - struct block_device *bdev = fs_info->fs_devices->latest_bdev; struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree; u64 offset = eb->start; u32 nritems; @@ -3884,7 +3887,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, - p, offset, PAGE_SIZE, 0, bdev, + p, offset, PAGE_SIZE, 0, &epd->bio, end_bio_extent_buffer_writepage, 0, 0, 0, false); @@ -4121,7 +4124,7 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - done_index = page->index; + done_index = page->index + 1; /* * At this point we hold neither the i_pages lock nor * the page lock: the page may be truncated or @@ -4156,16 +4159,6 @@ retry: ret = __extent_writepage(page, wbc, epd); if (ret < 0) { - /* - * done_index is set past this page, - * so media errors will not choke - * background writeout for the entire - * file. This has consequences for - * range_cyclic semantics (ie. it may - * not be suitable for data integrity - * writeout). - */ - done_index = page->index + 1; done = 1; break; } @@ -4240,8 +4233,12 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, .nr_to_write = nr_pages * 2, .range_start = start, .range_end = end + 1, + /* We're called from an async helper function */ + .punt_to_cgroup = 1, + .no_cgroup_owner = 1, }; + wbc_attach_fdatawrite_inode(&wbc_writepages, inode); while (start <= end) { page = find_get_page(mapping, start >> PAGE_SHIFT); if (clear_page_dirty_for_io(page)) @@ -4256,11 +4253,12 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, } ASSERT(ret <= 0); - if (ret < 0) { + if (ret == 0) + ret = flush_write_bio(&epd); + else end_write_bio(&epd, ret); - return ret; - } - ret = flush_write_bio(&epd); + + wbc_detach_inode(&wbc_writepages); return ret; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index cf3424d58fec..a8551a1f56e2 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -7,35 +7,6 @@ #include <linux/refcount.h> #include "ulist.h" -/* bits for the extent state */ -#define EXTENT_DIRTY (1U << 0) -#define EXTENT_UPTODATE (1U << 1) -#define EXTENT_LOCKED (1U << 2) -#define EXTENT_NEW (1U << 3) -#define EXTENT_DELALLOC (1U << 4) -#define EXTENT_DEFRAG (1U << 5) -#define EXTENT_BOUNDARY (1U << 6) -#define EXTENT_NODATASUM (1U << 7) -#define EXTENT_CLEAR_META_RESV (1U << 8) -#define EXTENT_NEED_WAIT (1U << 9) -#define EXTENT_DAMAGED (1U << 10) -#define EXTENT_NORESERVE (1U << 11) -#define EXTENT_QGROUP_RESERVED (1U << 12) -#define EXTENT_CLEAR_DATA_RESV (1U << 13) -#define EXTENT_DELALLOC_NEW (1U << 14) -#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \ - EXTENT_CLEAR_DATA_RESV) -#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING) - -/* - * Redefined bits above which are used only in the device allocation tree, - * shouldn't be using EXTENT_LOCKED / EXTENT_BOUNDARY / EXTENT_CLEAR_META_RESV - * / EXTENT_CLEAR_DATA_RESV because they have special meaning to the bit - * manipulation functions - */ -#define CHUNK_ALLOCATED EXTENT_DIRTY -#define CHUNK_TRIMMED EXTENT_DEFRAG - /* * flags for bio submission. The high bits indicate the compression * type for this bio @@ -89,12 +60,11 @@ enum { #define BITMAP_LAST_BYTE_MASK(nbits) \ (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) -struct extent_state; struct btrfs_root; struct btrfs_inode; struct btrfs_io_bio; struct io_failure_record; - +struct extent_io_tree; typedef blk_status_t (extent_submit_bio_start_t)(void *private_data, struct bio *bio, u64 bio_offset); @@ -111,47 +81,6 @@ struct extent_io_ops { int mirror); }; -enum { - IO_TREE_FS_INFO_FREED_EXTENTS0, - IO_TREE_FS_INFO_FREED_EXTENTS1, - IO_TREE_INODE_IO, - IO_TREE_INODE_IO_FAILURE, - IO_TREE_RELOC_BLOCKS, - IO_TREE_TRANS_DIRTY_PAGES, - IO_TREE_ROOT_DIRTY_LOG_PAGES, - IO_TREE_SELFTEST, -}; - -struct extent_io_tree { - struct rb_root state; - struct btrfs_fs_info *fs_info; - void *private_data; - u64 dirty_bytes; - bool track_uptodate; - - /* Who owns this io tree, should be one of IO_TREE_* */ - u8 owner; - - spinlock_t lock; - const struct extent_io_ops *ops; -}; - -struct extent_state { - u64 start; - u64 end; /* inclusive */ - struct rb_node rb_node; - - /* ADD NEW ELEMENTS AFTER THIS */ - wait_queue_head_t wq; - refcount_t refs; - unsigned state; - - struct io_failure_record *failrec; - -#ifdef CONFIG_BTRFS_DEBUG - struct list_head leak_list; -#endif -}; #define INLINE_EXTENT_BUFFER_PAGES 16 #define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE) @@ -259,152 +188,11 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, u64 start, u64 len, int create); -void extent_io_tree_init(struct btrfs_fs_info *fs_info, - struct extent_io_tree *tree, unsigned int owner, - void *private_data); -void extent_io_tree_release(struct extent_io_tree *tree); int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); -int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached); -static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) -{ - return lock_extent_bits(tree, start, end, NULL); -} - -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, int mirror_num); -int __init extent_io_init(void); -void __cold extent_io_exit(void); - -u64 count_range_bits(struct extent_io_tree *tree, - u64 *start, u64 search_end, - u64 max_bytes, unsigned bits, int contig); - -void free_extent_state(struct extent_state *state); -int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, int filled, - struct extent_state *cached_state); -int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, struct extent_changeset *changeset); -int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, int wake, int delete, - struct extent_state **cached); -int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, int wake, int delete, - struct extent_state **cached, gfp_t mask, - struct extent_changeset *changeset); - -static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) -{ - return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL); -} - -static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached) -{ - return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, - GFP_NOFS, NULL); -} - -static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree, - u64 start, u64 end, struct extent_state **cached) -{ - return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, - GFP_ATOMIC, NULL); -} - -static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, - u64 end, unsigned bits) -{ - int wake = 0; - - if (bits & EXTENT_LOCKED) - wake = 1; - - return clear_extent_bit(tree, start, end, bits, wake, 0, NULL); -} - -int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, struct extent_changeset *changeset); -int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, u64 *failed_start, - struct extent_state **cached_state, gfp_t mask); -int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits); - -static inline int set_extent_bits(struct extent_io_tree *tree, u64 start, - u64 end, unsigned bits) -{ - return set_extent_bit(tree, start, end, bits, NULL, NULL, GFP_NOFS); -} - -static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state) -{ - return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, - cached_state, GFP_NOFS, NULL); -} - -static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start, - u64 end, gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, - NULL, mask); -} - -static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached) -{ - return clear_extent_bit(tree, start, end, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, cached); -} - -int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, unsigned clear_bits, - struct extent_state **cached_state); - -static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start, - u64 end, unsigned int extra_bits, - struct extent_state **cached_state) -{ - return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits, - NULL, cached_state, GFP_NOFS); -} - -static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state) -{ - return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, - NULL, cached_state, GFP_NOFS); -} - -static inline int set_extent_new(struct extent_io_tree *tree, u64 start, - u64 end) -{ - return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, - GFP_NOFS); -} - -static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state, gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL, - cached_state, mask); -} - -int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, unsigned bits, - struct extent_state **cached_state); -void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, unsigned bits); -int extent_invalidatepage(struct extent_io_tree *tree, - struct page *page, unsigned long offset); int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end, int mode); @@ -442,11 +230,6 @@ static inline int num_extent_pages(const struct extent_buffer *eb) (eb->start >> PAGE_SHIFT); } -static inline void extent_buffer_get(struct extent_buffer *eb) -{ - atomic_inc(&eb->refs); -} - static inline int extent_buffer_uptodate(struct extent_buffer *eb) { return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); @@ -508,10 +291,6 @@ struct btrfs_inode; int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, u64 length, u64 logical, struct page *page, unsigned int pg_offset, int mirror_num); -int clean_io_failure(struct btrfs_fs_info *fs_info, - struct extent_io_tree *failure_tree, - struct extent_io_tree *io_tree, u64 start, - struct page *page, u64 ino, unsigned int pg_offset); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); int btrfs_repair_eb_io_failure(struct extent_buffer *eb, int mirror_num); @@ -535,19 +314,12 @@ struct io_failure_record { }; -void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, - u64 end); -int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, - struct io_failure_record **failrec_ret); bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages, struct io_failure_record *failrec, int fail_mirror); struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, struct io_failure_record *failrec, struct page *page, int pg_offset, int icsum, bio_end_io_t *endio_func, void *data); -int free_io_failure(struct extent_io_tree *failure_tree, - struct extent_io_tree *io_tree, - struct io_failure_record *rec); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, struct page *locked_page, u64 *start, @@ -555,5 +327,4 @@ bool find_lock_delalloc_range(struct inode *inode, #endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); - #endif diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 9d30acca55e1..6f417ff68980 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -214,9 +214,13 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) ASSERT(next->block_start != EXTENT_MAP_DELALLOC && prev->block_start != EXTENT_MAP_DELALLOC); + if (prev->map_lookup || next->map_lookup) + ASSERT(test_bit(EXTENT_FLAG_FS_MAPPING, &prev->flags) && + test_bit(EXTENT_FLAG_FS_MAPPING, &next->flags)); + if (extent_map_end(prev) == next->start && prev->flags == next->flags && - prev->bdev == next->bdev && + prev->map_lookup == next->map_lookup && ((next->block_start == EXTENT_MAP_HOLE && prev->block_start == EXTENT_MAP_HOLE) || (next->block_start == EXTENT_MAP_INLINE && diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 473f039fcd7c..8e217337dff9 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -42,15 +42,8 @@ struct extent_map { u64 block_len; u64 generation; unsigned long flags; - union { - struct block_device *bdev; - - /* - * used for chunk mappings - * flags & EXTENT_FLAG_FS_MAPPING must be set - */ - struct map_lookup *map_lookup; - }; + /* Used for chunk mappings, flag EXTENT_FLAG_FS_MAPPING must be set */ + struct map_lookup *map_lookup; refcount_t refs; unsigned int compress_type; struct list_head list; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 1a599f50837b..3270a40b0777 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -945,7 +945,6 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, u8 type = btrfs_file_extent_type(leaf, fi); int compress_type = btrfs_file_extent_compression(leaf, fi); - em->bdev = fs_info->fs_devices->latest_bdev; btrfs_item_key_to_cpu(leaf, &key, slot); extent_start = key.offset; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 27e5b269e729..0cb43b682789 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -296,7 +296,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, key.objectid = defrag->ino; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); + inode = btrfs_iget(fs_info->sb, &key, inode_root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto cleanup; @@ -667,7 +667,6 @@ void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end, } split->generation = gen; - split->bdev = em->bdev; split->flags = flags; split->compress_type = em->compress_type; replace_extent_mapping(em_tree, em, split, modified); @@ -680,7 +679,6 @@ void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end, split->start = start + len; split->len = em->start + em->len - (start + len); - split->bdev = em->bdev; split->flags = flags; split->compress_type = em->compress_type; split->generation = gen; @@ -1636,6 +1634,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, break; } + only_release_metadata = false; sector_offset = pos & (fs_info->sectorsize - 1); reserve_bytes = round_up(write_bytes + sector_offset, fs_info->sectorsize); @@ -1692,7 +1691,7 @@ again: force_page_uptodate); if (ret) { btrfs_delalloc_release_extents(BTRFS_I(inode), - reserve_bytes, true); + reserve_bytes); break; } @@ -1704,7 +1703,7 @@ again: if (extents_locked == -EAGAIN) goto again; btrfs_delalloc_release_extents(BTRFS_I(inode), - reserve_bytes, true); + reserve_bytes); ret = extents_locked; break; } @@ -1772,8 +1771,7 @@ again: else free_extent_state(cached_state); - btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes, - true); + btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); if (ret) { btrfs_drop_pages(pages, num_pages); break; @@ -1792,7 +1790,6 @@ again: set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, EXTENT_NORESERVE, NULL, NULL, GFP_NOFS); - only_release_metadata = false; } btrfs_drop_pages(pages, num_pages); @@ -1904,9 +1901,10 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, (iocb->ki_flags & IOCB_NOWAIT)) return -EOPNOTSUPP; - if (!inode_trylock(inode)) { - if (iocb->ki_flags & IOCB_NOWAIT) + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) return -EAGAIN; + } else { inode_lock(inode); } @@ -2068,25 +2066,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; int ret = 0, err; - u64 len; - - /* - * If the inode needs a full sync, make sure we use a full range to - * avoid log tree corruption, due to hole detection racing with ordered - * extent completion for adjacent ranges, and assertion failures during - * hole detection. - */ - if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags)) { - start = 0; - end = LLONG_MAX; - } - /* - * The range length can be represented by u64, we have to do the typecasts - * to avoid signed overflow if it's [0, LLONG_MAX] eg. from fsync() - */ - len = (u64)end - (u64)start + 1; trace_btrfs_sync_file(file, datasync); btrfs_init_log_ctx(&ctx, inode); @@ -2113,6 +2093,19 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); /* + * If the inode needs a full sync, make sure we use a full range to + * avoid log tree corruption, due to hole detection racing with ordered + * extent completion for adjacent ranges, and assertion failures during + * hole detection. Do this while holding the inode lock, to avoid races + * with other tasks. + */ + if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags)) { + start = 0; + end = LLONG_MAX; + } + + /* * Before we acquired the inode's lock, someone may have dirtied more * pages in the target range. We need to make sure that writeback for * any such pages does not start while we are logging the inode, because @@ -2139,8 +2132,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) /* * We have to do this here to avoid the priority inversion of waiting on * IO of a lower priority task while holding a transaction open. + * + * Also, the range length can be represented by u64, we have to do the + * typecasts to avoid signed overflow if it's [0, LLONG_MAX]. */ - ret = btrfs_wait_ordered_range(inode, start, len); + ret = btrfs_wait_ordered_range(inode, start, (u64)end - (u64)start + 1); if (ret) { up_write(&BTRFS_I(inode)->dio_sem); inode_unlock(inode); @@ -2362,7 +2358,6 @@ out: hole_em->block_start = EXTENT_MAP_HOLE; hole_em->block_len = 0; hole_em->orig_block_len = 0; - hole_em->bdev = fs_info->fs_devices->latest_bdev; hole_em->compress_type = BTRFS_COMPRESS_NONE; hole_em->generation = trans->transid; @@ -3353,29 +3348,30 @@ out: return ret; } -static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) +static loff_t find_desired_extent(struct inode *inode, loff_t offset, + int whence) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em = NULL; struct extent_state *cached_state = NULL; + loff_t i_size = inode->i_size; u64 lockstart; u64 lockend; u64 start; u64 len; int ret = 0; - if (inode->i_size == 0) + if (i_size == 0 || offset >= i_size) return -ENXIO; /* - * *offset can be negative, in this case we start finding DATA/HOLE from + * offset can be negative, in this case we start finding DATA/HOLE from * the very start of the file. */ - start = max_t(loff_t, 0, *offset); + start = max_t(loff_t, 0, offset); lockstart = round_down(start, fs_info->sectorsize); - lockend = round_up(i_size_read(inode), - fs_info->sectorsize); + lockend = round_up(i_size, fs_info->sectorsize); if (lockend <= lockstart) lockend = lockstart + fs_info->sectorsize; lockend--; @@ -3384,7 +3380,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); - while (start < inode->i_size) { + while (start < i_size) { em = btrfs_get_extent_fiemap(BTRFS_I(inode), start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); @@ -3407,46 +3403,39 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) cond_resched(); } free_extent_map(em); - if (!ret) { - if (whence == SEEK_DATA && start >= inode->i_size) - ret = -ENXIO; - else - *offset = min_t(loff_t, start, inode->i_size); - } unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); - return ret; + if (ret) { + offset = ret; + } else { + if (whence == SEEK_DATA && start >= i_size) + offset = -ENXIO; + else + offset = min_t(loff_t, start, i_size); + } + + return offset; } static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; - int ret; - inode_lock(inode); switch (whence) { - case SEEK_END: - case SEEK_CUR: - offset = generic_file_llseek(file, offset, whence); - goto out; + default: + return generic_file_llseek(file, offset, whence); case SEEK_DATA: case SEEK_HOLE: - if (offset >= i_size_read(inode)) { - inode_unlock(inode); - return -ENXIO; - } - - ret = find_desired_extent(inode, &offset, whence); - if (ret) { - inode_unlock(inode); - return ret; - } + inode_lock_shared(inode); + offset = find_desired_extent(inode, offset, whence); + inode_unlock_shared(inode); + break; } - offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); -out: - inode_unlock(inode); - return offset; + if (offset < 0) + return offset; + + return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); } static int btrfs_file_open(struct inode *inode, struct file *filp) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index d54dcd0ab230..3283da419200 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -78,7 +78,7 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, * sure NOFS is set to keep us from deadlocking. */ nofs_flag = memalloc_nofs_save(); - inode = btrfs_iget_path(fs_info->sb, &location, root, NULL, path); + inode = btrfs_iget_path(fs_info->sb, &location, root, path); btrfs_release_path(path); memalloc_nofs_restore(nofs_flag); if (IS_ERR(inode)) @@ -91,8 +91,7 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, return inode; } -struct inode *lookup_free_space_inode( - struct btrfs_block_group_cache *block_group, +struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = block_group->fs_info; @@ -107,7 +106,7 @@ struct inode *lookup_free_space_inode( return inode; inode = __lookup_free_space_inode(fs_info->tree_root, path, - block_group->key.objectid); + block_group->start); if (IS_ERR(inode)) return inode; @@ -190,7 +189,7 @@ static int __create_free_space_inode(struct btrfs_root *root, } int create_free_space_inode(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path) { int ret; @@ -201,7 +200,7 @@ int create_free_space_inode(struct btrfs_trans_handle *trans, return ret; return __create_free_space_inode(trans->fs_info->tree_root, trans, path, - ino, block_group->key.objectid); + ino, block_group->start); } int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, @@ -224,7 +223,7 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, } int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -385,6 +384,12 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode if (uptodate && !PageUptodate(page)) { btrfs_readpage(NULL, page); lock_page(page); + if (page->mapping != inode->i_mapping) { + btrfs_err(BTRFS_I(inode)->root->fs_info, + "free space cache page truncated"); + io_ctl_drop_pages(io_ctl); + return -EIO; + } if (!PageUptodate(page)) { btrfs_err(BTRFS_I(inode)->root->fs_info, "error reading free space cache"); @@ -814,7 +819,7 @@ free_cache: goto out; } -int load_free_space_cache(struct btrfs_block_group_cache *block_group) +int load_free_space_cache(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; @@ -822,7 +827,7 @@ int load_free_space_cache(struct btrfs_block_group_cache *block_group) struct btrfs_path *path; int ret = 0; bool matched; - u64 used = btrfs_block_group_used(&block_group->item); + u64 used = block_group->used; /* * If this block group has been marked to be cleared for one reason or @@ -876,13 +881,13 @@ int load_free_space_cache(struct btrfs_block_group_cache *block_group) spin_unlock(&block_group->lock); ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, - path, block_group->key.objectid); + path, block_group->start); btrfs_free_path(path); if (ret <= 0) goto out; spin_lock(&ctl->tree_lock); - matched = (ctl->free_space == (block_group->key.offset - used - + matched = (ctl->free_space == (block_group->length - used - block_group->bytes_super)); spin_unlock(&ctl->tree_lock); @@ -890,7 +895,7 @@ int load_free_space_cache(struct btrfs_block_group_cache *block_group) __btrfs_remove_free_space_cache(ctl); btrfs_warn(fs_info, "block group %llu has wrong amount of free space", - block_group->key.objectid); + block_group->start); ret = -1; } out: @@ -903,7 +908,7 @@ out: btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuilding it now", - block_group->key.objectid); + block_group->start); } iput(inode); @@ -913,7 +918,7 @@ out: static noinline_for_stack int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, struct btrfs_free_space_ctl *ctl, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, int *entries, int *bitmaps, struct list_head *bitmap_list) { @@ -1041,7 +1046,7 @@ fail: } static noinline_for_stack int write_pinned_extent_entries( - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_io_ctl *io_ctl, int *entries) { @@ -1061,9 +1066,9 @@ static noinline_for_stack int write_pinned_extent_entries( */ unpin = block_group->fs_info->pinned_extents; - start = block_group->key.objectid; + start = block_group->start; - while (start < block_group->key.objectid + block_group->key.offset) { + while (start < block_group->start + block_group->length) { ret = find_first_extent_bit(unpin, start, &extent_start, &extent_end, EXTENT_DIRTY, NULL); @@ -1071,13 +1076,12 @@ static noinline_for_stack int write_pinned_extent_entries( return 0; /* This pinned extent is out of our range */ - if (extent_start >= block_group->key.objectid + - block_group->key.offset) + if (extent_start >= block_group->start + block_group->length) return 0; extent_start = max(extent_start, start); - extent_end = min(block_group->key.objectid + - block_group->key.offset, extent_end + 1); + extent_end = min(block_group->start + block_group->length, + extent_end + 1); len = extent_end - extent_start; *entries += 1; @@ -1141,7 +1145,7 @@ cleanup_write_cache_enospc(struct inode *inode, static int __btrfs_wait_cache_io(struct btrfs_root *root, struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_io_ctl *io_ctl, struct btrfs_path *path, u64 offset) { @@ -1168,7 +1172,7 @@ out: #ifdef DEBUG btrfs_err(root->fs_info, "failed to write free space cache for block group %llu", - block_group->key.objectid); + block_group->start); #endif } } @@ -1210,12 +1214,12 @@ static int btrfs_wait_cache_io_root(struct btrfs_root *root, } int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path) { return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans, block_group, &block_group->io_ctl, - path, block_group->key.objectid); + path, block_group->start); } /** @@ -1231,7 +1235,7 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, */ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_ctl *ctl, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_io_ctl *io_ctl, struct btrfs_trans_handle *trans) { @@ -1369,7 +1373,7 @@ out_unlock: } int btrfs_write_out_cache(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -1394,7 +1398,7 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans, #ifdef DEBUG btrfs_err(fs_info, "failed to write free space cache for block group %llu", - block_group->key.objectid); + block_group->start); #endif spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_ERROR; @@ -1647,11 +1651,11 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl, static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) { - struct btrfs_block_group_cache *block_group = ctl->private; + struct btrfs_block_group *block_group = ctl->private; u64 max_bytes; u64 bitmap_bytes; u64 extent_bytes; - u64 size = block_group->key.offset; + u64 size = block_group->length; u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); @@ -1991,7 +1995,7 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, static bool use_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info) { - struct btrfs_block_group_cache *block_group = ctl->private; + struct btrfs_block_group *block_group = ctl->private; struct btrfs_fs_info *fs_info = block_group->fs_info; bool forced = false; @@ -2028,7 +2032,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, * so allow those block groups to still be allowed to have a bitmap * entry. */ - if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset) + if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length) return false; return true; @@ -2043,7 +2047,7 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info) { struct btrfs_free_space *bitmap_info; - struct btrfs_block_group_cache *block_group = NULL; + struct btrfs_block_group *block_group = NULL; int added = 0; u64 bytes, offset, bytes_added; int ret; @@ -2380,7 +2384,7 @@ out: return ret; } -int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, +int btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size) { return __btrfs_add_free_space(block_group->fs_info, @@ -2388,7 +2392,7 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, bytenr, size); } -int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, +int btrfs_remove_free_space(struct btrfs_block_group *block_group, u64 offset, u64 bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; @@ -2478,7 +2482,7 @@ out: return ret; } -void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, +void btrfs_dump_free_space(struct btrfs_block_group *block_group, u64 bytes) { struct btrfs_fs_info *fs_info = block_group->fs_info; @@ -2503,14 +2507,14 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, "%d blocks of free space at or bigger than bytes is", count); } -void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group) +void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; spin_lock_init(&ctl->tree_lock); ctl->unit = fs_info->sectorsize; - ctl->start = block_group->key.objectid; + ctl->start = block_group->start; ctl->private = block_group; ctl->op = &free_space_op; INIT_LIST_HEAD(&ctl->trimming_ranges); @@ -2532,7 +2536,7 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group) */ static int __btrfs_return_cluster_to_free_space( - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; @@ -2598,7 +2602,7 @@ void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) spin_unlock(&ctl->tree_lock); } -void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) +void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_cluster *cluster; @@ -2620,7 +2624,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) } -u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, +u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, u64 offset, u64 bytes, u64 empty_size, u64 *max_extent_size) { @@ -2674,7 +2678,7 @@ out: * cluster and remove the cluster from it. */ int btrfs_return_cluster_to_free_space( - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster) { struct btrfs_free_space_ctl *ctl; @@ -2708,7 +2712,7 @@ int btrfs_return_cluster_to_free_space( return ret; } -static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, +static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, struct btrfs_free_space *entry, u64 bytes, u64 min_start, @@ -2741,7 +2745,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, * if it couldn't find anything suitably large, or a logical disk offset * if things worked out */ -u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, +u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, u64 bytes, u64 min_start, u64 *max_extent_size) { @@ -2827,7 +2831,7 @@ out: return ret; } -static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, +static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group, struct btrfs_free_space *entry, struct btrfs_free_cluster *cluster, u64 offset, u64 bytes, @@ -2909,7 +2913,7 @@ again: * extent of cont1_bytes, and other clusters of at least min_bytes. */ static noinline int -setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, +setup_cluster_no_bitmap(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, struct list_head *bitmaps, u64 offset, u64 bytes, u64 cont1_bytes, u64 min_bytes) @@ -3000,7 +3004,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, * that we have already failed to find extents that will work. */ static noinline int -setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, +setup_cluster_bitmap(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, struct list_head *bitmaps, u64 offset, u64 bytes, u64 cont1_bytes, u64 min_bytes) @@ -3050,7 +3054,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, * returns zero and sets up cluster if things worked out, otherwise * it returns -enospc */ -int btrfs_find_space_cluster(struct btrfs_block_group_cache *block_group, +int btrfs_find_space_cluster(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, u64 offset, u64 bytes, u64 empty_size) { @@ -3141,7 +3145,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) cluster->block_group = NULL; } -static int do_trimming(struct btrfs_block_group_cache *block_group, +static int do_trimming(struct btrfs_block_group *block_group, u64 *total_trimmed, u64 start, u64 bytes, u64 reserved_start, u64 reserved_bytes, struct btrfs_trim_range *trim_entry) @@ -3186,7 +3190,7 @@ static int do_trimming(struct btrfs_block_group_cache *block_group, return ret; } -static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, +static int trim_no_bitmap(struct btrfs_block_group *block_group, u64 *total_trimmed, u64 start, u64 end, u64 minlen) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; @@ -3271,7 +3275,7 @@ out: return ret; } -static int trim_bitmaps(struct btrfs_block_group_cache *block_group, +static int trim_bitmaps(struct btrfs_block_group *block_group, u64 *total_trimmed, u64 start, u64 end, u64 minlen) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; @@ -3352,12 +3356,12 @@ next: return ret; } -void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache) +void btrfs_get_block_group_trimming(struct btrfs_block_group *cache) { atomic_inc(&cache->trimming); } -void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group) +void btrfs_put_block_group_trimming(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct extent_map_tree *em_tree; @@ -3373,7 +3377,7 @@ void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group) mutex_lock(&fs_info->chunk_mutex); em_tree = &fs_info->mapping_tree; write_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, block_group->key.objectid, + em = lookup_extent_mapping(em_tree, block_group->start, 1); BUG_ON(!em); /* logic error, can't happen */ remove_extent_mapping(em_tree, em); @@ -3392,7 +3396,7 @@ void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group) } } -int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, +int btrfs_trim_block_group(struct btrfs_block_group *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen) { int ret; @@ -3590,7 +3594,7 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, * how the free space cache loading stuff works, so you can get really weird * configurations. */ -int test_add_free_space_entry(struct btrfs_block_group_cache *cache, +int test_add_free_space_entry(struct btrfs_block_group *cache, u64 offset, u64 bytes, bool bitmap) { struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; @@ -3658,7 +3662,7 @@ again: * just used to check the absence of space, so if there is free space in the * range at all we will return 1. */ -int test_check_exists(struct btrfs_block_group_cache *cache, +int test_check_exists(struct btrfs_block_group *cache, u64 offset, u64 bytes) { struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 39c32c8fc24f..ba9a23241101 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -50,24 +50,23 @@ struct btrfs_io_ctl { unsigned check_crcs:1; }; -struct inode *lookup_free_space_inode( - struct btrfs_block_group_cache *block_group, +struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, struct btrfs_path *path); int create_free_space_inode(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path); int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct inode *inode); -int load_free_space_cache(struct btrfs_block_group_cache *block_group); +int load_free_space_cache(struct btrfs_block_group *block_group); int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path); int btrfs_write_out_cache(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path); struct inode *lookup_free_ino_inode(struct btrfs_root *root, struct btrfs_path *path); @@ -81,42 +80,40 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, struct btrfs_path *path, struct inode *inode); -void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group); +void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group); int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, struct btrfs_free_space_ctl *ctl, u64 bytenr, u64 size); -int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, +int btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size); -int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, +int btrfs_remove_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size); void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl); -void btrfs_remove_free_space_cache(struct btrfs_block_group_cache - *block_group); -u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, +void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group); +u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, u64 offset, u64 bytes, u64 empty_size, u64 *max_extent_size); u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root); -void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, +void btrfs_dump_free_space(struct btrfs_block_group *block_group, u64 bytes); -int btrfs_find_space_cluster(struct btrfs_block_group_cache *block_group, +int btrfs_find_space_cluster(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, u64 offset, u64 bytes, u64 empty_size); void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster); -u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, +u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, u64 bytes, u64 min_start, u64 *max_extent_size); int btrfs_return_cluster_to_free_space( - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster); -int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, +int btrfs_trim_block_group(struct btrfs_block_group *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen); /* Support functions for running our sanity tests */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -int test_add_free_space_entry(struct btrfs_block_group_cache *cache, +int test_add_free_space_entry(struct btrfs_block_group *cache, u64 offset, u64 bytes, bool bitmap); -int test_check_exists(struct btrfs_block_group_cache *cache, - u64 offset, u64 bytes); +int test_check_exists(struct btrfs_block_group *cache, u64 offset, u64 bytes); #endif #endif diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 48a03f5240f5..258cb3fae17a 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -13,10 +13,10 @@ #include "block-group.h" static int __add_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path); -void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache) +void set_free_space_tree_thresholds(struct btrfs_block_group *cache) { u32 bitmap_range; size_t bitmap_size; @@ -27,8 +27,7 @@ void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache) * exceeds that required for using bitmaps. */ bitmap_range = cache->fs_info->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS; - num_bitmaps = div_u64(cache->key.offset + bitmap_range - 1, - bitmap_range); + num_bitmaps = div_u64(cache->length + bitmap_range - 1, bitmap_range); bitmap_size = sizeof(struct btrfs_item) + BTRFS_FREE_SPACE_BITMAP_SIZE; total_bitmap_size = num_bitmaps * bitmap_size; cache->bitmap_high_thresh = div_u64(total_bitmap_size, @@ -45,7 +44,7 @@ void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache) } static int add_new_free_space_info(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path) { struct btrfs_root *root = trans->fs_info->free_space_root; @@ -54,9 +53,9 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; int ret; - key.objectid = block_group->key.objectid; + key.objectid = block_group->start; key.type = BTRFS_FREE_SPACE_INFO_KEY; - key.offset = block_group->key.offset; + key.offset = block_group->length; ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*info)); if (ret) @@ -78,7 +77,7 @@ out: EXPORT_FOR_TESTS struct btrfs_free_space_info *search_free_space_info( struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path, int cow) { struct btrfs_fs_info *fs_info = block_group->fs_info; @@ -86,16 +85,16 @@ struct btrfs_free_space_info *search_free_space_info( struct btrfs_key key; int ret; - key.objectid = block_group->key.objectid; + key.objectid = block_group->start; key.type = BTRFS_FREE_SPACE_INFO_KEY; - key.offset = block_group->key.offset; + key.offset = block_group->length; ret = btrfs_search_slot(trans, root, &key, path, 0, cow); if (ret < 0) return ERR_PTR(ret); if (ret != 0) { btrfs_warn(fs_info, "missing free space info for %llu", - block_group->key.objectid); + block_group->start); ASSERT(0); return ERR_PTR(-ENOENT); } @@ -180,7 +179,7 @@ static void le_bitmap_set(unsigned long *map, unsigned int start, int len) EXPORT_FOR_TESTS int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -197,7 +196,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, int done = 0, nr; int ret; - bitmap_size = free_space_bitmap_size(block_group->key.offset, + bitmap_size = free_space_bitmap_size(block_group->length, fs_info->sectorsize); bitmap = alloc_bitmap(bitmap_size); if (!bitmap) { @@ -205,8 +204,8 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, goto out; } - start = block_group->key.objectid; - end = block_group->key.objectid + block_group->key.offset; + start = block_group->start; + end = block_group->start + block_group->length; key.objectid = end - 1; key.type = (u8)-1; @@ -224,8 +223,8 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1); if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) { - ASSERT(found_key.objectid == block_group->key.objectid); - ASSERT(found_key.offset == block_group->key.offset); + ASSERT(found_key.objectid == block_group->start); + ASSERT(found_key.offset == block_group->length); done = 1; break; } else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY) { @@ -271,7 +270,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, if (extent_count != expected_extent_count) { btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", - block_group->key.objectid, extent_count, + block_group->start, extent_count, expected_extent_count); ASSERT(0); ret = -EIO; @@ -320,7 +319,7 @@ out: EXPORT_FOR_TESTS int convert_free_space_to_extents(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -336,7 +335,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, int done = 0, nr; int ret; - bitmap_size = free_space_bitmap_size(block_group->key.offset, + bitmap_size = free_space_bitmap_size(block_group->length, fs_info->sectorsize); bitmap = alloc_bitmap(bitmap_size); if (!bitmap) { @@ -344,8 +343,8 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, goto out; } - start = block_group->key.objectid; - end = block_group->key.objectid + block_group->key.offset; + start = block_group->start; + end = block_group->start + block_group->length; key.objectid = end - 1; key.type = (u8)-1; @@ -363,8 +362,8 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1); if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) { - ASSERT(found_key.objectid == block_group->key.objectid); - ASSERT(found_key.offset == block_group->key.offset); + ASSERT(found_key.objectid == block_group->start); + ASSERT(found_key.offset == block_group->length); done = 1; break; } else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) { @@ -413,7 +412,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - nrbits = div_u64(block_group->key.offset, block_group->fs_info->sectorsize); + nrbits = div_u64(block_group->length, block_group->fs_info->sectorsize); start_bit = find_next_bit_le(bitmap, nrbits, 0); while (start_bit < nrbits) { @@ -437,7 +436,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, if (extent_count != expected_extent_count) { btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", - block_group->key.objectid, extent_count, + block_group->start, extent_count, expected_extent_count); ASSERT(0); ret = -EIO; @@ -453,7 +452,7 @@ out: } static int update_free_space_extent_count(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path, int new_extents) { @@ -491,7 +490,7 @@ out: } EXPORT_FOR_TESTS -int free_space_test_bit(struct btrfs_block_group_cache *block_group, +int free_space_test_bit(struct btrfs_block_group *block_group, struct btrfs_path *path, u64 offset) { struct extent_buffer *leaf; @@ -513,7 +512,7 @@ int free_space_test_bit(struct btrfs_block_group_cache *block_group, return !!extent_buffer_test_bit(leaf, ptr, i); } -static void free_space_set_bits(struct btrfs_block_group_cache *block_group, +static void free_space_set_bits(struct btrfs_block_group *block_group, struct btrfs_path *path, u64 *start, u64 *size, int bit) { @@ -581,7 +580,7 @@ static int free_space_next_bitmap(struct btrfs_trans_handle *trans, * the bitmap. */ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path, u64 start, u64 size, int remove) { @@ -597,7 +596,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, * Read the bit for the block immediately before the extent of space if * that block is within the block group. */ - if (start > block_group->key.objectid) { + if (start > block_group->start) { u64 prev_block = start - block_group->fs_info->sectorsize; key.objectid = prev_block; @@ -649,7 +648,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, * Read the bit for the block immediately after the extent of space if * that block is within the block group. */ - if (end < block_group->key.objectid + block_group->key.offset) { + if (end < block_group->start + block_group->length) { /* The next block may be in the next bitmap. */ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (end >= key.objectid + key.offset) { @@ -694,7 +693,7 @@ out: } static int remove_free_space_extent(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path, u64 start, u64 size) { @@ -781,7 +780,7 @@ out: EXPORT_FOR_TESTS int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path, u64 start, u64 size) { struct btrfs_free_space_info *info; @@ -812,7 +811,7 @@ int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, int remove_from_free_space_tree(struct btrfs_trans_handle *trans, u64 start, u64 size) { - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; struct btrfs_path *path; int ret; @@ -846,7 +845,7 @@ out: } static int add_free_space_extent(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path, u64 start, u64 size) { @@ -880,7 +879,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans, new_key.offset = size; /* Search for a neighbor on the left. */ - if (start == block_group->key.objectid) + if (start == block_group->start) goto right; key.objectid = start - 1; key.type = (u8)-1; @@ -900,8 +899,8 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans, found_start = key.objectid; found_end = key.objectid + key.offset; - ASSERT(found_start >= block_group->key.objectid && - found_end > block_group->key.objectid); + ASSERT(found_start >= block_group->start && + found_end > block_group->start); ASSERT(found_start < start && found_end <= start); /* @@ -920,7 +919,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans, right: /* Search for a neighbor on the right. */ - if (end == block_group->key.objectid + block_group->key.offset) + if (end == block_group->start + block_group->length) goto insert; key.objectid = end; key.type = (u8)-1; @@ -940,8 +939,8 @@ right: found_start = key.objectid; found_end = key.objectid + key.offset; - ASSERT(found_start >= block_group->key.objectid && - found_end > block_group->key.objectid); + ASSERT(found_start >= block_group->start && + found_end > block_group->start); ASSERT((found_start < start && found_end <= start) || (found_start >= end && found_end > end)); @@ -974,7 +973,7 @@ out: EXPORT_FOR_TESTS int __add_to_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path, u64 start, u64 size) { struct btrfs_free_space_info *info; @@ -1005,7 +1004,7 @@ int __add_to_free_space_tree(struct btrfs_trans_handle *trans, int add_to_free_space_tree(struct btrfs_trans_handle *trans, u64 start, u64 size) { - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; struct btrfs_path *path; int ret; @@ -1043,7 +1042,7 @@ out: * through the normal add/remove hooks. */ static int populate_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group) + struct btrfs_block_group *block_group) { struct btrfs_root *extent_root = trans->fs_info->extent_root; struct btrfs_path *path, *path2; @@ -1075,7 +1074,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, * BLOCK_GROUP_ITEM, so an extent may precede the block group that it's * contained in. */ - key.objectid = block_group->key.objectid; + key.objectid = block_group->start; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = 0; @@ -1084,8 +1083,8 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, goto out_locked; ASSERT(ret == 0); - start = block_group->key.objectid; - end = block_group->key.objectid + block_group->key.offset; + start = block_group->start; + end = block_group->start + block_group->length; while (1) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); @@ -1109,7 +1108,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, else start += key.offset; } else if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { - if (key.objectid != block_group->key.objectid) + if (key.objectid != block_group->start) break; } @@ -1140,7 +1139,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) struct btrfs_trans_handle *trans; struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root *free_space_root; - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; struct rb_node *node; int ret; @@ -1159,7 +1158,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) node = rb_first(&fs_info->block_group_cache_tree); while (node) { - block_group = rb_entry(node, struct btrfs_block_group_cache, + block_group = rb_entry(node, struct btrfs_block_group, cache_node); ret = populate_free_space_tree(trans, block_group); if (ret) @@ -1265,7 +1264,7 @@ abort: } static int __add_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path) { int ret; @@ -1277,12 +1276,12 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, return ret; return __add_to_free_space_tree(trans, block_group, path, - block_group->key.objectid, - block_group->key.offset); + block_group->start, + block_group->length); } int add_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group) + struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_path *path = NULL; @@ -1312,7 +1311,7 @@ out: } int remove_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group) + struct btrfs_block_group *block_group) { struct btrfs_root *root = trans->fs_info->free_space_root; struct btrfs_path *path; @@ -1336,8 +1335,8 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans, goto out; } - start = block_group->key.objectid; - end = block_group->key.objectid + block_group->key.offset; + start = block_group->start; + end = block_group->start + block_group->length; key.objectid = end - 1; key.type = (u8)-1; @@ -1355,8 +1354,8 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1); if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) { - ASSERT(found_key.objectid == block_group->key.objectid); - ASSERT(found_key.offset == block_group->key.offset); + ASSERT(found_key.objectid == block_group->start); + ASSERT(found_key.offset == block_group->length); done = 1; nr++; path->slots[0]--; @@ -1391,7 +1390,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, struct btrfs_path *path, u32 expected_extent_count) { - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; struct btrfs_fs_info *fs_info; struct btrfs_root *root; struct btrfs_key key; @@ -1407,7 +1406,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, fs_info = block_group->fs_info; root = fs_info->free_space_root; - end = block_group->key.objectid + block_group->key.offset; + end = block_group->start + block_group->length; while (1) { ret = btrfs_next_item(root, path); @@ -1454,7 +1453,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, if (extent_count != expected_extent_count) { btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", - block_group->key.objectid, extent_count, + block_group->start, extent_count, expected_extent_count); ASSERT(0); ret = -EIO; @@ -1472,7 +1471,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, struct btrfs_path *path, u32 expected_extent_count) { - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; struct btrfs_fs_info *fs_info; struct btrfs_root *root; struct btrfs_key key; @@ -1485,7 +1484,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, fs_info = block_group->fs_info; root = fs_info->free_space_root; - end = block_group->key.objectid + block_group->key.offset; + end = block_group->start + block_group->length; while (1) { ret = btrfs_next_item(root, path); @@ -1516,7 +1515,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, if (extent_count != expected_extent_count) { btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", - block_group->key.objectid, extent_count, + block_group->start, extent_count, expected_extent_count); ASSERT(0); ret = -EIO; @@ -1532,7 +1531,7 @@ out: int load_free_space_tree(struct btrfs_caching_control *caching_ctl) { - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; struct btrfs_free_space_info *info; struct btrfs_path *path; u32 extent_count, flags; diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h index 360d50e1cdea..dc2463e4cfe3 100644 --- a/fs/btrfs/free-space-tree.h +++ b/fs/btrfs/free-space-tree.h @@ -16,14 +16,14 @@ struct btrfs_caching_control; #define BTRFS_FREE_SPACE_BITMAP_SIZE 256 #define BTRFS_FREE_SPACE_BITMAP_BITS (BTRFS_FREE_SPACE_BITMAP_SIZE * BITS_PER_BYTE) -void set_free_space_tree_thresholds(struct btrfs_block_group_cache *block_group); +void set_free_space_tree_thresholds(struct btrfs_block_group *block_group); int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info); int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info); int load_free_space_tree(struct btrfs_caching_control *caching_ctl); int add_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group); + struct btrfs_block_group *block_group); int remove_block_group_free_space(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group); + struct btrfs_block_group *block_group); int add_to_free_space_tree(struct btrfs_trans_handle *trans, u64 start, u64 size); int remove_from_free_space_tree(struct btrfs_trans_handle *trans, @@ -32,21 +32,21 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_free_space_info * search_free_space_info(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path, int cow); int __add_to_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path, u64 start, u64 size); int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path, u64 start, u64 size); int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path); int convert_free_space_to_extents(struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct btrfs_path *path); -int free_space_test_bit(struct btrfs_block_group_cache *block_group, +int free_space_test_bit(struct btrfs_block_group *block_group, struct btrfs_path *path, u64 offset); #endif diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 63cad7865d75..37345fb6191d 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -501,13 +501,13 @@ again: ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, prealloc, prealloc, &alloc_hint); if (ret) { - btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc); btrfs_delalloc_release_metadata(BTRFS_I(inode), prealloc, true); goto out_put; } ret = btrfs_write_out_ino_cache(root, trans, path, inode); - btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, false); + btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc); out_put: iput(inode); out_release: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0f2754eaa05b..56032c518b26 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -368,6 +368,7 @@ struct async_chunk { u64 end; unsigned int write_flags; struct list_head extents; + struct cgroup_subsys_state *blkcg_css; struct btrfs_work work; atomic_t *pending; }; @@ -474,6 +475,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) u64 start = async_chunk->start; u64 end = async_chunk->end; u64 actual_end; + u64 i_size; int ret = 0; struct page **pages = NULL; unsigned long nr_pages; @@ -488,7 +490,19 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1, SZ_16K); - actual_end = min_t(u64, i_size_read(inode), end + 1); + /* + * We need to save i_size before now because it could change in between + * us evaluating the size and assigning it. This is because we lock and + * unlock the page in truncate and fallocate, and then modify the i_size + * later on. + * + * The barriers are to emulate READ_ONCE, remove that once i_size_read + * does that for us. + */ + barrier(); + i_size = i_size_read(inode); + barrier(); + actual_end = min_t(u64, i_size, end + 1); again: will_compress = 0; nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; @@ -699,10 +713,12 @@ cleanup_and_bail_uncompressed: * to our extent and set things up for the async work queue to run * cow_file_range to do the normal delalloc dance. */ - if (page_offset(async_chunk->locked_page) >= start && - page_offset(async_chunk->locked_page) <= end) + if (async_chunk->locked_page && + (page_offset(async_chunk->locked_page) >= start && + page_offset(async_chunk->locked_page)) <= end) { __set_page_dirty_nobuffers(async_chunk->locked_page); /* unlocked later on in the async handlers */ + } if (redirty) extent_range_redirty_for_io(inode, start, end); @@ -782,7 +798,7 @@ retry: async_extent->start + async_extent->ram_size - 1, WB_SYNC_ALL); - else if (ret) + else if (ret && async_chunk->locked_page) unlock_page(async_chunk->locked_page); kfree(async_extent); cond_resched(); @@ -865,7 +881,8 @@ retry: ins.objectid, ins.offset, async_extent->pages, async_extent->nr_pages, - async_chunk->write_flags)) { + async_chunk->write_flags, + async_chunk->blkcg_css)) { struct page *p = async_extent->pages[0]; const u64 start = async_extent->start; const u64 end = start + async_extent->ram_size - 1; @@ -1183,6 +1200,8 @@ static noinline void async_cow_free(struct btrfs_work *work) async_chunk = container_of(work, struct async_chunk, work); if (async_chunk->inode) btrfs_add_delayed_iput(async_chunk->inode); + if (async_chunk->blkcg_css) + css_put(async_chunk->blkcg_css); /* * Since the pointer to 'pending' is at the beginning of the array of * async_chunk's, freeing it ensures the whole array has been freed. @@ -1191,12 +1210,14 @@ static noinline void async_cow_free(struct btrfs_work *work) kvfree(async_chunk->pending); } -static int cow_file_range_async(struct inode *inode, struct page *locked_page, +static int cow_file_range_async(struct inode *inode, + struct writeback_control *wbc, + struct page *locked_page, u64 start, u64 end, int *page_started, - unsigned long *nr_written, - unsigned int write_flags) + unsigned long *nr_written) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc); struct async_cow *ctx; struct async_chunk *async_chunk; unsigned long nr_pages; @@ -1205,6 +1226,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, int i; bool should_compress; unsigned nofs_flag; + const unsigned int write_flags = wbc_to_write_flags(wbc); unlock_extent(&BTRFS_I(inode)->io_tree, start, end); @@ -1251,14 +1273,45 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_chunk[i].inode = inode; async_chunk[i].start = start; async_chunk[i].end = cur_end; - async_chunk[i].locked_page = locked_page; async_chunk[i].write_flags = write_flags; INIT_LIST_HEAD(&async_chunk[i].extents); - btrfs_init_work(&async_chunk[i].work, - btrfs_delalloc_helper, - async_cow_start, async_cow_submit, - async_cow_free); + /* + * The locked_page comes all the way from writepage and its + * the original page we were actually given. As we spread + * this large delalloc region across multiple async_chunk + * structs, only the first struct needs a pointer to locked_page + * + * This way we don't need racey decisions about who is supposed + * to unlock it. + */ + if (locked_page) { + /* + * Depending on the compressibility, the pages might or + * might not go through async. We want all of them to + * be accounted against wbc once. Let's do it here + * before the paths diverge. wbc accounting is used + * only for foreign writeback detection and doesn't + * need full accuracy. Just account the whole thing + * against the first page. + */ + wbc_account_cgroup_owner(wbc, locked_page, + cur_end - start); + async_chunk[i].locked_page = locked_page; + locked_page = NULL; + } else { + async_chunk[i].locked_page = NULL; + } + + if (blkcg_css != blkcg_root_css) { + css_get(blkcg_css); + async_chunk[i].blkcg_css = blkcg_css; + } else { + async_chunk[i].blkcg_css = NULL; + } + + btrfs_init_work(&async_chunk[i].work, async_cow_start, + async_cow_submit, async_cow_free); nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE); atomic_add(nr_pages, &fs_info->async_delalloc_pages); @@ -1684,7 +1737,6 @@ int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page, { int ret; int force_cow = need_force_cow(inode, start, end); - unsigned int write_flags = wbc_to_write_flags(wbc); if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) { ret = run_delalloc_nocow(inode, locked_page, start, end, @@ -1699,9 +1751,8 @@ int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page, } else { set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags); - ret = cow_file_range_async(inode, locked_page, start, end, - page_started, nr_written, - write_flags); + ret = cow_file_range_async(inode, wbc, locked_page, start, end, + page_started, nr_written); } if (ret) btrfs_cleanup_ordered_extents(inode, locked_page, start, @@ -2097,7 +2148,7 @@ static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, } mapit: - ret = btrfs_map_bio(fs_info, bio, mirror_num, 0); + ret = btrfs_map_bio(fs_info, bio, mirror_num); out: if (ret) { @@ -2201,12 +2252,16 @@ again: mapping_set_error(page->mapping, ret); end_extent_writepage(page, ret, page_start, page_end); ClearPageChecked(page); - goto out; + goto out_reserved; } ClearPageChecked(page); set_page_dirty(page); - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, false); +out_reserved: + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + if (ret) + btrfs_delalloc_release_space(inode, data_reserved, page_start, + PAGE_SIZE, true); out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state); @@ -2247,8 +2302,7 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) SetPageChecked(page); get_page(page); - btrfs_init_work(&fixup->work, btrfs_fixup_helper, - btrfs_writepage_fixup_worker, NULL, NULL); + btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); fixup->page = page; btrfs_queue_work(fs_info->fixup_workers, &fixup->work); return -EBUSY; @@ -2662,7 +2716,7 @@ static noinline int relink_extent_backref(struct btrfs_path *path, key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - inode = btrfs_iget(fs_info->sb, &key, root, NULL); + inode = btrfs_iget(fs_info->sb, &key, root); if (IS_ERR(inode)) { srcu_read_unlock(&fs_info->subvol_srcu, index); return 0; @@ -2986,7 +3040,7 @@ out_kfree: static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info, u64 start, u64 len) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; cache = btrfs_lookup_block_group(fs_info, start); ASSERT(cache); @@ -3014,7 +3068,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) int compress_type = 0; int ret = 0; u64 logical_len = ordered_extent->len; - bool nolock; + bool freespace_inode; bool truncated = false; bool range_locked = false; bool clear_new_delalloc_bytes = false; @@ -3025,7 +3079,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags)) clear_new_delalloc_bytes = true; - nolock = btrfs_is_free_space_inode(BTRFS_I(inode)); + freespace_inode = btrfs_is_free_space_inode(BTRFS_I(inode)); if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { ret = -EIO; @@ -3056,8 +3110,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset, ordered_extent->len); btrfs_ordered_update_i_size(inode, 0, ordered_extent); - if (nolock) - trans = btrfs_join_transaction_nolock(root); + if (freespace_inode) + trans = btrfs_join_transaction_spacecache(root); else trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { @@ -3091,8 +3145,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) EXTENT_DEFRAG, 0, 0, &cached_state); } - if (nolock) - trans = btrfs_join_transaction_nolock(root); + if (freespace_inode) + trans = btrfs_join_transaction_spacecache(root); else trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { @@ -3241,7 +3295,6 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_extent *ordered_extent = NULL; struct btrfs_workqueue *wq; - btrfs_work_func_t func; trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); @@ -3250,16 +3303,12 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, end - start + 1, uptodate)) return; - if (btrfs_is_free_space_inode(BTRFS_I(inode))) { + if (btrfs_is_free_space_inode(BTRFS_I(inode))) wq = fs_info->endio_freespace_worker; - func = btrfs_freespace_write_helper; - } else { + else wq = fs_info->endio_write_workers; - func = btrfs_endio_write_helper; - } - btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL, - NULL); + btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL); btrfs_queue_work(wq, &ordered_extent->work); } @@ -3518,7 +3567,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) found_key.objectid = found_key.offset; found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; - inode = btrfs_iget(fs_info->sb, &found_key, root, NULL); + inode = btrfs_iget(fs_info->sb, &found_key, root); ret = PTR_ERR_OR_ZERO(inode); if (ret && ret != -ENOENT) goto out; @@ -4951,7 +5000,7 @@ again: if (!page) { btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); ret = -ENOMEM; goto out; } @@ -5018,7 +5067,7 @@ out_unlock: if (ret) btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, (ret != 0)); + btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); unlock_page(page); put_page(page); out: @@ -5140,7 +5189,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) hole_em->block_len = 0; hole_em->orig_block_len = 0; hole_em->ram_bytes = hole_size; - hole_em->bdev = fs_info->fs_devices->latest_bdev; hole_em->compress_type = BTRFS_COMPRESS_NONE; hole_em->generation = fs_info->generation; @@ -5737,12 +5785,14 @@ static struct inode *btrfs_iget_locked(struct super_block *s, return inode; } -/* Get an inode object given its location and corresponding root. - * Returns in *is_new if the inode was read from disk +/* + * Get an inode object given its location and corresponding root. + * Path can be preallocated to prevent recursing back to iget through + * allocator. NULL is also valid but may require an additional allocation + * later. */ struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location, - struct btrfs_root *root, int *new, - struct btrfs_path *path) + struct btrfs_root *root, struct btrfs_path *path) { struct inode *inode; @@ -5757,8 +5807,6 @@ struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location, if (!ret) { inode_tree_add(inode); unlock_new_inode(inode); - if (new) - *new = 1; } else { iget_failed(inode); /* @@ -5776,9 +5824,9 @@ struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location, } struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, - struct btrfs_root *root, int *new) + struct btrfs_root *root) { - return btrfs_iget_path(s, location, root, new, NULL); + return btrfs_iget_path(s, location, root, NULL); } static struct inode *new_simple_dir(struct super_block *s, @@ -5844,7 +5892,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return ERR_PTR(ret); if (location.type == BTRFS_INODE_ITEM_KEY) { - inode = btrfs_iget(dir->i_sb, &location, root, NULL); + inode = btrfs_iget(dir->i_sb, &location, root); if (IS_ERR(inode)) return inode; @@ -5869,7 +5917,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) else inode = new_simple_dir(dir->i_sb, &location, sub_root); } else { - inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL); + inode = btrfs_iget(dir->i_sb, &location, sub_root); } srcu_read_unlock(&fs_info->subvol_srcu, index); @@ -6918,8 +6966,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); - if (em) - em->bdev = fs_info->fs_devices->latest_bdev; read_unlock(&em_tree->lock); if (em) { @@ -6935,7 +6981,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, err = -ENOMEM; goto out; } - em->bdev = fs_info->fs_devices->latest_bdev; em->start = EXTENT_MAP_HOLE; em->orig_start = EXTENT_MAP_HOLE; em->len = (u64)-1; @@ -7194,7 +7239,6 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, err = -ENOMEM; goto out; } - em->bdev = NULL; ASSERT(hole_em); /* @@ -7554,7 +7598,6 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, { struct extent_map_tree *em_tree; struct extent_map *em; - struct btrfs_root *root = BTRFS_I(inode)->root; int ret; ASSERT(type == BTRFS_ORDERED_PREALLOC || @@ -7572,7 +7615,6 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, em->len = len; em->block_len = block_len; em->block_start = block_start; - em->bdev = root->fs_info->fs_devices->latest_bdev; em->orig_block_len = orig_block_len; em->ram_bytes = ram_bytes; em->generation = -1; @@ -7611,6 +7653,8 @@ static int btrfs_get_blocks_direct_read(struct extent_map *em, struct inode *inode, u64 start, u64 len) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + if (em->block_start == EXTENT_MAP_HOLE || test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) return -ENOENT; @@ -7620,7 +7664,7 @@ static int btrfs_get_blocks_direct_read(struct extent_map *em, bh_result->b_blocknr = (em->block_start + (start - em->start)) >> inode->i_blkbits; bh_result->b_size = len; - bh_result->b_bdev = em->bdev; + bh_result->b_bdev = fs_info->fs_devices->latest_bdev; set_buffer_mapped(bh_result); return 0; @@ -7703,7 +7747,7 @@ skip_cow: bh_result->b_blocknr = (em->block_start + (start - em->start)) >> inode->i_blkbits; bh_result->b_size = len; - bh_result->b_bdev = em->bdev; + bh_result->b_bdev = fs_info->fs_devices->latest_bdev; set_buffer_mapped(bh_result); if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) @@ -7845,7 +7889,7 @@ static inline blk_status_t submit_dio_repair_bio(struct inode *inode, if (ret) return ret; - ret = btrfs_map_bio(fs_info, bio, mirror_num, 0); + ret = btrfs_map_bio(fs_info, bio, mirror_num); return ret; } @@ -8198,18 +8242,14 @@ static void __endio_write_update_ordered(struct inode *inode, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_extent *ordered = NULL; struct btrfs_workqueue *wq; - btrfs_work_func_t func; u64 ordered_offset = offset; u64 ordered_bytes = bytes; u64 last_offset; - if (btrfs_is_free_space_inode(BTRFS_I(inode))) { + if (btrfs_is_free_space_inode(BTRFS_I(inode))) wq = fs_info->endio_freespace_worker; - func = btrfs_freespace_write_helper; - } else { + else wq = fs_info->endio_write_workers; - func = btrfs_endio_write_helper; - } while (ordered_offset < offset + bytes) { last_offset = ordered_offset; @@ -8217,9 +8257,8 @@ static void __endio_write_update_ordered(struct inode *inode, &ordered_offset, ordered_bytes, uptodate)) { - btrfs_init_work(&ordered->work, func, - finish_ordered_fn, - NULL, NULL); + btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, + NULL); btrfs_queue_work(wq, &ordered->work); } /* @@ -8376,7 +8415,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, goto err; } map: - ret = btrfs_map_bio(fs_info, bio, 0, 0); + ret = btrfs_map_bio(fs_info, bio, 0); err: return ret; } @@ -8709,7 +8748,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } else if (ret >= 0 && (size_t)ret < count) btrfs_delalloc_release_space(inode, data_reserved, offset, count - (size_t)ret, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), count, false); + btrfs_delalloc_release_extents(BTRFS_I(inode), count); } out: if (wakeup) @@ -9059,7 +9098,7 @@ again: unlock_extent_cached(io_tree, page_start, page_end, &cached_state); if (!ret2) { - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); sb_end_pagefault(inode->i_sb); extent_changeset_free(data_reserved); return VM_FAULT_LOCKED; @@ -9068,7 +9107,7 @@ again: out_unlock: unlock_page(page); out: - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, (ret != 0)); + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); btrfs_delalloc_release_space(inode, data_reserved, page_start, reserved_space, (ret != 0)); out_noreserve: @@ -9308,7 +9347,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->io_failure_tree.track_uptodate = true; atomic_set(&ei->sync_writers, 0); mutex_init(&ei->log_mutex); - mutex_init(&ei->delalloc_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->delayed_iput); @@ -9537,6 +9575,9 @@ static int btrfs_rename_exchange(struct inode *old_dir, goto out_notrans; } + if (dest != root) + btrfs_record_root_in_trans(trans, dest); + /* * We need to find a free sequence number both in the source and * in the destination directory for the exchange. @@ -9731,6 +9772,18 @@ out_fail: commit_transaction = true; } if (commit_transaction) { + /* + * We may have set commit_transaction when logging the new name + * in the destination root, in which case we left the source + * root context in the list of log contextes. So make sure we + * remove it to avoid invalid memory accesses, since the context + * was allocated in our stack frame. + */ + if (sync_log_root) { + mutex_lock(&root->log_mutex); + list_del_init(&ctx_root.list); + mutex_unlock(&root->log_mutex); + } ret = btrfs_commit_transaction(trans); } else { int ret2; @@ -9744,6 +9797,9 @@ out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); + ASSERT(list_empty(&ctx_root.list)); + ASSERT(list_empty(&ctx_dest.list)); + return ret; } @@ -10088,8 +10144,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode init_completion(&work->completion); INIT_LIST_HEAD(&work->list); work->inode = inode; - btrfs_init_work(&work->work, btrfs_flush_delalloc_helper, - btrfs_run_delalloc_work, NULL, NULL); + btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL); return work; } @@ -10422,7 +10477,6 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, em->block_len = ins.offset; em->orig_block_len = ins.offset; em->ram_bytes = ins.offset; - em->bdev = fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PREALLOC, &em->flags); em->generation = trans->transid; @@ -10778,7 +10832,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, start = 0; while (start < isize) { u64 logical_block_start, physical_block_start; - struct btrfs_block_group_cache *bg; + struct btrfs_block_group *bg; u64 len = isize - start; em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index de730e56d3f5..a1ee0b775e65 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -479,10 +479,9 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg) return put_user(inode->i_generation, arg); } -static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) +static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, + void __user *arg) { - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_device *device; struct request_queue *q; struct fstrim_range range; @@ -541,7 +540,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) return 0; } -int btrfs_is_empty_uuid(u8 *uuid) +int __pure btrfs_is_empty_uuid(u8 *uuid) { int i; @@ -1360,8 +1359,7 @@ again: unlock_page(pages[i]); put_page(pages[i]); } - btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT, - false); + btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); extent_changeset_free(data_reserved); return i_done; out: @@ -1372,8 +1370,7 @@ out: btrfs_delalloc_release_space(inode, data_reserved, start_index << PAGE_SHIFT, page_cnt << PAGE_SHIFT, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT, - true); + btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); extent_changeset_free(data_reserved); return ret; @@ -1411,7 +1408,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, return -EINVAL; if (do_compress) { - if (range->compress_type > BTRFS_COMPRESS_TYPES) + if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) return -EINVAL; if (range->compress_type) compress_type = range->compress_type; @@ -2464,7 +2461,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, goto out; } - temp_inode = btrfs_iget(sb, &key2, root, NULL); + temp_inode = btrfs_iget(sb, &key2, root); if (IS_ERR(temp_inode)) { ret = PTR_ERR(temp_inode); goto out; @@ -4034,16 +4031,15 @@ out: static void get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space) { - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; space->total_bytes = 0; space->used_bytes = 0; space->flags = 0; list_for_each_entry(block_group, groups_list, list) { space->flags = block_group->flags; - space->total_bytes += block_group->key.offset; - space->used_bytes += - btrfs_block_group_used(&block_group->item); + space->total_bytes += block_group->length; + space->used_bytes += block_group->used; } } @@ -4197,9 +4193,6 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, u64 transid; int ret; - btrfs_warn(root->fs_info, - "START_SYNC ioctl is deprecated and will be removed in kernel 5.7"); - trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT) @@ -4227,9 +4220,6 @@ static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info, { u64 transid; - btrfs_warn(fs_info, - "WAIT_SYNC ioctl is deprecated and will be removed in kernel 5.7"); - if (argp) { if (copy_from_user(&transid, argp, sizeof(transid))) return -EFAULT; @@ -4960,10 +4950,9 @@ drop_write: return ret; } -static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg) +static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info, + void __user *arg) { - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_quota_rescan_args *qsa; int ret = 0; @@ -4986,11 +4975,9 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg) return ret; } -static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg) +static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info, + void __user *arg) { - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -5162,10 +5149,9 @@ out: return ret; } -static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg) +static int btrfs_ioctl_get_fslabel(struct btrfs_fs_info *fs_info, + void __user *arg) { - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); size_t len; int ret; char label[BTRFS_LABEL_SIZE]; @@ -5249,10 +5235,9 @@ int btrfs_ioctl_get_supported_features(void __user *arg) return 0; } -static int btrfs_ioctl_get_features(struct file *file, void __user *arg) +static int btrfs_ioctl_get_features(struct btrfs_fs_info *fs_info, + void __user *arg) { - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_super_block *super_block = fs_info->super_copy; struct btrfs_ioctl_feature_flags features; @@ -5453,11 +5438,11 @@ long btrfs_ioctl(struct file *file, unsigned int case FS_IOC_GETVERSION: return btrfs_ioctl_getversion(file, argp); case FS_IOC_GETFSLABEL: - return btrfs_ioctl_get_fslabel(file, argp); + return btrfs_ioctl_get_fslabel(fs_info, argp); case FS_IOC_SETFSLABEL: return btrfs_ioctl_set_fslabel(file, argp); case FITRIM: - return btrfs_ioctl_fitrim(file, argp); + return btrfs_ioctl_fitrim(fs_info, argp); case BTRFS_IOC_SNAP_CREATE: return btrfs_ioctl_snap_create(file, argp, 0); case BTRFS_IOC_SNAP_CREATE_V2: @@ -5562,15 +5547,15 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_QUOTA_RESCAN: return btrfs_ioctl_quota_rescan(file, argp); case BTRFS_IOC_QUOTA_RESCAN_STATUS: - return btrfs_ioctl_quota_rescan_status(file, argp); + return btrfs_ioctl_quota_rescan_status(fs_info, argp); case BTRFS_IOC_QUOTA_RESCAN_WAIT: - return btrfs_ioctl_quota_rescan_wait(file, argp); + return btrfs_ioctl_quota_rescan_wait(fs_info, argp); case BTRFS_IOC_DEV_REPLACE: return btrfs_ioctl_dev_replace(fs_info, argp); case BTRFS_IOC_GET_SUPPORTED_FEATURES: return btrfs_ioctl_get_supported_features(argp); case BTRFS_IOC_GET_FEATURES: - return btrfs_ioctl_get_features(file, argp); + return btrfs_ioctl_get_features(fs_info, argp); case BTRFS_IOC_SET_FEATURES: return btrfs_ioctl_set_features(file, argp); case FS_IOC_FSGETXATTR: diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 7f9a578a1a20..571c4826c428 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -13,65 +13,164 @@ #include "extent_io.h" #include "locking.h" +/* + * Extent buffer locking + * ===================== + * + * The locks use a custom scheme that allows to do more operations than are + * available fromt current locking primitives. The building blocks are still + * rwlock and wait queues. + * + * Required semantics: + * + * - reader/writer exclusion + * - writer/writer exclusion + * - reader/reader sharing + * - spinning lock semantics + * - blocking lock semantics + * - try-lock semantics for readers and writers + * - one level nesting, allowing read lock to be taken by the same thread that + * already has write lock + * + * The extent buffer locks (also called tree locks) manage access to eb data + * related to the storage in the b-tree (keys, items, but not the individual + * members of eb). + * We want concurrency of many readers and safe updates. The underlying locking + * is done by read-write spinlock and the blocking part is implemented using + * counters and wait queues. + * + * spinning semantics - the low-level rwlock is held so all other threads that + * want to take it are spinning on it. + * + * blocking semantics - the low-level rwlock is not held but the counter + * denotes how many times the blocking lock was held; + * sleeping is possible + * + * Write lock always allows only one thread to access the data. + * + * + * Debugging + * --------- + * + * There are additional state counters that are asserted in various contexts, + * removed from non-debug build to reduce extent_buffer size and for + * performance reasons. + * + * + * Lock nesting + * ------------ + * + * A write operation on a tree might indirectly start a look up on the same + * tree. This can happen when btrfs_cow_block locks the tree and needs to + * lookup free extents. + * + * btrfs_cow_block + * .. + * alloc_tree_block_no_bg_flush + * btrfs_alloc_tree_block + * btrfs_reserve_extent + * .. + * load_free_space_cache + * .. + * btrfs_lookup_file_extent + * btrfs_search_slot + * + * + * Locking pattern - spinning + * -------------------------- + * + * The simple locking scenario, the +--+ denotes the spinning section. + * + * +- btrfs_tree_lock + * | - extent_buffer::rwlock is held + * | - no heavy operations should happen, eg. IO, memory allocations, large + * | structure traversals + * +- btrfs_tree_unock +* +* + * Locking pattern - blocking + * -------------------------- + * + * The blocking write uses the following scheme. The +--+ denotes the spinning + * section. + * + * +- btrfs_tree_lock + * | + * +- btrfs_set_lock_blocking_write + * + * - allowed: IO, memory allocations, etc. + * + * -- btrfs_tree_unlock - note, no explicit unblocking necessary + * + * + * Blocking read is similar. + * + * +- btrfs_tree_read_lock + * | + * +- btrfs_set_lock_blocking_read + * + * - heavy operations allowed + * + * +- btrfs_tree_read_unlock_blocking + * | + * +- btrfs_tree_read_unlock + * + */ + #ifdef CONFIG_BTRFS_DEBUG -static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) +static inline void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) { WARN_ON(eb->spinning_writers); eb->spinning_writers++; } -static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) +static inline void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) { WARN_ON(eb->spinning_writers != 1); eb->spinning_writers--; } -static void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) +static inline void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) { WARN_ON(eb->spinning_writers); } -static void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) +static inline void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) { atomic_inc(&eb->spinning_readers); } -static void btrfs_assert_spinning_readers_put(struct extent_buffer *eb) +static inline void btrfs_assert_spinning_readers_put(struct extent_buffer *eb) { WARN_ON(atomic_read(&eb->spinning_readers) == 0); atomic_dec(&eb->spinning_readers); } -static void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb) +static inline void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb) { atomic_inc(&eb->read_locks); } -static void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb) +static inline void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb) { atomic_dec(&eb->read_locks); } -static void btrfs_assert_tree_read_locked(struct extent_buffer *eb) +static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { BUG_ON(!atomic_read(&eb->read_locks)); } -static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) +static inline void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) { eb->write_locks++; } -static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) +static inline void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) { eb->write_locks--; } -void btrfs_assert_tree_locked(struct extent_buffer *eb) -{ - BUG_ON(!eb->write_locks); -} - #else static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) { } static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) { } @@ -81,11 +180,19 @@ static void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) { } static void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { } static void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb) { } static void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb) { } -void btrfs_assert_tree_locked(struct extent_buffer *eb) { } static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) { } static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) { } #endif +/* + * Mark already held read lock as blocking. Can be nested in write lock by the + * same thread. + * + * Use when there are potentially long operations ahead so other thread waiting + * on the lock will not actively spin but sleep instead. + * + * The rwlock is released and blocking reader counter is increased. + */ void btrfs_set_lock_blocking_read(struct extent_buffer *eb) { trace_btrfs_set_lock_blocking_read(eb); @@ -102,6 +209,14 @@ void btrfs_set_lock_blocking_read(struct extent_buffer *eb) read_unlock(&eb->lock); } +/* + * Mark already held write lock as blocking. + * + * Use when there are potentially long operations ahead so other threads + * waiting on the lock will not actively spin but sleep instead. + * + * The rwlock is released and blocking writers is set. + */ void btrfs_set_lock_blocking_write(struct extent_buffer *eb) { trace_btrfs_set_lock_blocking_write(eb); @@ -115,14 +230,19 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb) if (eb->blocking_writers == 0) { btrfs_assert_spinning_writers_put(eb); btrfs_assert_tree_locked(eb); - eb->blocking_writers++; + WRITE_ONCE(eb->blocking_writers, 1); write_unlock(&eb->lock); } } /* - * take a spinning read lock. This will wait for any blocking - * writers + * Lock the extent buffer for read. Wait for any writers (spinning or blocking). + * Can be nested in write lock by the same thread. + * + * Use when the locked section does only lightweight actions and busy waiting + * would be cheaper than making other threads do the wait/wake loop. + * + * The rwlock is held upon exit. */ void btrfs_tree_read_lock(struct extent_buffer *eb) { @@ -134,23 +254,24 @@ again: read_lock(&eb->lock); BUG_ON(eb->blocking_writers == 0 && current->pid == eb->lock_owner); - if (eb->blocking_writers && current->pid == eb->lock_owner) { - /* - * This extent is already write-locked by our thread. We allow - * an additional read lock to be added because it's for the same - * thread. btrfs_find_all_roots() depends on this as it may be - * called on a partly (write-)locked tree. - */ - BUG_ON(eb->lock_nested); - eb->lock_nested = true; - read_unlock(&eb->lock); - trace_btrfs_tree_read_lock(eb, start_ns); - return; - } if (eb->blocking_writers) { + if (current->pid == eb->lock_owner) { + /* + * This extent is already write-locked by our thread. + * We allow an additional read lock to be added because + * it's for the same thread. btrfs_find_all_roots() + * depends on this as it may be called on a partly + * (write-)locked tree. + */ + BUG_ON(eb->lock_nested); + eb->lock_nested = true; + read_unlock(&eb->lock); + trace_btrfs_tree_read_lock(eb, start_ns); + return; + } read_unlock(&eb->lock); wait_event(eb->write_lock_wq, - eb->blocking_writers == 0); + READ_ONCE(eb->blocking_writers) == 0); goto again; } btrfs_assert_tree_read_locks_get(eb); @@ -159,17 +280,19 @@ again: } /* - * take a spinning read lock. - * returns 1 if we get the read lock and 0 if we don't - * this won't wait for blocking writers + * Lock extent buffer for read, optimistically expecting that there are no + * contending blocking writers. If there are, don't wait. + * + * Return 1 if the rwlock has been taken, 0 otherwise */ int btrfs_tree_read_lock_atomic(struct extent_buffer *eb) { - if (eb->blocking_writers) + if (READ_ONCE(eb->blocking_writers)) return 0; read_lock(&eb->lock); - if (eb->blocking_writers) { + /* Refetch value after lock */ + if (READ_ONCE(eb->blocking_writers)) { read_unlock(&eb->lock); return 0; } @@ -180,18 +303,20 @@ int btrfs_tree_read_lock_atomic(struct extent_buffer *eb) } /* - * returns 1 if we get the read lock and 0 if we don't - * this won't wait for blocking writers + * Try-lock for read. Don't block or wait for contending writers. + * + * Retrun 1 if the rwlock has been taken, 0 otherwise */ int btrfs_try_tree_read_lock(struct extent_buffer *eb) { - if (eb->blocking_writers) + if (READ_ONCE(eb->blocking_writers)) return 0; if (!read_trylock(&eb->lock)) return 0; - if (eb->blocking_writers) { + /* Refetch value after lock */ + if (READ_ONCE(eb->blocking_writers)) { read_unlock(&eb->lock); return 0; } @@ -202,16 +327,19 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb) } /* - * returns 1 if we get the read lock and 0 if we don't - * this won't wait for blocking writers or readers + * Try-lock for write. May block until the lock is uncontended, but does not + * wait until it is free. + * + * Retrun 1 if the rwlock has been taken, 0 otherwise */ int btrfs_try_tree_write_lock(struct extent_buffer *eb) { - if (eb->blocking_writers || atomic_read(&eb->blocking_readers)) + if (READ_ONCE(eb->blocking_writers) || atomic_read(&eb->blocking_readers)) return 0; write_lock(&eb->lock); - if (eb->blocking_writers || atomic_read(&eb->blocking_readers)) { + /* Refetch value after lock */ + if (READ_ONCE(eb->blocking_writers) || atomic_read(&eb->blocking_readers)) { write_unlock(&eb->lock); return 0; } @@ -223,7 +351,10 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb) } /* - * drop a spinning read lock + * Release read lock. Must be used only if the lock is in spinning mode. If + * the read lock is nested, must pair with read lock before the write unlock. + * + * The rwlock is not held upon exit. */ void btrfs_tree_read_unlock(struct extent_buffer *eb) { @@ -245,7 +376,11 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb) } /* - * drop a blocking read lock + * Release read lock, previously set to blocking by a pairing call to + * btrfs_set_lock_blocking_read(). Can be nested in write lock by the same + * thread. + * + * State of rwlock is unchanged, last reader wakes waiting threads. */ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) { @@ -269,8 +404,10 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) } /* - * take a spinning write lock. This will wait for both - * blocking readers or writers + * Lock for write. Wait for all blocking and spinning readers and writers. This + * starts context where reader lock could be nested by the same thread. + * + * The rwlock is held for write upon exit. */ void btrfs_tree_lock(struct extent_buffer *eb) { @@ -282,9 +419,11 @@ void btrfs_tree_lock(struct extent_buffer *eb) WARN_ON(eb->lock_owner == current->pid); again: wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); - wait_event(eb->write_lock_wq, eb->blocking_writers == 0); + wait_event(eb->write_lock_wq, READ_ONCE(eb->blocking_writers) == 0); write_lock(&eb->lock); - if (atomic_read(&eb->blocking_readers) || eb->blocking_writers) { + /* Refetch value after lock */ + if (atomic_read(&eb->blocking_readers) || + READ_ONCE(eb->blocking_writers)) { write_unlock(&eb->lock); goto again; } @@ -295,10 +434,19 @@ again: } /* - * drop a spinning or a blocking write lock. + * Release the write lock, either blocking or spinning (ie. there's no need + * for an explicit blocking unlock, like btrfs_tree_read_unlock_blocking). + * This also ends the context for nesting, the read lock must have been + * released already. + * + * Tasks blocked and waiting are woken, rwlock is not held upon exit. */ void btrfs_tree_unlock(struct extent_buffer *eb) { + /* + * This is read both locked and unlocked but always by the same thread + * that already owns the lock so we don't need to use READ_ONCE + */ int blockers = eb->blocking_writers; BUG_ON(blockers > 1); @@ -310,7 +458,8 @@ void btrfs_tree_unlock(struct extent_buffer *eb) if (blockers) { btrfs_assert_no_spinning_writers(eb); - eb->blocking_writers--; + /* Unlocked write */ + WRITE_ONCE(eb->blocking_writers, 0); /* * We need to order modifying blocking_writers above with * actually waking up the sleepers to ensure they see the @@ -322,3 +471,55 @@ void btrfs_tree_unlock(struct extent_buffer *eb) write_unlock(&eb->lock); } } + +/* + * Set all locked nodes in the path to blocking locks. This should be done + * before scheduling + */ +void btrfs_set_path_blocking(struct btrfs_path *p) +{ + int i; + + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + if (!p->nodes[i] || !p->locks[i]) + continue; + /* + * If we currently have a spinning reader or writer lock this + * will bump the count of blocking holders and drop the + * spinlock. + */ + if (p->locks[i] == BTRFS_READ_LOCK) { + btrfs_set_lock_blocking_read(p->nodes[i]); + p->locks[i] = BTRFS_READ_LOCK_BLOCKING; + } else if (p->locks[i] == BTRFS_WRITE_LOCK) { + btrfs_set_lock_blocking_write(p->nodes[i]); + p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING; + } + } +} + +/* + * This releases any locks held in the path starting at level and going all the + * way up to the root. + * + * btrfs_search_slot will keep the lock held on higher nodes in a few corner + * cases, such as COW of the block at slot zero in the node. This ignores + * those rules, and it should only be called when there are no more updates to + * be done higher up in the tree. + */ +void btrfs_unlock_up_safe(struct btrfs_path *path, int level) +{ + int i; + + if (path->keep_locks) + return; + + for (i = level; i < BTRFS_MAX_LEVEL; i++) { + if (!path->nodes[i]) + continue; + if (!path->locks[i]) + continue; + btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); + path->locks[i] = 0; + } +} diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index b775a4207ed9..21a285883e89 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -6,6 +6,8 @@ #ifndef BTRFS_LOCKING_H #define BTRFS_LOCKING_H +#include "extent_io.h" + #define BTRFS_WRITE_LOCK 1 #define BTRFS_READ_LOCK 2 #define BTRFS_WRITE_LOCK_BLOCKING 3 @@ -19,11 +21,20 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb); void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb); void btrfs_set_lock_blocking_read(struct extent_buffer *eb); void btrfs_set_lock_blocking_write(struct extent_buffer *eb); -void btrfs_assert_tree_locked(struct extent_buffer *eb); int btrfs_try_tree_read_lock(struct extent_buffer *eb); int btrfs_try_tree_write_lock(struct extent_buffer *eb); int btrfs_tree_read_lock_atomic(struct extent_buffer *eb); +#ifdef CONFIG_BTRFS_DEBUG +static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { + BUG_ON(!eb->write_locks); +} +#else +static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { } +#endif + +void btrfs_set_path_blocking(struct btrfs_path *p); +void btrfs_unlock_up_safe(struct btrfs_path *path, int level); static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw) { diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index acad4174f68d..aa9cd11f4b78 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -63,27 +63,7 @@ struct workspace { static struct workspace_manager wsm; -static void lzo_init_workspace_manager(void) -{ - btrfs_init_workspace_manager(&wsm, &btrfs_lzo_compress); -} - -static void lzo_cleanup_workspace_manager(void) -{ - btrfs_cleanup_workspace_manager(&wsm); -} - -static struct list_head *lzo_get_workspace(unsigned int level) -{ - return btrfs_get_workspace(&wsm, level); -} - -static void lzo_put_workspace(struct list_head *ws) -{ - btrfs_put_workspace(&wsm, ws); -} - -static void lzo_free_workspace(struct list_head *ws) +void lzo_free_workspace(struct list_head *ws) { struct workspace *workspace = list_entry(ws, struct workspace, list); @@ -93,7 +73,7 @@ static void lzo_free_workspace(struct list_head *ws) kfree(workspace); } -static struct list_head *lzo_alloc_workspace(unsigned int level) +struct list_head *lzo_alloc_workspace(unsigned int level) { struct workspace *workspace; @@ -131,13 +111,9 @@ static inline size_t read_compress_length(const char *buf) return le32_to_cpu(dlen); } -static int lzo_compress_pages(struct list_head *ws, - struct address_space *mapping, - u64 start, - struct page **pages, - unsigned long *out_pages, - unsigned long *total_in, - unsigned long *total_out) +int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0; @@ -303,7 +279,7 @@ out: return ret; } -static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) +int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0, ret2; @@ -444,10 +420,9 @@ done: return ret; } -static int lzo_decompress(struct list_head *ws, unsigned char *data_in, - struct page *dest_page, - unsigned long start_byte, - size_t srclen, size_t destlen) +int lzo_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); size_t in_len; @@ -508,15 +483,7 @@ out: } const struct btrfs_compress_op btrfs_lzo_compress = { - .init_workspace_manager = lzo_init_workspace_manager, - .cleanup_workspace_manager = lzo_cleanup_workspace_manager, - .get_workspace = lzo_get_workspace, - .put_workspace = lzo_put_workspace, - .alloc_workspace = lzo_alloc_workspace, - .free_workspace = lzo_free_workspace, - .compress_pages = lzo_compress_pages, - .decompress_bio = lzo_decompress_bio, - .decompress = lzo_decompress, + .workspace_manager = &wsm, .max_level = 1, .default_level = 1, }; diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index 7d564924dfeb..72bab64ecf60 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -47,4 +47,15 @@ static inline u64 div_factor_fine(u64 num, int factor) return div_u64(num, 100); } +/* Copy of is_power_of_two that is 64bit safe */ +static inline bool is_power_of_two_u64(u64 n) +{ + return n != 0 && (n & (n - 1)) == 0; +} + +static inline bool has_single_bit_set(u64 n) +{ + return is_power_of_two_u64(n); +} + #endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 24b6c72b9a59..fb09bc2f8e4d 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -547,7 +547,6 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, spin_unlock(&root->ordered_extent_lock); btrfs_init_work(&ordered->flush_work, - btrfs_flush_delalloc_helper, btrfs_run_ordered_extent_work, NULL, NULL); list_add_tail(&ordered->work_list, &works); btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work); @@ -573,12 +572,11 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, return count; } -u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, +void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, const u64 range_start, const u64 range_len) { struct btrfs_root *root; struct list_head splice; - u64 total_done = 0; u64 done; INIT_LIST_HEAD(&splice); @@ -598,7 +596,6 @@ u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, done = btrfs_wait_ordered_extents(root, nr, range_start, range_len); btrfs_put_fs_root(root); - total_done += done; spin_lock(&fs_info->ordered_root_lock); if (nr != U64_MAX) { @@ -608,8 +605,6 @@ u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, list_splice_tail(&splice, &fs_info->ordered_roots); spin_unlock(&fs_info->ordered_root_lock); mutex_unlock(&fs_info->ordered_operations_mutex); - - return total_done; } /* diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 5204171ea962..4eb0319a86d7 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -186,7 +186,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u8 *sum, int len); u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, const u64 range_start, const u64 range_len); -u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, +void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, const u64 range_start, const u64 range_len); void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree, struct btrfs_inode *inode, u64 start, diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 9cb50577d982..873b6b694107 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -266,9 +266,9 @@ void btrfs_print_leaf(struct extent_buffer *l) struct btrfs_block_group_item); pr_info( "\t\tblock group used %llu chunk_objectid %llu flags %llu\n", - btrfs_disk_block_group_used(l, bi), - btrfs_disk_block_group_chunk_objectid(l, bi), - btrfs_disk_block_group_flags(l, bi)); + btrfs_block_group_used(l, bi), + btrfs_block_group_chunk_objectid(l, bi), + btrfs_block_group_flags(l, bi)); break; case BTRFS_CHUNK_ITEM_KEY: print_chunk(l, btrfs_item_ptr(l, i, diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 1e664e0b59b8..deb59e7cfcac 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -416,11 +416,11 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - parent_inode = btrfs_iget(sb, &key, parent_root, NULL); + parent_inode = btrfs_iget(sb, &key, parent_root); if (IS_ERR(parent_inode)) return PTR_ERR(parent_inode); - child_inode = btrfs_iget(sb, &key, root, NULL); + child_inode = btrfs_iget(sb, &key, root); if (IS_ERR(child_inode)) { iput(parent_inode); return PTR_ERR(child_inode); @@ -437,8 +437,6 @@ void __init btrfs_props_init(void) { int i; - hash_init(prop_handlers_ht); - for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { struct prop_handler *p = &prop_handlers[i]; u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name)); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index c4bb69941c77..93aeb2e539a4 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1811,7 +1811,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0); /* For src_path */ - extent_buffer_get(src_eb); + atomic_inc(&src_eb->refs); src_path->nodes[root_level] = src_eb; src_path->slots[root_level] = dst_path->slots[root_level]; src_path->locks[root_level] = 0; @@ -2067,7 +2067,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, goto out; } /* For dst_path */ - extent_buffer_get(dst_eb); + atomic_inc(&dst_eb->refs); dst_path->nodes[level] = dst_eb; dst_path->slots[level] = 0; dst_path->locks[level] = 0; @@ -2126,7 +2126,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, * walk back up the tree (adjusting slot pointers as we go) * and restart the search process. */ - extent_buffer_get(root_eb); /* For path */ + atomic_inc(&root_eb->refs); /* For path */ path->nodes[root_level] = root_eb; path->slots[root_level] = 0; path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ @@ -3277,10 +3277,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, spin_unlock(&fs_info->qgroup_lock); mutex_unlock(&fs_info->qgroup_rescan_lock); - memset(&fs_info->qgroup_rescan_work, 0, - sizeof(fs_info->qgroup_rescan_work)); btrfs_init_work(&fs_info->qgroup_rescan_work, - btrfs_qgroup_rescan_helper, btrfs_qgroup_rescan_worker, NULL, NULL); return 0; } @@ -3629,7 +3626,7 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, return 0; BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); - trace_qgroup_meta_reserve(root, type, (s64)num_bytes); + trace_qgroup_meta_reserve(root, (s64)num_bytes, type); ret = qgroup_reserve(root, num_bytes, enforce, type); if (ret < 0) return ret; @@ -3676,7 +3673,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, */ num_bytes = sub_root_meta_rsv(root, num_bytes, type); BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); - trace_qgroup_meta_reserve(root, type, -(s64)num_bytes); + trace_qgroup_meta_reserve(root, -(s64)num_bytes, type); btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, num_bytes, type); } @@ -3826,7 +3823,7 @@ out: */ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *subvol_root, - struct btrfs_block_group_cache *bg, + struct btrfs_block_group *bg, struct extent_buffer *subvol_parent, int subvol_slot, struct extent_buffer *reloc_parent, int reloc_slot, u64 last_snapshot) diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 46ba7bd2961c..236f12224d52 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -408,7 +408,7 @@ void btrfs_qgroup_init_swapped_blocks( void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root); int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *subvol_root, - struct btrfs_block_group_cache *bg, + struct btrfs_block_group *bg, struct extent_buffer *subvol_parent, int subvol_slot, struct extent_buffer *reloc_parent, int reloc_slot, u64 last_snapshot); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 57a2ac721985..a8e53c8e7b01 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -190,7 +190,7 @@ static void scrub_parity_work(struct btrfs_work *work); static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func) { - btrfs_init_work(&rbio->work, btrfs_rmw_helper, work_func, NULL, NULL); + btrfs_init_work(&rbio->work, work_func, NULL, NULL); btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); } @@ -671,8 +671,7 @@ static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) */ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) { - int bucket = rbio_bucket(rbio); - struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket; + struct btrfs_stripe_hash *h; struct btrfs_raid_bio *cur; struct btrfs_raid_bio *pending; unsigned long flags; @@ -680,64 +679,63 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) struct btrfs_raid_bio *cache_drop = NULL; int ret = 0; + h = rbio->fs_info->stripe_hash_table->table + rbio_bucket(rbio); + spin_lock_irqsave(&h->lock, flags); list_for_each_entry(cur, &h->hash_list, hash_list) { - if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) { - spin_lock(&cur->bio_list_lock); - - /* can we steal this cached rbio's pages? */ - if (bio_list_empty(&cur->bio_list) && - list_empty(&cur->plug_list) && - test_bit(RBIO_CACHE_BIT, &cur->flags) && - !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { - list_del_init(&cur->hash_list); - refcount_dec(&cur->refs); - - steal_rbio(cur, rbio); - cache_drop = cur; - spin_unlock(&cur->bio_list_lock); + if (cur->bbio->raid_map[0] != rbio->bbio->raid_map[0]) + continue; - goto lockit; - } + spin_lock(&cur->bio_list_lock); - /* can we merge into the lock owner? */ - if (rbio_can_merge(cur, rbio)) { - merge_rbio(cur, rbio); - spin_unlock(&cur->bio_list_lock); - freeit = rbio; - ret = 1; - goto out; - } + /* Can we steal this cached rbio's pages? */ + if (bio_list_empty(&cur->bio_list) && + list_empty(&cur->plug_list) && + test_bit(RBIO_CACHE_BIT, &cur->flags) && + !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { + list_del_init(&cur->hash_list); + refcount_dec(&cur->refs); + steal_rbio(cur, rbio); + cache_drop = cur; + spin_unlock(&cur->bio_list_lock); - /* - * we couldn't merge with the running - * rbio, see if we can merge with the - * pending ones. We don't have to - * check for rmw_locked because there - * is no way they are inside finish_rmw - * right now - */ - list_for_each_entry(pending, &cur->plug_list, - plug_list) { - if (rbio_can_merge(pending, rbio)) { - merge_rbio(pending, rbio); - spin_unlock(&cur->bio_list_lock); - freeit = rbio; - ret = 1; - goto out; - } - } + goto lockit; + } - /* no merging, put us on the tail of the plug list, - * our rbio will be started with the currently - * running rbio unlocks - */ - list_add_tail(&rbio->plug_list, &cur->plug_list); + /* Can we merge into the lock owner? */ + if (rbio_can_merge(cur, rbio)) { + merge_rbio(cur, rbio); spin_unlock(&cur->bio_list_lock); + freeit = rbio; ret = 1; goto out; } + + + /* + * We couldn't merge with the running rbio, see if we can merge + * with the pending ones. We don't have to check for rmw_locked + * because there is no way they are inside finish_rmw right now + */ + list_for_each_entry(pending, &cur->plug_list, plug_list) { + if (rbio_can_merge(pending, rbio)) { + merge_rbio(pending, rbio); + spin_unlock(&cur->bio_list_lock); + freeit = rbio; + ret = 1; + goto out; + } + } + + /* + * No merging, put us on the tail of the plug list, our rbio + * will be started with the currently running rbio unlocks + */ + list_add_tail(&rbio->plug_list, &cur->plug_list); + spin_unlock(&cur->bio_list_lock); + ret = 1; + goto out; } lockit: refcount_inc(&rbio->refs); @@ -1743,8 +1741,7 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) plug = container_of(cb, struct btrfs_plug_cb, cb); if (from_schedule) { - btrfs_init_work(&plug->work, btrfs_rmw_helper, - unplug_work, NULL, NULL); + btrfs_init_work(&plug->work, unplug_work, NULL, NULL); btrfs_queue_work(plug->info->rmw_workers, &plug->work); return; diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index ee6f60547a8d..243a2e44526e 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -227,7 +227,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical, struct btrfs_fs_info *fs_info = dev->fs_info; int ret; struct reada_zone *zone; - struct btrfs_block_group_cache *cache = NULL; + struct btrfs_block_group *cache = NULL; u64 start; u64 end; int i; @@ -248,8 +248,8 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical, if (!cache) return NULL; - start = cache->key.objectid; - end = start + cache->key.offset - 1; + start = cache->start; + end = start + cache->length - 1; btrfs_put_block_group(cache); zone = kzalloc(sizeof(*zone), GFP_KERNEL); @@ -752,21 +752,19 @@ static int reada_start_machine_dev(struct btrfs_device *dev) static void reada_start_machine_worker(struct btrfs_work *work) { struct reada_machine_work *rmw; - struct btrfs_fs_info *fs_info; int old_ioprio; rmw = container_of(work, struct reada_machine_work, work); - fs_info = rmw->fs_info; - - kfree(rmw); old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current), task_nice_ioprio(current)); set_task_ioprio(current, BTRFS_IOPRIO_READA); - __reada_start_machine(fs_info); + __reada_start_machine(rmw->fs_info); set_task_ioprio(current, old_ioprio); - atomic_dec(&fs_info->reada_works_cnt); + atomic_dec(&rmw->fs_info->reada_works_cnt); + + kfree(rmw); } static void __reada_start_machine(struct btrfs_fs_info *fs_info) @@ -821,8 +819,7 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info) /* FIXME we cannot handle this properly right now */ BUG(); } - btrfs_init_work(&rmw->work, btrfs_readahead_helper, - reada_start_machine_worker, NULL, NULL); + btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL); rmw->fs_info = fs_info; btrfs_queue_work(fs_info->readahead_workers, &rmw->work); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 00504657b602..d897a8e5e430 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -147,7 +147,7 @@ struct file_extent_cluster { struct reloc_control { /* block group to relocate */ - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; /* extent tree */ struct btrfs_root *extent_root; /* inode for moving data */ @@ -1560,11 +1560,10 @@ again: return NULL; } -static int in_block_group(u64 bytenr, - struct btrfs_block_group_cache *block_group) +static int in_block_group(u64 bytenr, struct btrfs_block_group *block_group) { - if (bytenr >= block_group->key.objectid && - bytenr < block_group->key.objectid + block_group->key.offset) + if (bytenr >= block_group->start && + bytenr < block_group->start + block_group->length) return 1; return 0; } @@ -2246,7 +2245,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_root_level(root_item); - extent_buffer_get(reloc_root->node); + atomic_inc(&reloc_root->node->refs); path->nodes[level] = reloc_root->node; path->slots[level] = 0; } else { @@ -3195,7 +3194,6 @@ static noinline_for_stack int setup_extent_mapping(struct inode *inode, u64 start, u64 end, u64 block_start) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_map *em; int ret = 0; @@ -3208,7 +3206,6 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end, em->len = end + 1 - start; em->block_len = em->len; em->block_start = block_start; - em->bdev = fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); lock_extent(&BTRFS_I(inode)->io_tree, start, end); @@ -3277,6 +3274,8 @@ static int relocate_file_extent_cluster(struct inode *inode, if (!page) { btrfs_delalloc_release_metadata(BTRFS_I(inode), PAGE_SIZE, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), + PAGE_SIZE); ret = -ENOMEM; goto out; } @@ -3297,7 +3296,7 @@ static int relocate_file_extent_cluster(struct inode *inode, btrfs_delalloc_release_metadata(BTRFS_I(inode), PAGE_SIZE, true); btrfs_delalloc_release_extents(BTRFS_I(inode), - PAGE_SIZE, true); + PAGE_SIZE); ret = -EIO; goto out; } @@ -3326,7 +3325,7 @@ static int relocate_file_extent_cluster(struct inode *inode, btrfs_delalloc_release_metadata(BTRFS_I(inode), PAGE_SIZE, true); btrfs_delalloc_release_extents(BTRFS_I(inode), - PAGE_SIZE, true); + PAGE_SIZE); clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, @@ -3342,8 +3341,7 @@ static int relocate_file_extent_cluster(struct inode *inode, put_page(page); index++; - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, - false); + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); balance_dirty_pages_ratelimited(inode->i_mapping); btrfs_throttle(fs_info); } @@ -3543,7 +3541,7 @@ static int block_use_full_backref(struct reloc_control *rc, } static int delete_block_group_cache(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *block_group, + struct btrfs_block_group *block_group, struct inode *inode, u64 ino) { @@ -3559,7 +3557,7 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info, key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - inode = btrfs_iget(fs_info->sb, &key, root, NULL); + inode = btrfs_iget(fs_info->sb, &key, root); if (IS_ERR(inode)) return -ENOENT; @@ -3862,7 +3860,7 @@ int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, u64 start, end, last; int ret; - last = rc->block_group->key.objectid + rc->block_group->key.offset; + last = rc->block_group->start + rc->block_group->length; while (1) { cond_resched(); if (rc->search_start >= last) { @@ -3979,7 +3977,7 @@ int prepare_to_relocate(struct reloc_control *rc) return -ENOMEM; memset(&rc->cluster, 0, sizeof(rc->cluster)); - rc->search_start = rc->block_group->key.objectid; + rc->search_start = rc->block_group->start; rc->extents_found = 0; rc->nodes_relocated = 0; rc->merging_rsv_size = 0; @@ -4218,7 +4216,7 @@ out: */ static noinline_for_stack struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *group) + struct btrfs_block_group *group) { struct inode *inode = NULL; struct btrfs_trans_handle *trans; @@ -4245,9 +4243,9 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, key.objectid = objectid; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - inode = btrfs_iget(fs_info->sb, &key, root, NULL); + inode = btrfs_iget(fs_info->sb, &key, root); BUG_ON(IS_ERR(inode)); - BTRFS_I(inode)->index_cnt = group->key.objectid; + BTRFS_I(inode)->index_cnt = group->start; err = btrfs_orphan_add(trans, BTRFS_I(inode)); out: @@ -4282,7 +4280,7 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) * Print the block group being relocated */ static void describe_relocation(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *block_group) + struct btrfs_block_group *block_group) { char buf[128] = {'\0'}; @@ -4290,7 +4288,7 @@ static void describe_relocation(struct btrfs_fs_info *fs_info, btrfs_info(fs_info, "relocating block group %llu flags %s", - block_group->key.objectid, buf); + block_group->start, buf); } /* @@ -4298,7 +4296,7 @@ static void describe_relocation(struct btrfs_fs_info *fs_info, */ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) { - struct btrfs_block_group_cache *bg; + struct btrfs_block_group *bg; struct btrfs_root *extent_root = fs_info->extent_root; struct reloc_control *rc; struct inode *inode; @@ -4325,7 +4323,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) rc->extent_root = extent_root; rc->block_group = bg; - ret = btrfs_inc_block_group_ro(rc->block_group); + ret = btrfs_inc_block_group_ro(rc->block_group, true); if (ret) { err = ret; goto out; @@ -4363,8 +4361,8 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) btrfs_wait_block_group_reservations(rc->block_group); btrfs_wait_nocow_writers(rc->block_group); btrfs_wait_ordered_roots(fs_info, U64_MAX, - rc->block_group->key.objectid, - rc->block_group->key.offset); + rc->block_group->start, + rc->block_group->length); while (1) { mutex_lock(&fs_info->cleaner_mutex); @@ -4404,7 +4402,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) WARN_ON(rc->block_group->pinned > 0); WARN_ON(rc->block_group->reserved > 0); - WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); + WARN_ON(rc->block_group->used > 0); out: if (err && rw) btrfs_dec_block_group_ro(rc->block_group); @@ -4687,7 +4685,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, node->new_bytenr != buf->start); drop_node_buffer(node); - extent_buffer_get(cow); + atomic_inc(&cow->refs); node->eb = cow; node->new_bytenr = cow->start; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index f7d4e03f4c5d..21de630b0730 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -389,8 +389,7 @@ static struct full_stripe_lock *search_full_stripe_lock( * * Caller must ensure @cache is a RAID56 block group. */ -static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache, - u64 bytenr) +static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr) { u64 ret; @@ -404,8 +403,8 @@ static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache, * round_down() can only handle power of 2, while RAID56 full * stripe length can be 64KiB * n, so we need to manually round down. */ - ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) * - cache->full_stripe_len + cache->key.objectid; + ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) * + cache->full_stripe_len + cache->start; return ret; } @@ -423,7 +422,7 @@ static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache, static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, bool *locked_ret) { - struct btrfs_block_group_cache *bg_cache; + struct btrfs_block_group *bg_cache; struct btrfs_full_stripe_locks_tree *locks_root; struct full_stripe_lock *existing; u64 fstripe_start; @@ -470,7 +469,7 @@ out: static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, bool locked) { - struct btrfs_block_group_cache *bg_cache; + struct btrfs_block_group *bg_cache; struct btrfs_full_stripe_locks_tree *locks_root; struct full_stripe_lock *fstripe_lock; u64 fstripe_start; @@ -598,8 +597,8 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( sbio->index = i; sbio->sctx = sctx; sbio->page_count = 0; - btrfs_init_work(&sbio->work, btrfs_scrub_helper, - scrub_bio_end_io_worker, NULL, NULL); + btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, NULL, + NULL); if (i != SCRUB_BIOS_PER_SCTX - 1) sctx->bios[i]->next_free = i + 1; @@ -1720,8 +1719,7 @@ static void scrub_wr_bio_end_io(struct bio *bio) sbio->status = bio->bi_status; sbio->bio = bio; - btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper, - scrub_wr_bio_end_io_worker, NULL, NULL); + btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL); btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); } @@ -2149,14 +2147,13 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work) scrub_write_block_to_dev_replace(sblock); } - scrub_block_put(sblock); - if (sctx->is_dev_replace && sctx->flush_all_writes) { mutex_lock(&sctx->wr_lock); scrub_wr_submit(sctx); mutex_unlock(&sctx->wr_lock); } + scrub_block_put(sblock); scrub_pending_bio_dec(sctx); } @@ -2204,8 +2201,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) raid56_add_scrub_pages(rbio, spage->page, spage->logical); } - btrfs_init_work(&sblock->work, btrfs_scrub_helper, - scrub_missing_raid56_worker, NULL, NULL); + btrfs_init_work(&sblock->work, scrub_missing_raid56_worker, NULL, NULL); scrub_block_get(sblock); scrub_pending_bio_inc(sctx); raid56_submit_missing_rbio(rbio); @@ -2743,8 +2739,8 @@ static void scrub_parity_bio_endio(struct bio *bio) bio_put(bio); - btrfs_init_work(&sparity->work, btrfs_scrubparity_helper, - scrub_parity_bio_endio_worker, NULL, NULL); + btrfs_init_work(&sparity->work, scrub_parity_bio_endio_worker, NULL, + NULL); btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work); } @@ -3420,7 +3416,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, u64 chunk_offset, u64 length, u64 dev_offset, - struct btrfs_block_group_cache *cache) + struct btrfs_block_group *cache) { struct btrfs_fs_info *fs_info = sctx->fs_info; struct extent_map_tree *map_tree = &fs_info->mapping_tree; @@ -3484,7 +3480,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, struct extent_buffer *l; struct btrfs_key key; struct btrfs_key found_key; - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; path = btrfs_alloc_path(); @@ -3563,46 +3559,26 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, * -> btrfs_scrub_pause() */ scrub_pause_on(fs_info); - ret = btrfs_inc_block_group_ro(cache); - if (!ret && sctx->is_dev_replace) { - /* - * If we are doing a device replace wait for any tasks - * that started delalloc right before we set the block - * group to RO mode, as they might have just allocated - * an extent from it or decided they could do a nocow - * write. And if any such tasks did that, wait for their - * ordered extents to complete and then commit the - * current transaction, so that we can later see the new - * extent items in the extent tree - the ordered extents - * create delayed data references (for cow writes) when - * they complete, which will be run and insert the - * corresponding extent items into the extent tree when - * we commit the transaction they used when running - * inode.c:btrfs_finish_ordered_io(). We later use - * the commit root of the extent tree to find extents - * to copy from the srcdev into the tgtdev, and we don't - * want to miss any new extents. - */ - btrfs_wait_block_group_reservations(cache); - btrfs_wait_nocow_writers(cache); - ret = btrfs_wait_ordered_roots(fs_info, U64_MAX, - cache->key.objectid, - cache->key.offset); - if (ret > 0) { - struct btrfs_trans_handle *trans; - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - ret = PTR_ERR(trans); - else - ret = btrfs_commit_transaction(trans); - if (ret) { - scrub_pause_off(fs_info); - btrfs_put_block_group(cache); - break; - } - } - } + + /* + * Don't do chunk preallocation for scrub. + * + * This is especially important for SYSTEM bgs, or we can hit + * -EFBIG from btrfs_finish_chunk_alloc() like: + * 1. The only SYSTEM bg is marked RO. + * Since SYSTEM bg is small, that's pretty common. + * 2. New SYSTEM bg will be allocated + * Due to regular version will allocate new chunk. + * 3. New SYSTEM bg is empty and will get cleaned up + * Before cleanup really happens, it's marked RO again. + * 4. Empty SYSTEM bg get scrubbed + * We go back to 2. + * + * This can easily boost the amount of SYSTEM chunks if cleaner + * thread can't be triggered fast enough, and use up all space + * of btrfs_super_block::sys_chunk_array + */ + ret = btrfs_inc_block_group_ro(cache, false); scrub_pause_off(fs_info); if (ret == 0) { @@ -3623,7 +3599,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, break; } - down_write(&fs_info->dev_replace.rwsem); + down_write(&dev_replace->rwsem); dev_replace->cursor_right = found_key.offset + length; dev_replace->cursor_left = found_key.offset; dev_replace->item_needs_writeback = 1; @@ -3664,10 +3640,10 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, scrub_pause_off(fs_info); - down_write(&fs_info->dev_replace.rwsem); + down_write(&dev_replace->rwsem); dev_replace->cursor_left = dev_replace->cursor_right; dev_replace->item_needs_writeback = 1; - up_write(&fs_info->dev_replace.rwsem); + up_write(&dev_replace->rwsem); if (ro_set) btrfs_dec_block_group_ro(cache); @@ -3681,7 +3657,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, */ spin_lock(&cache->lock); if (!cache->removed && !cache->ro && cache->reserved == 0 && - btrfs_block_group_used(&cache->item) == 0) { + cache->used == 0) { spin_unlock(&cache->lock); btrfs_mark_bg_unused(cache); } else { diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 123ac54af071..ae2db5eb1549 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -25,6 +25,14 @@ #include "compression.h" /* + * Maximum number of references an extent can have in order for us to attempt to + * issue clone operations instead of write operations. This currently exists to + * avoid hitting limitations of the backreference walking code (taking a lot of + * time and using too much memory for extents with large number of references). + */ +#define SEND_MAX_EXTENT_REFS 64 + +/* * A fs_path is a helper to dynamically build path names with unknown size. * It reallocates the internal buffer on demand. * It allows fast adding of path elements on the right side (normal path) and @@ -1248,12 +1256,20 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) */ if (found->root == bctx->sctx->send_root) { /* - * TODO for the moment we don't accept clones from the inode - * that is currently send. We may change this when - * BTRFS_IOC_CLONE_RANGE supports cloning from and to the same - * file. + * If the source inode was not yet processed we can't issue a + * clone operation, as the source extent does not exist yet at + * the destination of the stream. */ - if (ino >= bctx->cur_objectid) + if (ino > bctx->cur_objectid) + return 0; + /* + * We clone from the inode currently being sent as long as the + * source extent is already processed, otherwise we could try + * to clone from an extent that does not exist yet at the + * destination of the stream. + */ + if (ino == bctx->cur_objectid && + offset >= bctx->sctx->cur_inode_next_write_offset) return 0; } @@ -1302,6 +1318,7 @@ static int find_extent_clone(struct send_ctx *sctx, struct clone_root *cur_clone_root; struct btrfs_key found_key; struct btrfs_path *tmp_path; + struct btrfs_extent_item *ei; int compressed; u32 i; @@ -1349,7 +1366,6 @@ static int find_extent_clone(struct send_ctx *sctx, ret = extent_from_logical(fs_info, disk_byte, tmp_path, &found_key, &flags); up_read(&fs_info->commit_root_sem); - btrfs_release_path(tmp_path); if (ret < 0) goto out; @@ -1358,6 +1374,21 @@ static int find_extent_clone(struct send_ctx *sctx, goto out; } + ei = btrfs_item_ptr(tmp_path->nodes[0], tmp_path->slots[0], + struct btrfs_extent_item); + /* + * Backreference walking (iterate_extent_inodes() below) is currently + * too expensive when an extent has a large number of references, both + * in time spent and used memory. So for now just fallback to write + * operations instead of clone operations when an extent has more than + * a certain amount of references. + */ + if (btrfs_extent_refs(tmp_path->nodes[0], ei) > SEND_MAX_EXTENT_REFS) { + ret = -ENOENT; + goto out; + } + btrfs_release_path(tmp_path); + /* * Setup the clone roots. */ @@ -4779,7 +4810,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - inode = btrfs_iget(fs_info->sb, &key, root, NULL); + inode = btrfs_iget(fs_info->sb, &key, root); if (IS_ERR(inode)) return PTR_ERR(inode); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 98dc092a905e..f09aa6ee9113 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -10,7 +10,7 @@ #include "transaction.h" #include "block-group.h" -u64 btrfs_space_info_used(struct btrfs_space_info *s_info, +u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, bool may_use_included) { ASSERT(s_info); @@ -58,7 +58,6 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) spin_lock_init(&space_info->lock); space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; - init_waitqueue_head(&space_info->wait); INIT_LIST_HEAD(&space_info->ro_bgs); INIT_LIST_HEAD(&space_info->tickets); INIT_LIST_HEAD(&space_info->priority_tickets); @@ -285,7 +284,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *info, u64 bytes, int dump_block_groups) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; int index = 0; spin_lock(&info->lock); @@ -301,8 +300,7 @@ again: spin_lock(&cache->lock); btrfs_info(fs_info, "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", - cache->key.objectid, cache->key.offset, - btrfs_block_group_used(&cache->item), cache->pinned, + cache->start, cache->length, cache->used, cache->pinned, cache->reserved, cache->ro ? "[readonly]" : ""); btrfs_dump_free_space(cache, bytes); spin_unlock(&cache->lock); @@ -893,6 +891,15 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, while (ticket->bytes > 0 && ticket->error == 0) { ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); if (ret) { + /* + * Delete us from the list. After we unlock the space + * info, we don't want the async reclaim job to reserve + * space for this ticket. If that would happen, then the + * ticket's task would not known that space was reserved + * despite getting an error, resulting in a space leak + * (bytes_may_use counter of our space_info). + */ + list_del_init(&ticket->list); ticket->error = -EINTR; break; } @@ -945,12 +952,24 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); ret = ticket->error; if (ticket->bytes || ticket->error) { + /* + * Need to delete here for priority tickets. For regular tickets + * either the async reclaim job deletes the ticket from the list + * or we delete it ourselves at wait_reserve_ticket(). + */ list_del_init(&ticket->list); if (!ret) ret = -ENOSPC; } spin_unlock(&space_info->lock); ASSERT(list_empty(&ticket->list)); + /* + * Check that we can't have an error set if the reservation succeeded, + * as that would confuse tasks and lead them to error out without + * releasing reserved space (if an error happens the expectation is that + * space wasn't reserved at all). + */ + ASSERT(!(ticket->bytes == 0 && ticket->error)); return ret; } diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 8867e84aa33d..1a349e3f9cc1 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -63,7 +63,6 @@ struct btrfs_space_info { struct rw_semaphore groups_sem; /* for block groups in our same type */ struct list_head block_groups[BTRFS_NR_RAID_TYPES]; - wait_queue_head_t wait; struct kobject kobj; struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES]; @@ -116,7 +115,7 @@ void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, struct btrfs_space_info **space_info); struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, u64 flags); -u64 btrfs_space_info_used(struct btrfs_space_info *s_info, +u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, bool may_use_included); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 1b151af25772..f452a94abdc3 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -66,7 +66,7 @@ static struct file_system_type btrfs_root_fs_type; static int btrfs_remount(struct super_block *sb, int *flags, char *data); -const char *btrfs_decode_error(int errno) +const char * __attribute_const__ btrfs_decode_error(int errno) { char *errstr = "unknown"; @@ -187,7 +187,7 @@ static struct ratelimit_state printk_limits[] = { RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100), }; -void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) +void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0"; struct va_format vaf; @@ -1219,7 +1219,7 @@ static int btrfs_fill_super(struct super_block *sb, key.objectid = BTRFS_FIRST_FREE_OBJECTID; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL); + inode = btrfs_iget(sb, &key, fs_info->fs_root); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto fail_close; @@ -1669,7 +1669,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, btrfs_workqueue_set_max(fs_info->workers, new_pool_size); btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size); - btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size); @@ -1936,6 +1935,10 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, num_stripes = nr_devices; else if (type & BTRFS_BLOCK_GROUP_RAID1) num_stripes = 2; + else if (type & BTRFS_BLOCK_GROUP_RAID1C3) + num_stripes = 3; + else if (type & BTRFS_BLOCK_GROUP_RAID1C4) + num_stripes = 4; else if (type & BTRFS_BLOCK_GROUP_RAID10) num_stripes = 4; @@ -2022,7 +2025,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); struct btrfs_super_block *disk_super = fs_info->super_copy; - struct list_head *head = &fs_info->space_info; struct btrfs_space_info *found; u64 total_used = 0; u64 total_free_data = 0; @@ -2036,7 +2038,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) int mixed = 0; rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { + list_for_each_entry_rcu(found, &fs_info->space_info, list) { if (found->flags & BTRFS_BLOCK_GROUP_DATA) { int i; @@ -2297,7 +2299,7 @@ static const struct super_operations btrfs_super_ops = { static const struct file_operations btrfs_ctl_fops = { .open = btrfs_control_open, .unlocked_ioctl = btrfs_control_ioctl, - .compat_ioctl = btrfs_control_ioctl, + .compat_ioctl = compat_ptr_ioctl, .owner = THIS_MODULE, .llseek = noop_llseek, }; @@ -2360,10 +2362,14 @@ static int __init init_btrfs_fs(void) if (err) goto free_cachep; - err = extent_map_init(); + err = extent_state_cache_init(); if (err) goto free_extent_io; + err = extent_map_init(); + if (err) + goto free_extent_state_cache; + err = ordered_data_init(); if (err) goto free_extent_map; @@ -2422,6 +2428,8 @@ free_ordered_data: ordered_data_exit(); free_extent_map: extent_map_exit(); +free_extent_state_cache: + extent_state_cache_exit(); free_extent_io: extent_io_exit(); free_cachep: @@ -2442,6 +2450,7 @@ static void __exit exit_btrfs_fs(void) btrfs_prelim_ref_exit(); ordered_data_exit(); extent_map_exit(); + extent_state_cache_exit(); extent_io_exit(); btrfs_interface_exit(); btrfs_end_io_wq_exit(); @@ -2456,3 +2465,6 @@ module_exit(exit_btrfs_fs) MODULE_LICENSE("GPL"); MODULE_SOFTDEP("pre: crc32c"); +MODULE_SOFTDEP("pre: xxhash64"); +MODULE_SOFTDEP("pre: sha256"); +MODULE_SOFTDEP("pre: blake2b-256"); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index f6d3c80f2e28..5ebbe8a5ee76 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -9,6 +9,7 @@ #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/bug.h> +#include <crypto/hash.h> #include "ctree.h" #include "disk-io.h" @@ -258,6 +259,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA); BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID); BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); +BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34); static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(mixed_backref), @@ -272,6 +274,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(no_holes), BTRFS_FEAT_ATTR_PTR(metadata_uuid), BTRFS_FEAT_ATTR_PTR(free_space_tree), + BTRFS_FEAT_ATTR_PTR(raid1c34), NULL }; @@ -295,8 +298,30 @@ static ssize_t rmdir_subvol_show(struct kobject *kobj, } BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show); +static ssize_t supported_checksums_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + ssize_t ret = 0; + int i; + + for (i = 0; i < btrfs_get_num_csums(); i++) { + /* + * This "trick" only works as long as 'enum btrfs_csum_type' has + * no holes in it + */ + ret += snprintf(buf + ret, PAGE_SIZE - ret, "%s%s", + (i == 0 ? "" : " "), btrfs_super_csum_name(i)); + + } + + ret += snprintf(buf + ret, PAGE_SIZE - ret, "\n"); + return ret; +} +BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show); + static struct attribute *btrfs_supported_static_feature_attrs[] = { BTRFS_ATTR_PTR(static_feature, rmdir_subvol), + BTRFS_ATTR_PTR(static_feature, supported_checksums), NULL }; @@ -372,16 +397,16 @@ static ssize_t raid_bytes_show(struct kobject *kobj, { struct btrfs_space_info *sinfo = to_space_info(kobj->parent); - struct btrfs_block_group_cache *block_group; + struct btrfs_block_group *block_group; int index = btrfs_bg_flags_to_raid_index(to_raid_kobj(kobj)->flags); u64 val = 0; down_read(&sinfo->groups_sem); list_for_each_entry(block_group, &sinfo->block_groups[index], list) { if (&attr->attr == BTRFS_ATTR_PTR(raid, total_bytes)) - val += block_group->key.offset; + val += block_group->length; else - val += btrfs_block_group_used(&block_group->item); + val += block_group->used; } up_read(&sinfo->groups_sem); return snprintf(buf, PAGE_SIZE, "%llu\n", val); @@ -604,6 +629,19 @@ static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj, BTRFS_ATTR(, metadata_uuid, btrfs_metadata_uuid_show); +static ssize_t btrfs_checksum_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + u16 csum_type = btrfs_super_csum_type(fs_info->super_copy); + + return snprintf(buf, PAGE_SIZE, "%s (%s)\n", + btrfs_super_csum_name(csum_type), + crypto_shash_driver_name(fs_info->csum_shash)); +} + +BTRFS_ATTR(, checksum, btrfs_checksum_show); + static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, label), BTRFS_ATTR_PTR(, nodesize), @@ -611,6 +649,7 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, clone_alignment), BTRFS_ATTR_PTR(, quota_override), BTRFS_ATTR_PTR(, metadata_uuid), + BTRFS_ATTR_PTR(, checksum), NULL, }; @@ -822,7 +861,7 @@ static void init_feature_attrs(void) * Create a sysfs entry for a given block group type at path * /sys/fs/btrfs/UUID/allocation/data/TYPE */ -void btrfs_sysfs_add_block_group_type(struct btrfs_block_group_cache *cache) +void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache) { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_space_info *space_info = cache->space_info; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 610e9c36a94c..e10c3adfc30f 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -32,7 +32,7 @@ int __init btrfs_init_sysfs(void); void __cold btrfs_exit_sysfs(void); int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info); void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info); -void btrfs_sysfs_add_block_group_type(struct btrfs_block_group_cache *cache); +void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache); int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info); void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info); diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 99fe9bf3fdac..a7aca4141788 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -202,11 +202,11 @@ void btrfs_free_dummy_root(struct btrfs_root *root) kfree(root); } -struct btrfs_block_group_cache * +struct btrfs_block_group * btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long length) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; cache = kzalloc(sizeof(*cache), GFP_KERNEL); if (!cache) @@ -218,9 +218,8 @@ btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, return NULL; } - cache->key.objectid = 0; - cache->key.offset = length; - cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + cache->start = 0; + cache->length = length; cache->full_stripe_len = fs_info->sectorsize; cache->fs_info = fs_info; @@ -233,7 +232,7 @@ btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, return cache; } -void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache) +void btrfs_free_dummy_block_group(struct btrfs_block_group *cache) { if (!cache) return; diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index ee277bbd939b..9e52527357d8 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -41,9 +41,9 @@ struct inode *btrfs_new_test_inode(void); struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize); void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info); void btrfs_free_dummy_root(struct btrfs_root *root); -struct btrfs_block_group_cache * +struct btrfs_block_group * btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long length); -void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache); +void btrfs_free_dummy_block_group(struct btrfs_block_group *cache); void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); #else diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index 43ec7060fcd2..aebdf23f0cdd 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -17,7 +17,7 @@ * entry and remove space from either end and the middle, and make sure we can * remove space that covers adjacent extent entries. */ -static int test_extents(struct btrfs_block_group_cache *cache) +static int test_extents(struct btrfs_block_group *cache) { int ret = 0; @@ -87,8 +87,7 @@ static int test_extents(struct btrfs_block_group_cache *cache) return 0; } -static int test_bitmaps(struct btrfs_block_group_cache *cache, - u32 sectorsize) +static int test_bitmaps(struct btrfs_block_group *cache, u32 sectorsize) { u64 next_bitmap_offset; int ret; @@ -156,7 +155,7 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache, } /* This is the high grade jackassery */ -static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache, +static int test_bitmaps_and_extents(struct btrfs_block_group *cache, u32 sectorsize) { u64 bitmap_offset = (u64)(BITS_PER_BITMAP * sectorsize); @@ -331,7 +330,7 @@ static bool test_use_bitmap(struct btrfs_free_space_ctl *ctl, /* Used by test_steal_space_from_bitmap_to_extent(). */ static int -check_num_extents_and_bitmaps(const struct btrfs_block_group_cache *cache, +check_num_extents_and_bitmaps(const struct btrfs_block_group *cache, const int num_extents, const int num_bitmaps) { @@ -351,7 +350,7 @@ check_num_extents_and_bitmaps(const struct btrfs_block_group_cache *cache, } /* Used by test_steal_space_from_bitmap_to_extent(). */ -static int check_cache_empty(struct btrfs_block_group_cache *cache) +static int check_cache_empty(struct btrfs_block_group *cache) { u64 offset; u64 max_extent_size; @@ -393,7 +392,7 @@ static int check_cache_empty(struct btrfs_block_group_cache *cache) * requests. */ static int -test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, +test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache, u32 sectorsize) { int ret; @@ -829,7 +828,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) { struct btrfs_fs_info *fs_info; - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; struct btrfs_root *root = NULL; int ret = -ENOMEM; diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index bc92df977630..1a846bf6e197 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -18,7 +18,7 @@ struct free_space_extent { static int __check_free_space_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache, + struct btrfs_block_group *cache, struct btrfs_path *path, const struct free_space_extent * const extents, unsigned int num_extents) @@ -48,7 +48,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans, if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { if (path->slots[0] != 0) goto invalid; - end = cache->key.objectid + cache->key.offset; + end = cache->start + cache->length; i = 0; while (++path->slots[0] < btrfs_header_nritems(path->nodes[0])) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); @@ -107,7 +107,7 @@ invalid: static int check_free_space_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache, + struct btrfs_block_group *cache, struct btrfs_path *path, const struct free_space_extent * const extents, unsigned int num_extents) @@ -150,12 +150,12 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans, static int test_empty_block_group(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache, + struct btrfs_block_group *cache, struct btrfs_path *path, u32 alignment) { const struct free_space_extent extents[] = { - {cache->key.objectid, cache->key.offset}, + {cache->start, cache->length}, }; return check_free_space_extents(trans, fs_info, cache, path, @@ -164,7 +164,7 @@ static int test_empty_block_group(struct btrfs_trans_handle *trans, static int test_remove_all(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache, + struct btrfs_block_group *cache, struct btrfs_path *path, u32 alignment) { @@ -172,8 +172,8 @@ static int test_remove_all(struct btrfs_trans_handle *trans, int ret; ret = __remove_from_free_space_tree(trans, cache, path, - cache->key.objectid, - cache->key.offset); + cache->start, + cache->length); if (ret) { test_err("could not remove free space"); return ret; @@ -185,18 +185,17 @@ static int test_remove_all(struct btrfs_trans_handle *trans, static int test_remove_beginning(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache, + struct btrfs_block_group *cache, struct btrfs_path *path, u32 alignment) { const struct free_space_extent extents[] = { - {cache->key.objectid + alignment, - cache->key.offset - alignment}, + {cache->start + alignment, cache->length - alignment}, }; int ret; ret = __remove_from_free_space_tree(trans, cache, path, - cache->key.objectid, alignment); + cache->start, alignment); if (ret) { test_err("could not remove free space"); return ret; @@ -209,19 +208,18 @@ static int test_remove_beginning(struct btrfs_trans_handle *trans, static int test_remove_end(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache, + struct btrfs_block_group *cache, struct btrfs_path *path, u32 alignment) { const struct free_space_extent extents[] = { - {cache->key.objectid, cache->key.offset - alignment}, + {cache->start, cache->length - alignment}, }; int ret; ret = __remove_from_free_space_tree(trans, cache, path, - cache->key.objectid + - cache->key.offset - alignment, - alignment); + cache->start + cache->length - alignment, + alignment); if (ret) { test_err("could not remove free space"); return ret; @@ -233,19 +231,18 @@ static int test_remove_end(struct btrfs_trans_handle *trans, static int test_remove_middle(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache, + struct btrfs_block_group *cache, struct btrfs_path *path, u32 alignment) { const struct free_space_extent extents[] = { - {cache->key.objectid, alignment}, - {cache->key.objectid + 2 * alignment, - cache->key.offset - 2 * alignment}, + {cache->start, alignment}, + {cache->start + 2 * alignment, cache->length - 2 * alignment}, }; int ret; ret = __remove_from_free_space_tree(trans, cache, path, - cache->key.objectid + alignment, + cache->start + alignment, alignment); if (ret) { test_err("could not remove free space"); @@ -258,24 +255,23 @@ static int test_remove_middle(struct btrfs_trans_handle *trans, static int test_merge_left(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache, + struct btrfs_block_group *cache, struct btrfs_path *path, u32 alignment) { const struct free_space_extent extents[] = { - {cache->key.objectid, 2 * alignment}, + {cache->start, 2 * alignment}, }; int ret; ret = __remove_from_free_space_tree(trans, cache, path, - cache->key.objectid, - cache->key.offset); + cache->start, cache->length); if (ret) { test_err("could not remove free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid, + ret = __add_to_free_space_tree(trans, cache, path, cache->start, alignment); if (ret) { test_err("could not add free space"); @@ -283,7 +279,7 @@ static int test_merge_left(struct btrfs_trans_handle *trans, } ret = __add_to_free_space_tree(trans, cache, path, - cache->key.objectid + alignment, + cache->start + alignment, alignment); if (ret) { test_err("could not add free space"); @@ -296,25 +292,24 @@ static int test_merge_left(struct btrfs_trans_handle *trans, static int test_merge_right(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache, + struct btrfs_block_group *cache, struct btrfs_path *path, u32 alignment) { const struct free_space_extent extents[] = { - {cache->key.objectid + alignment, 2 * alignment}, + {cache->start + alignment, 2 * alignment}, }; int ret; ret = __remove_from_free_space_tree(trans, cache, path, - cache->key.objectid, - cache->key.offset); + cache->start, cache->length); if (ret) { test_err("could not remove free space"); return ret; } ret = __add_to_free_space_tree(trans, cache, path, - cache->key.objectid + 2 * alignment, + cache->start + 2 * alignment, alignment); if (ret) { test_err("could not add free space"); @@ -322,7 +317,7 @@ static int test_merge_right(struct btrfs_trans_handle *trans, } ret = __add_to_free_space_tree(trans, cache, path, - cache->key.objectid + alignment, + cache->start + alignment, alignment); if (ret) { test_err("could not add free space"); @@ -335,24 +330,23 @@ static int test_merge_right(struct btrfs_trans_handle *trans, static int test_merge_both(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache, + struct btrfs_block_group *cache, struct btrfs_path *path, u32 alignment) { const struct free_space_extent extents[] = { - {cache->key.objectid, 3 * alignment}, + {cache->start, 3 * alignment}, }; int ret; ret = __remove_from_free_space_tree(trans, cache, path, - cache->key.objectid, - cache->key.offset); + cache->start, cache->length); if (ret) { test_err("could not remove free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid, + ret = __add_to_free_space_tree(trans, cache, path, cache->start, alignment); if (ret) { test_err("could not add free space"); @@ -360,16 +354,14 @@ static int test_merge_both(struct btrfs_trans_handle *trans, } ret = __add_to_free_space_tree(trans, cache, path, - cache->key.objectid + 2 * alignment, - alignment); + cache->start + 2 * alignment, alignment); if (ret) { test_err("could not add free space"); return ret; } ret = __add_to_free_space_tree(trans, cache, path, - cache->key.objectid + alignment, - alignment); + cache->start + alignment, alignment); if (ret) { test_err("could not add free space"); return ret; @@ -381,26 +373,25 @@ static int test_merge_both(struct btrfs_trans_handle *trans, static int test_merge_none(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *cache, + struct btrfs_block_group *cache, struct btrfs_path *path, u32 alignment) { const struct free_space_extent extents[] = { - {cache->key.objectid, alignment}, - {cache->key.objectid + 2 * alignment, alignment}, - {cache->key.objectid + 4 * alignment, alignment}, + {cache->start, alignment}, + {cache->start + 2 * alignment, alignment}, + {cache->start + 4 * alignment, alignment}, }; int ret; ret = __remove_from_free_space_tree(trans, cache, path, - cache->key.objectid, - cache->key.offset); + cache->start, cache->length); if (ret) { test_err("could not remove free space"); return ret; } - ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid, + ret = __add_to_free_space_tree(trans, cache, path, cache->start, alignment); if (ret) { test_err("could not add free space"); @@ -408,16 +399,14 @@ static int test_merge_none(struct btrfs_trans_handle *trans, } ret = __add_to_free_space_tree(trans, cache, path, - cache->key.objectid + 4 * alignment, - alignment); + cache->start + 4 * alignment, alignment); if (ret) { test_err("could not add free space"); return ret; } ret = __add_to_free_space_tree(trans, cache, path, - cache->key.objectid + 2 * alignment, - alignment); + cache->start + 2 * alignment, alignment); if (ret) { test_err("could not add free space"); return ret; @@ -429,7 +418,7 @@ static int test_merge_none(struct btrfs_trans_handle *trans, typedef int (*test_func_t)(struct btrfs_trans_handle *, struct btrfs_fs_info *, - struct btrfs_block_group_cache *, + struct btrfs_block_group *, struct btrfs_path *, u32 alignment); @@ -438,7 +427,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, { struct btrfs_fs_info *fs_info; struct btrfs_root *root = NULL; - struct btrfs_block_group_cache *cache = NULL; + struct btrfs_block_group *cache = NULL; struct btrfs_trans_handle trans; struct btrfs_path *path = NULL; int ret; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 8624bdee8c5b..cfc08ef9b876 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -24,9 +24,79 @@ #define BTRFS_ROOT_TRANS_TAG 0 +/* + * Transaction states and transitions + * + * No running transaction (fs tree blocks are not modified) + * | + * | To next stage: + * | Call start_transaction() variants. Except btrfs_join_transaction_nostart(). + * V + * Transaction N [[TRANS_STATE_RUNNING]] + * | + * | New trans handles can be attached to transaction N by calling all + * | start_transaction() variants. + * | + * | To next stage: + * | Call btrfs_commit_transaction() on any trans handle attached to + * | transaction N + * V + * Transaction N [[TRANS_STATE_COMMIT_START]] + * | + * | Will wait for previous running transaction to completely finish if there + * | is one + * | + * | Then one of the following happes: + * | - Wait for all other trans handle holders to release. + * | The btrfs_commit_transaction() caller will do the commit work. + * | - Wait for current transaction to be committed by others. + * | Other btrfs_commit_transaction() caller will do the commit work. + * | + * | At this stage, only btrfs_join_transaction*() variants can attach + * | to this running transaction. + * | All other variants will wait for current one to finish and attach to + * | transaction N+1. + * | + * | To next stage: + * | Caller is chosen to commit transaction N, and all other trans handle + * | haven been released. + * V + * Transaction N [[TRANS_STATE_COMMIT_DOING]] + * | + * | The heavy lifting transaction work is started. + * | From running delayed refs (modifying extent tree) to creating pending + * | snapshots, running qgroups. + * | In short, modify supporting trees to reflect modifications of subvolume + * | trees. + * | + * | At this stage, all start_transaction() calls will wait for this + * | transaction to finish and attach to transaction N+1. + * | + * | To next stage: + * | Until all supporting trees are updated. + * V + * Transaction N [[TRANS_STATE_UNBLOCKED]] + * | Transaction N+1 + * | All needed trees are modified, thus we only [[TRANS_STATE_RUNNING]] + * | need to write them back to disk and update | + * | super blocks. | + * | | + * | At this stage, new transaction is allowed to | + * | start. | + * | All new start_transaction() calls will be | + * | attached to transid N+1. | + * | | + * | To next stage: | + * | Until all tree blocks are super blocks are | + * | written to block devices | + * V | + * Transaction N [[TRANS_STATE_COMPLETED]] V + * All tree blocks and super blocks are written. Transaction N+1 + * This transaction is finished and all its [[TRANS_STATE_COMMIT_START]] + * data structures will be cleaned up. | Life goes on + */ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { [TRANS_STATE_RUNNING] = 0U, - [TRANS_STATE_BLOCKED] = __TRANS_START, [TRANS_STATE_COMMIT_START] = (__TRANS_START | __TRANS_ATTACH), [TRANS_STATE_COMMIT_DOING] = (__TRANS_START | __TRANS_ATTACH | @@ -63,10 +133,10 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) * discard the physical locations of the block groups. */ while (!list_empty(&transaction->deleted_bgs)) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; cache = list_first_entry(&transaction->deleted_bgs, - struct btrfs_block_group_cache, + struct btrfs_block_group, bg_list); list_del_init(&cache->bg_list); btrfs_put_block_group_trimming(cache); @@ -383,7 +453,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, static inline int is_transaction_blocked(struct btrfs_transaction *trans) { - return (trans->state >= TRANS_STATE_BLOCKED && + return (trans->state >= TRANS_STATE_COMMIT_START && trans->state < TRANS_STATE_UNBLOCKED && !trans->aborted); } @@ -570,7 +640,7 @@ again: INIT_LIST_HEAD(&h->new_bgs); smp_mb(); - if (cur_trans->state >= TRANS_STATE_BLOCKED && + if (cur_trans->state >= TRANS_STATE_COMMIT_START && may_wait_transaction(fs_info, type)) { current->journal_info = h; btrfs_commit_transaction(h); @@ -659,7 +729,7 @@ struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) true); } -struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root) +struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root *root) { return start_transaction(root, 0, TRANS_JOIN_NOLOCK, BTRFS_RESERVE_NO_FLUSH, true); @@ -798,7 +868,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans) struct btrfs_transaction *cur_trans = trans->transaction; smp_mb(); - if (cur_trans->state >= TRANS_STATE_BLOCKED || + if (cur_trans->state >= TRANS_STATE_COMMIT_START || cur_trans->delayed_refs.flushing) return 1; @@ -831,7 +901,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; - int lock = (trans->type != TRANS_JOIN_NOLOCK); int err = 0; if (refcount_read(&trans->use_count) > 1) { @@ -847,13 +916,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_chunk_metadata(trans); - if (lock && READ_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) { - if (throttle) - return btrfs_commit_transaction(trans); - else - wake_up_process(info->transaction_kthread); - } - if (trans->type & __TRANS_FREEZABLE) sb_end_intwrite(info->sb); @@ -990,7 +1052,7 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, return werr; } -int btrfs_wait_extents(struct btrfs_fs_info *fs_info, +static int btrfs_wait_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages) { bool errors = false; @@ -1875,7 +1937,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_group_cache *block_group, *tmp; + struct btrfs_block_group *block_group, *tmp; list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { btrfs_delayed_refs_rsv_release(fs_info, 1); @@ -1949,6 +2011,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) struct btrfs_transaction *prev_trans = NULL; int ret; + ASSERT(refcount_read(&trans->use_count) == 1); + /* Stop the commit early if ->aborted is set */ if (unlikely(READ_ONCE(cur_trans->aborted))) { ret = cur_trans->aborted; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 2c5a6f6e5bb0..49f7196368f5 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -13,7 +13,6 @@ enum btrfs_trans_state { TRANS_STATE_RUNNING, - TRANS_STATE_BLOCKED, TRANS_STATE_COMMIT_START, TRANS_STATE_COMMIT_DOING, TRANS_STATE_UNBLOCKED, @@ -184,7 +183,7 @@ struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv( unsigned int num_items, int min_factor); struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); -struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_attach_transaction_barrier( @@ -218,8 +217,6 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages, int mark); -int btrfs_wait_extents(struct btrfs_fs_info *fs_info, - struct extent_io_tree *dirty_pages); int btrfs_wait_tree_log_extents(struct btrfs_root *root, int mark); int btrfs_transaction_blocked(struct btrfs_fs_info *info); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 43e488f5d063..493d4d9e0f79 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -23,6 +23,7 @@ #include "disk-io.h" #include "compression.h" #include "volumes.h" +#include "misc.h" /* * Error message should follow the following format: @@ -124,6 +125,74 @@ static u64 file_extent_end(struct extent_buffer *leaf, return end; } +/* + * Customized report for dir_item, the only new important information is + * key->objectid, which represents inode number + */ +__printf(3, 4) +__cold +static void dir_item_err(const struct extent_buffer *eb, int slot, + const char *fmt, ...) +{ + const struct btrfs_fs_info *fs_info = eb->fs_info; + struct btrfs_key key; + struct va_format vaf; + va_list args; + + btrfs_item_key_to_cpu(eb, &key, slot); + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(fs_info, + "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV", + btrfs_header_level(eb) == 0 ? "leaf" : "node", + btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, + key.objectid, &vaf); + va_end(args); +} + +/* + * This functions checks prev_key->objectid, to ensure current key and prev_key + * share the same objectid as inode number. + * + * This is to detect missing INODE_ITEM in subvolume trees. + * + * Return true if everything is OK or we don't need to check. + * Return false if anything is wrong. + */ +static bool check_prev_ino(struct extent_buffer *leaf, + struct btrfs_key *key, int slot, + struct btrfs_key *prev_key) +{ + /* No prev key, skip check */ + if (slot == 0) + return true; + + /* Only these key->types needs to be checked */ + ASSERT(key->type == BTRFS_XATTR_ITEM_KEY || + key->type == BTRFS_INODE_REF_KEY || + key->type == BTRFS_DIR_INDEX_KEY || + key->type == BTRFS_DIR_ITEM_KEY || + key->type == BTRFS_EXTENT_DATA_KEY); + + /* + * Only subvolume trees along with their reloc trees need this check. + * Things like log tree doesn't follow this ino requirement. + */ + if (!is_fstree(btrfs_header_owner(leaf))) + return true; + + if (key->objectid == prev_key->objectid) + return true; + + /* Error found */ + dir_item_err(leaf, slot, + "invalid previous key objectid, have %llu expect %llu", + prev_key->objectid, key->objectid); + return false; +} static int check_extent_data_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot, struct btrfs_key *prev_key) @@ -141,13 +210,33 @@ static int check_extent_data_item(struct extent_buffer *leaf, return -EUCLEAN; } + /* + * Previous key must have the same key->objectid (ino). + * It can be XATTR_ITEM, INODE_ITEM or just another EXTENT_DATA. + * But if objectids mismatch, it means we have a missing + * INODE_ITEM. + */ + if (!check_prev_ino(leaf, key, slot, prev_key)) + return -EUCLEAN; + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) { + /* + * Make sure the item contains at least inline header, so the file + * extent type is not some garbage. + */ + if (item_size < BTRFS_FILE_EXTENT_INLINE_DATA_START) { + file_extent_err(leaf, slot, + "invalid item size, have %u expect [%lu, %u)", + item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START, + SZ_4K); + return -EUCLEAN; + } + if (btrfs_file_extent_type(leaf, fi) >= BTRFS_NR_FILE_EXTENT_TYPES) { file_extent_err(leaf, slot, "invalid type for file extent, have %u expect range [0, %u]", btrfs_file_extent_type(leaf, fi), - BTRFS_FILE_EXTENT_TYPES); + BTRFS_NR_FILE_EXTENT_TYPES - 1); return -EUCLEAN; } @@ -155,11 +244,11 @@ static int check_extent_data_item(struct extent_buffer *leaf, * Support for new compression/encryption must introduce incompat flag, * and must be caught in open_ctree(). */ - if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) { + if (btrfs_file_extent_compression(leaf, fi) >= BTRFS_NR_COMPRESS_TYPES) { file_extent_err(leaf, slot, "invalid compression for file extent, have %u expect range [0, %u]", btrfs_file_extent_compression(leaf, fi), - BTRFS_COMPRESS_TYPES); + BTRFS_NR_COMPRESS_TYPES - 1); return -EUCLEAN; } if (btrfs_file_extent_encryption(leaf, fi)) { @@ -270,42 +359,17 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key, return 0; } -/* - * Customized reported for dir_item, only important new info is key->objectid, - * which represents inode number - */ -__printf(3, 4) -__cold -static void dir_item_err(const struct extent_buffer *eb, int slot, - const char *fmt, ...) -{ - const struct btrfs_fs_info *fs_info = eb->fs_info; - struct btrfs_key key; - struct va_format vaf; - va_list args; - - btrfs_item_key_to_cpu(eb, &key, slot); - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - btrfs_crit(fs_info, - "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV", - btrfs_header_level(eb) == 0 ? "leaf" : "node", - btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, - key.objectid, &vaf); - va_end(args); -} - static int check_dir_item(struct extent_buffer *leaf, - struct btrfs_key *key, int slot) + struct btrfs_key *key, struct btrfs_key *prev_key, + int slot) { struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_dir_item *di; u32 item_size = btrfs_item_size_nr(leaf, slot); u32 cur = 0; + if (!check_prev_ino(leaf, key, slot, prev_key)) + return -EUCLEAN; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); while (cur < item_size) { u32 name_len; @@ -459,23 +523,23 @@ static int check_block_group_item(struct extent_buffer *leaf, read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), sizeof(bgi)); - if (btrfs_block_group_chunk_objectid(&bgi) != + if (btrfs_stack_block_group_chunk_objectid(&bgi) != BTRFS_FIRST_CHUNK_TREE_OBJECTID) { block_group_err(leaf, slot, "invalid block group chunk objectid, have %llu expect %llu", - btrfs_block_group_chunk_objectid(&bgi), + btrfs_stack_block_group_chunk_objectid(&bgi), BTRFS_FIRST_CHUNK_TREE_OBJECTID); return -EUCLEAN; } - if (btrfs_block_group_used(&bgi) > key->offset) { + if (btrfs_stack_block_group_used(&bgi) > key->offset) { block_group_err(leaf, slot, "invalid block group used, have %llu expect [0, %llu)", - btrfs_block_group_used(&bgi), key->offset); + btrfs_stack_block_group_used(&bgi), key->offset); return -EUCLEAN; } - flags = btrfs_block_group_flags(&bgi); + flags = btrfs_stack_block_group_flags(&bgi); if (hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1) { block_group_err(leaf, slot, "invalid profile flags, have 0x%llx (%lu bits set) expect no more than 1 bit set", @@ -609,7 +673,7 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, return -EUCLEAN; } - if (!is_power_of_2(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) && + if (!has_single_bit_set(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) && (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) { chunk_err(leaf, chunk, logical, "invalid chunk profile flag: 0x%llx, expect 0 or 1 bit set", @@ -686,9 +750,7 @@ static void dev_item_err(const struct extent_buffer *eb, int slot, static int check_dev_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { - struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_dev_item *ditem; - u64 max_devid = max(BTRFS_MAX_DEVS(fs_info), BTRFS_MAX_DEVS_SYS_CHUNK); if (key->objectid != BTRFS_DEV_ITEMS_OBJECTID) { dev_item_err(leaf, slot, @@ -696,12 +758,6 @@ static int check_dev_item(struct extent_buffer *leaf, key->objectid, BTRFS_DEV_ITEMS_OBJECTID); return -EUCLEAN; } - if (key->offset > max_devid) { - dev_item_err(leaf, slot, - "invalid devid: has=%llu expect=[0, %llu]", - key->offset, max_devid); - return -EUCLEAN; - } ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item); if (btrfs_device_id(leaf, ditem) != key->offset) { dev_item_err(leaf, slot, @@ -793,11 +849,11 @@ static int check_inode_item(struct extent_buffer *leaf, } /* - * S_IFMT is not bit mapped so we can't completely rely on is_power_of_2, - * but is_power_of_2() can save us from checking FIFO/CHR/DIR/REG. - * Only needs to check BLK, LNK and SOCKS + * S_IFMT is not bit mapped so we can't completely rely on + * is_power_of_2/has_single_bit_set, but it can save us from checking + * FIFO/CHR/DIR/REG. Only needs to check BLK, LNK and SOCKS */ - if (!is_power_of_2(mode & S_IFMT)) { + if (!has_single_bit_set(mode & S_IFMT)) { if (!S_ISLNK(mode) && !S_ISBLK(mode) && !S_ISSOCK(mode)) { inode_item_err(fs_info, leaf, slot, "invalid mode: has 0%o expect valid S_IF* bit(s)", @@ -1018,8 +1074,8 @@ static int check_extent_item(struct extent_buffer *leaf, btrfs_super_generation(fs_info->super_copy) + 1); return -EUCLEAN; } - if (!is_power_of_2(flags & (BTRFS_EXTENT_FLAG_DATA | - BTRFS_EXTENT_FLAG_TREE_BLOCK))) { + if (!has_single_bit_set(flags & (BTRFS_EXTENT_FLAG_DATA | + BTRFS_EXTENT_FLAG_TREE_BLOCK))) { extent_err(leaf, slot, "invalid extent flag, have 0x%llx expect 1 bit set in 0x%llx", flags, BTRFS_EXTENT_FLAG_DATA | @@ -1232,6 +1288,58 @@ static int check_extent_data_ref(struct extent_buffer *leaf, return 0; } +#define inode_ref_err(fs_info, eb, slot, fmt, args...) \ + inode_item_err(fs_info, eb, slot, fmt, ##args) +static int check_inode_ref(struct extent_buffer *leaf, + struct btrfs_key *key, struct btrfs_key *prev_key, + int slot) +{ + struct btrfs_inode_ref *iref; + unsigned long ptr; + unsigned long end; + + if (!check_prev_ino(leaf, key, slot, prev_key)) + return -EUCLEAN; + /* namelen can't be 0, so item_size == sizeof() is also invalid */ + if (btrfs_item_size_nr(leaf, slot) <= sizeof(*iref)) { + inode_ref_err(fs_info, leaf, slot, + "invalid item size, have %u expect (%zu, %u)", + btrfs_item_size_nr(leaf, slot), + sizeof(*iref), BTRFS_LEAF_DATA_SIZE(leaf->fs_info)); + return -EUCLEAN; + } + + ptr = btrfs_item_ptr_offset(leaf, slot); + end = ptr + btrfs_item_size_nr(leaf, slot); + while (ptr < end) { + u16 namelen; + + if (ptr + sizeof(iref) > end) { + inode_ref_err(fs_info, leaf, slot, + "inode ref overflow, ptr %lu end %lu inode_ref_size %zu", + ptr, end, sizeof(iref)); + return -EUCLEAN; + } + + iref = (struct btrfs_inode_ref *)ptr; + namelen = btrfs_inode_ref_name_len(leaf, iref); + if (ptr + sizeof(*iref) + namelen > end) { + inode_ref_err(fs_info, leaf, slot, + "inode ref overflow, ptr %lu end %lu namelen %u", + ptr, end, namelen); + return -EUCLEAN; + } + + /* + * NOTE: In theory we should record all found index numbers + * to find any duplicated indexes, but that will be too time + * consuming for inodes with too many hard links. + */ + ptr += sizeof(*iref) + namelen; + } + return 0; +} + /* * Common point to switch the item-specific validation. */ @@ -1252,7 +1360,10 @@ static int check_leaf_item(struct extent_buffer *leaf, case BTRFS_DIR_ITEM_KEY: case BTRFS_DIR_INDEX_KEY: case BTRFS_XATTR_ITEM_KEY: - ret = check_dir_item(leaf, key, slot); + ret = check_dir_item(leaf, key, prev_key, slot); + break; + case BTRFS_INODE_REF_KEY: + ret = check_inode_ref(leaf, key, prev_key, slot); break; case BTRFS_BLOCK_GROUP_ITEM_KEY: ret = check_block_group_item(leaf, key, slot); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 8a6cc600bf18..6f757361db53 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -559,7 +559,7 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root, key.objectid = objectid; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); + inode = btrfs_iget(root->fs_info->sb, &key, root); if (IS_ERR(inode)) inode = NULL; return inode; @@ -945,54 +945,32 @@ static noinline int backref_in_log(struct btrfs_root *log, const char *name, int namelen) { struct btrfs_path *path; - struct btrfs_inode_ref *ref; - unsigned long ptr; - unsigned long ptr_end; - unsigned long name_ptr; - int found_name_len; - int item_size; int ret; - int match = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = btrfs_search_slot(NULL, log, key, path, 0, 0); - if (ret != 0) + if (ret < 0) { goto out; - - ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); - - if (key->type == BTRFS_INODE_EXTREF_KEY) { - if (btrfs_find_name_in_ext_backref(path->nodes[0], - path->slots[0], - ref_objectid, - name, namelen)) - match = 1; - + } else if (ret == 1) { + ret = 0; goto out; } - item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); - ptr_end = ptr + item_size; - while (ptr < ptr_end) { - ref = (struct btrfs_inode_ref *)ptr; - found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); - if (found_name_len == namelen) { - name_ptr = (unsigned long)(ref + 1); - ret = memcmp_extent_buffer(path->nodes[0], name, - name_ptr, namelen); - if (ret == 0) { - match = 1; - goto out; - } - } - ptr = (unsigned long)(ref + 1) + found_name_len; - } + if (key->type == BTRFS_INODE_EXTREF_KEY) + ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], + path->slots[0], + ref_objectid, + name, namelen); + else + ret = !!btrfs_find_name_in_backref(path->nodes[0], + path->slots[0], + name, namelen); out: btrfs_free_path(path); - return match; + return ret; } static inline int __add_inode_ref(struct btrfs_trans_handle *trans, @@ -1050,10 +1028,13 @@ again: (unsigned long)(victim_ref + 1), victim_name_len); - if (!backref_in_log(log_root, &search_key, - parent_objectid, - victim_name, - victim_name_len)) { + ret = backref_in_log(log_root, &search_key, + parent_objectid, victim_name, + victim_name_len); + if (ret < 0) { + kfree(victim_name); + return ret; + } else if (!ret) { inc_nlink(&inode->vfs_inode); btrfs_release_path(path); @@ -1115,10 +1096,12 @@ again: search_key.offset = btrfs_extref_hash(parent_objectid, victim_name, victim_name_len); - ret = 0; - if (!backref_in_log(log_root, &search_key, - parent_objectid, victim_name, - victim_name_len)) { + ret = backref_in_log(log_root, &search_key, + parent_objectid, victim_name, + victim_name_len); + if (ret < 0) { + return ret; + } else if (!ret) { ret = -ENOENT; victim_parent = read_one_inode(root, parent_objectid); @@ -1885,30 +1868,6 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, } /* - * Return true if an inode reference exists in the log for the given name, - * inode and parent inode. - */ -static bool name_in_log_ref(struct btrfs_root *log_root, - const char *name, const int name_len, - const u64 dirid, const u64 ino) -{ - struct btrfs_key search_key; - - search_key.objectid = ino; - search_key.type = BTRFS_INODE_REF_KEY; - search_key.offset = dirid; - if (backref_in_log(log_root, &search_key, dirid, name, name_len)) - return true; - - search_key.type = BTRFS_INODE_EXTREF_KEY; - search_key.offset = btrfs_extref_hash(dirid, name, name_len); - if (backref_in_log(log_root, &search_key, dirid, name, name_len)) - return true; - - return false; -} - -/* * take a single entry in a log directory item and replay it into * the subvolume. * @@ -2024,8 +1983,31 @@ out: return ret; insert: - if (name_in_log_ref(root->log_root, name, name_len, - key->objectid, log_key.objectid)) { + /* + * Check if the inode reference exists in the log for the given name, + * inode and parent inode + */ + found_key.objectid = log_key.objectid; + found_key.type = BTRFS_INODE_REF_KEY; + found_key.offset = key->objectid; + ret = backref_in_log(root->log_root, &found_key, 0, name, name_len); + if (ret < 0) { + goto out; + } else if (ret) { + /* The dentry will be added later. */ + ret = 0; + update_size = false; + goto out; + } + + found_key.objectid = log_key.objectid; + found_key.type = BTRFS_INODE_EXTREF_KEY; + found_key.offset = key->objectid; + ret = backref_in_log(root->log_root, &found_key, key->objectid, name, + name_len); + if (ret < 0) { + goto out; + } else if (ret) { /* The dentry will be added later. */ ret = 0; update_size = false; @@ -2869,7 +2851,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, level = btrfs_header_level(log->node); orig_level = level; path->nodes[level] = log->node; - extent_buffer_get(log->node); + atomic_inc(&log->node->refs); path->slots[level] = 0; while (1) { @@ -4983,7 +4965,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, key.objectid = ino; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - inode = btrfs_iget(fs_info->sb, &key, root, NULL); + inode = btrfs_iget(fs_info->sb, &key, root); /* * If the other inode that had a conflicting dir entry was * deleted in the current transaction, we need to log its parent @@ -4993,8 +4975,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, ret = PTR_ERR(inode); if (ret == -ENOENT) { key.objectid = parent; - inode = btrfs_iget(fs_info->sb, &key, root, - NULL); + inode = btrfs_iget(fs_info->sb, &key, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); } else { @@ -5699,7 +5680,7 @@ process_leaf: continue; btrfs_release_path(path); - di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL); + di_inode = btrfs_iget(fs_info->sb, &di_key, root); if (IS_ERR(di_inode)) { ret = PTR_ERR(di_inode); goto next_dir_inode; @@ -5825,8 +5806,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, cur_offset = item_size; } - dir_inode = btrfs_iget(fs_info->sb, &inode_key, - root, NULL); + dir_inode = btrfs_iget(fs_info->sb, &inode_key, root); /* * If the parent inode was deleted, return an error to * fallback to a transaction commit. This is to prevent @@ -5900,7 +5880,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, search_key.objectid = found_key.offset; search_key.type = BTRFS_INODE_ITEM_KEY; search_key.offset = 0; - inode = btrfs_iget(fs_info->sb, &search_key, root, NULL); + inode = btrfs_iget(fs_info->sb, &search_key, root); if (IS_ERR(inode)) return PTR_ERR(inode); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bdfe4493e43a..d8e5560db285 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -58,6 +58,30 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .bg_flag = BTRFS_BLOCK_GROUP_RAID1, .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, }, + [BTRFS_RAID_RAID1C3] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 3, + .tolerated_failures = 2, + .devs_increment = 3, + .ncopies = 3, + .raid_name = "raid1c3", + .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3, + .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, + }, + [BTRFS_RAID_RAID1C4] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 4, + .tolerated_failures = 3, + .devs_increment = 4, + .ncopies = 4, + .raid_name = "raid1c4", + .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4, + .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, + }, [BTRFS_RAID_DUP] = { .sub_stripes = 1, .dev_stripes = 2, @@ -297,7 +321,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); -struct list_head *btrfs_get_fs_uuids(void) +struct list_head * __attribute_const__ btrfs_get_fs_uuids(void) { return &fs_uuids; } @@ -397,8 +421,6 @@ static struct btrfs_device *__alloc_device(void) INIT_LIST_HEAD(&dev->dev_alloc_list); INIT_LIST_HEAD(&dev->post_commit_list); - spin_lock_init(&dev->io_lock); - atomic_set(&dev->reada_in_flight, 0); atomic_set(&dev->dev_stats_ccnt, 0); btrfs_device_data_ordered_init(dev); @@ -501,212 +523,6 @@ error: return ret; } -static void requeue_list(struct btrfs_pending_bios *pending_bios, - struct bio *head, struct bio *tail) -{ - - struct bio *old_head; - - old_head = pending_bios->head; - pending_bios->head = head; - if (pending_bios->tail) - tail->bi_next = old_head; - else - pending_bios->tail = tail; -} - -/* - * we try to collect pending bios for a device so we don't get a large - * number of procs sending bios down to the same device. This greatly - * improves the schedulers ability to collect and merge the bios. - * - * But, it also turns into a long list of bios to process and that is sure - * to eventually make the worker thread block. The solution here is to - * make some progress and then put this work struct back at the end of - * the list if the block device is congested. This way, multiple devices - * can make progress from a single worker thread. - */ -static noinline void run_scheduled_bios(struct btrfs_device *device) -{ - struct btrfs_fs_info *fs_info = device->fs_info; - struct bio *pending; - struct backing_dev_info *bdi; - struct btrfs_pending_bios *pending_bios; - struct bio *tail; - struct bio *cur; - int again = 0; - unsigned long num_run; - unsigned long batch_run = 0; - unsigned long last_waited = 0; - int force_reg = 0; - int sync_pending = 0; - struct blk_plug plug; - - /* - * this function runs all the bios we've collected for - * a particular device. We don't want to wander off to - * another device without first sending all of these down. - * So, setup a plug here and finish it off before we return - */ - blk_start_plug(&plug); - - bdi = device->bdev->bd_bdi; - -loop: - spin_lock(&device->io_lock); - -loop_lock: - num_run = 0; - - /* take all the bios off the list at once and process them - * later on (without the lock held). But, remember the - * tail and other pointers so the bios can be properly reinserted - * into the list if we hit congestion - */ - if (!force_reg && device->pending_sync_bios.head) { - pending_bios = &device->pending_sync_bios; - force_reg = 1; - } else { - pending_bios = &device->pending_bios; - force_reg = 0; - } - - pending = pending_bios->head; - tail = pending_bios->tail; - WARN_ON(pending && !tail); - - /* - * if pending was null this time around, no bios need processing - * at all and we can stop. Otherwise it'll loop back up again - * and do an additional check so no bios are missed. - * - * device->running_pending is used to synchronize with the - * schedule_bio code. - */ - if (device->pending_sync_bios.head == NULL && - device->pending_bios.head == NULL) { - again = 0; - device->running_pending = 0; - } else { - again = 1; - device->running_pending = 1; - } - - pending_bios->head = NULL; - pending_bios->tail = NULL; - - spin_unlock(&device->io_lock); - - while (pending) { - - rmb(); - /* we want to work on both lists, but do more bios on the - * sync list than the regular list - */ - if ((num_run > 32 && - pending_bios != &device->pending_sync_bios && - device->pending_sync_bios.head) || - (num_run > 64 && pending_bios == &device->pending_sync_bios && - device->pending_bios.head)) { - spin_lock(&device->io_lock); - requeue_list(pending_bios, pending, tail); - goto loop_lock; - } - - cur = pending; - pending = pending->bi_next; - cur->bi_next = NULL; - - BUG_ON(atomic_read(&cur->__bi_cnt) == 0); - - /* - * if we're doing the sync list, record that our - * plug has some sync requests on it - * - * If we're doing the regular list and there are - * sync requests sitting around, unplug before - * we add more - */ - if (pending_bios == &device->pending_sync_bios) { - sync_pending = 1; - } else if (sync_pending) { - blk_finish_plug(&plug); - blk_start_plug(&plug); - sync_pending = 0; - } - - btrfsic_submit_bio(cur); - num_run++; - batch_run++; - - cond_resched(); - - /* - * we made progress, there is more work to do and the bdi - * is now congested. Back off and let other work structs - * run instead - */ - if (pending && bdi_write_congested(bdi) && batch_run > 8 && - fs_info->fs_devices->open_devices > 1) { - struct io_context *ioc; - - ioc = current->io_context; - - /* - * the main goal here is that we don't want to - * block if we're going to be able to submit - * more requests without blocking. - * - * This code does two great things, it pokes into - * the elevator code from a filesystem _and_ - * it makes assumptions about how batching works. - */ - if (ioc && ioc->nr_batch_requests > 0 && - time_before(jiffies, ioc->last_waited + HZ/50UL) && - (last_waited == 0 || - ioc->last_waited == last_waited)) { - /* - * we want to go through our batch of - * requests and stop. So, we copy out - * the ioc->last_waited time and test - * against it before looping - */ - last_waited = ioc->last_waited; - cond_resched(); - continue; - } - spin_lock(&device->io_lock); - requeue_list(pending_bios, pending, tail); - device->running_pending = 1; - - spin_unlock(&device->io_lock); - btrfs_queue_work(fs_info->submit_workers, - &device->work); - goto done; - } - } - - cond_resched(); - if (again) - goto loop; - - spin_lock(&device->io_lock); - if (device->pending_bios.head || device->pending_sync_bios.head) - goto loop_lock; - spin_unlock(&device->io_lock); - -done: - blk_finish_plug(&plug); -} - -static void pending_bios_fn(struct btrfs_work *work) -{ - struct btrfs_device *device; - - device = container_of(work, struct btrfs_device, work); - run_scheduled_bios(device); -} - static bool device_path_matched(const char *path, struct btrfs_device *device) { int found; @@ -818,7 +634,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, } clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); - fs_devices->seeding = 1; + fs_devices->seeding = true; } else { if (bdev_read_only(bdev)) clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); @@ -828,7 +644,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, q = bdev_get_queue(bdev); if (!blk_queue_nonrot(q)) - fs_devices->rotating = 1; + fs_devices->rotating = true; device->bdev = bdev; clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); @@ -1005,11 +821,15 @@ static noinline struct btrfs_device *device_list_add(const char *path, *new_device_added = true; if (disk_super->label[0]) - pr_info("BTRFS: device label %s devid %llu transid %llu %s\n", - disk_super->label, devid, found_transid, path); + pr_info( + "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n", + disk_super->label, devid, found_transid, path, + current->comm, task_pid_nr(current)); else - pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n", - disk_super->fsid, devid, found_transid, path); + pr_info( + "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n", + disk_super->fsid, devid, found_transid, path, + current->comm, task_pid_nr(current)); } else if (!device->name || strcmp(device->name->str, path)) { /* @@ -1295,7 +1115,7 @@ static int close_fs_devices(struct btrfs_fs_devices *fs_devices) WARN_ON(fs_devices->open_devices); WARN_ON(fs_devices->rw_devices); fs_devices->opened = 0; - fs_devices->seeding = 0; + fs_devices->seeding = false; return 0; } @@ -2048,7 +1868,7 @@ static struct btrfs_device * btrfs_find_next_active_device( * where this function called, there should be always be another device (or * this_dev) which is active. */ -void btrfs_assign_next_active_device(struct btrfs_device *device, +void __cold btrfs_assign_next_active_device(struct btrfs_device *device, struct btrfs_device *this_dev) { struct btrfs_fs_info *fs_info = device->fs_info; @@ -2450,11 +2270,11 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); mutex_unlock(&fs_info->chunk_mutex); - fs_devices->seeding = 0; + fs_devices->seeding = false; fs_devices->num_devices = 0; fs_devices->open_devices = 0; fs_devices->missing_devices = 0; - fs_devices->rotating = 0; + fs_devices->rotating = false; fs_devices->seed = seed_devices; generate_random_uuid(fs_devices->fsid); @@ -2649,7 +2469,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path atomic64_add(device->total_bytes, &fs_info->free_chunk_space); if (!blk_queue_nonrot(q)) - fs_devices->rotating = 1; + fs_devices->rotating = true; orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); btrfs_set_super_total_bytes(fs_info->super_copy, @@ -3177,7 +2997,7 @@ error: static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; u64 bytes_used; u64 chunk_type; @@ -3186,27 +3006,28 @@ static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, chunk_type = cache->flags; btrfs_put_block_group(cache); - if (chunk_type & BTRFS_BLOCK_GROUP_DATA) { - spin_lock(&fs_info->data_sinfo->lock); - bytes_used = fs_info->data_sinfo->bytes_used; - spin_unlock(&fs_info->data_sinfo->lock); + if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA)) + return 0; + + spin_lock(&fs_info->data_sinfo->lock); + bytes_used = fs_info->data_sinfo->bytes_used; + spin_unlock(&fs_info->data_sinfo->lock); - if (!bytes_used) { - struct btrfs_trans_handle *trans; - int ret; + if (!bytes_used) { + struct btrfs_trans_handle *trans; + int ret; - trans = btrfs_join_transaction(fs_info->tree_root); - if (IS_ERR(trans)) - return PTR_ERR(trans); + trans = btrfs_join_transaction(fs_info->tree_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); - ret = btrfs_force_chunk_alloc(trans, - BTRFS_BLOCK_GROUP_DATA); - btrfs_end_transaction(trans); - if (ret < 0) - return ret; - return 1; - } + ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA); + btrfs_end_transaction(trans); + if (ret < 0) + return ret; + return 1; } + return 0; } @@ -3385,28 +3206,28 @@ static int chunk_profiles_filter(u64 chunk_type, static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, struct btrfs_balance_args *bargs) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; u64 chunk_used; u64 user_thresh_min; u64 user_thresh_max; int ret = 1; cache = btrfs_lookup_block_group(fs_info, chunk_offset); - chunk_used = btrfs_block_group_used(&cache->item); + chunk_used = cache->used; if (bargs->usage_min == 0) user_thresh_min = 0; else - user_thresh_min = div_factor_fine(cache->key.offset, - bargs->usage_min); + user_thresh_min = div_factor_fine(cache->length, + bargs->usage_min); if (bargs->usage_max == 0) user_thresh_max = 1; else if (bargs->usage_max > 100) - user_thresh_max = cache->key.offset; + user_thresh_max = cache->length; else - user_thresh_max = div_factor_fine(cache->key.offset, - bargs->usage_max); + user_thresh_max = div_factor_fine(cache->length, + bargs->usage_max); if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) ret = 0; @@ -3418,20 +3239,19 @@ static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_off static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, struct btrfs_balance_args *bargs) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; u64 chunk_used, user_thresh; int ret = 1; cache = btrfs_lookup_block_group(fs_info, chunk_offset); - chunk_used = btrfs_block_group_used(&cache->item); + chunk_used = cache->used; if (bargs->usage_min == 0) user_thresh = 1; else if (bargs->usage > 100) - user_thresh = cache->key.offset; + user_thresh = cache->length; else - user_thresh = div_factor_fine(cache->key.offset, - bargs->usage); + user_thresh = div_factor_fine(cache->length, bargs->usage); if (chunk_used < user_thresh) ret = 0; @@ -3844,12 +3664,7 @@ static int alloc_profile_is_valid(u64 flags, int extended) if (flags == 0) return !extended; /* "0" is valid for usual profiles */ - /* true if exactly one bit set */ - /* - * Don't use is_power_of_2(unsigned long) because it won't work - * for the single profile (1ULL << 48) on 32-bit CPUs. - */ - return flags != 0 && (flags & (flags - 1)) == 0; + return has_single_bit_set(flags); } static inline int balance_need_close(struct btrfs_fs_info *fs_info) @@ -4036,7 +3851,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, int ret; u64 num_devices; unsigned seq; - bool reducing_integrity; + bool reducing_redundancy; int i; if (btrfs_fs_closing(fs_info) || @@ -4119,9 +3934,9 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && (fs_info->avail_metadata_alloc_bits & allowed) && !(bctl->meta.target & allowed))) - reducing_integrity = true; + reducing_redundancy = true; else - reducing_integrity = false; + reducing_redundancy = false; /* if we're not converting, the target field is uninitialized */ meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? @@ -4130,13 +3945,13 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, bctl->data.target : fs_info->avail_data_alloc_bits; } while (read_seqretry(&fs_info->profiles_lock, seq)); - if (reducing_integrity) { + if (reducing_redundancy) { if (bctl->flags & BTRFS_BALANCE_FORCE) { btrfs_info(fs_info, - "balance: force reducing metadata integrity"); + "balance: force reducing metadata redundancy"); } else { btrfs_err(fs_info, - "balance: reduces metadata integrity, use --force if you want this"); + "balance: reduces metadata redundancy, use --force if you want this"); ret = -EINVAL; goto out; } @@ -4902,6 +4717,14 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) btrfs_set_fs_incompat(info, RAID56); } +static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type) +{ + if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4))) + return; + + btrfs_set_fs_incompat(info, RAID1C34); +} + static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 start, u64 type) { @@ -4967,6 +4790,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { max_stripe_size = SZ_32M; max_chunk_size = 2 * max_stripe_size; + devs_max = min_t(int, devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); } else { btrfs_err(info, "invalid chunk type 0x%llx requested", type); @@ -5047,8 +4871,11 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, sort(devices_info, ndevs, sizeof(struct btrfs_device_info), btrfs_cmp_device_info, NULL); - /* round down to number of usable stripes */ - ndevs = round_down(ndevs, devs_increment); + /* + * Round down to number of usable stripes, devs_increment can be any + * number so we can't use round_down() + */ + ndevs -= ndevs % devs_increment; if (ndevs < devs_min) { ret = -ENOSPC; @@ -5164,6 +4991,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, free_extent_map(em); check_raid56_incompat_flag(info, type); + check_raid1c34_incompat_flag(info, type); kfree(devices_info); return 0; @@ -5582,12 +5410,13 @@ void btrfs_put_bbio(struct btrfs_bio *bbio) * replace. */ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, - u64 logical, u64 length, + u64 logical, u64 *length_ret, struct btrfs_bio **bbio_ret) { struct extent_map *em; struct map_lookup *map; struct btrfs_bio *bbio; + u64 length = *length_ret; u64 offset; u64 stripe_nr; u64 stripe_nr_end; @@ -5620,7 +5449,8 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, } offset = logical - em->start; - length = min_t(u64, em->len - offset, length); + length = min_t(u64, em->start + em->len - logical, length); + *length_ret = length; stripe_len = map->stripe_len; /* @@ -6035,7 +5865,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, if (op == BTRFS_MAP_DISCARD) return __btrfs_map_block_for_discard(fs_info, logical, - *length, bbio_ret); + length, bbio_ret); ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom); if (ret < 0) @@ -6415,52 +6245,8 @@ static void btrfs_end_bio(struct bio *bio) } } -/* - * see run_scheduled_bios for a description of why bios are collected for - * async submit. - * - * This will add one bio to the pending list for a device and make sure - * the work struct is scheduled. - */ -static noinline void btrfs_schedule_bio(struct btrfs_device *device, - struct bio *bio) -{ - struct btrfs_fs_info *fs_info = device->fs_info; - int should_queue = 1; - struct btrfs_pending_bios *pending_bios; - - /* don't bother with additional async steps for reads, right now */ - if (bio_op(bio) == REQ_OP_READ) { - btrfsic_submit_bio(bio); - return; - } - - WARN_ON(bio->bi_next); - bio->bi_next = NULL; - - spin_lock(&device->io_lock); - if (op_is_sync(bio->bi_opf)) - pending_bios = &device->pending_sync_bios; - else - pending_bios = &device->pending_bios; - - if (pending_bios->tail) - pending_bios->tail->bi_next = bio; - - pending_bios->tail = bio; - if (!pending_bios->head) - pending_bios->head = bio; - if (device->running_pending) - should_queue = 0; - - spin_unlock(&device->io_lock); - - if (should_queue) - btrfs_queue_work(fs_info->submit_workers, &device->work); -} - static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, - u64 physical, int dev_nr, int async) + u64 physical, int dev_nr) { struct btrfs_device *dev = bbio->stripes[dev_nr].dev; struct btrfs_fs_info *fs_info = bbio->fs_info; @@ -6478,10 +6264,7 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, btrfs_bio_counter_inc_noblocked(fs_info); - if (async) - btrfs_schedule_bio(dev, bio); - else - btrfsic_submit_bio(bio); + btrfsic_submit_bio(bio); } static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) @@ -6502,7 +6285,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) } blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, - int mirror_num, int async_submit) + int mirror_num) { struct btrfs_device *dev; struct bio *first_bio = bio; @@ -6571,7 +6354,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, bio = first_bio; submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, - dev_nr, async_submit); + dev_nr); } btrfs_bio_counter_dec(fs_info); return BLK_STS_OK; @@ -6675,9 +6458,6 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, else generate_random_uuid(dev->uuid); - btrfs_init_work(&dev->work, btrfs_submit_helper, - pending_bios_fn, NULL, NULL); - return dev; } @@ -6874,7 +6654,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, if (IS_ERR(fs_devices)) return fs_devices; - fs_devices->seeding = 1; + fs_devices->seeding = true; fs_devices->opened = 1; return fs_devices; } @@ -7063,48 +6843,49 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) sb_array_offset += len; cur_offset += len; - if (key.type == BTRFS_CHUNK_ITEM_KEY) { - chunk = (struct btrfs_chunk *)sb_array_offset; - /* - * At least one btrfs_chunk with one stripe must be - * present, exact stripe count check comes afterwards - */ - len = btrfs_chunk_item_size(1); - if (cur_offset + len > array_size) - goto out_short_read; - - num_stripes = btrfs_chunk_num_stripes(sb, chunk); - if (!num_stripes) { - btrfs_err(fs_info, - "invalid number of stripes %u in sys_array at offset %u", - num_stripes, cur_offset); - ret = -EIO; - break; - } + if (key.type != BTRFS_CHUNK_ITEM_KEY) { + btrfs_err(fs_info, + "unexpected item type %u in sys_array at offset %u", + (u32)key.type, cur_offset); + ret = -EIO; + break; + } - type = btrfs_chunk_type(sb, chunk); - if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { - btrfs_err(fs_info, - "invalid chunk type %llu in sys_array at offset %u", - type, cur_offset); - ret = -EIO; - break; - } + chunk = (struct btrfs_chunk *)sb_array_offset; + /* + * At least one btrfs_chunk with one stripe must be present, + * exact stripe count check comes afterwards + */ + len = btrfs_chunk_item_size(1); + if (cur_offset + len > array_size) + goto out_short_read; - len = btrfs_chunk_item_size(num_stripes); - if (cur_offset + len > array_size) - goto out_short_read; + num_stripes = btrfs_chunk_num_stripes(sb, chunk); + if (!num_stripes) { + btrfs_err(fs_info, + "invalid number of stripes %u in sys_array at offset %u", + num_stripes, cur_offset); + ret = -EIO; + break; + } - ret = read_one_chunk(&key, sb, chunk); - if (ret) - break; - } else { + type = btrfs_chunk_type(sb, chunk); + if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { btrfs_err(fs_info, - "unexpected item type %u in sys_array at offset %u", - (u32)key.type, cur_offset); + "invalid chunk type %llu in sys_array at offset %u", + type, cur_offset); ret = -EIO; break; } + + len = btrfs_chunk_item_size(num_stripes); + if (cur_offset + len > array_size) + goto out_short_read; + + ret = read_one_chunk(&key, sb, chunk); + if (ret) + break; + array_ptr += len; sb_array_offset += len; cur_offset += len; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index a7da1f3e3627..fc1b564b9cfe 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -18,10 +18,6 @@ extern struct mutex uuid_mutex; #define BTRFS_STRIPE_LEN SZ_64K struct buffer_head; -struct btrfs_pending_bios { - struct bio *head; - struct bio *tail; -}; struct btrfs_io_geometry { /* remaining bytes before crossing a stripe */ @@ -68,13 +64,6 @@ struct btrfs_device { u64 generation; - spinlock_t io_lock ____cacheline_aligned; - int running_pending; - /* regular prio bios */ - struct btrfs_pending_bios pending_bios; - /* sync bios */ - struct btrfs_pending_bios pending_sync_bios; - struct block_device *bdev; /* the mode sent to blkdev_get */ @@ -254,14 +243,14 @@ struct btrfs_fs_devices { struct list_head alloc_list; struct btrfs_fs_devices *seed; - int seeding; + bool seeding; int opened; /* set when we find or add a device that doesn't have the * nonrot flag set */ - int rotating; + bool rotating; struct btrfs_fs_info *fs_info; /* sysfs kobjects */ @@ -330,7 +319,6 @@ struct btrfs_bio { u64 map_type; /* get from map_lookup->type */ bio_end_io_t *end_io; struct bio *orig_bio; - unsigned long flags; void *private; atomic_t error; int max_errors; @@ -436,7 +424,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type); void btrfs_mapping_tree_free(struct extent_map_tree *tree); blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, - int mirror_num, int async_submit); + int mirror_num); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); struct btrfs_device *btrfs_scan_one_device(const char *path, @@ -557,6 +545,10 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags) return BTRFS_RAID_RAID10; else if (flags & BTRFS_BLOCK_GROUP_RAID1) return BTRFS_RAID_RAID1; + else if (flags & BTRFS_BLOCK_GROUP_RAID1C3) + return BTRFS_RAID_RAID1C3; + else if (flags & BTRFS_BLOCK_GROUP_RAID1C4) + return BTRFS_RAID_RAID1C4; else if (flags & BTRFS_BLOCK_GROUP_DUP) return BTRFS_RAID_DUP; else if (flags & BTRFS_BLOCK_GROUP_RAID0) @@ -571,7 +563,7 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags) void btrfs_commit_device_sizes(struct btrfs_transaction *trans); -struct list_head *btrfs_get_fs_uuids(void); +struct list_head * __attribute_const__ btrfs_get_fs_uuids(void); void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index df1aace5df50..a6c90a003c12 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -29,19 +29,9 @@ struct workspace { static struct workspace_manager wsm; -static void zlib_init_workspace_manager(void) +struct list_head *zlib_get_workspace(unsigned int level) { - btrfs_init_workspace_manager(&wsm, &btrfs_zlib_compress); -} - -static void zlib_cleanup_workspace_manager(void) -{ - btrfs_cleanup_workspace_manager(&wsm); -} - -static struct list_head *zlib_get_workspace(unsigned int level) -{ - struct list_head *ws = btrfs_get_workspace(&wsm, level); + struct list_head *ws = btrfs_get_workspace(BTRFS_COMPRESS_ZLIB, level); struct workspace *workspace = list_entry(ws, struct workspace, list); workspace->level = level; @@ -49,12 +39,7 @@ static struct list_head *zlib_get_workspace(unsigned int level) return ws; } -static void zlib_put_workspace(struct list_head *ws) -{ - btrfs_put_workspace(&wsm, ws); -} - -static void zlib_free_workspace(struct list_head *ws) +void zlib_free_workspace(struct list_head *ws) { struct workspace *workspace = list_entry(ws, struct workspace, list); @@ -63,7 +48,7 @@ static void zlib_free_workspace(struct list_head *ws) kfree(workspace); } -static struct list_head *zlib_alloc_workspace(unsigned int level) +struct list_head *zlib_alloc_workspace(unsigned int level) { struct workspace *workspace; int workspacesize; @@ -88,13 +73,9 @@ fail: return ERR_PTR(-ENOMEM); } -static int zlib_compress_pages(struct list_head *ws, - struct address_space *mapping, - u64 start, - struct page **pages, - unsigned long *out_pages, - unsigned long *total_in, - unsigned long *total_out) +int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret; @@ -228,7 +209,7 @@ out: return ret; } -static int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) +int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0, ret2; @@ -319,10 +300,9 @@ done: return ret; } -static int zlib_decompress(struct list_head *ws, unsigned char *data_in, - struct page *dest_page, - unsigned long start_byte, - size_t srclen, size_t destlen) +int zlib_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0; @@ -419,15 +399,7 @@ next: } const struct btrfs_compress_op btrfs_zlib_compress = { - .init_workspace_manager = zlib_init_workspace_manager, - .cleanup_workspace_manager = zlib_cleanup_workspace_manager, - .get_workspace = zlib_get_workspace, - .put_workspace = zlib_put_workspace, - .alloc_workspace = zlib_alloc_workspace, - .free_workspace = zlib_free_workspace, - .compress_pages = zlib_compress_pages, - .decompress_bio = zlib_decompress_bio, - .decompress = zlib_decompress, + .workspace_manager = &wsm, .max_level = 9, .default_level = BTRFS_ZLIB_DEFAULT_LEVEL, }; diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 764d47b107e5..9a4871636c6c 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -91,9 +91,8 @@ static inline struct workspace *list_to_workspace(struct list_head *list) return container_of(list, struct workspace, list); } -static void zstd_free_workspace(struct list_head *ws); -static struct list_head *zstd_alloc_workspace(unsigned int level); - +void zstd_free_workspace(struct list_head *ws); +struct list_head *zstd_alloc_workspace(unsigned int level); /* * zstd_reclaim_timer_fn - reclaim timer * @t: timer @@ -168,7 +167,7 @@ static void zstd_calc_ws_mem_sizes(void) } } -static void zstd_init_workspace_manager(void) +void zstd_init_workspace_manager(void) { struct list_head *ws; int i; @@ -194,7 +193,7 @@ static void zstd_init_workspace_manager(void) } } -static void zstd_cleanup_workspace_manager(void) +void zstd_cleanup_workspace_manager(void) { struct workspace *workspace; int i; @@ -261,7 +260,7 @@ static struct list_head *zstd_find_workspace(unsigned int level) * attempt to allocate a new workspace. If we fail to allocate one due to * memory pressure, go to sleep waiting for the max level workspace to free up. */ -static struct list_head *zstd_get_workspace(unsigned int level) +struct list_head *zstd_get_workspace(unsigned int level) { struct list_head *ws; unsigned int nofs_flag; @@ -302,7 +301,7 @@ again: * isn't set, it is also set here. Only the max level workspace tries and wakes * up waiting workspaces. */ -static void zstd_put_workspace(struct list_head *ws) +void zstd_put_workspace(struct list_head *ws) { struct workspace *workspace = list_to_workspace(ws); @@ -332,7 +331,7 @@ static void zstd_put_workspace(struct list_head *ws) cond_wake_up(&wsm.wait); } -static void zstd_free_workspace(struct list_head *ws) +void zstd_free_workspace(struct list_head *ws) { struct workspace *workspace = list_entry(ws, struct workspace, list); @@ -341,7 +340,7 @@ static void zstd_free_workspace(struct list_head *ws) kfree(workspace); } -static struct list_head *zstd_alloc_workspace(unsigned int level) +struct list_head *zstd_alloc_workspace(unsigned int level) { struct workspace *workspace; @@ -367,13 +366,9 @@ fail: return ERR_PTR(-ENOMEM); } -static int zstd_compress_pages(struct list_head *ws, - struct address_space *mapping, - u64 start, - struct page **pages, - unsigned long *out_pages, - unsigned long *total_in, - unsigned long *total_out) +int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); ZSTD_CStream *stream; @@ -548,7 +543,7 @@ out: return ret; } -static int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) +int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct workspace *workspace = list_entry(ws, struct workspace, list); struct page **pages_in = cb->compressed_pages; @@ -626,10 +621,9 @@ done: return ret; } -static int zstd_decompress(struct list_head *ws, unsigned char *data_in, - struct page *dest_page, - unsigned long start_byte, - size_t srclen, size_t destlen) +int zstd_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); ZSTD_DStream *stream; @@ -712,15 +706,8 @@ finish: } const struct btrfs_compress_op btrfs_zstd_compress = { - .init_workspace_manager = zstd_init_workspace_manager, - .cleanup_workspace_manager = zstd_cleanup_workspace_manager, - .get_workspace = zstd_get_workspace, - .put_workspace = zstd_put_workspace, - .alloc_workspace = zstd_alloc_workspace, - .free_workspace = zstd_free_workspace, - .compress_pages = zstd_compress_pages, - .decompress_bio = zstd_decompress_bio, - .decompress = zstd_decompress, + /* ZSTD uses own workspace manager */ + .workspace_manager = NULL, .max_level = ZSTD_BTRFS_MAX_LEVEL, .default_level = ZSTD_BTRFS_DEFAULT_LEVEL, }; diff --git a/fs/buffer.c b/fs/buffer.c index 86a38b979323..d8c7242426bb 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -47,6 +47,9 @@ #include <linux/pagevec.h> #include <linux/sched/mm.h> #include <trace/events/block.h> +#include <linux/fscrypt.h> + +#include "internal.h" static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, @@ -246,10 +249,6 @@ out: return ret; } -/* - * I/O completion handler for block_read_full_page() - pages - * which come unlocked at the end of I/O. - */ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) { unsigned long flags; @@ -307,6 +306,47 @@ still_busy: return; } +struct decrypt_bh_ctx { + struct work_struct work; + struct buffer_head *bh; +}; + +static void decrypt_bh(struct work_struct *work) +{ + struct decrypt_bh_ctx *ctx = + container_of(work, struct decrypt_bh_ctx, work); + struct buffer_head *bh = ctx->bh; + int err; + + err = fscrypt_decrypt_pagecache_blocks(bh->b_page, bh->b_size, + bh_offset(bh)); + end_buffer_async_read(bh, err == 0); + kfree(ctx); +} + +/* + * I/O completion handler for block_read_full_page() - pages + * which come unlocked at the end of I/O. + */ +static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate) +{ + /* Decrypt if needed */ + if (uptodate && IS_ENABLED(CONFIG_FS_ENCRYPTION) && + IS_ENCRYPTED(bh->b_page->mapping->host) && + S_ISREG(bh->b_page->mapping->host->i_mode)) { + struct decrypt_bh_ctx *ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC); + + if (ctx) { + INIT_WORK(&ctx->work, decrypt_bh); + ctx->bh = bh; + fscrypt_enqueue_decrypt_work(&ctx->work); + return; + } + uptodate = 0; + } + end_buffer_async_read(bh, uptodate); +} + /* * Completion handler for block_write_full_page() - pages which are unlocked * during I/O, and which have PageWriteback cleared upon I/O completion. @@ -379,7 +419,7 @@ EXPORT_SYMBOL(end_buffer_async_write); */ static void mark_buffer_async_read(struct buffer_head *bh) { - bh->b_end_io = end_buffer_async_read; + bh->b_end_io = end_buffer_async_read_io; set_buffer_async_read(bh); } @@ -1385,10 +1425,10 @@ static bool has_bh_in_lru(int cpu, void *dummy) for (i = 0; i < BH_LRU_SIZE; i++) { if (b->bhs[i]) - return 1; + return true; } - return 0; + return false; } void invalidate_bh_lrus(void) diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index b2ec29eeb4c4..73f24f307a4a 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -8,6 +8,7 @@ #include <linux/ceph/ceph_debug.h> +#include <linux/fs_context.h> #include "super.h" #include "cache.h" @@ -49,7 +50,7 @@ void ceph_fscache_unregister(void) fscache_unregister_netfs(&ceph_cache_netfs); } -int ceph_fscache_register_fs(struct ceph_fs_client* fsc) +int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc) { const struct ceph_fsid *fsid = &fsc->client->fsid; const char *fscache_uniq = fsc->mount_options->fscache_uniq; @@ -66,8 +67,8 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc) if (uniq_len && memcmp(ent->uniquifier, fscache_uniq, uniq_len)) continue; - pr_err("fscache cookie already registered for fsid %pU\n", fsid); - pr_err(" use fsc=%%s mount option to specify a uniquifier\n"); + errorf(fc, "ceph: fscache cookie already registered for fsid %pU, use fsc=<uniquifier> option", + fsid); err = -EBUSY; goto out_unlock; } @@ -95,7 +96,7 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc) list_add_tail(&ent->list, &ceph_fscache_list); } else { kfree(ent); - pr_err("unable to register fscache cookie for fsid %pU\n", + errorf(fc, "ceph: unable to register fscache cookie for fsid %pU", fsid); /* all other fs ignore this error */ } diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index e486fac3434d..89dbdd1eb14a 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -16,7 +16,7 @@ extern struct fscache_netfs ceph_cache_netfs; int ceph_fscache_register(void); void ceph_fscache_unregister(void); -int ceph_fscache_register_fs(struct ceph_fs_client* fsc); +int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc); void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc); void ceph_fscache_register_inode_cookie(struct inode *inode); @@ -88,7 +88,8 @@ static inline void ceph_fscache_unregister(void) { } -static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc) +static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc, + struct fs_context *fc) { return 0; } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index d3b9c9d5c1bd..f5a38910a82b 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1058,6 +1058,11 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); + /* remove from inode's cap rbtree, and clear auth cap */ + rb_erase(&cap->ci_node, &ci->i_caps); + if (ci->i_auth_cap == cap) + ci->i_auth_cap = NULL; + /* remove from session list */ spin_lock(&session->s_cap_lock); if (session->s_cap_iterator == cap) { @@ -1091,11 +1096,6 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) spin_unlock(&session->s_cap_lock); - /* remove from inode list */ - rb_erase(&cap->ci_node, &ci->i_caps); - if (ci->i_auth_cap == cap) - ci->i_auth_cap = NULL; - if (removed) ceph_put_cap(mdsc, cap); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 4ca0b8ff9a72..2e4764fd1872 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1553,36 +1553,37 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) { int valid = 0; struct dentry *parent; - struct inode *dir; + struct inode *dir, *inode; if (flags & LOOKUP_RCU) { parent = READ_ONCE(dentry->d_parent); dir = d_inode_rcu(parent); if (!dir) return -ECHILD; + inode = d_inode_rcu(dentry); } else { parent = dget_parent(dentry); dir = d_inode(parent); + inode = d_inode(dentry); } dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry, - dentry, d_inode(dentry), ceph_dentry(dentry)->offset); + dentry, inode, ceph_dentry(dentry)->offset); /* always trust cached snapped dentries, snapdir dentry */ if (ceph_snap(dir) != CEPH_NOSNAP) { dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry, - dentry, d_inode(dentry)); + dentry, inode); valid = 1; - } else if (d_really_is_positive(dentry) && - ceph_snap(d_inode(dentry)) == CEPH_SNAPDIR) { + } else if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { valid = 1; } else { valid = dentry_lease_is_valid(dentry, flags); if (valid == -ECHILD) return valid; if (valid || dir_lease_is_valid(dir, dentry)) { - if (d_really_is_positive(dentry)) - valid = ceph_is_any_caps(d_inode(dentry)); + if (inode) + valid = ceph_is_any_caps(inode); else valid = 1; } @@ -1808,6 +1809,7 @@ const struct file_operations ceph_dir_fops = { .open = ceph_open, .release = ceph_release, .unlocked_ioctl = ceph_ioctl, + .compat_ioctl = compat_ptr_ioctl, .fsync = ceph_fsync, .lock = ceph_lock, .flock = ceph_flock, diff --git a/fs/ceph/file.c b/fs/ceph/file.c index d277f71abe0b..11929d2bb594 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -462,6 +462,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, err = ceph_security_init_secctx(dentry, mode, &as_ctx); if (err < 0) goto out_ctx; + } else if (!d_in_lookup(dentry)) { + /* If it's not being looked up, it's negative */ + return -ENOENT; } /* do the open */ @@ -750,6 +753,9 @@ static void ceph_aio_complete(struct inode *inode, if (!atomic_dec_and_test(&aio_req->pending_reqs)) return; + if (aio_req->iocb->ki_flags & IOCB_DIRECT) + inode_dio_end(inode); + ret = aio_req->error; if (!ret) ret = aio_req->total_len; @@ -1088,6 +1094,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, CEPH_CAP_FILE_RD); list_splice(&aio_req->osd_reqs, &osd_reqs); + inode_dio_begin(inode); while (!list_empty(&osd_reqs)) { req = list_first_entry(&osd_reqs, struct ceph_osd_request, @@ -1261,14 +1268,24 @@ again: dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode); + if (iocb->ki_flags & IOCB_DIRECT) + ceph_start_io_direct(inode); + else + ceph_start_io_read(inode); + if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_CACHE; ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); - if (ret < 0) + if (ret < 0) { + if (iocb->ki_flags & IOCB_DIRECT) + ceph_end_io_direct(inode); + else + ceph_end_io_read(inode); return ret; + } if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_flags & IOCB_DIRECT) || @@ -1280,16 +1297,12 @@ again: if (ci->i_inline_version == CEPH_INLINE_NONE) { if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { - ceph_start_io_direct(inode); ret = ceph_direct_read_write(iocb, to, NULL, NULL); - ceph_end_io_direct(inode); if (ret >= 0 && ret < len) retry_op = CHECK_EOF; } else { - ceph_start_io_read(inode); ret = ceph_sync_read(iocb, to, &retry_op); - ceph_end_io_read(inode); } } else { retry_op = READ_INLINE; @@ -1300,11 +1313,10 @@ again: inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, ceph_cap_string(got)); ceph_add_rw_context(fi, &rw_ctx); - ceph_start_io_read(inode); ret = generic_file_read_iter(iocb, to); - ceph_end_io_read(inode); ceph_del_rw_context(fi, &rw_ctx); } + dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); if (pinned_page) { @@ -1312,6 +1324,12 @@ again: pinned_page = NULL; } ceph_put_cap_refs(ci, got); + + if (iocb->ki_flags & IOCB_DIRECT) + ceph_end_io_direct(inode); + else + ceph_end_io_read(inode); + if (retry_op > HAVE_RETRIED && ret >= 0) { int statret; struct page *page = NULL; @@ -1956,10 +1974,18 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, if (ceph_test_mount_opt(src_fsc, NOCOPYFROM)) return -EOPNOTSUPP; + /* + * Striped file layouts require that we copy partial objects, but the + * OSD copy-from operation only supports full-object copies. Limit + * this to non-striped file layouts for now. + */ if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) || - (src_ci->i_layout.stripe_count != dst_ci->i_layout.stripe_count) || - (src_ci->i_layout.object_size != dst_ci->i_layout.object_size)) + (src_ci->i_layout.stripe_count != 1) || + (dst_ci->i_layout.stripe_count != 1) || + (src_ci->i_layout.object_size != dst_ci->i_layout.object_size)) { + dout("Invalid src/dst files layout\n"); return -EOPNOTSUPP; + } if (len < src_ci->i_layout.object_size) return -EOPNOTSUPP; /* no remote copy will be done */ @@ -2162,7 +2188,7 @@ const struct file_operations ceph_file_fops = { .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .unlocked_ioctl = ceph_ioctl, - .compat_ioctl = ceph_ioctl, + .compat_ioctl = compat_ptr_ioctl, .fallocate = ceph_fallocate, .copy_file_range = ceph_copy_file_range, }; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 9f135624ae47..c07407586ce8 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1434,6 +1434,7 @@ retry_lookup: dout(" final dn %p\n", dn); } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP || req->r_op == CEPH_MDS_OP_MKSNAP) && + test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { struct inode *dir = req->r_parent; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index a8a8f84f3bbf..068b029cf073 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -384,8 +384,8 @@ static int parse_reply_info_readdir(void **p, void *end, } done: - if (*p != end) - goto bad; + /* Skip over any unrecognized fields */ + *p = end; return 0; bad: @@ -406,12 +406,10 @@ static int parse_reply_info_filelock(void **p, void *end, goto bad; info->filelock_reply = *p; - *p += sizeof(*info->filelock_reply); - if (unlikely(*p != end)) - goto bad; + /* Skip over any unrecognized fields */ + *p = end; return 0; - bad: return -EIO; } @@ -425,18 +423,21 @@ static int parse_reply_info_create(void **p, void *end, { if (features == (u64)-1 || (features & CEPH_FEATURE_REPLY_CREATE_INODE)) { + /* Malformed reply? */ if (*p == end) { info->has_create_ino = false; } else { info->has_create_ino = true; - info->ino = ceph_decode_64(p); + ceph_decode_64_safe(p, end, info->ino, bad); } + } else { + if (*p != end) + goto bad; } - if (unlikely(*p != end)) - goto bad; + /* Skip over any unrecognized fields */ + *p = end; return 0; - bad: return -EIO; } @@ -2181,13 +2182,17 @@ retry: } base = ceph_ino(d_inode(temp)); rcu_read_unlock(); - if (pos < 0 || read_seqretry(&rename_lock, seq)) { - pr_err("build_path did not end path lookup where " - "expected, pos is %d\n", pos); - /* presumably this is only possible if racing with a - rename of one of the parent directories (we can not - lock the dentries above us to prevent this, but - retrying should be harmless) */ + + if (read_seqretry(&rename_lock, seq)) + goto retry; + + if (pos < 0) { + /* + * A rename didn't occur, but somehow we didn't end up where + * we thought we would. Throw a warning and try again. + */ + pr_warn("build_path did not end path lookup where " + "expected, pos is %d\n", pos); goto retry; } @@ -2344,6 +2349,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, head->op = cpu_to_le32(req->r_op); head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid)); head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid)); + head->ino = 0; head->args = req->r_args; ceph_encode_filepath(&p, end, ino1, path1); diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index ce2d00da5096..aeec1d6e3769 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -20,7 +20,7 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) { int n = 0; - int i; + int i, j; /* special case for one mds */ if (1 == m->m_num_mds && m->m_info[0].state > 0) @@ -35,9 +35,12 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) /* pick */ n = prandom_u32() % n; - for (i = 0; n > 0; i++, n--) - while (m->m_info[i].state <= 0) - i++; + for (j = 0, i = 0; i < m->m_num_mds; i++) { + if (m->m_info[i].state > 0) + j++; + if (j > n) + break; + } return i; } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index edfd643a8205..9c9a7c68eea3 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -9,7 +9,8 @@ #include <linux/in6.h> #include <linux/module.h> #include <linux/mount.h> -#include <linux/parser.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/slab.h> @@ -138,271 +139,308 @@ enum { Opt_readdir_max_entries, Opt_readdir_max_bytes, Opt_congestion_kb, - Opt_last_int, /* int args above */ Opt_snapdirname, Opt_mds_namespace, - Opt_fscache_uniq, Opt_recover_session, - Opt_last_string, + Opt_source, /* string args above */ Opt_dirstat, - Opt_nodirstat, Opt_rbytes, - Opt_norbytes, Opt_asyncreaddir, - Opt_noasyncreaddir, Opt_dcache, - Opt_nodcache, Opt_ino32, - Opt_noino32, Opt_fscache, - Opt_nofscache, Opt_poolperm, - Opt_nopoolperm, Opt_require_active_mds, - Opt_norequire_active_mds, -#ifdef CONFIG_CEPH_FS_POSIX_ACL Opt_acl, -#endif - Opt_noacl, Opt_quotadf, - Opt_noquotadf, Opt_copyfrom, - Opt_nocopyfrom, }; -static match_table_t fsopt_tokens = { - {Opt_wsize, "wsize=%d"}, - {Opt_rsize, "rsize=%d"}, - {Opt_rasize, "rasize=%d"}, - {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, - {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, - {Opt_caps_max, "caps_max=%d"}, - {Opt_readdir_max_entries, "readdir_max_entries=%d"}, - {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, - {Opt_congestion_kb, "write_congestion_kb=%d"}, - /* int args above */ - {Opt_snapdirname, "snapdirname=%s"}, - {Opt_mds_namespace, "mds_namespace=%s"}, - {Opt_recover_session, "recover_session=%s"}, - {Opt_fscache_uniq, "fsc=%s"}, - /* string args above */ - {Opt_dirstat, "dirstat"}, - {Opt_nodirstat, "nodirstat"}, - {Opt_rbytes, "rbytes"}, - {Opt_norbytes, "norbytes"}, - {Opt_asyncreaddir, "asyncreaddir"}, - {Opt_noasyncreaddir, "noasyncreaddir"}, - {Opt_dcache, "dcache"}, - {Opt_nodcache, "nodcache"}, - {Opt_ino32, "ino32"}, - {Opt_noino32, "noino32"}, - {Opt_fscache, "fsc"}, - {Opt_nofscache, "nofsc"}, - {Opt_poolperm, "poolperm"}, - {Opt_nopoolperm, "nopoolperm"}, - {Opt_require_active_mds, "require_active_mds"}, - {Opt_norequire_active_mds, "norequire_active_mds"}, -#ifdef CONFIG_CEPH_FS_POSIX_ACL - {Opt_acl, "acl"}, -#endif - {Opt_noacl, "noacl"}, - {Opt_quotadf, "quotadf"}, - {Opt_noquotadf, "noquotadf"}, - {Opt_copyfrom, "copyfrom"}, - {Opt_nocopyfrom, "nocopyfrom"}, - {-1, NULL} +enum ceph_recover_session_mode { + ceph_recover_session_no, + ceph_recover_session_clean +}; + +static const struct fs_parameter_enum ceph_mount_param_enums[] = { + { Opt_recover_session, "no", ceph_recover_session_no }, + { Opt_recover_session, "clean", ceph_recover_session_clean }, + {} +}; + +static const struct fs_parameter_spec ceph_mount_param_specs[] = { + fsparam_flag_no ("acl", Opt_acl), + fsparam_flag_no ("asyncreaddir", Opt_asyncreaddir), + fsparam_u32 ("caps_max", Opt_caps_max), + fsparam_u32 ("caps_wanted_delay_max", Opt_caps_wanted_delay_max), + fsparam_u32 ("caps_wanted_delay_min", Opt_caps_wanted_delay_min), + fsparam_s32 ("write_congestion_kb", Opt_congestion_kb), + fsparam_flag_no ("copyfrom", Opt_copyfrom), + fsparam_flag_no ("dcache", Opt_dcache), + fsparam_flag_no ("dirstat", Opt_dirstat), + __fsparam (fs_param_is_string, "fsc", Opt_fscache, + fs_param_neg_with_no | fs_param_v_optional), + fsparam_flag_no ("ino32", Opt_ino32), + fsparam_string ("mds_namespace", Opt_mds_namespace), + fsparam_flag_no ("poolperm", Opt_poolperm), + fsparam_flag_no ("quotadf", Opt_quotadf), + fsparam_u32 ("rasize", Opt_rasize), + fsparam_flag_no ("rbytes", Opt_rbytes), + fsparam_s32 ("readdir_max_bytes", Opt_readdir_max_bytes), + fsparam_s32 ("readdir_max_entries", Opt_readdir_max_entries), + fsparam_enum ("recover_session", Opt_recover_session), + fsparam_flag_no ("require_active_mds", Opt_require_active_mds), + fsparam_u32 ("rsize", Opt_rsize), + fsparam_string ("snapdirname", Opt_snapdirname), + fsparam_string ("source", Opt_source), + fsparam_u32 ("wsize", Opt_wsize), + {} +}; + +static const struct fs_parameter_description ceph_mount_parameters = { + .name = "ceph", + .specs = ceph_mount_param_specs, + .enums = ceph_mount_param_enums, }; -static int parse_fsopt_token(char *c, void *private) +struct ceph_parse_opts_ctx { + struct ceph_options *copts; + struct ceph_mount_options *opts; +}; + +/* + * Parse the source parameter. Distinguish the server list from the path. + * Internally we do not include the leading '/' in the path. + * + * The source will look like: + * <server_spec>[,<server_spec>...]:[<path>] + * where + * <server_spec> is <ip>[:<port>] + * <path> is optional, but if present must begin with '/' + */ +static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc) { - struct ceph_mount_options *fsopt = private; - substring_t argstr[MAX_OPT_ARGS]; - int token, intval, ret; + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + char *dev_name = param->string, *dev_name_end; + int ret; - token = match_token((char *)c, fsopt_tokens, argstr); - if (token < 0) - return -EINVAL; + dout("%s '%s'\n", __func__, dev_name); + if (!dev_name || !*dev_name) + return invalf(fc, "ceph: Empty source"); - if (token < Opt_last_int) { - ret = match_int(&argstr[0], &intval); - if (ret < 0) { - pr_err("bad option arg (not int) at '%s'\n", c); - return ret; + dev_name_end = strchr(dev_name, '/'); + if (dev_name_end) { + if (strlen(dev_name_end) > 1) { + kfree(fsopt->server_path); + fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL); + if (!fsopt->server_path) + return -ENOMEM; } - dout("got int token %d val %d\n", token, intval); - } else if (token > Opt_last_int && token < Opt_last_string) { - dout("got string token %d val %s\n", token, - argstr[0].from); } else { - dout("got token %d\n", token); + dev_name_end = dev_name + strlen(dev_name); } + dev_name_end--; /* back up to ':' separator */ + if (dev_name_end < dev_name || *dev_name_end != ':') + return invalf(fc, "ceph: No path or : separator in source"); + + dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); + if (fsopt->server_path) + dout("server path '%s'\n", fsopt->server_path); + + ret = ceph_parse_mon_ips(param->string, dev_name_end - dev_name, + pctx->copts, fc); + if (ret) + return ret; + + fc->source = param->string; + param->string = NULL; + return 0; +} + +static int ceph_parse_mount_param(struct fs_context *fc, + struct fs_parameter *param) +{ + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + struct fs_parse_result result; + unsigned int mode; + int token, ret; + + ret = ceph_parse_param(param, pctx->copts, fc); + if (ret != -ENOPARAM) + return ret; + + token = fs_parse(fc, &ceph_mount_parameters, param, &result); + dout("%s fs_parse '%s' token %d\n", __func__, param->key, token); + if (token < 0) + return token; + switch (token) { case Opt_snapdirname: kfree(fsopt->snapdir_name); - fsopt->snapdir_name = kstrndup(argstr[0].from, - argstr[0].to-argstr[0].from, - GFP_KERNEL); - if (!fsopt->snapdir_name) - return -ENOMEM; + fsopt->snapdir_name = param->string; + param->string = NULL; break; case Opt_mds_namespace: kfree(fsopt->mds_namespace); - fsopt->mds_namespace = kstrndup(argstr[0].from, - argstr[0].to-argstr[0].from, - GFP_KERNEL); - if (!fsopt->mds_namespace) - return -ENOMEM; + fsopt->mds_namespace = param->string; + param->string = NULL; break; case Opt_recover_session: - if (!strncmp(argstr[0].from, "no", - argstr[0].to - argstr[0].from)) { + mode = result.uint_32; + if (mode == ceph_recover_session_no) fsopt->flags &= ~CEPH_MOUNT_OPT_CLEANRECOVER; - } else if (!strncmp(argstr[0].from, "clean", - argstr[0].to - argstr[0].from)) { + else if (mode == ceph_recover_session_clean) fsopt->flags |= CEPH_MOUNT_OPT_CLEANRECOVER; - } else { - return -EINVAL; - } - break; - case Opt_fscache_uniq: - kfree(fsopt->fscache_uniq); - fsopt->fscache_uniq = kstrndup(argstr[0].from, - argstr[0].to-argstr[0].from, - GFP_KERNEL); - if (!fsopt->fscache_uniq) - return -ENOMEM; - fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE; + else + BUG(); break; - /* misc */ + case Opt_source: + if (fc->source) + return invalf(fc, "ceph: Multiple sources specified"); + return ceph_parse_source(param, fc); case Opt_wsize: - if (intval < (int)PAGE_SIZE || intval > CEPH_MAX_WRITE_SIZE) - return -EINVAL; - fsopt->wsize = ALIGN(intval, PAGE_SIZE); + if (result.uint_32 < PAGE_SIZE || + result.uint_32 > CEPH_MAX_WRITE_SIZE) + goto out_of_range; + fsopt->wsize = ALIGN(result.uint_32, PAGE_SIZE); break; case Opt_rsize: - if (intval < (int)PAGE_SIZE || intval > CEPH_MAX_READ_SIZE) - return -EINVAL; - fsopt->rsize = ALIGN(intval, PAGE_SIZE); + if (result.uint_32 < PAGE_SIZE || + result.uint_32 > CEPH_MAX_READ_SIZE) + goto out_of_range; + fsopt->rsize = ALIGN(result.uint_32, PAGE_SIZE); break; case Opt_rasize: - if (intval < 0) - return -EINVAL; - fsopt->rasize = ALIGN(intval, PAGE_SIZE); + fsopt->rasize = ALIGN(result.uint_32, PAGE_SIZE); break; case Opt_caps_wanted_delay_min: - if (intval < 1) - return -EINVAL; - fsopt->caps_wanted_delay_min = intval; + if (result.uint_32 < 1) + goto out_of_range; + fsopt->caps_wanted_delay_min = result.uint_32; break; case Opt_caps_wanted_delay_max: - if (intval < 1) - return -EINVAL; - fsopt->caps_wanted_delay_max = intval; + if (result.uint_32 < 1) + goto out_of_range; + fsopt->caps_wanted_delay_max = result.uint_32; break; case Opt_caps_max: - if (intval < 0) - return -EINVAL; - fsopt->caps_max = intval; + fsopt->caps_max = result.uint_32; break; case Opt_readdir_max_entries: - if (intval < 1) - return -EINVAL; - fsopt->max_readdir = intval; + if (result.uint_32 < 1) + goto out_of_range; + fsopt->max_readdir = result.uint_32; break; case Opt_readdir_max_bytes: - if (intval < (int)PAGE_SIZE && intval != 0) - return -EINVAL; - fsopt->max_readdir_bytes = intval; + if (result.uint_32 < PAGE_SIZE && result.uint_32 != 0) + goto out_of_range; + fsopt->max_readdir_bytes = result.uint_32; break; case Opt_congestion_kb: - if (intval < 1024) /* at least 1M */ - return -EINVAL; - fsopt->congestion_kb = intval; + if (result.uint_32 < 1024) /* at least 1M */ + goto out_of_range; + fsopt->congestion_kb = result.uint_32; break; case Opt_dirstat: - fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT; - break; - case Opt_nodirstat: - fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT; + if (!result.negated) + fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT; + else + fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT; break; case Opt_rbytes: - fsopt->flags |= CEPH_MOUNT_OPT_RBYTES; - break; - case Opt_norbytes: - fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES; + if (!result.negated) + fsopt->flags |= CEPH_MOUNT_OPT_RBYTES; + else + fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES; break; case Opt_asyncreaddir: - fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR; - break; - case Opt_noasyncreaddir: - fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; + if (!result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR; + else + fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; break; case Opt_dcache: - fsopt->flags |= CEPH_MOUNT_OPT_DCACHE; - break; - case Opt_nodcache: - fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE; + if (!result.negated) + fsopt->flags |= CEPH_MOUNT_OPT_DCACHE; + else + fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE; break; case Opt_ino32: - fsopt->flags |= CEPH_MOUNT_OPT_INO32; - break; - case Opt_noino32: - fsopt->flags &= ~CEPH_MOUNT_OPT_INO32; + if (!result.negated) + fsopt->flags |= CEPH_MOUNT_OPT_INO32; + else + fsopt->flags &= ~CEPH_MOUNT_OPT_INO32; break; + case Opt_fscache: - fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE; - kfree(fsopt->fscache_uniq); - fsopt->fscache_uniq = NULL; - break; - case Opt_nofscache: - fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; +#ifdef CONFIG_CEPH_FSCACHE kfree(fsopt->fscache_uniq); fsopt->fscache_uniq = NULL; + if (result.negated) { + fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; + } else { + fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE; + fsopt->fscache_uniq = param->string; + param->string = NULL; + } break; +#else + return invalf(fc, "ceph: fscache support is disabled"); +#endif case Opt_poolperm: - fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM; - break; - case Opt_nopoolperm: - fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM; + if (!result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM; + else + fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM; break; case Opt_require_active_mds: - fsopt->flags &= ~CEPH_MOUNT_OPT_MOUNTWAIT; - break; - case Opt_norequire_active_mds: - fsopt->flags |= CEPH_MOUNT_OPT_MOUNTWAIT; + if (!result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_MOUNTWAIT; + else + fsopt->flags |= CEPH_MOUNT_OPT_MOUNTWAIT; break; case Opt_quotadf: - fsopt->flags &= ~CEPH_MOUNT_OPT_NOQUOTADF; - break; - case Opt_noquotadf: - fsopt->flags |= CEPH_MOUNT_OPT_NOQUOTADF; + if (!result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_NOQUOTADF; + else + fsopt->flags |= CEPH_MOUNT_OPT_NOQUOTADF; break; case Opt_copyfrom: - fsopt->flags &= ~CEPH_MOUNT_OPT_NOCOPYFROM; + if (!result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_NOCOPYFROM; + else + fsopt->flags |= CEPH_MOUNT_OPT_NOCOPYFROM; break; - case Opt_nocopyfrom: - fsopt->flags |= CEPH_MOUNT_OPT_NOCOPYFROM; - break; -#ifdef CONFIG_CEPH_FS_POSIX_ACL case Opt_acl: - fsopt->sb_flags |= SB_POSIXACL; - break; + if (!result.negated) { +#ifdef CONFIG_CEPH_FS_POSIX_ACL + fc->sb_flags |= SB_POSIXACL; +#else + return invalf(fc, "ceph: POSIX ACL support is disabled"); #endif - case Opt_noacl: - fsopt->sb_flags &= ~SB_POSIXACL; + } else { + fc->sb_flags &= ~SB_POSIXACL; + } break; default: - BUG_ON(token); + BUG(); } return 0; + +out_of_range: + return invalf(fc, "ceph: %s out of range", param->key); } static void destroy_mount_options(struct ceph_mount_options *args) { dout("destroy_mount_options %p\n", args); + if (!args) + return; + kfree(args->snapdir_name); kfree(args->mds_namespace); kfree(args->server_path); @@ -450,91 +488,6 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, return ceph_compare_options(new_opt, fsc->client); } -static int parse_mount_options(struct ceph_mount_options **pfsopt, - struct ceph_options **popt, - int flags, char *options, - const char *dev_name) -{ - struct ceph_mount_options *fsopt; - const char *dev_name_end; - int err; - - if (!dev_name || !*dev_name) - return -EINVAL; - - fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL); - if (!fsopt) - return -ENOMEM; - - dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name); - - fsopt->sb_flags = flags; - fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; - - fsopt->wsize = CEPH_MAX_WRITE_SIZE; - fsopt->rsize = CEPH_MAX_READ_SIZE; - fsopt->rasize = CEPH_RASIZE_DEFAULT; - fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); - if (!fsopt->snapdir_name) { - err = -ENOMEM; - goto out; - } - - fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; - fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; - fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; - fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; - fsopt->congestion_kb = default_congestion_kb(); - - /* - * Distinguish the server list from the path in "dev_name". - * Internally we do not include the leading '/' in the path. - * - * "dev_name" will look like: - * <server_spec>[,<server_spec>...]:[<path>] - * where - * <server_spec> is <ip>[:<port>] - * <path> is optional, but if present must begin with '/' - */ - dev_name_end = strchr(dev_name, '/'); - if (dev_name_end) { - if (strlen(dev_name_end) > 1) { - fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL); - if (!fsopt->server_path) { - err = -ENOMEM; - goto out; - } - } - } else { - dev_name_end = dev_name + strlen(dev_name); - } - err = -EINVAL; - dev_name_end--; /* back up to ':' separator */ - if (dev_name_end < dev_name || *dev_name_end != ':') { - pr_err("device name is missing path (no : separator in %s)\n", - dev_name); - goto out; - } - dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); - if (fsopt->server_path) - dout("server path '%s'\n", fsopt->server_path); - - *popt = ceph_parse_options(options, dev_name, dev_name_end, - parse_fsopt_token, (void *)fsopt); - if (IS_ERR(*popt)) { - err = PTR_ERR(*popt); - goto out; - } - - /* success */ - *pfsopt = fsopt; - return 0; - -out: - destroy_mount_options(fsopt); - return err; -} - /** * ceph_show_options - Show mount options in /proc/mounts * @m: seq_file to write to @@ -578,7 +531,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",noquotadf"); #ifdef CONFIG_CEPH_FS_POSIX_ACL - if (fsopt->sb_flags & SB_POSIXACL) + if (root->d_sb->s_flags & SB_POSIXACL) seq_puts(m, ",acl"); else seq_puts(m, ",noacl"); @@ -851,12 +804,6 @@ static void ceph_umount_begin(struct super_block *sb) fsc->filp_gen++; // invalidate open files } -static int ceph_remount(struct super_block *sb, int *flags, char *data) -{ - sync_filesystem(sb); - return 0; -} - static const struct super_operations ceph_super_ops = { .alloc_inode = ceph_alloc_inode, .free_inode = ceph_free_inode, @@ -865,7 +812,6 @@ static const struct super_operations ceph_super_ops = { .evict_inode = ceph_evict_inode, .sync_fs = ceph_sync_fs, .put_super = ceph_put_super, - .remount_fs = ceph_remount, .show_options = ceph_show_options, .statfs = ceph_statfs, .umount_begin = ceph_umount_begin, @@ -926,7 +872,8 @@ out: /* * mount: join the ceph cluster, and open root directory. */ -static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc) +static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, + struct fs_context *fc) { int err; unsigned long started = jiffies; /* note the start time */ @@ -943,7 +890,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc) /* setup fscache */ if (fsc->mount_options->flags & CEPH_MOUNT_OPT_FSCACHE) { - err = ceph_fscache_register_fs(fsc); + err = ceph_fscache_register_fs(fsc, fc); if (err < 0) goto out; } @@ -978,18 +925,16 @@ out: return ERR_PTR(err); } -static int ceph_set_super(struct super_block *s, void *data) +static int ceph_set_super(struct super_block *s, struct fs_context *fc) { - struct ceph_fs_client *fsc = data; + struct ceph_fs_client *fsc = s->s_fs_info; int ret; - dout("set_super %p data %p\n", s, data); + dout("set_super %p\n", s); - s->s_flags = fsc->mount_options->sb_flags; s->s_maxbytes = MAX_LFS_FILESIZE; s->s_xattr = ceph_xattr_handlers; - s->s_fs_info = fsc; fsc->sb = s; fsc->max_file_size = 1ULL << 40; /* temp value until we get mdsmap */ @@ -1001,24 +946,18 @@ static int ceph_set_super(struct super_block *s, void *data) s->s_time_min = 0; s->s_time_max = U32_MAX; - ret = set_anon_super(s, NULL); /* what is that second arg for? */ + ret = set_anon_super_fc(s, fc); if (ret != 0) - goto fail; - - return ret; - -fail: - s->s_fs_info = NULL; - fsc->sb = NULL; + fsc->sb = NULL; return ret; } /* * share superblock if same fs AND options */ -static int ceph_compare_super(struct super_block *sb, void *data) +static int ceph_compare_super(struct super_block *sb, struct fs_context *fc) { - struct ceph_fs_client *new = data; + struct ceph_fs_client *new = fc->s_fs_info; struct ceph_mount_options *fsopt = new->mount_options; struct ceph_options *opt = new->client->options; struct ceph_fs_client *other = ceph_sb_to_client(sb); @@ -1034,7 +973,7 @@ static int ceph_compare_super(struct super_block *sb, void *data) dout("fsid doesn't match\n"); return 0; } - if (fsopt->sb_flags != other->mount_options->sb_flags) { + if (fc->sb_flags != (sb->s_flags & ~SB_BORN)) { dout("flags differ\n"); return 0; } @@ -1064,46 +1003,46 @@ static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc) return 0; } -static struct dentry *ceph_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int ceph_get_tree(struct fs_context *fc) { + struct ceph_parse_opts_ctx *pctx = fc->fs_private; struct super_block *sb; struct ceph_fs_client *fsc; struct dentry *res; + int (*compare_super)(struct super_block *, struct fs_context *) = + ceph_compare_super; int err; - int (*compare_super)(struct super_block *, void *) = ceph_compare_super; - struct ceph_mount_options *fsopt = NULL; - struct ceph_options *opt = NULL; - dout("ceph_mount\n"); + dout("ceph_get_tree\n"); + + if (!fc->source) + return invalf(fc, "ceph: No source"); #ifdef CONFIG_CEPH_FS_POSIX_ACL - flags |= SB_POSIXACL; + fc->sb_flags |= SB_POSIXACL; #endif - err = parse_mount_options(&fsopt, &opt, flags, data, dev_name); - if (err < 0) { - res = ERR_PTR(err); - goto out_final; - } /* create client (which we may/may not use) */ - fsc = create_fs_client(fsopt, opt); + fsc = create_fs_client(pctx->opts, pctx->copts); + pctx->opts = NULL; + pctx->copts = NULL; if (IS_ERR(fsc)) { - res = ERR_CAST(fsc); + err = PTR_ERR(fsc); goto out_final; } err = ceph_mdsc_init(fsc); - if (err < 0) { - res = ERR_PTR(err); + if (err < 0) goto out; - } if (ceph_test_opt(fsc->client, NOSHARE)) compare_super = NULL; - sb = sget(fs_type, compare_super, ceph_set_super, flags, fsc); + + fc->s_fs_info = fsc; + sb = sget_fc(fc, compare_super, ceph_set_super); + fc->s_fs_info = NULL; if (IS_ERR(sb)) { - res = ERR_CAST(sb); + err = PTR_ERR(sb); goto out; } @@ -1114,18 +1053,19 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, } else { dout("get_sb using new client %p\n", fsc); err = ceph_setup_bdi(sb, fsc); - if (err < 0) { - res = ERR_PTR(err); + if (err < 0) goto out_splat; - } } - res = ceph_real_mount(fsc); - if (IS_ERR(res)) + res = ceph_real_mount(fsc, fc); + if (IS_ERR(res)) { + err = PTR_ERR(res); goto out_splat; + } dout("root %p inode %p ino %llx.%llx\n", res, d_inode(res), ceph_vinop(d_inode(res))); - return res; + fc->root = fsc->sb->s_root; + return 0; out_splat: ceph_mdsc_close_sessions(fsc->mdsc); @@ -1135,8 +1075,79 @@ out_splat: out: destroy_fs_client(fsc); out_final: - dout("ceph_mount fail %ld\n", PTR_ERR(res)); - return res; + dout("ceph_get_tree fail %d\n", err); + return err; +} + +static void ceph_free_fc(struct fs_context *fc) +{ + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + + if (pctx) { + destroy_mount_options(pctx->opts); + ceph_destroy_options(pctx->copts); + kfree(pctx); + } +} + +static int ceph_reconfigure_fc(struct fs_context *fc) +{ + sync_filesystem(fc->root->d_sb); + return 0; +} + +static const struct fs_context_operations ceph_context_ops = { + .free = ceph_free_fc, + .parse_param = ceph_parse_mount_param, + .get_tree = ceph_get_tree, + .reconfigure = ceph_reconfigure_fc, +}; + +/* + * Set up the filesystem mount context. + */ +static int ceph_init_fs_context(struct fs_context *fc) +{ + struct ceph_parse_opts_ctx *pctx; + struct ceph_mount_options *fsopt; + + pctx = kzalloc(sizeof(*pctx), GFP_KERNEL); + if (!pctx) + return -ENOMEM; + + pctx->copts = ceph_alloc_options(); + if (!pctx->copts) + goto nomem; + + pctx->opts = kzalloc(sizeof(*pctx->opts), GFP_KERNEL); + if (!pctx->opts) + goto nomem; + + fsopt = pctx->opts; + fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; + + fsopt->wsize = CEPH_MAX_WRITE_SIZE; + fsopt->rsize = CEPH_MAX_READ_SIZE; + fsopt->rasize = CEPH_RASIZE_DEFAULT; + fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); + if (!fsopt->snapdir_name) + goto nomem; + + fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; + fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; + fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; + fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; + fsopt->congestion_kb = default_congestion_kb(); + + fc->fs_private = pctx; + fc->ops = &ceph_context_ops; + return 0; + +nomem: + destroy_mount_options(pctx->opts); + ceph_destroy_options(pctx->copts); + kfree(pctx); + return -ENOMEM; } static void ceph_kill_sb(struct super_block *s) @@ -1163,7 +1174,7 @@ static void ceph_kill_sb(struct super_block *s) static struct file_system_type ceph_fs_type = { .owner = THIS_MODULE, .name = "ceph", - .mount = ceph_mount, + .init_fs_context = ceph_init_fs_context, .kill_sb = ceph_kill_sb, .fs_flags = FS_RENAME_DOES_D_MOVE, }; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index f98d9247f9cb..f0f9cb7447ac 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -74,7 +74,6 @@ struct ceph_mount_options { int flags; - int sb_flags; int wsize; /* max write size */ int rsize; /* max read size */ @@ -407,22 +406,26 @@ struct ceph_inode_info { struct inode vfs_inode; /* at end */ }; -static inline struct ceph_inode_info *ceph_inode(struct inode *inode) +static inline struct ceph_inode_info * +ceph_inode(const struct inode *inode) { return container_of(inode, struct ceph_inode_info, vfs_inode); } -static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode) +static inline struct ceph_fs_client * +ceph_inode_to_client(const struct inode *inode) { return (struct ceph_fs_client *)inode->i_sb->s_fs_info; } -static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb) +static inline struct ceph_fs_client * +ceph_sb_to_client(const struct super_block *sb) { return (struct ceph_fs_client *)sb->s_fs_info; } -static inline struct ceph_vino ceph_vino(struct inode *inode) +static inline struct ceph_vino +ceph_vino(const struct inode *inode) { return ceph_inode(inode)->i_vino; } diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 0b4eee3bed66..19f6e592b941 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -122,6 +122,27 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon) } static void +cifs_dump_channel(struct seq_file *m, int i, struct cifs_chan *chan) +{ + struct TCP_Server_Info *server = chan->server; + + seq_printf(m, "\t\tChannel %d Number of credits: %d Dialect 0x%x " + "TCP status: %d Instance: %d Local Users To Server: %d " + "SecMode: 0x%x Req On Wire: %d In Send: %d " + "In MaxReq Wait: %d\n", + i+1, + server->credits, + server->dialect, + server->tcpStatus, + server->reconnect_instance, + server->srv_count, + server->sec_mode, + in_flight(server), + atomic_read(&server->in_send), + atomic_read(&server->num_waiters)); +} + +static void cifs_dump_iface(struct seq_file *m, struct cifs_server_iface *iface) { struct sockaddr_in *ipv4 = (struct sockaddr_in *)&iface->sockaddr; @@ -256,6 +277,11 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) if (!server->rdma) goto skip_rdma; + if (!server->smbd_conn) { + seq_printf(m, "\nSMBDirect transport not available"); + goto skip_rdma; + } + seq_printf(m, "\nSMBDirect (in hex) protocol version: %x " "transport status: %x", server->smbd_conn->protocol, @@ -360,11 +386,10 @@ skip_rdma: server->srv_count, server->sec_mode, in_flight(server)); -#ifdef CONFIG_CIFS_STATS2 seq_printf(m, " In Send: %d In MaxReq Wait: %d", atomic_read(&server->in_send), atomic_read(&server->num_waiters)); -#endif + /* dump session id helpful for use with network trace */ seq_printf(m, " SessionId: 0x%llx", ses->Suid); if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) @@ -372,6 +397,13 @@ skip_rdma: if (ses->sign) seq_puts(m, " signed"); + if (ses->chan_count > 1) { + seq_printf(m, "\n\n\tExtra Channels: %zu\n", + ses->chan_count-1); + for (j = 1; j < ses->chan_count; j++) + cifs_dump_channel(m, j, &ses->chans[j]); + } + seq_puts(m, "\n\tShares:"); j = 0; @@ -410,8 +442,13 @@ skip_rdma: seq_printf(m, "\n\tServer interfaces: %zu\n", ses->iface_count); for (j = 0; j < ses->iface_count; j++) { + struct cifs_server_iface *iface; + + iface = &ses->iface_list[j]; seq_printf(m, "\t%d)", j); - cifs_dump_iface(m, &ses->iface_list[j]); + cifs_dump_iface(m, iface); + if (is_ses_using_iface(ses, iface)) + seq_puts(m, "\t\t[CONNECTED]\n"); } spin_unlock(&ses->iface_lock); } diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 7f01c6e60791..7b9b876b513b 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -98,7 +98,7 @@ struct key_type cifs_spnego_key_type = { struct key * cifs_get_spnego_key(struct cifs_ses *sesInfo) { - struct TCP_Server_Info *server = sesInfo->server; + struct TCP_Server_Info *server = cifs_ses_server(sesInfo); struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr; struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr; char *description, *dp; diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index f842944a5c76..06ffe52bdcfa 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -39,8 +39,6 @@ static const struct cifs_sid sid_everyone = { /* security id for Authenticated Users system group */ static const struct cifs_sid sid_authusers = { 1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11)} }; -/* group users */ -static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} }; /* S-1-22-1 Unmapped Unix users */ static const struct cifs_sid sid_unix_users = {1, 1, {0, 0, 0, 0, 0, 22}, diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index c049c7b3aa87..5492b9860baa 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -119,6 +119,7 @@ extern mempool_t *cifs_mid_poolp; struct workqueue_struct *cifsiod_wq; struct workqueue_struct *decrypt_wq; +struct workqueue_struct *fileinfo_put_wq; struct workqueue_struct *cifsoplockd_wq; __u32 cifs_lock_secret; @@ -169,7 +170,13 @@ cifs_read_super(struct super_block *sb) else sb->s_maxbytes = MAX_NON_LFS; - /* Some very old servers like DOS and OS/2 used 2 second granularity */ + /* + * Some very old servers like DOS and OS/2 used 2 second granularity + * (while all current servers use 100ns granularity - see MS-DTYP) + * but 1 second is the maximum allowed granularity for the VFS + * so for old servers set time granularity to 1 second while for + * everything else (current servers) set it to 100ns. + */ if ((tcon->ses->server->vals->protocol_id == SMB10_PROT_ID) && ((tcon->ses->capabilities & tcon->ses->server->vals->cap_nt_find) == 0) && @@ -607,6 +614,10 @@ cifs_show_options(struct seq_file *s, struct dentry *root) /* convert actimeo and display it in seconds */ seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ); + if (tcon->ses->chan_max > 1) + seq_printf(s, ",multichannel,max_channel=%zu", + tcon->ses->chan_max); + return 0; } @@ -719,11 +730,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) struct inode *dir = d_inode(dentry); struct dentry *child; - if (!dir) { - dput(dentry); - dentry = ERR_PTR(-ENOENT); - break; - } if (!S_ISDIR(dir->i_mode)) { dput(dentry); dentry = ERR_PTR(-ENOTDIR); @@ -740,7 +746,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) while (*s && *s != sep) s++; - child = lookup_one_len_unlocked(p, dentry, s - p); + child = lookup_positive_unlocked(p, dentry, s - p); dput(dentry); dentry = child; } while (!IS_ERR(dentry)); @@ -1213,6 +1219,7 @@ const struct file_operations cifs_file_ops = { .open = cifs_open, .release = cifs_close, .lock = cifs_lock, + .flock = cifs_flock, .fsync = cifs_fsync, .flush = cifs_flush, .mmap = cifs_file_mmap, @@ -1232,6 +1239,7 @@ const struct file_operations cifs_file_strict_ops = { .open = cifs_open, .release = cifs_close, .lock = cifs_lock, + .flock = cifs_flock, .fsync = cifs_strict_fsync, .flush = cifs_flush, .mmap = cifs_file_strict_mmap, @@ -1251,6 +1259,7 @@ const struct file_operations cifs_file_direct_ops = { .open = cifs_open, .release = cifs_close, .lock = cifs_lock, + .flock = cifs_flock, .fsync = cifs_fsync, .flush = cifs_flush, .mmap = cifs_file_mmap, @@ -1537,7 +1546,7 @@ init_cifs(void) /* * Consider in future setting limit!=0 maybe to min(num_of_cores - 1, 3) * so that we don't launch too many worker threads but - * Documentation/workqueue.txt recommends setting it to 0 + * Documentation/core-api/workqueue.rst recommends setting it to 0 */ /* WQ_UNBOUND allows decrypt tasks to run on any CPU */ @@ -1548,11 +1557,18 @@ init_cifs(void) goto out_destroy_cifsiod_wq; } + fileinfo_put_wq = alloc_workqueue("cifsfileinfoput", + WQ_UNBOUND|WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + if (!fileinfo_put_wq) { + rc = -ENOMEM; + goto out_destroy_decrypt_wq; + } + cifsoplockd_wq = alloc_workqueue("cifsoplockd", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); if (!cifsoplockd_wq) { rc = -ENOMEM; - goto out_destroy_decrypt_wq; + goto out_destroy_fileinfo_put_wq; } rc = cifs_fscache_register(); @@ -1618,6 +1634,8 @@ out_unreg_fscache: cifs_fscache_unregister(); out_destroy_cifsoplockd_wq: destroy_workqueue(cifsoplockd_wq); +out_destroy_fileinfo_put_wq: + destroy_workqueue(fileinfo_put_wq); out_destroy_decrypt_wq: destroy_workqueue(decrypt_wq); out_destroy_cifsiod_wq: @@ -1647,6 +1665,7 @@ exit_cifs(void) cifs_fscache_unregister(); destroy_workqueue(cifsoplockd_wq); destroy_workqueue(decrypt_wq); + destroy_workqueue(fileinfo_put_wq); destroy_workqueue(cifsiod_wq); cifs_proc_clean(); } @@ -1657,17 +1676,17 @@ MODULE_DESCRIPTION ("VFS to access SMB3 servers e.g. Samba, Macs, Azure and Windows (and " "also older servers complying with the SNIA CIFS Specification)"); MODULE_VERSION(CIFS_VERSION); -MODULE_SOFTDEP("pre: ecb"); -MODULE_SOFTDEP("pre: hmac"); -MODULE_SOFTDEP("pre: md4"); -MODULE_SOFTDEP("pre: md5"); -MODULE_SOFTDEP("pre: nls"); -MODULE_SOFTDEP("pre: aes"); -MODULE_SOFTDEP("pre: cmac"); -MODULE_SOFTDEP("pre: sha256"); -MODULE_SOFTDEP("pre: sha512"); -MODULE_SOFTDEP("pre: aead2"); -MODULE_SOFTDEP("pre: ccm"); -MODULE_SOFTDEP("pre: gcm"); +MODULE_SOFTDEP("ecb"); +MODULE_SOFTDEP("hmac"); +MODULE_SOFTDEP("md4"); +MODULE_SOFTDEP("md5"); +MODULE_SOFTDEP("nls"); +MODULE_SOFTDEP("aes"); +MODULE_SOFTDEP("cmac"); +MODULE_SOFTDEP("sha256"); +MODULE_SOFTDEP("sha512"); +MODULE_SOFTDEP("aead2"); +MODULE_SOFTDEP("ccm"); +MODULE_SOFTDEP("gcm"); module_init(init_cifs) module_exit(exit_cifs) diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index bc4ca94137f2..b59dc7478130 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -108,6 +108,7 @@ extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to); extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from); extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from); extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from); +extern int cifs_flock(struct file *pfile, int cmd, struct file_lock *plock); extern int cifs_lock(struct file *, int, struct file_lock *); extern int cifs_fsync(struct file *, loff_t, loff_t, int); extern int cifs_strict_fsync(struct file *, loff_t, loff_t, int); @@ -152,5 +153,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.23" +#define CIFS_VERSION "2.24" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 50dfd9049370..d34a4ed8c57d 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -230,7 +230,8 @@ struct smb_version_operations { bool (*compare_fids)(struct cifsFileInfo *, struct cifsFileInfo *); /* setup request: allocate mid, sign message */ struct mid_q_entry *(*setup_request)(struct cifs_ses *, - struct smb_rqst *); + struct TCP_Server_Info *, + struct smb_rqst *); /* setup async request: allocate mid, sign message */ struct mid_q_entry *(*setup_async_request)(struct TCP_Server_Info *, struct smb_rqst *); @@ -268,8 +269,9 @@ struct smb_version_operations { int (*check_message)(char *, unsigned int, struct TCP_Server_Info *); bool (*is_oplock_break)(char *, struct TCP_Server_Info *); int (*handle_cancelled_mid)(char *, struct TCP_Server_Info *); - void (*downgrade_oplock)(struct TCP_Server_Info *, - struct cifsInodeInfo *, bool); + void (*downgrade_oplock)(struct TCP_Server_Info *server, + struct cifsInodeInfo *cinode, __u32 oplock, + unsigned int epoch, bool *purge_cache); /* process transaction2 response */ bool (*check_trans2)(struct mid_q_entry *, struct TCP_Server_Info *, char *, int); @@ -591,6 +593,10 @@ struct smb_vol { bool resilient:1; /* noresilient not required since not fored for CA */ bool domainauto:1; bool rdma:1; + bool multichannel:1; + bool use_client_guid:1; + /* reuse existing guid for multichannel */ + u8 client_guid[SMB2_CLIENT_GUID_SIZE]; unsigned int bsize; unsigned int rsize; unsigned int wsize; @@ -607,6 +613,7 @@ struct smb_vol { __u64 snapshot_time; /* needed for timewarp tokens */ __u32 handle_timeout; /* persistent and durable handle timeout in ms */ unsigned int max_credits; /* smb3 max_credits 10 < credits < 60000 */ + unsigned int max_channels; __u16 compression; /* compression algorithm 0xFFFF default 0=disabled */ bool rootfs:1; /* if it's a SMB root file system */ }; @@ -736,12 +743,12 @@ struct TCP_Server_Info { /* Total size of this PDU. Only valid from cifs_demultiplex_thread */ unsigned int pdu_size; unsigned int total_read; /* total amount of data read in this pass */ + atomic_t in_send; /* requests trying to send */ + atomic_t num_waiters; /* blocked waiting to get in sendrecv */ #ifdef CONFIG_CIFS_FSCACHE struct fscache_cookie *fscache; /* client index cache cookie */ #endif #ifdef CONFIG_CIFS_STATS2 - atomic_t in_send; /* requests trying to send */ - atomic_t num_waiters; /* blocked waiting to get in sendrecv */ atomic_t num_cmds[NUMBER_OF_SMB2_COMMANDS]; /* total requests by cmd */ atomic_t smb2slowcmd[NUMBER_OF_SMB2_COMMANDS]; /* count resps > 1 sec */ __u64 time_per_cmd[NUMBER_OF_SMB2_COMMANDS]; /* total time per cmd */ @@ -953,6 +960,11 @@ struct cifs_server_iface { struct sockaddr_storage sockaddr; }; +struct cifs_chan { + struct TCP_Server_Info *server; + __u8 signkey[SMB3_SIGN_KEY_SIZE]; +}; + /* * Session structure. One of these for each uid session with a particular host */ @@ -983,12 +995,15 @@ struct cifs_ses { bool sign; /* is signing required? */ bool need_reconnect:1; /* connection reset, uid now invalid */ bool domainAuto:1; + bool binding:1; /* are we binding the session? */ __u16 session_flags; __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE]; __u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE]; __u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE]; __u8 preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE]; + __u8 binding_preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE]; + /* * Network interfaces available on the server this session is * connected to. @@ -1002,8 +1017,37 @@ struct cifs_ses { struct cifs_server_iface *iface_list; size_t iface_count; unsigned long iface_last_update; /* jiffies */ + +#define CIFS_MAX_CHANNELS 16 + struct cifs_chan chans[CIFS_MAX_CHANNELS]; + size_t chan_count; + size_t chan_max; + atomic_t chan_seq; /* round robin state */ }; +/* + * When binding a new channel, we need to access the channel which isn't fully + * established yet (one past the established count) + */ + +static inline +struct cifs_chan *cifs_ses_binding_channel(struct cifs_ses *ses) +{ + if (ses->binding) + return &ses->chans[ses->chan_count]; + else + return NULL; +} + +static inline +struct TCP_Server_Info *cifs_ses_server(struct cifs_ses *ses) +{ + if (ses->binding) + return ses->chans[ses->chan_count].server; + else + return ses->server; +} + static inline bool cap_unix(struct cifs_ses *ses) { @@ -1260,11 +1304,14 @@ struct cifsFileInfo { unsigned int f_flags; bool invalidHandle:1; /* file closed via session abend */ bool oplock_break_cancelled:1; + unsigned int oplock_epoch; /* epoch from the lease break */ + __u32 oplock_level; /* oplock/lease level from the lease break */ int count; spinlock_t file_info_lock; /* protects four flag/count fields above */ struct mutex fh_mutex; /* prevents reopen race after dead ses*/ struct cifs_search_info srch_inf; struct work_struct oplock_break; /* work for oplock breaks */ + struct work_struct put; /* work for the final part of _put */ }; struct cifs_io_parms { @@ -1370,7 +1417,8 @@ cifsFileInfo_get_locked(struct cifsFileInfo *cifs_file) } struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file); -void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_hdlr); +void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_hdlr, + bool offload); void cifsFileInfo_put(struct cifsFileInfo *cifs_file); #define CIFS_CACHE_READ_FLG 1 @@ -1391,6 +1439,11 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file); struct cifsInodeInfo { bool can_cache_brlcks; struct list_head llist; /* locks helb by this inode */ + /* + * NOTE: Some code paths call down_read(lock_sem) twice, so + * we must always use use cifs_down_write() instead of down_write() + * for this semaphore to avoid deadlocks. + */ struct rw_semaphore lock_sem; /* protect the fields above */ /* BB add in lists for dirty pages i.e. write caching info for oplock */ struct list_head openFileList; @@ -1400,7 +1453,7 @@ struct cifsInodeInfo { unsigned int epoch; /* used to track lease state changes */ #define CIFS_INODE_PENDING_OPLOCK_BREAK (0) /* oplock break in progress */ #define CIFS_INODE_PENDING_WRITERS (1) /* Writes in progress */ -#define CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2 (2) /* Downgrade oplock to L2 */ +#define CIFS_INODE_FLAG_UNUSED (2) /* Unused flag */ #define CIFS_INO_DELETE_PENDING (3) /* delete pending on server */ #define CIFS_INO_INVALID_MAPPING (4) /* pagecache is invalid */ #define CIFS_INO_LOCK (5) /* lock bit for synchronization */ @@ -1519,6 +1572,7 @@ struct mid_q_entry { struct TCP_Server_Info *server; /* server corresponding to this mid */ __u64 mid; /* multiplex id */ __u16 credits; /* number of credits consumed by this mid */ + __u16 credits_received; /* number of credits from the response */ __u32 pid; /* process id */ __u32 sequence_number; /* for CIFS signing */ unsigned long when_alloc; /* when mid was created */ @@ -1546,12 +1600,12 @@ struct close_cancelled_open { struct cifs_fid fid; struct cifs_tcon *tcon; struct work_struct work; + __u64 mid; + __u16 cmd; }; /* Make code in transport.c a little cleaner by moving update of optional stats into function below */ -#ifdef CONFIG_CIFS_STATS2 - static inline void cifs_in_send_inc(struct TCP_Server_Info *server) { atomic_inc(&server->in_send); @@ -1572,26 +1626,12 @@ static inline void cifs_num_waiters_dec(struct TCP_Server_Info *server) atomic_dec(&server->num_waiters); } +#ifdef CONFIG_CIFS_STATS2 static inline void cifs_save_when_sent(struct mid_q_entry *mid) { mid->when_sent = jiffies; } #else -static inline void cifs_in_send_inc(struct TCP_Server_Info *server) -{ -} -static inline void cifs_in_send_dec(struct TCP_Server_Info *server) -{ -} - -static inline void cifs_num_waiters_inc(struct TCP_Server_Info *server) -{ -} - -static inline void cifs_num_waiters_dec(struct TCP_Server_Info *server) -{ -} - static inline void cifs_save_when_sent(struct mid_q_entry *mid) { } @@ -1902,6 +1942,7 @@ void cifs_queue_oplock_break(struct cifsFileInfo *cfile); extern const struct slow_work_ops cifs_oplock_break_ops; extern struct workqueue_struct *cifsiod_wq; extern struct workqueue_struct *decrypt_wq; +extern struct workqueue_struct *fileinfo_put_wq; extern struct workqueue_struct *cifsoplockd_wq; extern __u32 cifs_lock_secret; @@ -1932,4 +1973,10 @@ extern struct smb_version_values smb302_values; #define ALT_SMB311_VERSION_STRING "3.11" extern struct smb_version_operations smb311_operations; extern struct smb_version_values smb311_values; + +static inline bool is_smb1_server(struct TCP_Server_Info *server) +{ + return strcmp(server->vals->version_string, SMB1_VERSION_STRING) == 0; +} + #endif /* _CIFS_GLOB_H */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index e53e9f62b87b..1ed695336f62 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -109,6 +109,7 @@ extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *, extern int SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses, char *in_buf, int flags); extern struct mid_q_entry *cifs_setup_request(struct cifs_ses *, + struct TCP_Server_Info *, struct smb_rqst *); extern struct mid_q_entry *cifs_setup_async_request(struct TCP_Server_Info *, struct smb_rqst *); @@ -170,6 +171,7 @@ extern int cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, const unsigned int xid); extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile); +extern void cifs_down_write(struct rw_semaphore *sem); extern struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, struct tcon_link *tlink, @@ -241,6 +243,7 @@ extern void cifs_add_pending_open_locked(struct cifs_fid *fid, struct tcon_link *tlink, struct cifs_pending_open *open); extern void cifs_del_pending_open(struct cifs_pending_open *open); +extern struct TCP_Server_Info *cifs_get_tcp_session(struct smb_vol *vol); extern void cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect); extern void cifs_put_tcon(struct cifs_tcon *tcon); @@ -583,6 +586,12 @@ void cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc); extern void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page, unsigned int *len, unsigned int *offset); +int cifs_try_adding_channels(struct cifs_ses *ses); +int cifs_ses_add_channel(struct cifs_ses *ses, + struct cifs_server_iface *iface); +bool is_server_using_iface(struct TCP_Server_Info *server, + struct cifs_server_iface *iface); +bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface); void extract_unc_hostname(const char *unc, const char **h, size_t *len); int copy_path_name(char *dst, const char *src); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index a64dfa95a925..86d1baedf21c 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -97,6 +97,7 @@ enum { Opt_persistent, Opt_nopersistent, Opt_resilient, Opt_noresilient, Opt_domainauto, Opt_rdma, Opt_modesid, Opt_rootfs, + Opt_multichannel, Opt_nomultichannel, Opt_compress, /* Mount options which take numeric value */ @@ -106,7 +107,7 @@ enum { Opt_min_enc_offload, Opt_blocksize, Opt_rsize, Opt_wsize, Opt_actimeo, Opt_echo_interval, Opt_max_credits, Opt_handletimeout, - Opt_snapshot, + Opt_snapshot, Opt_max_channels, /* Mount options which take string value */ Opt_user, Opt_pass, Opt_ip, @@ -199,6 +200,8 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_noresilient, "noresilienthandles"}, { Opt_domainauto, "domainauto"}, { Opt_rdma, "rdma"}, + { Opt_multichannel, "multichannel" }, + { Opt_nomultichannel, "nomultichannel" }, { Opt_backupuid, "backupuid=%s" }, { Opt_backupgid, "backupgid=%s" }, @@ -218,6 +221,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_echo_interval, "echo_interval=%s" }, { Opt_max_credits, "max_credits=%s" }, { Opt_snapshot, "snapshot=%s" }, + { Opt_max_channels, "max_channels=%s" }, { Opt_compress, "compress=%s" }, { Opt_blank_user, "user=" }, @@ -387,7 +391,7 @@ static inline int reconn_set_ipaddr(struct TCP_Server_Info *server) #ifdef CONFIG_CIFS_DFS_UPCALL struct super_cb_data { struct TCP_Server_Info *server; - struct cifs_sb_info *cifs_sb; + struct super_block *sb; }; /* These functions must be called with server->srv_mutex held */ @@ -398,25 +402,39 @@ static void super_cb(struct super_block *sb, void *arg) struct cifs_sb_info *cifs_sb; struct cifs_tcon *tcon; - if (d->cifs_sb) + if (d->sb) return; cifs_sb = CIFS_SB(sb); tcon = cifs_sb_master_tcon(cifs_sb); if (tcon->ses->server == d->server) - d->cifs_sb = cifs_sb; + d->sb = sb; } -static inline struct cifs_sb_info * -find_super_by_tcp(struct TCP_Server_Info *server) +static struct super_block *get_tcp_super(struct TCP_Server_Info *server) { struct super_cb_data d = { .server = server, - .cifs_sb = NULL, + .sb = NULL, }; iterate_supers_type(&cifs_fs_type, super_cb, &d); - return d.cifs_sb ? d.cifs_sb : ERR_PTR(-ENOENT); + + if (unlikely(!d.sb)) + return ERR_PTR(-ENOENT); + /* + * Grab an active reference in order to prevent automounts (DFS links) + * of expiring and then freeing up our cifs superblock pointer while + * we're doing failover. + */ + cifs_sb_active(d.sb); + return d.sb; +} + +static inline void put_tcp_super(struct super_block *sb) +{ + if (!IS_ERR_OR_NULL(sb)) + cifs_sb_deactive(sb); } static void reconn_inval_dfs_target(struct TCP_Server_Info *server, @@ -480,6 +498,7 @@ cifs_reconnect(struct TCP_Server_Info *server) struct mid_q_entry *mid_entry; struct list_head retry_list; #ifdef CONFIG_CIFS_DFS_UPCALL + struct super_block *sb = NULL; struct cifs_sb_info *cifs_sb = NULL; struct dfs_cache_tgt_list tgt_list = {0}; struct dfs_cache_tgt_iterator *tgt_it = NULL; @@ -489,13 +508,15 @@ cifs_reconnect(struct TCP_Server_Info *server) server->nr_targets = 1; #ifdef CONFIG_CIFS_DFS_UPCALL spin_unlock(&GlobalMid_Lock); - cifs_sb = find_super_by_tcp(server); - if (IS_ERR(cifs_sb)) { - rc = PTR_ERR(cifs_sb); + sb = get_tcp_super(server); + if (IS_ERR(sb)) { + rc = PTR_ERR(sb); cifs_dbg(FYI, "%s: will not do DFS failover: rc = %d\n", __func__, rc); - cifs_sb = NULL; + sb = NULL; } else { + cifs_sb = CIFS_SB(sb); + rc = reconn_setup_dfs_targets(cifs_sb, &tgt_list, &tgt_it); if (rc && (rc != -EOPNOTSUPP)) { cifs_server_dbg(VFS, "%s: no target servers for DFS failover\n", @@ -512,6 +533,10 @@ cifs_reconnect(struct TCP_Server_Info *server) /* the demux thread will exit normally next time through the loop */ spin_unlock(&GlobalMid_Lock); +#ifdef CONFIG_CIFS_DFS_UPCALL + dfs_cache_free_tgts(&tgt_list); + put_tcp_super(sb); +#endif return rc; } else server->tcpStatus = CifsNeedReconnect; @@ -564,9 +589,11 @@ cifs_reconnect(struct TCP_Server_Info *server) spin_lock(&GlobalMid_Lock); list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { mid_entry = list_entry(tmp, struct mid_q_entry, qhead); + kref_get(&mid_entry->refcount); if (mid_entry->mid_state == MID_REQUEST_SUBMITTED) mid_entry->mid_state = MID_RETRY_NEEDED; list_move(&mid_entry->qhead, &retry_list); + mid_entry->mid_flags |= MID_DELETED; } spin_unlock(&GlobalMid_Lock); mutex_unlock(&server->srv_mutex); @@ -576,6 +603,7 @@ cifs_reconnect(struct TCP_Server_Info *server) mid_entry = list_entry(tmp, struct mid_q_entry, qhead); list_del_init(&mid_entry->qhead); mid_entry->callback(mid_entry); + cifs_mid_q_entry_release(mid_entry); } if (cifs_rdma_enabled(server)) { @@ -635,7 +663,10 @@ cifs_reconnect(struct TCP_Server_Info *server) __func__, rc); } dfs_cache_free_tgts(&tgt_list); + } + + put_tcp_super(sb); #endif if (server->tcpStatus == CifsNeedNegotiate) mod_delayed_work(cifsiod_wq, &server->echo, 0); @@ -895,11 +926,27 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed) if (mid->mid_flags & MID_DELETED) printk_once(KERN_WARNING "trying to dequeue a deleted mid\n"); - else + else { list_del_init(&mid->qhead); + mid->mid_flags |= MID_DELETED; + } spin_unlock(&GlobalMid_Lock); } +static unsigned int +smb2_get_credits_from_hdr(char *buffer, struct TCP_Server_Info *server) +{ + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buffer; + + /* + * SMB1 does not use credits. + */ + if (server->vals->header_preamble_size) + return 0; + + return le16_to_cpu(shdr->CreditRequest); +} + static void handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server, char *buf, int malformed) @@ -907,6 +954,7 @@ handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server, if (server->ops->check_trans2 && server->ops->check_trans2(mid, server, buf, malformed)) return; + mid->credits_received = smb2_get_credits_from_hdr(buf, server); mid->resp_buf = buf; mid->large_buf = server->large_buf; /* Was previous buf put in mpx struct for multi-rsp? */ @@ -966,8 +1014,10 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { mid_entry = list_entry(tmp, struct mid_q_entry, qhead); cifs_dbg(FYI, "Clearing mid 0x%llx\n", mid_entry->mid); + kref_get(&mid_entry->refcount); mid_entry->mid_state = MID_SHUTDOWN; list_move(&mid_entry->qhead, &dispose_list); + mid_entry->mid_flags |= MID_DELETED; } spin_unlock(&GlobalMid_Lock); @@ -977,6 +1027,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) cifs_dbg(FYI, "Callback mid 0x%llx\n", mid_entry->mid); list_del_init(&mid_entry->qhead); mid_entry->callback(mid_entry); + cifs_mid_q_entry_release(mid_entry); } /* 1/8th of sec is more than enough time for them to exit */ msleep(125); @@ -1214,12 +1265,6 @@ next_pdu: for (i = 0; i < num_mids; i++) { if (mids[i] != NULL) { mids[i]->resp_buf_size = server->pdu_size; - if ((mids[i]->mid_flags & MID_WAIT_CANCELLED) && - mids[i]->mid_state == MID_RESPONSE_RECEIVED && - server->ops->handle_cancelled_mid) - server->ops->handle_cancelled_mid( - mids[i]->resp_buf, - server); if (!mids[i]->multiRsp || mids[i]->multiEnd) mids[i]->callback(mids[i]); @@ -1664,6 +1709,10 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->echo_interval = SMB_ECHO_INTERVAL_DEFAULT; + /* default to no multichannel (single server connection) */ + vol->multichannel = false; + vol->max_channels = 1; + if (!mountdata) goto cifs_parse_mount_err; @@ -1957,6 +2006,12 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, case Opt_rdma: vol->rdma = true; break; + case Opt_multichannel: + vol->multichannel = true; + break; + case Opt_nomultichannel: + vol->multichannel = false; + break; case Opt_compress: vol->compression = UNKNOWN_TYPE; cifs_dbg(VFS, @@ -2120,6 +2175,15 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, } vol->max_credits = option; break; + case Opt_max_channels: + if (get_option_ul(args, &option) || option < 1 || + option > CIFS_MAX_CHANNELS) { + cifs_dbg(VFS, "%s: Invalid max_channels value, needs to be 1-%d\n", + __func__, CIFS_MAX_CHANNELS); + goto cifs_parse_mount_err; + } + vol->max_channels = option; + break; /* String Arguments */ @@ -2705,7 +2769,7 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect) send_sig(SIGKILL, task, 1); } -static struct TCP_Server_Info * +struct TCP_Server_Info * cifs_get_tcp_session(struct smb_vol *volume_info) { struct TCP_Server_Info *tcp_ses = NULL; @@ -2764,7 +2828,11 @@ cifs_get_tcp_session(struct smb_vol *volume_info) sizeof(tcp_ses->srcaddr)); memcpy(&tcp_ses->dstaddr, &volume_info->dstaddr, sizeof(tcp_ses->dstaddr)); - generate_random_uuid(tcp_ses->client_guid); + if (volume_info->use_client_guid) + memcpy(tcp_ses->client_guid, volume_info->client_guid, + SMB2_CLIENT_GUID_SIZE); + else + generate_random_uuid(tcp_ses->client_guid); /* * at this point we are the only ones with the pointer * to the struct since the kernel thread not created yet @@ -2853,6 +2921,13 @@ static int match_session(struct cifs_ses *ses, struct smb_vol *vol) vol->sectype != ses->sectype) return 0; + /* + * If an existing session is limited to less channels than + * requested, it should not be reused + */ + if (ses->chan_max < vol->max_channels) + return 0; + switch (ses->sectype) { case Kerberos: if (!uid_eq(vol->cred_uid, ses->cred_uid)) @@ -3023,6 +3098,14 @@ void cifs_put_smb_ses(struct cifs_ses *ses) list_del_init(&ses->smb_ses_list); spin_unlock(&cifs_tcp_ses_lock); + /* close any extra channels */ + if (ses->chan_count > 1) { + int i; + + for (i = 1; i < ses->chan_count; i++) + cifs_put_tcp_session(ses->chans[i].server, 0); + } + sesInfoFree(ses); cifs_put_tcp_session(server, 0); } @@ -3269,14 +3352,25 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) ses->sectype = volume_info->sectype; ses->sign = volume_info->sign; mutex_lock(&ses->session_mutex); + + /* add server as first channel */ + ses->chans[0].server = server; + ses->chan_count = 1; + ses->chan_max = volume_info->multichannel ? volume_info->max_channels:1; + rc = cifs_negotiate_protocol(xid, ses); if (!rc) rc = cifs_setup_session(xid, ses, volume_info->local_nls); + + /* each channel uses a different signing key */ + memcpy(ses->chans[0].signkey, ses->smb3signingkey, + sizeof(ses->smb3signingkey)); + mutex_unlock(&ses->session_mutex); if (rc) goto get_ses_fail; - /* success, put it on the list */ + /* success, put it on the list and add it as first channel */ spin_lock(&cifs_tcp_ses_lock); list_add(&ses->smb_ses_list, &server->smb_ses_list); spin_unlock(&cifs_tcp_ses_lock); @@ -3882,8 +3976,12 @@ generic_ip_connect(struct TCP_Server_Info *server) rc = socket->ops->connect(socket, saddr, slen, server->noblockcnt ? O_NONBLOCK : 0); - - if (rc == -EINPROGRESS) + /* + * When mounting SMB root file systems, we do not want to block in + * connect. Otherwise bail out and then let cifs_reconnect() perform + * reconnect failover - if possible. + */ + if (server->noblockcnt && rc == -EINPROGRESS) rc = 0; if (rc < 0) { cifs_dbg(FYI, "Error %d connecting to server\n", rc); @@ -4688,6 +4786,17 @@ static int is_path_remote(struct cifs_sb_info *cifs_sb, struct smb_vol *vol, } #ifdef CONFIG_CIFS_DFS_UPCALL +static inline void set_root_tcon(struct cifs_sb_info *cifs_sb, + struct cifs_tcon *tcon, + struct cifs_tcon **root) +{ + spin_lock(&cifs_tcp_ses_lock); + tcon->tc_count++; + tcon->remap = cifs_remap(cifs_sb); + spin_unlock(&cifs_tcp_ses_lock); + *root = tcon; +} + int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol) { int rc = 0; @@ -4789,18 +4898,10 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol) /* Cache out resolved root server */ (void)dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb), root_path + 1, NULL, NULL); - /* - * Save root tcon for additional DFS requests to update or create a new - * DFS cache entry, or even perform DFS failover. - */ - spin_lock(&cifs_tcp_ses_lock); - tcon->tc_count++; - tcon->dfs_path = root_path; + kfree(root_path); root_path = NULL; - tcon->remap = cifs_remap(cifs_sb); - spin_unlock(&cifs_tcp_ses_lock); - root_tcon = tcon; + set_root_tcon(cifs_sb, tcon, &root_tcon); for (count = 1; ;) { if (!rc && tcon) { @@ -4837,6 +4938,15 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol) mount_put_conns(cifs_sb, xid, server, ses, tcon); rc = mount_get_conns(vol, cifs_sb, &xid, &server, &ses, &tcon); + /* + * Ensure that DFS referrals go through new root server. + */ + if (!rc && tcon && + (tcon->share_flags & (SHI1005_FLAGS_DFS | + SHI1005_FLAGS_DFS_ROOT))) { + cifs_put_tcon(root_tcon); + set_root_tcon(cifs_sb, tcon, &root_tcon); + } } if (rc) { if (rc == -EACCES || rc == -EOPNOTSUPP) @@ -4885,6 +4995,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol) cifs_autodisable_serverino(cifs_sb); out: free_xid(xid); + cifs_try_adding_channels(ses); return mount_setup_tlink(cifs_sb, ses, tcon); error: @@ -5130,7 +5241,7 @@ int cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses) { int rc = 0; - struct TCP_Server_Info *server = ses->server; + struct TCP_Server_Info *server = cifs_ses_server(ses); if (!server->ops->need_neg || !server->ops->negotiate) return -ENOSYS; @@ -5157,23 +5268,25 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, struct nls_table *nls_info) { int rc = -ENOSYS; - struct TCP_Server_Info *server = ses->server; - - ses->capabilities = server->capabilities; - if (linuxExtEnabled == 0) - ses->capabilities &= (~server->vals->cap_unix); + struct TCP_Server_Info *server = cifs_ses_server(ses); + + if (!ses->binding) { + ses->capabilities = server->capabilities; + if (linuxExtEnabled == 0) + ses->capabilities &= (~server->vals->cap_unix); + + if (ses->auth_key.response) { + cifs_dbg(FYI, "Free previous auth_key.response = %p\n", + ses->auth_key.response); + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + ses->auth_key.len = 0; + } + } cifs_dbg(FYI, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d\n", server->sec_mode, server->capabilities, server->timeAdj); - if (ses->auth_key.response) { - cifs_dbg(FYI, "Free previous auth_key.response = %p\n", - ses->auth_key.response); - kfree(ses->auth_key.response); - ses->auth_key.response = NULL; - ses->auth_key.len = 0; - } - if (server->ops->sess_setup) rc = server->ops->sess_setup(xid, ses, nls_info); diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index 1692c0c6c23a..2faa05860a48 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -1317,7 +1317,6 @@ static struct cifs_ses *find_root_ses(struct dfs_cache_vol_info *vi, int rc; struct dfs_info3_param ref = {0}; char *mdata = NULL, *devname = NULL; - bool is_smb3 = tcon->ses->server->vals->header_preamble_size == 0; struct TCP_Server_Info *server; struct cifs_ses *ses; struct smb_vol vol; @@ -1344,7 +1343,7 @@ static struct cifs_ses *find_root_ses(struct dfs_cache_vol_info *vi, goto out; } - rc = cifs_setup_volume_info(&vol, mdata, devname, is_smb3); + rc = cifs_setup_volume_info(&vol, mdata, devname, false); kfree(devname); if (rc) { diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 7ce689d31aa2..f3b79012ff29 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -244,10 +244,8 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, *oplock = REQ_OPLOCK; full_path = build_path_from_dentry(direntry); - if (full_path == NULL) { - rc = -ENOMEM; - goto out; - } + if (!full_path) + return -ENOMEM; if (tcon->unix_ext && cap_unix(tcon->ses) && !tcon->broken_posix_open && (CIFS_UNIX_POSIX_PATH_OPS_CAP & diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 5ad15de2bb4f..f1fe9c44d298 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -281,6 +281,15 @@ cifs_has_mand_locks(struct cifsInodeInfo *cinode) return has_locks; } +void +cifs_down_write(struct rw_semaphore *sem) +{ + while (!down_write_trylock(sem)) + msleep(10); +} + +static void cifsFileInfo_put_work(struct work_struct *work); + struct cifsFileInfo * cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, struct tcon_link *tlink, __u32 oplock) @@ -306,7 +315,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, INIT_LIST_HEAD(&fdlocks->locks); fdlocks->cfile = cfile; cfile->llist = fdlocks; - down_write(&cinode->lock_sem); + cifs_down_write(&cinode->lock_sem); list_add(&fdlocks->llist, &cinode->llist); up_write(&cinode->lock_sem); @@ -318,6 +327,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, cfile->invalidHandle = false; cfile->tlink = cifs_get_tlink(tlink); INIT_WORK(&cfile->oplock_break, cifs_oplock_break); + INIT_WORK(&cfile->put, cifsFileInfo_put_work); mutex_init(&cfile->fh_mutex); spin_lock_init(&cfile->file_info_lock); @@ -368,6 +378,41 @@ cifsFileInfo_get(struct cifsFileInfo *cifs_file) return cifs_file; } +static void cifsFileInfo_put_final(struct cifsFileInfo *cifs_file) +{ + struct inode *inode = d_inode(cifs_file->dentry); + struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct cifsLockInfo *li, *tmp; + struct super_block *sb = inode->i_sb; + + /* + * Delete any outstanding lock records. We'll lose them when the file + * is closed anyway. + */ + cifs_down_write(&cifsi->lock_sem); + list_for_each_entry_safe(li, tmp, &cifs_file->llist->locks, llist) { + list_del(&li->llist); + cifs_del_lock_waiters(li); + kfree(li); + } + list_del(&cifs_file->llist->llist); + kfree(cifs_file->llist); + up_write(&cifsi->lock_sem); + + cifs_put_tlink(cifs_file->tlink); + dput(cifs_file->dentry); + cifs_sb_deactive(sb); + kfree(cifs_file); +} + +static void cifsFileInfo_put_work(struct work_struct *work) +{ + struct cifsFileInfo *cifs_file = container_of(work, + struct cifsFileInfo, put); + + cifsFileInfo_put_final(cifs_file); +} + /** * cifsFileInfo_put - release a reference of file priv data * @@ -375,15 +420,15 @@ cifsFileInfo_get(struct cifsFileInfo *cifs_file) */ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) { - _cifsFileInfo_put(cifs_file, true); + _cifsFileInfo_put(cifs_file, true, true); } /** * _cifsFileInfo_put - release a reference of file priv data * * This may involve closing the filehandle @cifs_file out on the - * server. Must be called without holding tcon->open_file_lock and - * cifs_file->file_info_lock. + * server. Must be called without holding tcon->open_file_lock, + * cinode->open_file_lock and cifs_file->file_info_lock. * * If @wait_for_oplock_handler is true and we are releasing the last * reference, wait for any running oplock break handler of the file @@ -391,7 +436,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) * oplock break handler, you need to pass false. * */ -void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) +void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, + bool wait_oplock_handler, bool offload) { struct inode *inode = d_inode(cifs_file->dentry); struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink); @@ -399,16 +445,16 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) struct cifsInodeInfo *cifsi = CIFS_I(inode); struct super_block *sb = inode->i_sb; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - struct cifsLockInfo *li, *tmp; struct cifs_fid fid; struct cifs_pending_open open; bool oplock_break_cancelled; spin_lock(&tcon->open_file_lock); - + spin_lock(&cifsi->open_file_lock); spin_lock(&cifs_file->file_info_lock); if (--cifs_file->count > 0) { spin_unlock(&cifs_file->file_info_lock); + spin_unlock(&cifsi->open_file_lock); spin_unlock(&tcon->open_file_lock); return; } @@ -421,9 +467,7 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) cifs_add_pending_open_locked(&fid, cifs_file->tlink, &open); /* remove it from the lists */ - spin_lock(&cifsi->open_file_lock); list_del(&cifs_file->flist); - spin_unlock(&cifsi->open_file_lock); list_del(&cifs_file->tlist); atomic_dec(&tcon->num_local_opens); @@ -440,6 +484,7 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) cifs_set_oplock_level(cifsi, 0); } + spin_unlock(&cifsi->open_file_lock); spin_unlock(&tcon->open_file_lock); oplock_break_cancelled = wait_oplock_handler ? @@ -460,24 +505,10 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) cifs_del_pending_open(&open); - /* - * Delete any outstanding lock records. We'll lose them when the file - * is closed anyway. - */ - down_write(&cifsi->lock_sem); - list_for_each_entry_safe(li, tmp, &cifs_file->llist->locks, llist) { - list_del(&li->llist); - cifs_del_lock_waiters(li); - kfree(li); - } - list_del(&cifs_file->llist->llist); - kfree(cifs_file->llist); - up_write(&cifsi->lock_sem); - - cifs_put_tlink(cifs_file->tlink); - dput(cifs_file->dentry); - cifs_sb_deactive(sb); - kfree(cifs_file); + if (offload) + queue_work(fileinfo_put_wq, &cifs_file->put); + else + cifsFileInfo_put_final(cifs_file); } int cifs_open(struct inode *inode, struct file *file) @@ -721,6 +752,13 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) if (backup_cred(cifs_sb)) create_options |= CREATE_OPEN_BACKUP_INTENT; + /* O_SYNC also has bit for O_DSYNC so following check picks up either */ + if (cfile->f_flags & O_SYNC) + create_options |= CREATE_WRITE_THROUGH; + + if (cfile->f_flags & O_DIRECT) + create_options |= CREATE_NO_BUFFER; + if (server->ops->get_lease_key) server->ops->get_lease_key(inode, &cfile->fid); @@ -801,7 +839,7 @@ reopen_error_exit: int cifs_close(struct inode *inode, struct file *file) { if (file->private_data != NULL) { - cifsFileInfo_put(file->private_data); + _cifsFileInfo_put(file->private_data, true, false); file->private_data = NULL; } @@ -1027,7 +1065,7 @@ static void cifs_lock_add(struct cifsFileInfo *cfile, struct cifsLockInfo *lock) { struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); - down_write(&cinode->lock_sem); + cifs_down_write(&cinode->lock_sem); list_add_tail(&lock->llist, &cfile->llist->locks); up_write(&cinode->lock_sem); } @@ -1049,7 +1087,7 @@ cifs_lock_add_if(struct cifsFileInfo *cfile, struct cifsLockInfo *lock, try_again: exist = false; - down_write(&cinode->lock_sem); + cifs_down_write(&cinode->lock_sem); exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length, lock->type, lock->flags, &conf_lock, @@ -1072,7 +1110,7 @@ try_again: (lock->blist.next == &lock->blist)); if (!rc) goto try_again; - down_write(&cinode->lock_sem); + cifs_down_write(&cinode->lock_sem); list_del_init(&lock->blist); } @@ -1125,7 +1163,7 @@ cifs_posix_lock_set(struct file *file, struct file_lock *flock) return rc; try_again: - down_write(&cinode->lock_sem); + cifs_down_write(&cinode->lock_sem); if (!cinode->can_cache_brlcks) { up_write(&cinode->lock_sem); return rc; @@ -1331,7 +1369,7 @@ cifs_push_locks(struct cifsFileInfo *cfile) int rc = 0; /* we are going to update can_cache_brlcks here - need a write access */ - down_write(&cinode->lock_sem); + cifs_down_write(&cinode->lock_sem); if (!cinode->can_cache_brlcks) { up_write(&cinode->lock_sem); return rc; @@ -1522,7 +1560,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, if (!buf) return -ENOMEM; - down_write(&cinode->lock_sem); + cifs_down_write(&cinode->lock_sem); for (i = 0; i < 2; i++) { cur = buf; num = 0; @@ -1674,7 +1712,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, rc = server->ops->mand_unlock_range(cfile, flock, xid); out: - if (flock->fl_flags & FL_POSIX) { + if ((flock->fl_flags & FL_POSIX) || (flock->fl_flags & FL_FLOCK)) { /* * If this is a request to remove all locks because we * are closing the file, it doesn't matter if the @@ -1691,6 +1729,52 @@ out: return rc; } +int cifs_flock(struct file *file, int cmd, struct file_lock *fl) +{ + int rc, xid; + int lock = 0, unlock = 0; + bool wait_flag = false; + bool posix_lck = false; + struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; + struct cifsFileInfo *cfile; + __u32 type; + + rc = -EACCES; + xid = get_xid(); + + if (!(fl->fl_flags & FL_FLOCK)) + return -ENOLCK; + + cfile = (struct cifsFileInfo *)file->private_data; + tcon = tlink_tcon(cfile->tlink); + + cifs_read_flock(fl, &type, &lock, &unlock, &wait_flag, + tcon->ses->server); + cifs_sb = CIFS_FILE_SB(file); + + if (cap_unix(tcon->ses) && + (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && + ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) + posix_lck = true; + + if (!lock && !unlock) { + /* + * if no lock or unlock then nothing to do since we do not + * know what it is + */ + free_xid(xid); + return -EOPNOTSUPP; + } + + rc = cifs_setlk(file, fl, type, wait_flag, posix_lck, lock, unlock, + xid); + free_xid(xid); + return rc; + + +} + int cifs_lock(struct file *file, int cmd, struct file_lock *flock) { int rc, xid; @@ -2750,9 +2834,17 @@ cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list, if (!rc) { if (wdata->cfile->invalidHandle) rc = -EAGAIN; - else + else { +#ifdef CONFIG_CIFS_SMB_DIRECT + if (wdata->mr) { + wdata->mr->need_invalidate = true; + smbd_deregister_mr(wdata->mr); + wdata->mr = NULL; + } +#endif rc = server->ops->async_writev(wdata, cifs_uncached_writedata_release); + } } /* If the write was successfully sent, we are done */ @@ -3475,8 +3567,16 @@ static int cifs_resend_rdata(struct cifs_readdata *rdata, if (!rc) { if (rdata->cfile->invalidHandle) rc = -EAGAIN; - else + else { +#ifdef CONFIG_CIFS_SMB_DIRECT + if (rdata->mr) { + rdata->mr->need_invalidate = true; + smbd_deregister_mr(rdata->mr); + rdata->mr = NULL; + } +#endif rc = server->ops->async_readv(rdata); + } } /* If the read was successfully sent, we are done */ @@ -4630,12 +4730,13 @@ void cifs_oplock_break(struct work_struct *work) struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; int rc = 0; + bool purge_cache = false; wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS, TASK_UNINTERRUPTIBLE); - server->ops->downgrade_oplock(server, cinode, - test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags)); + server->ops->downgrade_oplock(server, cinode, cfile->oplock_level, + cfile->oplock_epoch, &purge_cache); if (!CIFS_CACHE_WRITE(cinode) && CIFS_CACHE_READ(cinode) && cifs_has_mand_locks(cinode)) { @@ -4650,18 +4751,21 @@ void cifs_oplock_break(struct work_struct *work) else break_lease(inode, O_WRONLY); rc = filemap_fdatawrite(inode->i_mapping); - if (!CIFS_CACHE_READ(cinode)) { + if (!CIFS_CACHE_READ(cinode) || purge_cache) { rc = filemap_fdatawait(inode->i_mapping); mapping_set_error(inode->i_mapping, rc); cifs_zap_mapping(inode); } cifs_dbg(FYI, "Oplock flush inode %p rc %d\n", inode, rc); + if (CIFS_CACHE_WRITE(cinode)) + goto oplock_break_ack; } rc = cifs_push_locks(cfile); if (rc) cifs_dbg(VFS, "Push locks rc = %d\n", rc); +oplock_break_ack: /* * releasing stale oplock after recent reconnect of smb session using * a now incorrect file handle is not a data integrity issue but do @@ -4673,7 +4777,7 @@ void cifs_oplock_break(struct work_struct *work) cinode); cifs_dbg(FYI, "Oplock release rc = %d\n", rc); } - _cifsFileInfo_put(cfile, false /* do not wait for ourself */); + _cifsFileInfo_put(cfile, false /* do not wait for ourself */, false); cifs_done_oplock_break(cinode); } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 5dcc95b38310..8a76195e8a69 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -727,22 +727,138 @@ static __u64 simple_hashstr(const char *str) return hash; } +/** + * cifs_backup_query_path_info - SMB1 fallback code to get ino + * + * Fallback code to get file metadata when we don't have access to + * @full_path (EACCESS) and have backup creds. + * + * @data will be set to search info result buffer + * @resp_buf will be set to cifs resp buf and needs to be freed with + * cifs_buf_release() when done with @data. + */ +static int +cifs_backup_query_path_info(int xid, + struct cifs_tcon *tcon, + struct super_block *sb, + const char *full_path, + void **resp_buf, + FILE_ALL_INFO **data) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct cifs_search_info info = {0}; + u16 flags; + int rc; + + *resp_buf = NULL; + info.endOfSearch = false; + if (tcon->unix_ext) + info.info_level = SMB_FIND_FILE_UNIX; + else if ((tcon->ses->capabilities & + tcon->ses->server->vals->cap_nt_find) == 0) + info.info_level = SMB_FIND_FILE_INFO_STANDARD; + else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) + info.info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO; + else /* no srvino useful for fallback to some netapp */ + info.info_level = SMB_FIND_FILE_DIRECTORY_INFO; + + flags = CIFS_SEARCH_CLOSE_ALWAYS | + CIFS_SEARCH_CLOSE_AT_END | + CIFS_SEARCH_BACKUP_SEARCH; + + rc = CIFSFindFirst(xid, tcon, full_path, + cifs_sb, NULL, flags, &info, false); + if (rc) + return rc; + + *resp_buf = (void *)info.ntwrk_buf_start; + *data = (FILE_ALL_INFO *)info.srch_entries_start; + return 0; +} + +static void +cifs_set_fattr_ino(int xid, + struct cifs_tcon *tcon, + struct super_block *sb, + struct inode **inode, + const char *full_path, + FILE_ALL_INFO *data, + struct cifs_fattr *fattr) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct TCP_Server_Info *server = tcon->ses->server; + int rc; + + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) { + if (*inode) + fattr->cf_uniqueid = CIFS_I(*inode)->uniqueid; + else + fattr->cf_uniqueid = iunique(sb, ROOT_I); + return; + } + + /* + * If we have an inode pass a NULL tcon to ensure we don't + * make a round trip to the server. This only works for SMB2+. + */ + rc = server->ops->get_srv_inum(xid, + *inode ? NULL : tcon, + cifs_sb, full_path, + &fattr->cf_uniqueid, + data); + if (rc) { + /* + * If that fails reuse existing ino or generate one + * and disable server ones + */ + if (*inode) + fattr->cf_uniqueid = CIFS_I(*inode)->uniqueid; + else { + fattr->cf_uniqueid = iunique(sb, ROOT_I); + cifs_autodisable_serverino(cifs_sb); + } + return; + } + + /* If no errors, check for zero root inode (invalid) */ + if (fattr->cf_uniqueid == 0 && strlen(full_path) == 0) { + cifs_dbg(FYI, "Invalid (0) inodenum\n"); + if (*inode) { + /* reuse */ + fattr->cf_uniqueid = CIFS_I(*inode)->uniqueid; + } else { + /* make an ino by hashing the UNC */ + fattr->cf_flags |= CIFS_FATTR_FAKE_ROOT_INO; + fattr->cf_uniqueid = simple_hashstr(tcon->treeName); + } + } +} + +static inline bool is_inode_cache_good(struct inode *ino) +{ + return ino && CIFS_CACHE_READ(CIFS_I(ino)) && CIFS_I(ino)->time != 0; +} + int -cifs_get_inode_info(struct inode **inode, const char *full_path, - FILE_ALL_INFO *data, struct super_block *sb, int xid, +cifs_get_inode_info(struct inode **inode, + const char *full_path, + FILE_ALL_INFO *in_data, + struct super_block *sb, int xid, const struct cifs_fid *fid) { - __u16 srchflgs; - int rc = 0, tmprc = ENOSYS; + struct cifs_tcon *tcon; struct TCP_Server_Info *server; struct tcon_link *tlink; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - char *buf = NULL; bool adjust_tz = false; - struct cifs_fattr fattr; - struct cifs_search_info *srchinf = NULL; + struct cifs_fattr fattr = {0}; bool symlink = false; + FILE_ALL_INFO *data = in_data; + FILE_ALL_INFO *tmp_data = NULL; + void *smb1_backup_rsp_buf = NULL; + int rc = 0; + int tmprc = 0; tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) @@ -750,142 +866,88 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, tcon = tlink_tcon(tlink); server = tcon->ses->server; - cifs_dbg(FYI, "Getting info on %s\n", full_path); + /* + * 1. Fetch file metadata if not provided (data) + */ - if ((data == NULL) && (*inode != NULL)) { - if (CIFS_CACHE_READ(CIFS_I(*inode)) && - CIFS_I(*inode)->time != 0) { + if (!data) { + if (is_inode_cache_good(*inode)) { cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); - goto cgii_exit; - } - } - - /* if inode info is not passed, get it from server */ - if (data == NULL) { - if (!server->ops->query_path_info) { - rc = -ENOSYS; - goto cgii_exit; + goto out; } - buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); - if (buf == NULL) { + tmp_data = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); + if (!tmp_data) { rc = -ENOMEM; - goto cgii_exit; + goto out; } - data = (FILE_ALL_INFO *)buf; - rc = server->ops->query_path_info(xid, tcon, cifs_sb, full_path, - data, &adjust_tz, &symlink); + rc = server->ops->query_path_info(xid, tcon, cifs_sb, + full_path, tmp_data, + &adjust_tz, &symlink); + data = tmp_data; } - if (!rc) { - cifs_all_info_to_fattr(&fattr, data, sb, adjust_tz, - symlink); - } else if (rc == -EREMOTE) { + /* + * 2. Convert it to internal cifs metadata (fattr) + */ + + switch (rc) { + case 0: + cifs_all_info_to_fattr(&fattr, data, sb, adjust_tz, symlink); + break; + case -EREMOTE: + /* DFS link, no metadata available on this server */ cifs_create_dfs_fattr(&fattr, sb); rc = 0; - } else if ((rc == -EACCES) && backup_cred(cifs_sb) && - (strcmp(server->vals->version_string, SMB1_VERSION_STRING) - == 0)) { + break; + case -EACCES: /* - * For SMB2 and later the backup intent flag is already - * sent if needed on open and there is no path based - * FindFirst operation to use to retry with + * perm errors, try again with backup flags if possible + * + * For SMB2 and later the backup intent flag + * is already sent if needed on open and there + * is no path based FindFirst operation to use + * to retry with */ + if (backup_cred(cifs_sb) && is_smb1_server(server)) { + /* for easier reading */ + FILE_DIRECTORY_INFO *fdi; + SEARCH_ID_FULL_DIR_INFO *si; + + rc = cifs_backup_query_path_info(xid, tcon, sb, + full_path, + &smb1_backup_rsp_buf, + &data); + if (rc) + goto out; - srchinf = kzalloc(sizeof(struct cifs_search_info), - GFP_KERNEL); - if (srchinf == NULL) { - rc = -ENOMEM; - goto cgii_exit; - } + fdi = (FILE_DIRECTORY_INFO *)data; + si = (SEARCH_ID_FULL_DIR_INFO *)data; - srchinf->endOfSearch = false; - if (tcon->unix_ext) - srchinf->info_level = SMB_FIND_FILE_UNIX; - else if ((tcon->ses->capabilities & - tcon->ses->server->vals->cap_nt_find) == 0) - srchinf->info_level = SMB_FIND_FILE_INFO_STANDARD; - else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) - srchinf->info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO; - else /* no srvino useful for fallback to some netapp */ - srchinf->info_level = SMB_FIND_FILE_DIRECTORY_INFO; - - srchflgs = CIFS_SEARCH_CLOSE_ALWAYS | - CIFS_SEARCH_CLOSE_AT_END | - CIFS_SEARCH_BACKUP_SEARCH; - - rc = CIFSFindFirst(xid, tcon, full_path, - cifs_sb, NULL, srchflgs, srchinf, false); - if (!rc) { - data = (FILE_ALL_INFO *)srchinf->srch_entries_start; + cifs_dir_info_to_fattr(&fattr, fdi, cifs_sb); + fattr.cf_uniqueid = le64_to_cpu(si->UniqueId); + /* uniqueid set, skip get inum step */ + goto handle_mnt_opt; + } else { + /* nothing we can do, bail out */ + goto out; + } + break; + default: + cifs_dbg(FYI, "%s: unhandled err rc %d\n", __func__, rc); + goto out; + } - cifs_dir_info_to_fattr(&fattr, - (FILE_DIRECTORY_INFO *)data, cifs_sb); - fattr.cf_uniqueid = le64_to_cpu( - ((SEARCH_ID_FULL_DIR_INFO *)data)->UniqueId); + /* + * 3. Get or update inode number (fattr.cf_uniqueid) + */ - cifs_buf_release(srchinf->ntwrk_buf_start); - } - kfree(srchinf); - if (rc) - goto cgii_exit; - } else - goto cgii_exit; + cifs_set_fattr_ino(xid, tcon, sb, inode, full_path, data, &fattr); /* - * If an inode wasn't passed in, then get the inode number - * - * Is an i_ino of zero legal? Can we use that to check if the server - * supports returning inode numbers? Are there other sanity checks we - * can use to ensure that the server is really filling in that field? + * 4. Tweak fattr based on mount options */ - if (*inode == NULL) { - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { - if (server->ops->get_srv_inum) - tmprc = server->ops->get_srv_inum(xid, - tcon, cifs_sb, full_path, - &fattr.cf_uniqueid, data); - if (tmprc) { - cifs_dbg(FYI, "GetSrvInodeNum rc %d\n", - tmprc); - fattr.cf_uniqueid = iunique(sb, ROOT_I); - cifs_autodisable_serverino(cifs_sb); - } else if ((fattr.cf_uniqueid == 0) && - strlen(full_path) == 0) { - /* some servers ret bad root ino ie 0 */ - cifs_dbg(FYI, "Invalid (0) inodenum\n"); - fattr.cf_flags |= - CIFS_FATTR_FAKE_ROOT_INO; - fattr.cf_uniqueid = - simple_hashstr(tcon->treeName); - } - } else - fattr.cf_uniqueid = iunique(sb, ROOT_I); - } else { - if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) - && server->ops->get_srv_inum) { - /* - * Pass a NULL tcon to ensure we don't make a round - * trip to the server. This only works for SMB2+. - */ - tmprc = server->ops->get_srv_inum(xid, - NULL, cifs_sb, full_path, - &fattr.cf_uniqueid, data); - if (tmprc) - fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid; - else if ((fattr.cf_uniqueid == 0) && - strlen(full_path) == 0) { - /* - * Reuse existing root inode num since - * inum zero for root causes ls of . and .. to - * not be returned - */ - cifs_dbg(FYI, "Srv ret 0 inode num for root\n"); - fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid; - } - } else - fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid; - } +handle_mnt_opt: /* query for SFU type info if supported and needed */ if (fattr.cf_cifsattrs & ATTR_SYSTEM && cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { @@ -900,8 +962,8 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, full_path, fid); if (rc) { cifs_dbg(FYI, "%s: Get mode from SID failed. rc=%d\n", - __func__, rc); - goto cgii_exit; + __func__, rc); + goto out; } } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, false, @@ -909,7 +971,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, if (rc) { cifs_dbg(FYI, "%s: Getting ACL failed with error: %d\n", __func__, rc); - goto cgii_exit; + goto out; } } @@ -925,6 +987,10 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc); } + /* + * 5. Update inode with final fattr data + */ + if (!*inode) { *inode = cifs_iget(sb, &fattr); if (!*inode) @@ -937,7 +1003,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, CIFS_I(*inode)->uniqueid != fattr.cf_uniqueid)) { CIFS_I(*inode)->time = 0; /* force reval */ rc = -ESTALE; - goto cgii_exit; + goto out; } /* if filetype is different, return error */ @@ -945,18 +1011,15 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, (fattr.cf_mode & S_IFMT))) { CIFS_I(*inode)->time = 0; /* force reval */ rc = -ESTALE; - goto cgii_exit; + goto out; } cifs_fattr_to_inode(*inode, &fattr); } - -cgii_exit: - if ((*inode) && ((*inode)->i_ino == 0)) - cifs_dbg(FYI, "inode number of zero returned\n"); - - kfree(buf); +out: + cifs_buf_release(smb1_backup_rsp_buf); cifs_put_tlink(tlink); + kfree(tmp_data); return rc; } @@ -2475,9 +2538,9 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) rc = tcon->ses->server->ops->flush(xid, tcon, &wfile->fid); cifsFileInfo_put(wfile); if (rc) - return rc; + goto cifs_setattr_exit; } else if (rc != -EBADF) - return rc; + goto cifs_setattr_exit; else rc = 0; } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 5ad83bdb9bea..40ca394fd5de 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -488,21 +488,10 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &pCifsInode->flags); - /* - * Set flag if the server downgrades the oplock - * to L2 else clear. - */ - if (pSMB->OplockLevel) - set_bit( - CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, - &pCifsInode->flags); - else - clear_bit( - CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, - &pCifsInode->flags); - - cifs_queue_oplock_break(netfile); + netfile->oplock_epoch = 0; + netfile->oplock_level = pSMB->OplockLevel; netfile->oplock_break_cancelled = false; + cifs_queue_oplock_break(netfile); spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 85bd644f9773..fb3bdc44775c 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -31,6 +31,231 @@ #include <linux/utsname.h> #include <linux/slab.h> #include "cifs_spnego.h" +#include "smb2proto.h" + +bool +is_server_using_iface(struct TCP_Server_Info *server, + struct cifs_server_iface *iface) +{ + struct sockaddr_in *i4 = (struct sockaddr_in *)&iface->sockaddr; + struct sockaddr_in6 *i6 = (struct sockaddr_in6 *)&iface->sockaddr; + struct sockaddr_in *s4 = (struct sockaddr_in *)&server->dstaddr; + struct sockaddr_in6 *s6 = (struct sockaddr_in6 *)&server->dstaddr; + + if (server->dstaddr.ss_family != iface->sockaddr.ss_family) + return false; + if (server->dstaddr.ss_family == AF_INET) { + if (s4->sin_addr.s_addr != i4->sin_addr.s_addr) + return false; + } else if (server->dstaddr.ss_family == AF_INET6) { + if (memcmp(&s6->sin6_addr, &i6->sin6_addr, + sizeof(i6->sin6_addr)) != 0) + return false; + } else { + /* unknown family.. */ + return false; + } + return true; +} + +bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface) +{ + int i; + + for (i = 0; i < ses->chan_count; i++) { + if (is_server_using_iface(ses->chans[i].server, iface)) + return true; + } + return false; +} + +/* returns number of channels added */ +int cifs_try_adding_channels(struct cifs_ses *ses) +{ + int old_chan_count = ses->chan_count; + int left = ses->chan_max - ses->chan_count; + int i = 0; + int rc = 0; + int tries = 0; + + if (left <= 0) { + cifs_dbg(FYI, + "ses already at max_channels (%zu), nothing to open\n", + ses->chan_max); + return 0; + } + + if (ses->server->dialect < SMB30_PROT_ID) { + cifs_dbg(VFS, "multichannel is not supported on this protocol version, use 3.0 or above\n"); + return 0; + } + + /* + * Keep connecting to same, fastest, iface for all channels as + * long as its RSS. Try next fastest one if not RSS or channel + * creation fails. + */ + while (left > 0) { + struct cifs_server_iface *iface; + + tries++; + if (tries > 3*ses->chan_max) { + cifs_dbg(FYI, "too many attempt at opening channels (%d channels left to open)\n", + left); + break; + } + + iface = &ses->iface_list[i]; + if (is_ses_using_iface(ses, iface) && !iface->rss_capable) { + i = (i+1) % ses->iface_count; + continue; + } + + rc = cifs_ses_add_channel(ses, iface); + if (rc) { + cifs_dbg(FYI, "failed to open extra channel on iface#%d rc=%d\n", + i, rc); + i = (i+1) % ses->iface_count; + continue; + } + + cifs_dbg(FYI, "successfully opened new channel on iface#%d\n", + i); + left--; + } + + return ses->chan_count - old_chan_count; +} + +int +cifs_ses_add_channel(struct cifs_ses *ses, struct cifs_server_iface *iface) +{ + struct cifs_chan *chan; + struct smb_vol vol = {NULL}; + static const char unc_fmt[] = "\\%s\\foo"; + char unc[sizeof(unc_fmt)+SERVER_NAME_LEN_WITH_NULL] = {0}; + struct sockaddr_in *ipv4 = (struct sockaddr_in *)&iface->sockaddr; + struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)&iface->sockaddr; + int rc; + unsigned int xid = get_xid(); + + cifs_dbg(FYI, "adding channel to ses %p (speed:%zu bps rdma:%s ", + ses, iface->speed, iface->rdma_capable ? "yes" : "no"); + if (iface->sockaddr.ss_family == AF_INET) + cifs_dbg(FYI, "ip:%pI4)\n", &ipv4->sin_addr); + else + cifs_dbg(FYI, "ip:%pI6)\n", &ipv6->sin6_addr); + + /* + * Setup a smb_vol with mostly the same info as the existing + * session and overwrite it with the requested iface data. + * + * We need to setup at least the fields used for negprot and + * sesssetup. + * + * We only need the volume here, so we can reuse memory from + * the session and server without caring about memory + * management. + */ + + /* Always make new connection for now (TODO?) */ + vol.nosharesock = true; + + /* Auth */ + vol.domainauto = ses->domainAuto; + vol.domainname = ses->domainName; + vol.username = ses->user_name; + vol.password = ses->password; + vol.sectype = ses->sectype; + vol.sign = ses->sign; + + /* UNC and paths */ + /* XXX: Use ses->server->hostname? */ + sprintf(unc, unc_fmt, ses->serverName); + vol.UNC = unc; + vol.prepath = ""; + + /* Re-use same version as master connection */ + vol.vals = ses->server->vals; + vol.ops = ses->server->ops; + + vol.noblocksnd = ses->server->noblocksnd; + vol.noautotune = ses->server->noautotune; + vol.sockopt_tcp_nodelay = ses->server->tcp_nodelay; + vol.echo_interval = ses->server->echo_interval / HZ; + + /* + * This will be used for encoding/decoding user/domain/pw + * during sess setup auth. + * + * XXX: We use the default for simplicity but the proper way + * would be to use the one that ses used, which is not + * stored. This might break when dealing with non-ascii + * strings. + */ + vol.local_nls = load_nls_default(); + + /* Use RDMA if possible */ + vol.rdma = iface->rdma_capable; + memcpy(&vol.dstaddr, &iface->sockaddr, sizeof(struct sockaddr_storage)); + + /* reuse master con client guid */ + memcpy(&vol.client_guid, ses->server->client_guid, + SMB2_CLIENT_GUID_SIZE); + vol.use_client_guid = true; + + mutex_lock(&ses->session_mutex); + + chan = &ses->chans[ses->chan_count]; + chan->server = cifs_get_tcp_session(&vol); + if (IS_ERR(chan->server)) { + rc = PTR_ERR(chan->server); + chan->server = NULL; + goto out; + } + + /* + * We need to allocate the server crypto now as we will need + * to sign packets before we generate the channel signing key + * (we sign with the session key) + */ + rc = smb311_crypto_shash_allocate(chan->server); + if (rc) { + cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__); + goto out; + } + + ses->binding = true; + rc = cifs_negotiate_protocol(xid, ses); + if (rc) + goto out; + + rc = cifs_setup_session(xid, ses, vol.local_nls); + if (rc) + goto out; + + /* success, put it on the list + * XXX: sharing ses between 2 tcp server is not possible, the + * way "internal" linked lists works in linux makes element + * only able to belong to one list + * + * the binding session is already established so the rest of + * the code should be able to look it up, no need to add the + * ses to the new server. + */ + + ses->chan_count++; + atomic_set(&ses->chan_seq, 0); +out: + ses->binding = false; + mutex_unlock(&ses->session_mutex); + + if (rc && chan->server) + cifs_put_tcp_session(chan->server, 0); + unload_nls(vol.local_nls); + + return rc; +} static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB) { @@ -342,6 +567,7 @@ int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, void build_ntlmssp_negotiate_blob(unsigned char *pbuffer, struct cifs_ses *ses) { + struct TCP_Server_Info *server = cifs_ses_server(ses); NEGOTIATE_MESSAGE *sec_blob = (NEGOTIATE_MESSAGE *)pbuffer; __u32 flags; @@ -354,9 +580,9 @@ void build_ntlmssp_negotiate_blob(unsigned char *pbuffer, NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC | NTLMSSP_NEGOTIATE_SEAL; - if (ses->server->sign) + if (server->sign) flags |= NTLMSSP_NEGOTIATE_SIGN; - if (!ses->server->session_estab || ses->ntlmssp->sesskey_per_smbsess) + if (!server->session_estab || ses->ntlmssp->sesskey_per_smbsess) flags |= NTLMSSP_NEGOTIATE_KEY_XCH; sec_blob->NegotiateFlags = cpu_to_le32(flags); diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index b7421a096319..d70a2bb062df 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -171,6 +171,9 @@ cifs_get_next_mid(struct TCP_Server_Info *server) /* we do not want to loop forever */ last_mid = cur_mid; cur_mid++; + /* avoid 0xFFFF MID */ + if (cur_mid == 0xffff) + cur_mid++; /* * This nested loop looks more expensive than it is. @@ -366,12 +369,10 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr) static void cifs_downgrade_oplock(struct TCP_Server_Info *server, - struct cifsInodeInfo *cinode, bool set_level2) + struct cifsInodeInfo *cinode, __u32 oplock, + unsigned int epoch, bool *purge_cache) { - if (set_level2) - cifs_set_oplock_level(cinode, OPLOCK_READ); - else - cifs_set_oplock_level(cinode, 0); + cifs_set_oplock_level(cinode, oplock); } static bool diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index e6a1fc72018f..8b0b512c5792 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -145,7 +145,7 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, cur = buf; - down_write(&cinode->lock_sem); + cifs_down_write(&cinode->lock_sem); list_for_each_entry_safe(li, tmp, &cfile->llist->locks, llist) { if (flock->fl_start > li->offset || (flock->fl_start + length) < diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index e311f58dc1c8..0516fc482d43 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -29,6 +29,7 @@ #include "cifs_unicode.h" #include "smb2status.h" #include "smb2glob.h" +#include "nterr.h" static int check_smb2_hdr(struct smb2_sync_hdr *shdr, __u64 mid) @@ -249,16 +250,10 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr) * of junk. Other servers match RFC1001 len to actual * SMB2/SMB3 frame length (header + smb2 response specific data) * Some windows servers also pad up to 8 bytes when compounding. - * If pad is longer than eight bytes, log the server behavior - * (once), since may indicate a problem but allow it and continue - * since the frame is parseable. */ - if (clc_len < len) { - pr_warn_once( - "srv rsp padded more than expected. Length %d not %d for cmd:%d mid:%llu\n", - len, clc_len, command, mid); + if (clc_len < len) return 0; - } + pr_warn_once( "srv rsp too short, len %d not %d. cmd:%d mid:%llu\n", len, clc_len, command, mid); @@ -534,7 +529,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, cifs_dbg(FYI, "found in the open list\n"); cifs_dbg(FYI, "lease key match, lease break 0x%x\n", - le32_to_cpu(rsp->NewLeaseState)); + lease_state); if (ack_req) cfile->oplock_break_cancelled = false; @@ -543,17 +538,8 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags); - /* - * Set or clear flags depending on the lease state being READ. - * HANDLE caching flag should be added when the client starts - * to defer closing remote file handles with HANDLE leases. - */ - if (lease_state & SMB2_LEASE_READ_CACHING_HE) - set_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, - &cinode->flags); - else - clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, - &cinode->flags); + cfile->oplock_epoch = le16_to_cpu(rsp->Epoch); + cfile->oplock_level = lease_state; cifs_queue_oplock_break(cfile); kfree(lw); @@ -576,7 +562,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, cifs_dbg(FYI, "found in the pending open list\n"); cifs_dbg(FYI, "lease key match, lease break 0x%x\n", - le32_to_cpu(rsp->NewLeaseState)); + lease_state); open->oplock = lease_state; } @@ -673,10 +659,10 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) spin_lock(&cifs_tcp_ses_lock); list_for_each(tmp, &server->smb_ses_list) { ses = list_entry(tmp, struct cifs_ses, smb_ses_list); + list_for_each(tmp1, &ses->tcon_list) { tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); - cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks); spin_lock(&tcon->open_file_lock); list_for_each(tmp2, &tcon->openFileList) { cfile = list_entry(tmp2, struct cifsFileInfo, @@ -688,6 +674,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) continue; cifs_dbg(FYI, "file id match, oplock break\n"); + cifs_stats_inc( + &tcon->stats.cifs_stats.num_oplock_brks); cinode = CIFS_I(d_inode(cfile->dentry)); spin_lock(&cfile->file_info_lock); if (!CIFS_CACHE_WRITE(cinode) && @@ -699,18 +687,9 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags); - /* - * Set flag if the server downgrades the oplock - * to L2 else clear. - */ - if (rsp->OplockLevel) - set_bit( - CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, - &cinode->flags); - else - clear_bit( - CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, - &cinode->flags); + cfile->oplock_epoch = 0; + cfile->oplock_level = rsp->OplockLevel; + spin_unlock(&cfile->file_info_lock); cifs_queue_oplock_break(cfile); @@ -720,9 +699,6 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) return true; } spin_unlock(&tcon->open_file_lock); - spin_unlock(&cifs_tcp_ses_lock); - cifs_dbg(FYI, "No matching file for oplock break\n"); - return true; } } spin_unlock(&cifs_tcp_ses_lock); @@ -735,45 +711,98 @@ smb2_cancelled_close_fid(struct work_struct *work) { struct close_cancelled_open *cancelled = container_of(work, struct close_cancelled_open, work); + struct cifs_tcon *tcon = cancelled->tcon; + int rc; - cifs_dbg(VFS, "Close unmatched open\n"); + if (cancelled->mid) + cifs_tcon_dbg(VFS, "Close unmatched open for MID:%llx\n", + cancelled->mid); + else + cifs_tcon_dbg(VFS, "Close interrupted close\n"); - SMB2_close(0, cancelled->tcon, cancelled->fid.persistent_fid, - cancelled->fid.volatile_fid); - cifs_put_tcon(cancelled->tcon); + rc = SMB2_close(0, tcon, cancelled->fid.persistent_fid, + cancelled->fid.volatile_fid); + if (rc) + cifs_tcon_dbg(VFS, "Close cancelled mid failed rc:%d\n", rc); + + cifs_put_tcon(tcon); kfree(cancelled); } +/* + * Caller should already has an extra reference to @tcon + * This function is used to queue work to close a handle to prevent leaks + * on the server. + * We handle two cases. If an open was interrupted after we sent the + * SMB2_CREATE to the server but before we processed the reply, and second + * if a close was interrupted before we sent the SMB2_CLOSE to the server. + */ +static int +__smb2_handle_cancelled_cmd(struct cifs_tcon *tcon, __u16 cmd, __u64 mid, + __u64 persistent_fid, __u64 volatile_fid) +{ + struct close_cancelled_open *cancelled; + + cancelled = kzalloc(sizeof(*cancelled), GFP_KERNEL); + if (!cancelled) + return -ENOMEM; + + cancelled->fid.persistent_fid = persistent_fid; + cancelled->fid.volatile_fid = volatile_fid; + cancelled->tcon = tcon; + cancelled->cmd = cmd; + cancelled->mid = mid; + INIT_WORK(&cancelled->work, smb2_cancelled_close_fid); + WARN_ON(queue_work(cifsiod_wq, &cancelled->work) == false); + + return 0; +} + +int +smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid, + __u64 volatile_fid) +{ + int rc; + + cifs_dbg(FYI, "%s: tc_count=%d\n", __func__, tcon->tc_count); + spin_lock(&cifs_tcp_ses_lock); + tcon->tc_count++; + spin_unlock(&cifs_tcp_ses_lock); + + rc = __smb2_handle_cancelled_cmd(tcon, SMB2_CLOSE_HE, 0, + persistent_fid, volatile_fid); + if (rc) + cifs_put_tcon(tcon); + + return rc; +} + int smb2_handle_cancelled_mid(char *buffer, struct TCP_Server_Info *server) { struct smb2_sync_hdr *sync_hdr = (struct smb2_sync_hdr *)buffer; struct smb2_create_rsp *rsp = (struct smb2_create_rsp *)buffer; struct cifs_tcon *tcon; - struct close_cancelled_open *cancelled; + int rc; if (sync_hdr->Command != SMB2_CREATE || sync_hdr->Status != STATUS_SUCCESS) return 0; - cancelled = kzalloc(sizeof(*cancelled), GFP_KERNEL); - if (!cancelled) - return -ENOMEM; - tcon = smb2_find_smb_tcon(server, sync_hdr->SessionId, sync_hdr->TreeId); - if (!tcon) { - kfree(cancelled); + if (!tcon) return -ENOENT; - } - cancelled->fid.persistent_fid = rsp->PersistentFileId; - cancelled->fid.volatile_fid = rsp->VolatileFileId; - cancelled->tcon = tcon; - INIT_WORK(&cancelled->work, smb2_cancelled_close_fid); - queue_work(cifsiod_wq, &cancelled->work); + rc = __smb2_handle_cancelled_cmd(tcon, + le16_to_cpu(sync_hdr->Command), + le64_to_cpu(sync_hdr->MessageId), + rsp->PersistentFileId, + rsp->VolatileFileId); + if (rc) + cifs_put_tcon(tcon); - return 0; + return rc; } /** @@ -788,23 +817,37 @@ smb311_update_preauth_hash(struct cifs_ses *ses, struct kvec *iov, int nvec) int i, rc; struct sdesc *d; struct smb2_sync_hdr *hdr; + struct TCP_Server_Info *server = cifs_ses_server(ses); - if (ses->server->tcpStatus == CifsGood) { - /* skip non smb311 connections */ - if (ses->server->dialect != SMB311_PROT_ID) - return 0; + hdr = (struct smb2_sync_hdr *)iov[0].iov_base; + /* neg prot are always taken */ + if (hdr->Command == SMB2_NEGOTIATE) + goto ok; - /* skip last sess setup response */ - hdr = (struct smb2_sync_hdr *)iov[0].iov_base; - if (hdr->Flags & SMB2_FLAGS_SIGNED) - return 0; - } + /* + * If we process a command which wasn't a negprot it means the + * neg prot was already done, so the server dialect was set + * and we can test it. Preauth requires 3.1.1 for now. + */ + if (server->dialect != SMB311_PROT_ID) + return 0; + + if (hdr->Command != SMB2_SESSION_SETUP) + return 0; + + /* skip last sess setup response */ + if ((hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR) + && (hdr->Status == NT_STATUS_OK + || (hdr->Status != + cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED)))) + return 0; - rc = smb311_crypto_shash_allocate(ses->server); +ok: + rc = smb311_crypto_shash_allocate(server); if (rc) return rc; - d = ses->server->secmech.sdescsha512; + d = server->secmech.sdescsha512; rc = crypto_shash_init(&d->shash); if (rc) { cifs_dbg(VFS, "%s: could not init sha512 shash\n", __func__); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 4c0922596467..a7f328f79c6f 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -10,6 +10,7 @@ #include <linux/falloc.h> #include <linux/scatterlist.h> #include <linux/uuid.h> +#include <linux/sort.h> #include <crypto/aead.h> #include "cifsglob.h" #include "smb2pdu.h" @@ -151,13 +152,7 @@ smb2_get_credits_field(struct TCP_Server_Info *server, const int optype) static unsigned int smb2_get_credits(struct mid_q_entry *mid) { - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)mid->resp_buf; - - if (mid->mid_state == MID_RESPONSE_RECEIVED - || mid->mid_state == MID_RESPONSE_MALFORMED) - return le16_to_cpu(shdr->CreditRequest); - - return 0; + return mid->credits_received; } static int @@ -315,7 +310,7 @@ smb2_negotiate(const unsigned int xid, struct cifs_ses *ses) { int rc; - ses->server->CurrentMid = 0; + cifs_ses_server(ses)->CurrentMid = 0; rc = SMB2_negotiate(xid, ses); /* BB we probably don't need to retry with modern servers */ if (rc == -EAGAIN) @@ -558,6 +553,13 @@ out: return rc; } +static int compare_iface(const void *ia, const void *ib) +{ + const struct cifs_server_iface *a = (struct cifs_server_iface *)ia; + const struct cifs_server_iface *b = (struct cifs_server_iface *)ib; + + return a->speed == b->speed ? 0 : (a->speed > b->speed ? -1 : 1); +} static int SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) @@ -587,6 +589,9 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) if (rc) goto out; + /* sort interfaces from fastest to slowest */ + sort(iface_list, iface_count, sizeof(*iface_list), compare_iface, NULL); + spin_lock(&ses->iface_lock); kfree(ses->iface_list); ses->iface_list = iface_list; @@ -1402,15 +1407,10 @@ smb2_ioctl_query_info(const unsigned int xid, if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - buffer = kmalloc(qi.output_buffer_length, GFP_KERNEL); - if (buffer == NULL) - return -ENOMEM; - - if (copy_from_user(buffer, arg + sizeof(struct smb_query_info), - qi.output_buffer_length)) { - rc = -EFAULT; - goto iqinf_exit; - } + buffer = memdup_user(arg + sizeof(struct smb_query_info), + qi.output_buffer_length); + if (IS_ERR(buffer)) + return PTR_ERR(buffer); /* Open */ memset(&open_iov, 0, sizeof(open_iov)); @@ -1529,35 +1529,32 @@ smb2_ioctl_query_info(const unsigned int xid, if (le32_to_cpu(io_rsp->OutputCount) < qi.input_buffer_length) qi.input_buffer_length = le32_to_cpu(io_rsp->OutputCount); if (qi.input_buffer_length > 0 && - le32_to_cpu(io_rsp->OutputOffset) + qi.input_buffer_length > rsp_iov[1].iov_len) { - rc = -EFAULT; - goto iqinf_exit; - } - if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length, - sizeof(qi.input_buffer_length))) { - rc = -EFAULT; - goto iqinf_exit; - } + le32_to_cpu(io_rsp->OutputOffset) + qi.input_buffer_length + > rsp_iov[1].iov_len) + goto e_fault; + + if (copy_to_user(&pqi->input_buffer_length, + &qi.input_buffer_length, + sizeof(qi.input_buffer_length))) + goto e_fault; + if (copy_to_user((void __user *)pqi + sizeof(struct smb_query_info), (const void *)io_rsp + le32_to_cpu(io_rsp->OutputOffset), - qi.input_buffer_length)) { - rc = -EFAULT; - goto iqinf_exit; - } + qi.input_buffer_length)) + goto e_fault; } else { pqi = (struct smb_query_info __user *)arg; qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; if (le32_to_cpu(qi_rsp->OutputBufferLength) < qi.input_buffer_length) qi.input_buffer_length = le32_to_cpu(qi_rsp->OutputBufferLength); - if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length, - sizeof(qi.input_buffer_length))) { - rc = -EFAULT; - goto iqinf_exit; - } - if (copy_to_user(pqi + 1, qi_rsp->Buffer, qi.input_buffer_length)) { - rc = -EFAULT; - goto iqinf_exit; - } + if (copy_to_user(&pqi->input_buffer_length, + &qi.input_buffer_length, + sizeof(qi.input_buffer_length))) + goto e_fault; + + if (copy_to_user(pqi + 1, qi_rsp->Buffer, + qi.input_buffer_length)) + goto e_fault; } iqinf_exit: @@ -1573,6 +1570,10 @@ smb2_ioctl_query_info(const unsigned int xid, free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); return rc; + +e_fault: + rc = -EFAULT; + goto iqinf_exit; } static ssize_t @@ -3281,22 +3282,38 @@ static long smb3_fallocate(struct file *file, struct cifs_tcon *tcon, int mode, static void smb2_downgrade_oplock(struct TCP_Server_Info *server, - struct cifsInodeInfo *cinode, bool set_level2) + struct cifsInodeInfo *cinode, __u32 oplock, + unsigned int epoch, bool *purge_cache) { - if (set_level2) - server->ops->set_oplock_level(cinode, SMB2_OPLOCK_LEVEL_II, - 0, NULL); - else - server->ops->set_oplock_level(cinode, 0, 0, NULL); + server->ops->set_oplock_level(cinode, oplock, 0, NULL); } static void -smb21_downgrade_oplock(struct TCP_Server_Info *server, - struct cifsInodeInfo *cinode, bool set_level2) +smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, + unsigned int epoch, bool *purge_cache); + +static void +smb3_downgrade_oplock(struct TCP_Server_Info *server, + struct cifsInodeInfo *cinode, __u32 oplock, + unsigned int epoch, bool *purge_cache) { - server->ops->set_oplock_level(cinode, - set_level2 ? SMB2_LEASE_READ_CACHING_HE : - 0, 0, NULL); + unsigned int old_state = cinode->oplock; + unsigned int old_epoch = cinode->epoch; + unsigned int new_state; + + if (epoch > old_epoch) { + smb21_set_oplock_level(cinode, oplock, 0, NULL); + cinode->epoch = epoch; + } + + new_state = cinode->oplock; + *purge_cache = false; + + if ((old_state & CIFS_CACHE_READ_FLG) != 0 && + (new_state & CIFS_CACHE_READ_FLG) == 0) + *purge_cache = true; + else if (old_state == new_state && (epoch - old_epoch > 1)) + *purge_cache = true; } static void @@ -3598,14 +3615,16 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key) u8 *ses_enc_key; spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { - if (ses->Suid != ses_id) - continue; - ses_enc_key = enc ? ses->smb3encryptionkey : - ses->smb3decryptionkey; - memcpy(key, ses_enc_key, SMB3_SIGN_KEY_SIZE); - spin_unlock(&cifs_tcp_ses_lock); - return 0; + list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + if (ses->Suid == ses_id) { + ses_enc_key = enc ? ses->smb3encryptionkey : + ses->smb3decryptionkey; + memcpy(key, ses_enc_key, SMB3_SIGN_KEY_SIZE); + spin_unlock(&cifs_tcp_ses_lock); + return 0; + } + } } spin_unlock(&cifs_tcp_ses_lock); @@ -4084,6 +4103,7 @@ free_pages: kfree(dw->ppages); cifs_small_buf_release(dw->buf); + kfree(dw); } @@ -4157,7 +4177,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid, dw->server = server; dw->ppages = pages; dw->len = len; - queue_work(cifsiod_wq, &dw->decrypt); + queue_work(decrypt_wq, &dw->decrypt); *num_mids = 0; /* worker thread takes care of finding mid */ return -1; } @@ -4555,7 +4575,7 @@ struct smb_version_operations smb21_operations = { .print_stats = smb2_print_stats, .is_oplock_break = smb2_is_valid_oplock_break, .handle_cancelled_mid = smb2_handle_cancelled_mid, - .downgrade_oplock = smb21_downgrade_oplock, + .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, @@ -4655,7 +4675,7 @@ struct smb_version_operations smb30_operations = { .dump_share_caps = smb2_dump_share_caps, .is_oplock_break = smb2_is_valid_oplock_break, .handle_cancelled_mid = smb2_handle_cancelled_mid, - .downgrade_oplock = smb21_downgrade_oplock, + .downgrade_oplock = smb3_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb3_negotiate_wsize, @@ -4763,7 +4783,7 @@ struct smb_version_operations smb311_operations = { .dump_share_caps = smb2_dump_share_caps, .is_oplock_break = smb2_is_valid_oplock_break, .handle_cancelled_mid = smb2_handle_cancelled_mid, - .downgrade_oplock = smb21_downgrade_oplock, + .downgrade_oplock = smb3_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb3_negotiate_wsize, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 05149862aea4..ed77f94dbf1d 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -252,7 +252,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) if (tcon == NULL) return 0; - if (smb2_command == SMB2_TREE_CONNECT || smb2_command == SMB2_IOCTL) + if (smb2_command == SMB2_TREE_CONNECT) return 0; if (tcon->tidStatus == CifsExiting) { @@ -426,16 +426,9 @@ fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon, void *buf, * SMB information in the SMB header. If the return code is zero, this * function must have filled in request_buf pointer. */ -static int -smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon, - void **request_buf, unsigned int *total_len) +static int __smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon, + void **request_buf, unsigned int *total_len) { - int rc; - - rc = smb2_reconnect(smb2_command, tcon); - if (rc) - return rc; - /* BB eventually switch this to SMB2 specific small buf size */ if (smb2_command == SMB2_SET_INFO) *request_buf = cifs_buf_get(); @@ -456,7 +449,31 @@ smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon, cifs_stats_inc(&tcon->num_smbs_sent); } - return rc; + return 0; +} + +static int smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon, + void **request_buf, unsigned int *total_len) +{ + int rc; + + rc = smb2_reconnect(smb2_command, tcon); + if (rc) + return rc; + + return __smb2_plain_req_init(smb2_command, tcon, request_buf, + total_len); +} + +static int smb2_ioctl_req_init(u32 opcode, struct cifs_tcon *tcon, + void **request_buf, unsigned int *total_len) +{ + /* Skip reconnect only for FSCTL_VALIDATE_NEGOTIATE_INFO IOCTLs */ + if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO) { + return __smb2_plain_req_init(SMB2_IOCTL, tcon, request_buf, + total_len); + } + return smb2_plain_req_init(SMB2_IOCTL, tcon, request_buf, total_len); } /* For explanation of negotiate contexts see MS-SMB2 section 2.2.3.1 */ @@ -791,7 +808,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) struct kvec rsp_iov; int rc = 0; int resp_buftype; - struct TCP_Server_Info *server = ses->server; + struct TCP_Server_Info *server = cifs_ses_server(ses); int blob_offset, blob_length; char *security_blob; int flags = CIFS_NEG_OP; @@ -813,7 +830,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) memset(server->preauth_sha_hash, 0, SMB2_PREAUTH_HASH_SIZE); memset(ses->preauth_sha_hash, 0, SMB2_PREAUTH_HASH_SIZE); - if (strcmp(ses->server->vals->version_string, + if (strcmp(server->vals->version_string, SMB3ANY_VERSION_STRING) == 0) { req->Dialects[0] = cpu_to_le16(SMB30_PROT_ID); req->Dialects[1] = cpu_to_le16(SMB302_PROT_ID); @@ -829,7 +846,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) total_len += 8; } else { /* otherwise send specific dialect */ - req->Dialects[0] = cpu_to_le16(ses->server->vals->protocol_id); + req->Dialects[0] = cpu_to_le16(server->vals->protocol_id); req->DialectCount = cpu_to_le16(1); total_len += 2; } @@ -1171,7 +1188,7 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) int rc; struct cifs_ses *ses = sess_data->ses; struct smb2_sess_setup_req *req; - struct TCP_Server_Info *server = ses->server; + struct TCP_Server_Info *server = cifs_ses_server(ses); unsigned int total_len; rc = smb2_plain_req_init(SMB2_SESSION_SETUP, NULL, (void **) &req, @@ -1179,13 +1196,21 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) if (rc) return rc; - /* First session, not a reauthenticate */ - req->sync_hdr.SessionId = 0; - - /* if reconnect, we need to send previous sess id, otherwise it is 0 */ - req->PreviousSessionId = sess_data->previous_session; - - req->Flags = 0; /* MBZ */ + if (sess_data->ses->binding) { + req->sync_hdr.SessionId = sess_data->ses->Suid; + req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; + req->PreviousSessionId = 0; + req->Flags = SMB2_SESSION_REQ_FLAG_BINDING; + } else { + /* First session, not a reauthenticate */ + req->sync_hdr.SessionId = 0; + /* + * if reconnect, we need to send previous sess id + * otherwise it is 0 + */ + req->PreviousSessionId = sess_data->previous_session; + req->Flags = 0; /* MBZ */ + } /* enough to enable echos and oplocks and one max size write */ req->sync_hdr.CreditRequest = cpu_to_le16(130); @@ -1258,28 +1283,33 @@ SMB2_sess_establish_session(struct SMB2_sess_data *sess_data) { int rc = 0; struct cifs_ses *ses = sess_data->ses; + struct TCP_Server_Info *server = cifs_ses_server(ses); - mutex_lock(&ses->server->srv_mutex); - if (ses->server->ops->generate_signingkey) { - rc = ses->server->ops->generate_signingkey(ses); + mutex_lock(&server->srv_mutex); + if (server->ops->generate_signingkey) { + rc = server->ops->generate_signingkey(ses); if (rc) { cifs_dbg(FYI, "SMB3 session key generation failed\n"); - mutex_unlock(&ses->server->srv_mutex); + mutex_unlock(&server->srv_mutex); return rc; } } - if (!ses->server->session_estab) { - ses->server->sequence_number = 0x2; - ses->server->session_estab = true; + if (!server->session_estab) { + server->sequence_number = 0x2; + server->session_estab = true; } - mutex_unlock(&ses->server->srv_mutex); + mutex_unlock(&server->srv_mutex); cifs_dbg(FYI, "SMB2/3 session established successfully\n"); - spin_lock(&GlobalMid_Lock); - ses->status = CifsGood; - ses->need_reconnect = false; - spin_unlock(&GlobalMid_Lock); + /* keep existing ses state if binding */ + if (!ses->binding) { + spin_lock(&GlobalMid_Lock); + ses->status = CifsGood; + ses->need_reconnect = false; + spin_unlock(&GlobalMid_Lock); + } + return rc; } @@ -1317,16 +1347,19 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) goto out_put_spnego_key; } - ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, - GFP_KERNEL); - if (!ses->auth_key.response) { - cifs_dbg(VFS, - "Kerberos can't allocate (%u bytes) memory", - msg->sesskey_len); - rc = -ENOMEM; - goto out_put_spnego_key; + /* keep session key if binding */ + if (!ses->binding) { + ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, + GFP_KERNEL); + if (!ses->auth_key.response) { + cifs_dbg(VFS, + "Kerberos can't allocate (%u bytes) memory", + msg->sesskey_len); + rc = -ENOMEM; + goto out_put_spnego_key; + } + ses->auth_key.len = msg->sesskey_len; } - ses->auth_key.len = msg->sesskey_len; sess_data->iov[1].iov_base = msg->data + msg->sesskey_len; sess_data->iov[1].iov_len = msg->secblob_len; @@ -1336,9 +1369,11 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) goto out_put_spnego_key; rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; - ses->Suid = rsp->sync_hdr.SessionId; - - ses->session_flags = le16_to_cpu(rsp->SessionFlags); + /* keep session id and flags if binding */ + if (!ses->binding) { + ses->Suid = rsp->sync_hdr.SessionId; + ses->session_flags = le16_to_cpu(rsp->SessionFlags); + } rc = SMB2_sess_establish_session(sess_data); out_put_spnego_key: @@ -1432,9 +1467,11 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n"); - - ses->Suid = rsp->sync_hdr.SessionId; - ses->session_flags = le16_to_cpu(rsp->SessionFlags); + /* keep existing ses id and flags if binding */ + if (!ses->binding) { + ses->Suid = rsp->sync_hdr.SessionId; + ses->session_flags = le16_to_cpu(rsp->SessionFlags); + } out: kfree(ntlmssp_blob); @@ -1491,8 +1528,11 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; - ses->Suid = rsp->sync_hdr.SessionId; - ses->session_flags = le16_to_cpu(rsp->SessionFlags); + /* keep existing ses id and flags if binding */ + if (!ses->binding) { + ses->Suid = rsp->sync_hdr.SessionId; + ses->session_flags = le16_to_cpu(rsp->SessionFlags); + } rc = SMB2_sess_establish_session(sess_data); out: @@ -1509,7 +1549,7 @@ SMB2_select_sec(struct cifs_ses *ses, struct SMB2_sess_data *sess_data) { int type; - type = smb2_select_sectype(ses->server, ses->sectype); + type = smb2_select_sectype(cifs_ses_server(ses), ses->sectype); cifs_dbg(FYI, "sess setup type %d\n", type); if (type == Unspecified) { cifs_dbg(VFS, @@ -1537,7 +1577,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, const struct nls_table *nls_cp) { int rc = 0; - struct TCP_Server_Info *server = ses->server; + struct TCP_Server_Info *server = cifs_ses_server(ses); struct SMB2_sess_data *sess_data; cifs_dbg(FYI, "Session Setup\n"); @@ -1563,7 +1603,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, /* * Initialize the session hash with the server one. */ - memcpy(ses->preauth_sha_hash, ses->server->preauth_sha_hash, + memcpy(ses->preauth_sha_hash, server->preauth_sha_hash, SMB2_PREAUTH_HASH_SIZE); while (sess_data->func) @@ -1807,6 +1847,8 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) if ((tcon->need_reconnect) || (tcon->ses->need_reconnect)) return 0; + close_shroot(&tcon->crfid); + rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, (void **) &req, &total_len); if (rc) @@ -2661,7 +2703,7 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, int rc; char *in_data_buf; - rc = smb2_plain_req_init(SMB2_IOCTL, tcon, (void **) &req, &total_len); + rc = smb2_ioctl_req_init(opcode, tcon, (void **) &req, &total_len); if (rc) return rc; @@ -2972,7 +3014,21 @@ int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid) { - return SMB2_close_flags(xid, tcon, persistent_fid, volatile_fid, 0); + int rc; + int tmp_rc; + + rc = SMB2_close_flags(xid, tcon, persistent_fid, volatile_fid, 0); + + /* retry close in a worker thread if this one is interrupted */ + if (rc == -EINTR) { + tmp_rc = smb2_handle_cancelled_close(tcon, persistent_fid, + volatile_fid); + if (tmp_rc) + cifs_dbg(VFS, "handle cancelled close fid 0x%llx returned error %d\n", + persistent_fid, tmp_rc); + } + + return rc; } int diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index ea735d59c36e..f264e1d36fe1 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -838,6 +838,7 @@ struct create_durable_handle_reconnect_v2 { struct create_context ccontext; __u8 Name[8]; struct durable_reconnect_context_v2 dcontext; + __u8 Pad[4]; } __packed; /* See MS-SMB2 2.2.13.2.5 */ @@ -1385,7 +1386,7 @@ struct smb2_oplock_break { struct smb2_lease_break { struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 44 */ - __le16 Reserved; + __le16 Epoch; __le32 Flags; __u8 LeaseKey[16]; __le32 CurrentLeaseState; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 71b2930b8e0b..d21a5fcc8d06 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -46,7 +46,8 @@ extern int smb2_verify_signature(struct smb_rqst *, struct TCP_Server_Info *); extern int smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, bool log_error); extern struct mid_q_entry *smb2_setup_request(struct cifs_ses *ses, - struct smb_rqst *rqst); + struct TCP_Server_Info *, + struct smb_rqst *rqst); extern struct mid_q_entry *smb2_setup_async_request( struct TCP_Server_Info *server, struct smb_rqst *rqst); extern struct cifs_ses *smb2_find_smb_ses(struct TCP_Server_Info *server, @@ -212,6 +213,9 @@ extern int SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, extern int SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, const u64 persistent_fid, const u64 volatile_fid, const __u8 oplock_level); +extern int smb2_handle_cancelled_close(struct cifs_tcon *tcon, + __u64 persistent_fid, + __u64 volatile_fid); extern int smb2_handle_cancelled_mid(char *buffer, struct TCP_Server_Info *server); void smb2_cancelled_close_fid(struct work_struct *work); diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 148d7942c796..387c88704c52 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -98,6 +98,61 @@ err: return rc; } + +static +int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key) +{ + struct cifs_chan *chan; + struct cifs_ses *ses = NULL; + int i; + int rc = 0; + + spin_lock(&cifs_tcp_ses_lock); + + list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + if (ses->Suid == ses_id) + goto found; + } + } + cifs_server_dbg(VFS, "%s: Could not find session 0x%llx\n", + __func__, ses_id); + rc = -ENOENT; + goto out; + +found: + if (ses->binding) { + /* + * If we are in the process of binding a new channel + * to an existing session, use the master connection + * session key + */ + memcpy(key, ses->smb3signingkey, SMB3_SIGN_KEY_SIZE); + goto out; + } + + /* + * Otherwise, use the channel key. + */ + + for (i = 0; i < ses->chan_count; i++) { + chan = ses->chans + i; + if (chan->server == server) { + memcpy(key, chan->signkey, SMB3_SIGN_KEY_SIZE); + goto out; + } + } + + cifs_dbg(VFS, + "%s: Could not find channel signing key for session 0x%llx\n", + __func__, ses_id); + rc = -ENOENT; + +out: + spin_unlock(&cifs_tcp_ses_lock); + return rc; +} + static struct cifs_ses * smb2_find_smb_ses_unlocked(struct TCP_Server_Info *server, __u64 ses_id) { @@ -328,21 +383,45 @@ generate_smb3signingkey(struct cifs_ses *ses, { int rc; - rc = generate_key(ses, ptriplet->signing.label, - ptriplet->signing.context, ses->smb3signingkey, - SMB3_SIGN_KEY_SIZE); - if (rc) - return rc; + /* + * All channels use the same encryption/decryption keys but + * they have their own signing key. + * + * When we generate the keys, check if it is for a new channel + * (binding) in which case we only need to generate a signing + * key and store it in the channel as to not overwrite the + * master connection signing key stored in the session + */ - rc = generate_key(ses, ptriplet->encryption.label, - ptriplet->encryption.context, ses->smb3encryptionkey, - SMB3_SIGN_KEY_SIZE); - if (rc) - return rc; + if (ses->binding) { + rc = generate_key(ses, ptriplet->signing.label, + ptriplet->signing.context, + cifs_ses_binding_channel(ses)->signkey, + SMB3_SIGN_KEY_SIZE); + if (rc) + return rc; + } else { + rc = generate_key(ses, ptriplet->signing.label, + ptriplet->signing.context, + ses->smb3signingkey, + SMB3_SIGN_KEY_SIZE); + if (rc) + return rc; - rc = generate_key(ses, ptriplet->decryption.label, - ptriplet->decryption.context, - ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE); + memcpy(ses->chans[0].signkey, ses->smb3signingkey, + SMB3_SIGN_KEY_SIZE); + + rc = generate_key(ses, ptriplet->encryption.label, + ptriplet->encryption.context, + ses->smb3encryptionkey, + SMB3_SIGN_KEY_SIZE); + rc = generate_key(ses, ptriplet->decryption.label, + ptriplet->decryption.context, + ses->smb3decryptionkey, + SMB3_SIGN_KEY_SIZE); + if (rc) + return rc; + } if (rc) return rc; @@ -431,21 +510,19 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) unsigned char *sigptr = smb3_signature; struct kvec *iov = rqst->rq_iov; struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base; - struct cifs_ses *ses; struct shash_desc *shash = &server->secmech.sdesccmacaes->shash; struct smb_rqst drqst; + u8 key[SMB3_SIGN_KEY_SIZE]; - ses = smb2_find_smb_ses(server, shdr->SessionId); - if (!ses) { - cifs_server_dbg(VFS, "%s: Could not find session\n", __func__); + rc = smb2_get_sign_key(shdr->SessionId, server, key); + if (rc) return 0; - } memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE); memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE); rc = crypto_shash_setkey(server->secmech.cmacaes, - ses->smb3signingkey, SMB2_CMACAES_SIZE); + key, SMB2_CMACAES_SIZE); if (rc) { cifs_server_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__); return rc; @@ -494,16 +571,25 @@ static int smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server) { int rc = 0; - struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; + struct smb2_sync_hdr *shdr; + struct smb2_sess_setup_req *ssr; + bool is_binding; + bool is_signed; - if (!(shdr->Flags & SMB2_FLAGS_SIGNED) || - server->tcpStatus == CifsNeedNegotiate) - return rc; + shdr = (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; + ssr = (struct smb2_sess_setup_req *)shdr; + + is_binding = shdr->Command == SMB2_SESSION_SETUP && + (ssr->Flags & SMB2_SESSION_REQ_FLAG_BINDING); + is_signed = shdr->Flags & SMB2_FLAGS_SIGNED; - if (!server->session_estab) { + if (!is_signed) + return 0; + if (server->tcpStatus == CifsNeedNegotiate) + return 0; + if (!is_binding && !server->session_estab) { strncpy(shdr->Signature, "BSRSPYL", 8); - return rc; + return 0; } rc = server->ops->calc_signature(rqst, server); @@ -610,18 +696,18 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr, } static int -smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_sync_hdr *shdr, - struct mid_q_entry **mid) +smb2_get_mid_entry(struct cifs_ses *ses, struct TCP_Server_Info *server, + struct smb2_sync_hdr *shdr, struct mid_q_entry **mid) { - if (ses->server->tcpStatus == CifsExiting) + if (server->tcpStatus == CifsExiting) return -ENOENT; - if (ses->server->tcpStatus == CifsNeedReconnect) { + if (server->tcpStatus == CifsNeedReconnect) { cifs_dbg(FYI, "tcp session dead - return to caller to retry\n"); return -EAGAIN; } - if (ses->server->tcpStatus == CifsNeedNegotiate && + if (server->tcpStatus == CifsNeedNegotiate && shdr->Command != SMB2_NEGOTIATE) return -EAGAIN; @@ -638,11 +724,11 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_sync_hdr *shdr, /* else ok - we are shutting down the session */ } - *mid = smb2_mid_entry_alloc(shdr, ses->server); + *mid = smb2_mid_entry_alloc(shdr, server); if (*mid == NULL) return -ENOMEM; spin_lock(&GlobalMid_Lock); - list_add_tail(&(*mid)->qhead, &ses->server->pending_mid_q); + list_add_tail(&(*mid)->qhead, &server->pending_mid_q); spin_unlock(&GlobalMid_Lock); return 0; @@ -675,24 +761,25 @@ smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, } struct mid_q_entry * -smb2_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst) +smb2_setup_request(struct cifs_ses *ses, struct TCP_Server_Info *server, + struct smb_rqst *rqst) { int rc; struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; struct mid_q_entry *mid; - smb2_seq_num_into_buf(ses->server, shdr); + smb2_seq_num_into_buf(server, shdr); - rc = smb2_get_mid_entry(ses, shdr, &mid); + rc = smb2_get_mid_entry(ses, server, shdr, &mid); if (rc) { - revert_current_mid_from_hdr(ses->server, shdr); + revert_current_mid_from_hdr(server, shdr); return ERR_PTR(rc); } - rc = smb2_sign_rqst(rqst, ses->server); + rc = smb2_sign_rqst(rqst, server); if (rc) { - revert_current_mid_from_hdr(ses->server, shdr); + revert_current_mid_from_hdr(server, shdr); cifs_delete_mid(mid); return ERR_PTR(rc); } diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index 3c91fa97c9a8..5b1b97e9e0c9 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -1069,7 +1069,7 @@ static int smbd_post_send_data( if (n_vec > SMBDIRECT_MAX_SGE) { cifs_dbg(VFS, "Can't fit data to SGL, n_vec=%d\n", n_vec); - return -ENOMEM; + return -EINVAL; } sg_init_table(sgl, n_vec); @@ -1476,6 +1476,7 @@ void smbd_destroy(struct TCP_Server_Info *server) info->transport_status = SMBD_DESTROYED; destroy_workqueue(info->workqueue); + log_rdma_event(INFO, "rdma session destroyed\n"); kfree(info); } @@ -1505,8 +1506,9 @@ create_conn: log_rdma_event(INFO, "creating rdma session\n"); server->smbd_conn = smbd_get_connection( server, (struct sockaddr *) &server->dstaddr); - log_rdma_event(INFO, "created rdma session info=%p\n", - server->smbd_conn); + + if (server->smbd_conn) + cifs_dbg(VFS, "RDMA transport re-established\n"); return server->smbd_conn ? 0 : -ENOENT; } @@ -1970,7 +1972,7 @@ read_rfc1002_done: if (info->transport_status != SMBD_CONNECTED) { log_read(ERR, "disconnected\n"); - return 0; + return -ECONNABORTED; } goto again; @@ -2269,12 +2271,7 @@ static void smbd_mr_recovery_work(struct work_struct *work) int rc; list_for_each_entry(smbdirect_mr, &info->mr_list, list) { - if (smbdirect_mr->state == MR_INVALIDATED) - ib_dma_unmap_sg( - info->id->device, smbdirect_mr->sgl, - smbdirect_mr->sgl_count, - smbdirect_mr->dir); - else if (smbdirect_mr->state == MR_ERROR) { + if (smbdirect_mr->state == MR_ERROR) { /* recover this MR entry */ rc = ib_dereg_mr(smbdirect_mr->mr); @@ -2602,11 +2599,20 @@ int smbd_deregister_mr(struct smbd_mr *smbdirect_mr) */ smbdirect_mr->state = MR_INVALIDATED; - /* - * Schedule the work to do MR recovery for future I/Os - * MR recovery is slow and we don't want it to block the current I/O - */ - queue_work(info->workqueue, &info->mr_recovery_work); + if (smbdirect_mr->state == MR_INVALIDATED) { + ib_dma_unmap_sg( + info->id->device, smbdirect_mr->sgl, + smbdirect_mr->sgl_count, + smbdirect_mr->dir); + smbdirect_mr->state = MR_READY; + if (atomic_inc_return(&info->mr_ready_count) == 1) + wake_up_interruptible(&info->wait_mr); + } else + /* + * Schedule the work to do MR recovery for future I/Os MR + * recovery is slow and don't want it to block current I/O + */ + queue_work(info->workqueue, &info->mr_recovery_work); done: if (atomic_dec_and_test(&info->mr_used_count)) diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 308ad0f495e1..3d2e11f85cba 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -86,29 +86,21 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) static void _cifs_mid_q_entry_release(struct kref *refcount) { - struct mid_q_entry *mid = container_of(refcount, struct mid_q_entry, - refcount); - - mempool_free(mid, cifs_mid_poolp); -} - -void cifs_mid_q_entry_release(struct mid_q_entry *midEntry) -{ - spin_lock(&GlobalMid_Lock); - kref_put(&midEntry->refcount, _cifs_mid_q_entry_release); - spin_unlock(&GlobalMid_Lock); -} - -void -DeleteMidQEntry(struct mid_q_entry *midEntry) -{ + struct mid_q_entry *midEntry = + container_of(refcount, struct mid_q_entry, refcount); #ifdef CONFIG_CIFS_STATS2 __le16 command = midEntry->server->vals->lock_cmd; __u16 smb_cmd = le16_to_cpu(midEntry->command); unsigned long now; unsigned long roundtrip_time; - struct TCP_Server_Info *server = midEntry->server; #endif + struct TCP_Server_Info *server = midEntry->server; + + if (midEntry->resp_buf && (midEntry->mid_flags & MID_WAIT_CANCELLED) && + midEntry->mid_state == MID_RESPONSE_RECEIVED && + server->ops->handle_cancelled_mid) + server->ops->handle_cancelled_mid(midEntry->resp_buf, server); + midEntry->mid_state = MID_FREE; atomic_dec(&midCount); if (midEntry->large_buf) @@ -166,6 +158,19 @@ DeleteMidQEntry(struct mid_q_entry *midEntry) } } #endif + + mempool_free(midEntry, cifs_mid_poolp); +} + +void cifs_mid_q_entry_release(struct mid_q_entry *midEntry) +{ + spin_lock(&GlobalMid_Lock); + kref_put(&midEntry->refcount, _cifs_mid_q_entry_release); + spin_unlock(&GlobalMid_Lock); +} + +void DeleteMidQEntry(struct mid_q_entry *midEntry) +{ cifs_mid_q_entry_release(midEntry); } @@ -173,8 +178,10 @@ void cifs_delete_mid(struct mid_q_entry *mid) { spin_lock(&GlobalMid_Lock); - list_del_init(&mid->qhead); - mid->mid_flags |= MID_DELETED; + if (!(mid->mid_flags & MID_DELETED)) { + list_del_init(&mid->qhead); + mid->mid_flags |= MID_DELETED; + } spin_unlock(&GlobalMid_Lock); DeleteMidQEntry(mid); @@ -318,8 +325,11 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, int val = 1; __be32 rfc1002_marker; - if (cifs_rdma_enabled(server) && server->smbd_conn) { - rc = smbd_send(server, num_rqst, rqst); + if (cifs_rdma_enabled(server)) { + /* return -EAGAIN when connecting or reconnecting */ + rc = -EAGAIN; + if (server->smbd_conn) + rc = smbd_send(server, num_rqst, rqst); goto smbd_done; } @@ -872,7 +882,10 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server) rc = -EHOSTDOWN; break; default: - list_del_init(&mid->qhead); + if (!(mid->mid_flags & MID_DELETED)) { + list_del_init(&mid->qhead); + mid->mid_flags |= MID_DELETED; + } cifs_server_dbg(VFS, "%s: invalid mid state mid=%llu state=%d\n", __func__, mid->mid, mid->mid_state); rc = -EIO; @@ -923,7 +936,8 @@ cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server, } struct mid_q_entry * -cifs_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst) +cifs_setup_request(struct cifs_ses *ses, struct TCP_Server_Info *ignored, + struct smb_rqst *rqst) { int rc; struct smb_hdr *hdr = (struct smb_hdr *)rqst->rq_iov[0].iov_base; @@ -995,7 +1009,18 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, return -EIO; } - server = ses->server; + if (!ses->binding) { + uint index = 0; + + if (ses->chan_count > 1) { + index = (uint)atomic_inc_return(&ses->chan_seq); + index %= ses->chan_count; + } + server = ses->chans[index].server; + } else { + server = cifs_ses_server(ses); + } + if (server->tcpStatus == CifsExiting) return -ENOENT; @@ -1040,7 +1065,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, } for (i = 0; i < num_rqst; i++) { - midQ[i] = server->ops->setup_request(ses, &rqst[i]); + midQ[i] = server->ops->setup_request(ses, server, &rqst[i]); if (IS_ERR(midQ[i])) { revert_current_mid(server, i); for (j = 0; j < i; j++) @@ -1115,8 +1140,8 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, midQ[i]->mid, le16_to_cpu(midQ[i]->command)); send_cancel(server, &rqst[i], midQ[i]); spin_lock(&GlobalMid_Lock); + midQ[i]->mid_flags |= MID_WAIT_CANCELLED; if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED) { - midQ[i]->mid_flags |= MID_WAIT_CANCELLED; midQ[i]->callback = cifs_cancelled_callback; cancelled_mid[i] = true; credits[i].value = 0; @@ -1283,7 +1308,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, rc = allocate_mid(ses, in_buf, &midQ); if (rc) { - mutex_unlock(&ses->server->srv_mutex); + mutex_unlock(&server->srv_mutex); /* Update # of requests on wire to server */ add_credits(server, &credits, 0); return rc; diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index b7f9ffa1d5f1..aaad4ca1217e 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -48,8 +48,8 @@ #define elf_prstatus compat_elf_prstatus #define elf_prpsinfo compat_elf_prpsinfo -#undef ns_to_timeval -#define ns_to_timeval ns_to_old_timeval32 +#undef ns_to_kernel_old_timeval +#define ns_to_kernel_old_timeval ns_to_old_timeval32 /* * To use this file, asm/elf.h must define compat_elf_check_arch. diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 62e530814cef..358ea2ecf36b 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -11,8 +11,6 @@ * ioctls. */ -#include <linux/joystick.h> - #include <linux/types.h> #include <linux/compat.h> #include <linux/kernel.h> @@ -27,13 +25,9 @@ #include <linux/file.h> #include <linux/ppp-ioctl.h> #include <linux/if_pppox.h> -#include <linux/mtio.h> #include <linux/tty.h> #include <linux/vt_kern.h> -#include <linux/raw.h> #include <linux/blkdev.h> -#include <linux/rtc.h> -#include <linux/pci.h> #include <linux/serial.h> #include <linux/ctype.h> #include <linux/syscalls.h> @@ -42,13 +36,6 @@ #include "internal.h" -#include <net/bluetooth/bluetooth.h> -#include <net/bluetooth/hci_sock.h> -#include <net/bluetooth/rfcomm.h> - -#include <linux/capi.h> -#include <linux/gigaset_dev.h> - #ifdef CONFIG_BLOCK #include <linux/cdrom.h> #include <linux/fd.h> @@ -60,451 +47,11 @@ #include <linux/uaccess.h> #include <linux/watchdog.h> -#include <linux/soundcard.h> - #include <linux/hiddev.h> #include <linux/sort.h> -#ifdef CONFIG_SPARC -#include <linux/fb.h> -#include <asm/fbio.h> -#endif - -#define convert_in_user(srcptr, dstptr) \ -({ \ - typeof(*srcptr) val; \ - \ - get_user(val, srcptr) || put_user(val, dstptr); \ -}) - -static int do_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - int err; - - err = security_file_ioctl(file, cmd, arg); - if (err) - return err; - - return vfs_ioctl(file, cmd, arg); -} - -#ifdef CONFIG_BLOCK -typedef struct sg_io_hdr32 { - compat_int_t interface_id; /* [i] 'S' for SCSI generic (required) */ - compat_int_t dxfer_direction; /* [i] data transfer direction */ - unsigned char cmd_len; /* [i] SCSI command length ( <= 16 bytes) */ - unsigned char mx_sb_len; /* [i] max length to write to sbp */ - unsigned short iovec_count; /* [i] 0 implies no scatter gather */ - compat_uint_t dxfer_len; /* [i] byte count of data transfer */ - compat_uint_t dxferp; /* [i], [*io] points to data transfer memory - or scatter gather list */ - compat_uptr_t cmdp; /* [i], [*i] points to command to perform */ - compat_uptr_t sbp; /* [i], [*o] points to sense_buffer memory */ - compat_uint_t timeout; /* [i] MAX_UINT->no timeout (unit: millisec) */ - compat_uint_t flags; /* [i] 0 -> default, see SG_FLAG... */ - compat_int_t pack_id; /* [i->o] unused internally (normally) */ - compat_uptr_t usr_ptr; /* [i->o] unused internally */ - unsigned char status; /* [o] scsi status */ - unsigned char masked_status; /* [o] shifted, masked scsi status */ - unsigned char msg_status; /* [o] messaging level data (optional) */ - unsigned char sb_len_wr; /* [o] byte count actually written to sbp */ - unsigned short host_status; /* [o] errors from host adapter */ - unsigned short driver_status; /* [o] errors from software driver */ - compat_int_t resid; /* [o] dxfer_len - actual_transferred */ - compat_uint_t duration; /* [o] time taken by cmd (unit: millisec) */ - compat_uint_t info; /* [o] auxiliary information */ -} sg_io_hdr32_t; /* 64 bytes long (on sparc32) */ - -typedef struct sg_iovec32 { - compat_uint_t iov_base; - compat_uint_t iov_len; -} sg_iovec32_t; - -static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iovec_count) -{ - sg_iovec_t __user *iov = (sg_iovec_t __user *) (sgio + 1); - sg_iovec32_t __user *iov32 = dxferp; - int i; - - for (i = 0; i < iovec_count; i++) { - u32 base, len; - - if (get_user(base, &iov32[i].iov_base) || - get_user(len, &iov32[i].iov_len) || - put_user(compat_ptr(base), &iov[i].iov_base) || - put_user(len, &iov[i].iov_len)) - return -EFAULT; - } - - if (put_user(iov, &sgio->dxferp)) - return -EFAULT; - return 0; -} - -static int sg_ioctl_trans(struct file *file, unsigned int cmd, - sg_io_hdr32_t __user *sgio32) -{ - sg_io_hdr_t __user *sgio; - u16 iovec_count; - u32 data; - void __user *dxferp; - int err; - int interface_id; - - if (get_user(interface_id, &sgio32->interface_id)) - return -EFAULT; - if (interface_id != 'S') - return do_ioctl(file, cmd, (unsigned long)sgio32); - - if (get_user(iovec_count, &sgio32->iovec_count)) - return -EFAULT; - - { - void __user *top = compat_alloc_user_space(0); - void __user *new = compat_alloc_user_space(sizeof(sg_io_hdr_t) + - (iovec_count * sizeof(sg_iovec_t))); - if (new > top) - return -EINVAL; - - sgio = new; - } - - /* Ok, now construct. */ - if (copy_in_user(&sgio->interface_id, &sgio32->interface_id, - (2 * sizeof(int)) + - (2 * sizeof(unsigned char)) + - (1 * sizeof(unsigned short)) + - (1 * sizeof(unsigned int)))) - return -EFAULT; - - if (get_user(data, &sgio32->dxferp)) - return -EFAULT; - dxferp = compat_ptr(data); - if (iovec_count) { - if (sg_build_iovec(sgio, dxferp, iovec_count)) - return -EFAULT; - } else { - if (put_user(dxferp, &sgio->dxferp)) - return -EFAULT; - } - - { - unsigned char __user *cmdp; - unsigned char __user *sbp; - - if (get_user(data, &sgio32->cmdp)) - return -EFAULT; - cmdp = compat_ptr(data); - - if (get_user(data, &sgio32->sbp)) - return -EFAULT; - sbp = compat_ptr(data); - - if (put_user(cmdp, &sgio->cmdp) || - put_user(sbp, &sgio->sbp)) - return -EFAULT; - } - - if (copy_in_user(&sgio->timeout, &sgio32->timeout, - 3 * sizeof(int))) - return -EFAULT; - - if (get_user(data, &sgio32->usr_ptr)) - return -EFAULT; - if (put_user(compat_ptr(data), &sgio->usr_ptr)) - return -EFAULT; - - err = do_ioctl(file, cmd, (unsigned long) sgio); - - if (err >= 0) { - void __user *datap; - - if (copy_in_user(&sgio32->pack_id, &sgio->pack_id, - sizeof(int)) || - get_user(datap, &sgio->usr_ptr) || - put_user((u32)(unsigned long)datap, - &sgio32->usr_ptr) || - copy_in_user(&sgio32->status, &sgio->status, - (4 * sizeof(unsigned char)) + - (2 * sizeof(unsigned short)) + - (3 * sizeof(int)))) - err = -EFAULT; - } - - return err; -} - -struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */ - char req_state; - char orphan; - char sg_io_owned; - char problem; - int pack_id; - compat_uptr_t usr_ptr; - unsigned int duration; - int unused; -}; - -static int sg_grt_trans(struct file *file, - unsigned int cmd, struct compat_sg_req_info __user *o) -{ - int err, i; - sg_req_info_t __user *r; - r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE); - err = do_ioctl(file, cmd, (unsigned long)r); - if (err < 0) - return err; - for (i = 0; i < SG_MAX_QUEUE; i++) { - void __user *ptr; - int d; - - if (copy_in_user(o + i, r + i, offsetof(sg_req_info_t, usr_ptr)) || - get_user(ptr, &r[i].usr_ptr) || - get_user(d, &r[i].duration) || - put_user((u32)(unsigned long)(ptr), &o[i].usr_ptr) || - put_user(d, &o[i].duration)) - return -EFAULT; - } - return err; -} -#endif /* CONFIG_BLOCK */ - -struct sock_fprog32 { - unsigned short len; - compat_caddr_t filter; -}; - -#define PPPIOCSPASS32 _IOW('t', 71, struct sock_fprog32) -#define PPPIOCSACTIVE32 _IOW('t', 70, struct sock_fprog32) - -static int ppp_sock_fprog_ioctl_trans(struct file *file, - unsigned int cmd, struct sock_fprog32 __user *u_fprog32) -{ - struct sock_fprog __user *u_fprog64 = compat_alloc_user_space(sizeof(struct sock_fprog)); - void __user *fptr64; - u32 fptr32; - u16 flen; - - if (get_user(flen, &u_fprog32->len) || - get_user(fptr32, &u_fprog32->filter)) - return -EFAULT; - - fptr64 = compat_ptr(fptr32); - - if (put_user(flen, &u_fprog64->len) || - put_user(fptr64, &u_fprog64->filter)) - return -EFAULT; - - if (cmd == PPPIOCSPASS32) - cmd = PPPIOCSPASS; - else - cmd = PPPIOCSACTIVE; - - return do_ioctl(file, cmd, (unsigned long) u_fprog64); -} - -struct ppp_option_data32 { - compat_caddr_t ptr; - u32 length; - compat_int_t transmit; -}; -#define PPPIOCSCOMPRESS32 _IOW('t', 77, struct ppp_option_data32) - -struct ppp_idle32 { - compat_time_t xmit_idle; - compat_time_t recv_idle; -}; -#define PPPIOCGIDLE32 _IOR('t', 63, struct ppp_idle32) - -static int ppp_gidle(struct file *file, unsigned int cmd, - struct ppp_idle32 __user *idle32) -{ - struct ppp_idle __user *idle; - __kernel_time_t xmit, recv; - int err; - - idle = compat_alloc_user_space(sizeof(*idle)); - - err = do_ioctl(file, PPPIOCGIDLE, (unsigned long) idle); - - if (!err) { - if (get_user(xmit, &idle->xmit_idle) || - get_user(recv, &idle->recv_idle) || - put_user(xmit, &idle32->xmit_idle) || - put_user(recv, &idle32->recv_idle)) - err = -EFAULT; - } - return err; -} - -static int ppp_scompress(struct file *file, unsigned int cmd, - struct ppp_option_data32 __user *odata32) -{ - struct ppp_option_data __user *odata; - __u32 data; - void __user *datap; - - odata = compat_alloc_user_space(sizeof(*odata)); - - if (get_user(data, &odata32->ptr)) - return -EFAULT; - - datap = compat_ptr(data); - if (put_user(datap, &odata->ptr)) - return -EFAULT; - - if (copy_in_user(&odata->length, &odata32->length, - sizeof(__u32) + sizeof(int))) - return -EFAULT; - - return do_ioctl(file, PPPIOCSCOMPRESS, (unsigned long) odata); -} - -#ifdef CONFIG_BLOCK -struct mtget32 { - compat_long_t mt_type; - compat_long_t mt_resid; - compat_long_t mt_dsreg; - compat_long_t mt_gstat; - compat_long_t mt_erreg; - compat_daddr_t mt_fileno; - compat_daddr_t mt_blkno; -}; -#define MTIOCGET32 _IOR('m', 2, struct mtget32) - -struct mtpos32 { - compat_long_t mt_blkno; -}; -#define MTIOCPOS32 _IOR('m', 3, struct mtpos32) - -static int mt_ioctl_trans(struct file *file, - unsigned int cmd, void __user *argp) -{ - /* NULL initialization to make gcc shut up */ - struct mtget __user *get = NULL; - struct mtget32 __user *umget32; - struct mtpos __user *pos = NULL; - struct mtpos32 __user *upos32; - unsigned long kcmd; - void *karg; - int err = 0; - - switch(cmd) { - case MTIOCPOS32: - kcmd = MTIOCPOS; - pos = compat_alloc_user_space(sizeof(*pos)); - karg = pos; - break; - default: /* MTIOCGET32 */ - kcmd = MTIOCGET; - get = compat_alloc_user_space(sizeof(*get)); - karg = get; - break; - } - if (karg == NULL) - return -EFAULT; - err = do_ioctl(file, kcmd, (unsigned long)karg); - if (err) - return err; - switch (cmd) { - case MTIOCPOS32: - upos32 = argp; - err = convert_in_user(&pos->mt_blkno, &upos32->mt_blkno); - break; - case MTIOCGET32: - umget32 = argp; - err = convert_in_user(&get->mt_type, &umget32->mt_type); - err |= convert_in_user(&get->mt_resid, &umget32->mt_resid); - err |= convert_in_user(&get->mt_dsreg, &umget32->mt_dsreg); - err |= convert_in_user(&get->mt_gstat, &umget32->mt_gstat); - err |= convert_in_user(&get->mt_erreg, &umget32->mt_erreg); - err |= convert_in_user(&get->mt_fileno, &umget32->mt_fileno); - err |= convert_in_user(&get->mt_blkno, &umget32->mt_blkno); - break; - } - return err ? -EFAULT: 0; -} - -#endif /* CONFIG_BLOCK */ - -/* Bluetooth ioctls */ -#define HCIUARTSETPROTO _IOW('U', 200, int) -#define HCIUARTGETPROTO _IOR('U', 201, int) -#define HCIUARTGETDEVICE _IOR('U', 202, int) -#define HCIUARTSETFLAGS _IOW('U', 203, int) -#define HCIUARTGETFLAGS _IOR('U', 204, int) - -#define RTC_IRQP_READ32 _IOR('p', 0x0b, compat_ulong_t) -#define RTC_IRQP_SET32 _IOW('p', 0x0c, compat_ulong_t) -#define RTC_EPOCH_READ32 _IOR('p', 0x0d, compat_ulong_t) -#define RTC_EPOCH_SET32 _IOW('p', 0x0e, compat_ulong_t) - -static int rtc_ioctl(struct file *file, - unsigned cmd, void __user *argp) -{ - unsigned long __user *valp = compat_alloc_user_space(sizeof(*valp)); - int ret; - - if (valp == NULL) - return -EFAULT; - switch (cmd) { - case RTC_IRQP_READ32: - case RTC_EPOCH_READ32: - ret = do_ioctl(file, (cmd == RTC_IRQP_READ32) ? - RTC_IRQP_READ : RTC_EPOCH_READ, - (unsigned long)valp); - if (ret) - return ret; - return convert_in_user(valp, (unsigned int __user *)argp); - case RTC_IRQP_SET32: - return do_ioctl(file, RTC_IRQP_SET, (unsigned long)argp); - case RTC_EPOCH_SET32: - return do_ioctl(file, RTC_EPOCH_SET, (unsigned long)argp); - } - - return -ENOIOCTLCMD; -} - -/* on ia32 l_start is on a 32-bit boundary */ -#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) -struct space_resv_32 { - __s16 l_type; - __s16 l_whence; - __s64 l_start __attribute__((packed)); - /* len == 0 means until end of file */ - __s64 l_len __attribute__((packed)); - __s32 l_sysid; - __u32 l_pid; - __s32 l_pad[4]; /* reserve area */ -}; - -#define FS_IOC_RESVSP_32 _IOW ('X', 40, struct space_resv_32) -#define FS_IOC_UNRESVSP_32 _IOW ('X', 41, struct space_resv_32) -#define FS_IOC_RESVSP64_32 _IOW ('X', 42, struct space_resv_32) -#define FS_IOC_UNRESVSP64_32 _IOW ('X', 43, struct space_resv_32) -#define FS_IOC_ZERO_RANGE_32 _IOW ('X', 57, struct space_resv_32) - -/* just account for different alignment */ -static int compat_ioctl_preallocate(struct file *file, int mode, - struct space_resv_32 __user *p32) -{ - struct space_resv __user *p = compat_alloc_user_space(sizeof(*p)); - - if (copy_in_user(&p->l_type, &p32->l_type, sizeof(s16)) || - copy_in_user(&p->l_whence, &p32->l_whence, sizeof(s16)) || - copy_in_user(&p->l_start, &p32->l_start, sizeof(s64)) || - copy_in_user(&p->l_len, &p32->l_len, sizeof(s64)) || - copy_in_user(&p->l_sysid, &p32->l_sysid, sizeof(s32)) || - copy_in_user(&p->l_pid, &p32->l_pid, sizeof(u32)) || - copy_in_user(&p->l_pad, &p32->l_pad, 4*sizeof(u32))) - return -EFAULT; - - return ioctl_preallocate(file, mode, p); -} -#endif - /* * simple reversible transform to make our table more evenly * distributed after sorting. @@ -512,33 +59,7 @@ static int compat_ioctl_preallocate(struct file *file, int mode, #define XFORM(i) (((i) ^ ((i) << 27) ^ ((i) << 17)) & 0xffffffff) #define COMPATIBLE_IOCTL(cmd) XFORM((u32)cmd), -/* ioctl should not be warned about even if it's not implemented. - Valid reasons to use this: - - It is implemented with ->compat_ioctl on some device, but programs - call it on others too. - - The ioctl is not implemented in the native kernel, but programs - call it commonly anyways. - Most other reasons are not valid. */ -#define IGNORE_IOCTL(cmd) COMPATIBLE_IOCTL(cmd) - static unsigned int ioctl_pointer[] = { -/* compatible ioctls first */ -/* Little t */ -COMPATIBLE_IOCTL(TIOCOUTQ) -/* Little f */ -COMPATIBLE_IOCTL(FIOCLEX) -COMPATIBLE_IOCTL(FIONCLEX) -COMPATIBLE_IOCTL(FIOASYNC) -COMPATIBLE_IOCTL(FIONBIO) -COMPATIBLE_IOCTL(FIONREAD) /* This is also TIOCINQ */ -COMPATIBLE_IOCTL(FS_IOC_FIEMAP) -/* 0x00 */ -COMPATIBLE_IOCTL(FIBMAP) -COMPATIBLE_IOCTL(FIGETBSZ) -/* 'X' - originally XFS but some now in the VFS */ -COMPATIBLE_IOCTL(FIFREEZE) -COMPATIBLE_IOCTL(FITHAW) -COMPATIBLE_IOCTL(FITRIM) #ifdef CONFIG_BLOCK /* Big S */ COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN) @@ -550,43 +71,10 @@ COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND) COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST) COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI) #endif -/* Big V (don't complain on serial console) */ -IGNORE_IOCTL(VT_OPENQRY) -IGNORE_IOCTL(VT_GETMODE) -/* Little p (/dev/rtc, /dev/envctrl, etc.) */ -COMPATIBLE_IOCTL(RTC_AIE_ON) -COMPATIBLE_IOCTL(RTC_AIE_OFF) -COMPATIBLE_IOCTL(RTC_UIE_ON) -COMPATIBLE_IOCTL(RTC_UIE_OFF) -COMPATIBLE_IOCTL(RTC_PIE_ON) -COMPATIBLE_IOCTL(RTC_PIE_OFF) -COMPATIBLE_IOCTL(RTC_WIE_ON) -COMPATIBLE_IOCTL(RTC_WIE_OFF) -COMPATIBLE_IOCTL(RTC_ALM_SET) -COMPATIBLE_IOCTL(RTC_ALM_READ) -COMPATIBLE_IOCTL(RTC_RD_TIME) -COMPATIBLE_IOCTL(RTC_SET_TIME) -COMPATIBLE_IOCTL(RTC_WKALM_SET) -COMPATIBLE_IOCTL(RTC_WKALM_RD) -/* - * These two are only for the sbus rtc driver, but - * hwclock tries them on every rtc device first when - * running on sparc. On other architectures the entries - * are useless but harmless. - */ -COMPATIBLE_IOCTL(_IOR('p', 20, int[7])) /* RTCGET */ -COMPATIBLE_IOCTL(_IOW('p', 21, int[7])) /* RTCSET */ -/* Little m */ -COMPATIBLE_IOCTL(MTIOCTOP) -/* Socket level stuff */ -COMPATIBLE_IOCTL(FIOQSIZE) #ifdef CONFIG_BLOCK -/* md calls this on random blockdevs */ -IGNORE_IOCTL(RAID_VERSION) -/* qemu/qemu-img might call these two on plain files for probing */ -IGNORE_IOCTL(CDROM_DRIVE_STATUS) -IGNORE_IOCTL(FDGETPRM32) /* SG stuff */ +COMPATIBLE_IOCTL(SG_IO) +COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE) COMPATIBLE_IOCTL(SG_SET_TIMEOUT) COMPATIBLE_IOCTL(SG_GET_TIMEOUT) COMPATIBLE_IOCTL(SG_EMULATED_HOST) @@ -610,314 +98,6 @@ COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE) COMPATIBLE_IOCTL(SG_SET_KEEP_ORPHAN) COMPATIBLE_IOCTL(SG_GET_KEEP_ORPHAN) #endif -/* PPP stuff */ -COMPATIBLE_IOCTL(PPPIOCGFLAGS) -COMPATIBLE_IOCTL(PPPIOCSFLAGS) -COMPATIBLE_IOCTL(PPPIOCGASYNCMAP) -COMPATIBLE_IOCTL(PPPIOCSASYNCMAP) -COMPATIBLE_IOCTL(PPPIOCGUNIT) -COMPATIBLE_IOCTL(PPPIOCGRASYNCMAP) -COMPATIBLE_IOCTL(PPPIOCSRASYNCMAP) -COMPATIBLE_IOCTL(PPPIOCGMRU) -COMPATIBLE_IOCTL(PPPIOCSMRU) -COMPATIBLE_IOCTL(PPPIOCSMAXCID) -COMPATIBLE_IOCTL(PPPIOCGXASYNCMAP) -COMPATIBLE_IOCTL(PPPIOCSXASYNCMAP) -COMPATIBLE_IOCTL(PPPIOCXFERUNIT) -/* PPPIOCSCOMPRESS is translated */ -COMPATIBLE_IOCTL(PPPIOCGNPMODE) -COMPATIBLE_IOCTL(PPPIOCSNPMODE) -COMPATIBLE_IOCTL(PPPIOCGDEBUG) -COMPATIBLE_IOCTL(PPPIOCSDEBUG) -/* PPPIOCSPASS is translated */ -/* PPPIOCSACTIVE is translated */ -/* PPPIOCGIDLE is translated */ -COMPATIBLE_IOCTL(PPPIOCNEWUNIT) -COMPATIBLE_IOCTL(PPPIOCATTACH) -COMPATIBLE_IOCTL(PPPIOCDETACH) -COMPATIBLE_IOCTL(PPPIOCSMRRU) -COMPATIBLE_IOCTL(PPPIOCCONNECT) -COMPATIBLE_IOCTL(PPPIOCDISCONN) -COMPATIBLE_IOCTL(PPPIOCATTCHAN) -COMPATIBLE_IOCTL(PPPIOCGCHAN) -COMPATIBLE_IOCTL(PPPIOCGL2TPSTATS) -/* Big A */ -/* sparc only */ -/* Big Q for sound/OSS */ -COMPATIBLE_IOCTL(SNDCTL_SEQ_RESET) -COMPATIBLE_IOCTL(SNDCTL_SEQ_SYNC) -COMPATIBLE_IOCTL(SNDCTL_SYNTH_INFO) -COMPATIBLE_IOCTL(SNDCTL_SEQ_CTRLRATE) -COMPATIBLE_IOCTL(SNDCTL_SEQ_GETOUTCOUNT) -COMPATIBLE_IOCTL(SNDCTL_SEQ_GETINCOUNT) -COMPATIBLE_IOCTL(SNDCTL_SEQ_PERCMODE) -COMPATIBLE_IOCTL(SNDCTL_FM_LOAD_INSTR) -COMPATIBLE_IOCTL(SNDCTL_SEQ_TESTMIDI) -COMPATIBLE_IOCTL(SNDCTL_SEQ_RESETSAMPLES) -COMPATIBLE_IOCTL(SNDCTL_SEQ_NRSYNTHS) -COMPATIBLE_IOCTL(SNDCTL_SEQ_NRMIDIS) -COMPATIBLE_IOCTL(SNDCTL_MIDI_INFO) -COMPATIBLE_IOCTL(SNDCTL_SEQ_THRESHOLD) -COMPATIBLE_IOCTL(SNDCTL_SYNTH_MEMAVL) -COMPATIBLE_IOCTL(SNDCTL_FM_4OP_ENABLE) -COMPATIBLE_IOCTL(SNDCTL_SEQ_PANIC) -COMPATIBLE_IOCTL(SNDCTL_SEQ_OUTOFBAND) -COMPATIBLE_IOCTL(SNDCTL_SEQ_GETTIME) -COMPATIBLE_IOCTL(SNDCTL_SYNTH_ID) -COMPATIBLE_IOCTL(SNDCTL_SYNTH_CONTROL) -COMPATIBLE_IOCTL(SNDCTL_SYNTH_REMOVESAMPLE) -/* Big T for sound/OSS */ -COMPATIBLE_IOCTL(SNDCTL_TMR_TIMEBASE) -COMPATIBLE_IOCTL(SNDCTL_TMR_START) -COMPATIBLE_IOCTL(SNDCTL_TMR_STOP) -COMPATIBLE_IOCTL(SNDCTL_TMR_CONTINUE) -COMPATIBLE_IOCTL(SNDCTL_TMR_TEMPO) -COMPATIBLE_IOCTL(SNDCTL_TMR_SOURCE) -COMPATIBLE_IOCTL(SNDCTL_TMR_METRONOME) -COMPATIBLE_IOCTL(SNDCTL_TMR_SELECT) -/* Little m for sound/OSS */ -COMPATIBLE_IOCTL(SNDCTL_MIDI_PRETIME) -COMPATIBLE_IOCTL(SNDCTL_MIDI_MPUMODE) -COMPATIBLE_IOCTL(SNDCTL_MIDI_MPUCMD) -/* Big P for sound/OSS */ -COMPATIBLE_IOCTL(SNDCTL_DSP_RESET) -COMPATIBLE_IOCTL(SNDCTL_DSP_SYNC) -COMPATIBLE_IOCTL(SNDCTL_DSP_SPEED) -COMPATIBLE_IOCTL(SNDCTL_DSP_STEREO) -COMPATIBLE_IOCTL(SNDCTL_DSP_GETBLKSIZE) -COMPATIBLE_IOCTL(SNDCTL_DSP_CHANNELS) -COMPATIBLE_IOCTL(SOUND_PCM_WRITE_FILTER) -COMPATIBLE_IOCTL(SNDCTL_DSP_POST) -COMPATIBLE_IOCTL(SNDCTL_DSP_SUBDIVIDE) -COMPATIBLE_IOCTL(SNDCTL_DSP_SETFRAGMENT) -COMPATIBLE_IOCTL(SNDCTL_DSP_GETFMTS) -COMPATIBLE_IOCTL(SNDCTL_DSP_SETFMT) -COMPATIBLE_IOCTL(SNDCTL_DSP_GETOSPACE) -COMPATIBLE_IOCTL(SNDCTL_DSP_GETISPACE) -COMPATIBLE_IOCTL(SNDCTL_DSP_NONBLOCK) -COMPATIBLE_IOCTL(SNDCTL_DSP_GETCAPS) -COMPATIBLE_IOCTL(SNDCTL_DSP_GETTRIGGER) -COMPATIBLE_IOCTL(SNDCTL_DSP_SETTRIGGER) -COMPATIBLE_IOCTL(SNDCTL_DSP_GETIPTR) -COMPATIBLE_IOCTL(SNDCTL_DSP_GETOPTR) -/* SNDCTL_DSP_MAPINBUF, XXX needs translation */ -/* SNDCTL_DSP_MAPOUTBUF, XXX needs translation */ -COMPATIBLE_IOCTL(SNDCTL_DSP_SETSYNCRO) -COMPATIBLE_IOCTL(SNDCTL_DSP_SETDUPLEX) -COMPATIBLE_IOCTL(SNDCTL_DSP_GETODELAY) -COMPATIBLE_IOCTL(SNDCTL_DSP_PROFILE) -COMPATIBLE_IOCTL(SOUND_PCM_READ_RATE) -COMPATIBLE_IOCTL(SOUND_PCM_READ_CHANNELS) -COMPATIBLE_IOCTL(SOUND_PCM_READ_BITS) -COMPATIBLE_IOCTL(SOUND_PCM_READ_FILTER) -/* Big C for sound/OSS */ -COMPATIBLE_IOCTL(SNDCTL_COPR_RESET) -COMPATIBLE_IOCTL(SNDCTL_COPR_LOAD) -COMPATIBLE_IOCTL(SNDCTL_COPR_RDATA) -COMPATIBLE_IOCTL(SNDCTL_COPR_RCODE) -COMPATIBLE_IOCTL(SNDCTL_COPR_WDATA) -COMPATIBLE_IOCTL(SNDCTL_COPR_WCODE) -COMPATIBLE_IOCTL(SNDCTL_COPR_RUN) -COMPATIBLE_IOCTL(SNDCTL_COPR_HALT) -COMPATIBLE_IOCTL(SNDCTL_COPR_SENDMSG) -COMPATIBLE_IOCTL(SNDCTL_COPR_RCVMSG) -/* Big M for sound/OSS */ -COMPATIBLE_IOCTL(SOUND_MIXER_READ_VOLUME) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_BASS) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_TREBLE) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_SYNTH) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_PCM) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_SPEAKER) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_MIC) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_CD) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_IMIX) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_ALTPCM) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECLEV) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_IGAIN) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_OGAIN) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE1) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE2) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE3) -COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL1)) -COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL2)) -COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL3)) -COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_PHONEIN)) -COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_PHONEOUT)) -COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_VIDEO)) -COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_RADIO)) -COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_MONITOR)) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_MUTE) -/* SOUND_MIXER_READ_ENHANCE, same value as READ_MUTE */ -/* SOUND_MIXER_READ_LOUD, same value as READ_MUTE */ -COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECSRC) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_DEVMASK) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECMASK) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_STEREODEVS) -COMPATIBLE_IOCTL(SOUND_MIXER_READ_CAPS) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_VOLUME) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_BASS) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_TREBLE) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_SYNTH) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_PCM) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_SPEAKER) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_MIC) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_CD) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_IMIX) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_ALTPCM) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_RECLEV) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_IGAIN) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_OGAIN) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE1) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE2) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE3) -COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL1)) -COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL2)) -COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL3)) -COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_PHONEIN)) -COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_PHONEOUT)) -COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_VIDEO)) -COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_RADIO)) -COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_MONITOR)) -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_MUTE) -/* SOUND_MIXER_WRITE_ENHANCE, same value as WRITE_MUTE */ -/* SOUND_MIXER_WRITE_LOUD, same value as WRITE_MUTE */ -COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_RECSRC) -COMPATIBLE_IOCTL(SOUND_MIXER_INFO) -COMPATIBLE_IOCTL(SOUND_OLD_MIXER_INFO) -COMPATIBLE_IOCTL(SOUND_MIXER_ACCESS) -COMPATIBLE_IOCTL(SOUND_MIXER_AGC) -COMPATIBLE_IOCTL(SOUND_MIXER_3DSE) -COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE1) -COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE2) -COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE3) -COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE4) -COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE5) -COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS) -COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS) -COMPATIBLE_IOCTL(OSS_GETVERSION) -/* Raw devices */ -COMPATIBLE_IOCTL(RAW_SETBIND) -COMPATIBLE_IOCTL(RAW_GETBIND) -/* Watchdog */ -COMPATIBLE_IOCTL(WDIOC_GETSUPPORT) -COMPATIBLE_IOCTL(WDIOC_GETSTATUS) -COMPATIBLE_IOCTL(WDIOC_GETBOOTSTATUS) -COMPATIBLE_IOCTL(WDIOC_GETTEMP) -COMPATIBLE_IOCTL(WDIOC_SETOPTIONS) -COMPATIBLE_IOCTL(WDIOC_KEEPALIVE) -COMPATIBLE_IOCTL(WDIOC_SETTIMEOUT) -COMPATIBLE_IOCTL(WDIOC_GETTIMEOUT) -COMPATIBLE_IOCTL(WDIOC_SETPRETIMEOUT) -COMPATIBLE_IOCTL(WDIOC_GETPRETIMEOUT) -/* Big R */ -COMPATIBLE_IOCTL(RNDGETENTCNT) -COMPATIBLE_IOCTL(RNDADDTOENTCNT) -COMPATIBLE_IOCTL(RNDGETPOOL) -COMPATIBLE_IOCTL(RNDADDENTROPY) -COMPATIBLE_IOCTL(RNDZAPENTCNT) -COMPATIBLE_IOCTL(RNDCLEARPOOL) -/* Bluetooth */ -COMPATIBLE_IOCTL(HCIDEVUP) -COMPATIBLE_IOCTL(HCIDEVDOWN) -COMPATIBLE_IOCTL(HCIDEVRESET) -COMPATIBLE_IOCTL(HCIDEVRESTAT) -COMPATIBLE_IOCTL(HCIGETDEVLIST) -COMPATIBLE_IOCTL(HCIGETDEVINFO) -COMPATIBLE_IOCTL(HCIGETCONNLIST) -COMPATIBLE_IOCTL(HCIGETCONNINFO) -COMPATIBLE_IOCTL(HCIGETAUTHINFO) -COMPATIBLE_IOCTL(HCISETRAW) -COMPATIBLE_IOCTL(HCISETSCAN) -COMPATIBLE_IOCTL(HCISETAUTH) -COMPATIBLE_IOCTL(HCISETENCRYPT) -COMPATIBLE_IOCTL(HCISETPTYPE) -COMPATIBLE_IOCTL(HCISETLINKPOL) -COMPATIBLE_IOCTL(HCISETLINKMODE) -COMPATIBLE_IOCTL(HCISETACLMTU) -COMPATIBLE_IOCTL(HCISETSCOMTU) -COMPATIBLE_IOCTL(HCIBLOCKADDR) -COMPATIBLE_IOCTL(HCIUNBLOCKADDR) -COMPATIBLE_IOCTL(HCIINQUIRY) -COMPATIBLE_IOCTL(HCIUARTSETPROTO) -COMPATIBLE_IOCTL(HCIUARTGETPROTO) -COMPATIBLE_IOCTL(HCIUARTGETDEVICE) -COMPATIBLE_IOCTL(HCIUARTSETFLAGS) -COMPATIBLE_IOCTL(HCIUARTGETFLAGS) -COMPATIBLE_IOCTL(RFCOMMCREATEDEV) -COMPATIBLE_IOCTL(RFCOMMRELEASEDEV) -COMPATIBLE_IOCTL(RFCOMMGETDEVLIST) -COMPATIBLE_IOCTL(RFCOMMGETDEVINFO) -COMPATIBLE_IOCTL(RFCOMMSTEALDLC) -/* CAPI */ -COMPATIBLE_IOCTL(CAPI_REGISTER) -COMPATIBLE_IOCTL(CAPI_GET_MANUFACTURER) -COMPATIBLE_IOCTL(CAPI_GET_VERSION) -COMPATIBLE_IOCTL(CAPI_GET_SERIAL) -COMPATIBLE_IOCTL(CAPI_GET_PROFILE) -COMPATIBLE_IOCTL(CAPI_MANUFACTURER_CMD) -COMPATIBLE_IOCTL(CAPI_GET_ERRCODE) -COMPATIBLE_IOCTL(CAPI_INSTALLED) -COMPATIBLE_IOCTL(CAPI_GET_FLAGS) -COMPATIBLE_IOCTL(CAPI_SET_FLAGS) -COMPATIBLE_IOCTL(CAPI_CLR_FLAGS) -COMPATIBLE_IOCTL(CAPI_NCCI_OPENCOUNT) -COMPATIBLE_IOCTL(CAPI_NCCI_GETUNIT) -/* Misc. */ -COMPATIBLE_IOCTL(0x41545900) /* ATYIO_CLKR */ -COMPATIBLE_IOCTL(0x41545901) /* ATYIO_CLKW */ -COMPATIBLE_IOCTL(PCIIOC_CONTROLLER) -COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_IO) -COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_MEM) -COMPATIBLE_IOCTL(PCIIOC_WRITE_COMBINE) -/* hiddev */ -COMPATIBLE_IOCTL(HIDIOCGVERSION) -COMPATIBLE_IOCTL(HIDIOCAPPLICATION) -COMPATIBLE_IOCTL(HIDIOCGDEVINFO) -COMPATIBLE_IOCTL(HIDIOCGSTRING) -COMPATIBLE_IOCTL(HIDIOCINITREPORT) -COMPATIBLE_IOCTL(HIDIOCGREPORT) -COMPATIBLE_IOCTL(HIDIOCSREPORT) -COMPATIBLE_IOCTL(HIDIOCGREPORTINFO) -COMPATIBLE_IOCTL(HIDIOCGFIELDINFO) -COMPATIBLE_IOCTL(HIDIOCGUSAGE) -COMPATIBLE_IOCTL(HIDIOCSUSAGE) -COMPATIBLE_IOCTL(HIDIOCGUCODE) -COMPATIBLE_IOCTL(HIDIOCGFLAG) -COMPATIBLE_IOCTL(HIDIOCSFLAG) -COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINDEX) -COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINFO) -/* joystick */ -COMPATIBLE_IOCTL(JSIOCGVERSION) -COMPATIBLE_IOCTL(JSIOCGAXES) -COMPATIBLE_IOCTL(JSIOCGBUTTONS) -COMPATIBLE_IOCTL(JSIOCGNAME(0)) - -/* fat 'r' ioctls. These are handled by fat with ->compat_ioctl, - but we don't want warnings on other file systems. So declare - them as compatible here. */ -#define VFAT_IOCTL_READDIR_BOTH32 _IOR('r', 1, struct compat_dirent[2]) -#define VFAT_IOCTL_READDIR_SHORT32 _IOR('r', 2, struct compat_dirent[2]) - -IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32) -IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32) - -#ifdef CONFIG_SPARC -/* Sparc framebuffers, handled in sbusfb_compat_ioctl() */ -IGNORE_IOCTL(FBIOGTYPE) -IGNORE_IOCTL(FBIOSATTR) -IGNORE_IOCTL(FBIOGATTR) -IGNORE_IOCTL(FBIOSVIDEO) -IGNORE_IOCTL(FBIOGVIDEO) -IGNORE_IOCTL(FBIOSCURPOS) -IGNORE_IOCTL(FBIOGCURPOS) -IGNORE_IOCTL(FBIOGCURMAX) -IGNORE_IOCTL(FBIOPUTCMAP32) -IGNORE_IOCTL(FBIOGETCMAP32) -IGNORE_IOCTL(FBIOSCURSOR32) -IGNORE_IOCTL(FBIOGCURSOR32) -#endif }; /* @@ -930,51 +110,12 @@ IGNORE_IOCTL(FBIOGCURSOR32) static long do_ioctl_trans(unsigned int cmd, unsigned long arg, struct file *file) { - void __user *argp = compat_ptr(arg); - - switch (cmd) { - case PPPIOCGIDLE32: - return ppp_gidle(file, cmd, argp); - case PPPIOCSCOMPRESS32: - return ppp_scompress(file, cmd, argp); - case PPPIOCSPASS32: - case PPPIOCSACTIVE32: - return ppp_sock_fprog_ioctl_trans(file, cmd, argp); -#ifdef CONFIG_BLOCK - case SG_IO: - return sg_ioctl_trans(file, cmd, argp); - case SG_GET_REQUEST_TABLE: - return sg_grt_trans(file, cmd, argp); - case MTIOCGET32: - case MTIOCPOS32: - return mt_ioctl_trans(file, cmd, argp); -#endif - /* Not implemented in the native kernel */ - case RTC_IRQP_READ32: - case RTC_IRQP_SET32: - case RTC_EPOCH_READ32: - case RTC_EPOCH_SET32: - return rtc_ioctl(file, cmd, argp); - } - - /* - * These take an integer instead of a pointer as 'arg', - * so we must not do a compat_ptr() translation. - */ - switch (cmd) { - /* RAID */ - case HOT_REMOVE_DISK: - case HOT_ADD_DISK: - case SET_DISK_FAULTY: - case SET_BITMAP_FILE: - return vfs_ioctl(file, cmd, arg); - } - return -ENOIOCTLCMD; } static int compat_ioctl_check_table(unsigned int xcmd) { +#ifdef CONFIG_BLOCK int i; const int max = ARRAY_SIZE(ioctl_pointer) - 1; @@ -993,6 +134,9 @@ static int compat_ioctl_check_table(unsigned int xcmd) i--; return ioctl_pointer[i] == xcmd; +#else + return 0; +#endif } COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, @@ -1009,20 +153,40 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, if (error) goto out_fput; - /* - * To allow the compat_ioctl handlers to be self contained - * we need to check the common ioctls here first. - * Just handle them with the standard handlers below. - */ switch (cmd) { + /* these are never seen by ->ioctl(), no argument or int argument */ case FIOCLEX: case FIONCLEX: + case FIFREEZE: + case FITHAW: + case FICLONE: + goto do_ioctl; + /* these are never seen by ->ioctl(), pointer argument */ case FIONBIO: case FIOASYNC: case FIOQSIZE: - break; - -#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) + case FS_IOC_FIEMAP: + case FIGETBSZ: + case FICLONERANGE: + case FIDEDUPERANGE: + goto found_handler; + /* + * The next group is the stuff handled inside file_ioctl(). + * For regular files these never reach ->ioctl(); for + * devices, sockets, etc. they do and one (FIONREAD) is + * even accepted in some cases. In all those cases + * argument has the same type, so we can handle these + * here, shunting them towards do_vfs_ioctl(). + * ->compat_ioctl() will never see any of those. + */ + /* pointer argument, never actually handled by ->ioctl() */ + case FIBMAP: + goto found_handler; + /* handled by some ->ioctl(); always a pointer to int */ + case FIONREAD: + goto found_handler; + /* these get messy on amd64 due to alignment differences */ +#if defined(CONFIG_X86_64) case FS_IOC_RESVSP_32: case FS_IOC_RESVSP64_32: error = compat_ioctl_preallocate(f.file, 0, compat_ptr(arg)); @@ -1039,32 +203,12 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, #else case FS_IOC_RESVSP: case FS_IOC_RESVSP64: - error = ioctl_preallocate(f.file, 0, compat_ptr(arg)); - goto out_fput; case FS_IOC_UNRESVSP: case FS_IOC_UNRESVSP64: - error = ioctl_preallocate(f.file, FALLOC_FL_PUNCH_HOLE, - compat_ptr(arg)); - goto out_fput; case FS_IOC_ZERO_RANGE: - error = ioctl_preallocate(f.file, FALLOC_FL_ZERO_RANGE, - compat_ptr(arg)); - goto out_fput; + goto found_handler; #endif - case FICLONE: - case FICLONERANGE: - case FIDEDUPERANGE: - case FS_IOC_FIEMAP: - goto do_ioctl; - - case FIBMAP: - case FIGETBSZ: - case FIONREAD: - if (S_ISREG(file_inode(f.file)->i_mode)) - break; - /*FALL THROUGH*/ - default: if (f.file->f_op->compat_ioctl) { error = f.file->f_op->compat_ioctl(f.file, cmd, arg); diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index dc5dbf6a81d7..cb61467478ca 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -101,7 +101,7 @@ static int create_link(struct config_item *parent_item, } target_sd->s_links++; spin_unlock(&configfs_dirent_lock); - ret = configfs_get_target_path(item, item, body); + ret = configfs_get_target_path(parent_item, item, body); if (!ret) ret = configfs_create_link(target_sd, parent_item->ci_dentry, dentry, body); diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index d12ea28836a5..2f04024c3588 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -958,8 +958,8 @@ static int cramfs_get_tree(struct fs_context *fc) if (IS_ENABLED(CONFIG_CRAMFS_MTD)) { ret = get_tree_mtd(fc, cramfs_mtd_fill_super); - if (ret < 0) - return ret; + if (!ret) + return 0; } if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV)) ret = get_tree_bdev(fc, cramfs_blkdev_fill_super); diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 82da2510721f..1f4b8a277060 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -26,7 +26,7 @@ #include <linux/namei.h> #include "fscrypt_private.h" -static void __fscrypt_decrypt_bio(struct bio *bio, bool done) +void fscrypt_decrypt_bio(struct bio *bio) { struct bio_vec *bv; struct bvec_iter_all iter_all; @@ -37,37 +37,10 @@ static void __fscrypt_decrypt_bio(struct bio *bio, bool done) bv->bv_offset); if (ret) SetPageError(page); - else if (done) - SetPageUptodate(page); - if (done) - unlock_page(page); } } - -void fscrypt_decrypt_bio(struct bio *bio) -{ - __fscrypt_decrypt_bio(bio, false); -} EXPORT_SYMBOL(fscrypt_decrypt_bio); -static void completion_pages(struct work_struct *work) -{ - struct fscrypt_ctx *ctx = container_of(work, struct fscrypt_ctx, work); - struct bio *bio = ctx->bio; - - __fscrypt_decrypt_bio(bio, true); - fscrypt_release_ctx(ctx); - bio_put(bio); -} - -void fscrypt_enqueue_decrypt_bio(struct fscrypt_ctx *ctx, struct bio *bio) -{ - INIT_WORK(&ctx->work, completion_pages); - ctx->bio = bio; - fscrypt_enqueue_decrypt_work(&ctx->work); -} -EXPORT_SYMBOL(fscrypt_enqueue_decrypt_bio); - int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, sector_t pblk, unsigned int len) { diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 32a7ad0098cc..3719efa546c6 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -27,29 +27,20 @@ #include <linux/ratelimit.h> #include <linux/dcache.h> #include <linux/namei.h> -#include <crypto/aes.h> #include <crypto/skcipher.h> #include "fscrypt_private.h" static unsigned int num_prealloc_crypto_pages = 32; -static unsigned int num_prealloc_crypto_ctxs = 128; module_param(num_prealloc_crypto_pages, uint, 0444); MODULE_PARM_DESC(num_prealloc_crypto_pages, "Number of crypto pages to preallocate"); -module_param(num_prealloc_crypto_ctxs, uint, 0444); -MODULE_PARM_DESC(num_prealloc_crypto_ctxs, - "Number of crypto contexts to preallocate"); static mempool_t *fscrypt_bounce_page_pool = NULL; -static LIST_HEAD(fscrypt_free_ctxs); -static DEFINE_SPINLOCK(fscrypt_ctx_lock); - static struct workqueue_struct *fscrypt_read_workqueue; static DEFINE_MUTEX(fscrypt_init_mutex); -static struct kmem_cache *fscrypt_ctx_cachep; struct kmem_cache *fscrypt_info_cachep; void fscrypt_enqueue_decrypt_work(struct work_struct *work) @@ -58,62 +49,6 @@ void fscrypt_enqueue_decrypt_work(struct work_struct *work) } EXPORT_SYMBOL(fscrypt_enqueue_decrypt_work); -/** - * fscrypt_release_ctx() - Release a decryption context - * @ctx: The decryption context to release. - * - * If the decryption context was allocated from the pre-allocated pool, return - * it to that pool. Else, free it. - */ -void fscrypt_release_ctx(struct fscrypt_ctx *ctx) -{ - unsigned long flags; - - if (ctx->flags & FS_CTX_REQUIRES_FREE_ENCRYPT_FL) { - kmem_cache_free(fscrypt_ctx_cachep, ctx); - } else { - spin_lock_irqsave(&fscrypt_ctx_lock, flags); - list_add(&ctx->free_list, &fscrypt_free_ctxs); - spin_unlock_irqrestore(&fscrypt_ctx_lock, flags); - } -} -EXPORT_SYMBOL(fscrypt_release_ctx); - -/** - * fscrypt_get_ctx() - Get a decryption context - * @gfp_flags: The gfp flag for memory allocation - * - * Allocate and initialize a decryption context. - * - * Return: A new decryption context on success; an ERR_PTR() otherwise. - */ -struct fscrypt_ctx *fscrypt_get_ctx(gfp_t gfp_flags) -{ - struct fscrypt_ctx *ctx; - unsigned long flags; - - /* - * First try getting a ctx from the free list so that we don't have to - * call into the slab allocator. - */ - spin_lock_irqsave(&fscrypt_ctx_lock, flags); - ctx = list_first_entry_or_null(&fscrypt_free_ctxs, - struct fscrypt_ctx, free_list); - if (ctx) - list_del(&ctx->free_list); - spin_unlock_irqrestore(&fscrypt_ctx_lock, flags); - if (!ctx) { - ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, gfp_flags); - if (!ctx) - return ERR_PTR(-ENOMEM); - ctx->flags |= FS_CTX_REQUIRES_FREE_ENCRYPT_FL; - } else { - ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL; - } - return ctx; -} -EXPORT_SYMBOL(fscrypt_get_ctx); - struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags) { return mempool_alloc(fscrypt_bounce_page_pool, gfp_flags); @@ -138,14 +73,17 @@ EXPORT_SYMBOL(fscrypt_free_bounce_page); void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, const struct fscrypt_info *ci) { + u8 flags = fscrypt_policy_flags(&ci->ci_policy); + memset(iv, 0, ci->ci_mode->ivsize); - iv->lblk_num = cpu_to_le64(lblk_num); - if (fscrypt_is_direct_key_policy(&ci->ci_policy)) + if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) { + WARN_ON_ONCE((u32)lblk_num != lblk_num); + lblk_num |= (u64)ci->ci_inode->i_ino << 32; + } else if (flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) { memcpy(iv->nonce, ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE); - - if (ci->ci_essiv_tfm != NULL) - crypto_cipher_encrypt_one(ci->ci_essiv_tfm, iv->raw, iv->raw); + } + iv->lblk_num = cpu_to_le64(lblk_num); } /* Encrypt or decrypt a single filesystem block of file contents */ @@ -396,17 +334,6 @@ const struct dentry_operations fscrypt_d_ops = { .d_revalidate = fscrypt_d_revalidate, }; -static void fscrypt_destroy(void) -{ - struct fscrypt_ctx *pos, *n; - - list_for_each_entry_safe(pos, n, &fscrypt_free_ctxs, free_list) - kmem_cache_free(fscrypt_ctx_cachep, pos); - INIT_LIST_HEAD(&fscrypt_free_ctxs); - mempool_destroy(fscrypt_bounce_page_pool); - fscrypt_bounce_page_pool = NULL; -} - /** * fscrypt_initialize() - allocate major buffers for fs encryption. * @cop_flags: fscrypt operations flags @@ -414,11 +341,11 @@ static void fscrypt_destroy(void) * We only call this when we start accessing encrypted files, since it * results in memory getting allocated that wouldn't otherwise be used. * - * Return: Zero on success, non-zero otherwise. + * Return: 0 on success; -errno on failure */ int fscrypt_initialize(unsigned int cop_flags) { - int i, res = -ENOMEM; + int err = 0; /* No need to allocate a bounce page pool if this FS won't use it. */ if (cop_flags & FS_CFLG_OWN_PAGES) @@ -426,29 +353,18 @@ int fscrypt_initialize(unsigned int cop_flags) mutex_lock(&fscrypt_init_mutex); if (fscrypt_bounce_page_pool) - goto already_initialized; - - for (i = 0; i < num_prealloc_crypto_ctxs; i++) { - struct fscrypt_ctx *ctx; - - ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, GFP_NOFS); - if (!ctx) - goto fail; - list_add(&ctx->free_list, &fscrypt_free_ctxs); - } + goto out_unlock; + err = -ENOMEM; fscrypt_bounce_page_pool = mempool_create_page_pool(num_prealloc_crypto_pages, 0); if (!fscrypt_bounce_page_pool) - goto fail; + goto out_unlock; -already_initialized: - mutex_unlock(&fscrypt_init_mutex); - return 0; -fail: - fscrypt_destroy(); + err = 0; +out_unlock: mutex_unlock(&fscrypt_init_mutex); - return res; + return err; } void fscrypt_msg(const struct inode *inode, const char *level, @@ -494,13 +410,9 @@ static int __init fscrypt_init(void) if (!fscrypt_read_workqueue) goto fail; - fscrypt_ctx_cachep = KMEM_CACHE(fscrypt_ctx, SLAB_RECLAIM_ACCOUNT); - if (!fscrypt_ctx_cachep) - goto fail_free_queue; - fscrypt_info_cachep = KMEM_CACHE(fscrypt_info, SLAB_RECLAIM_ACCOUNT); if (!fscrypt_info_cachep) - goto fail_free_ctx; + goto fail_free_queue; err = fscrypt_init_keyring(); if (err) @@ -510,8 +422,6 @@ static int __init fscrypt_init(void) fail_free_info: kmem_cache_destroy(fscrypt_info_cachep); -fail_free_ctx: - kmem_cache_destroy(fscrypt_ctx_cachep); fail_free_queue: destroy_workqueue(fscrypt_read_workqueue); fail: diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index e84efc01512e..130b50e5a011 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -163,11 +163,8 @@ struct fscrypt_info { /* The actual crypto transform used for encryption and decryption */ struct crypto_skcipher *ci_ctfm; - /* - * Cipher for ESSIV IV generation. Only set for CBC contents - * encryption, otherwise is NULL. - */ - struct crypto_cipher *ci_essiv_tfm; + /* True if the key should be freed when this fscrypt_info is freed */ + bool ci_owns_key; /* * Encryption mode used for this inode. It corresponds to either the @@ -209,8 +206,6 @@ typedef enum { FS_ENCRYPT, } fscrypt_direction_t; -#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 - static inline bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode) { @@ -289,7 +284,8 @@ extern int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, */ #define HKDF_CONTEXT_KEY_IDENTIFIER 1 #define HKDF_CONTEXT_PER_FILE_KEY 2 -#define HKDF_CONTEXT_PER_MODE_KEY 3 +#define HKDF_CONTEXT_DIRECT_KEY 3 +#define HKDF_CONTEXT_IV_INO_LBLK_64_KEY 4 extern int fscrypt_hkdf_expand(struct fscrypt_hkdf *hkdf, u8 context, const u8 *info, unsigned int infolen, @@ -386,8 +382,14 @@ struct fscrypt_master_key { struct list_head mk_decrypted_inodes; spinlock_t mk_decrypted_inodes_lock; - /* Per-mode tfms for DIRECT_KEY policies, allocated on-demand */ - struct crypto_skcipher *mk_mode_keys[__FSCRYPT_MODE_MAX + 1]; + /* Crypto API transforms for DIRECT_KEY policies, allocated on-demand */ + struct crypto_skcipher *mk_direct_tfms[__FSCRYPT_MODE_MAX + 1]; + + /* + * Crypto API transforms for filesystem-layer implementation of + * IV_INO_LBLK_64 policies, allocated on-demand. + */ + struct crypto_skcipher *mk_iv_ino_lblk_64_tfms[__FSCRYPT_MODE_MAX + 1]; } __randomize_layout; @@ -443,8 +445,7 @@ struct fscrypt_mode { const char *cipher_str; int keysize; int ivsize; - bool logged_impl_name; - bool needs_essiv; + int logged_impl_name; }; static inline bool diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index c34fa7c61b43..040df1f5e1c8 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -43,8 +43,10 @@ static void free_master_key(struct fscrypt_master_key *mk) wipe_master_key_secret(&mk->mk_secret); - for (i = 0; i < ARRAY_SIZE(mk->mk_mode_keys); i++) - crypto_free_skcipher(mk->mk_mode_keys[i]); + for (i = 0; i <= __FSCRYPT_MODE_MAX; i++) { + crypto_free_skcipher(mk->mk_direct_tfms[i]); + crypto_free_skcipher(mk->mk_iv_ino_lblk_64_tfms[i]); + } key_put(mk->mk_users); kzfree(mk); diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index d71c2d6dd162..f577bb6613f9 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -8,15 +8,11 @@ * Heavily modified since then. */ -#include <crypto/aes.h> -#include <crypto/sha.h> #include <crypto/skcipher.h> #include <linux/key.h> #include "fscrypt_private.h" -static struct crypto_shash *essiv_hash_tfm; - static struct fscrypt_mode available_modes[] = { [FSCRYPT_MODE_AES_256_XTS] = { .friendly_name = "AES-256-XTS", @@ -31,11 +27,10 @@ static struct fscrypt_mode available_modes[] = { .ivsize = 16, }, [FSCRYPT_MODE_AES_128_CBC] = { - .friendly_name = "AES-128-CBC", - .cipher_str = "cbc(aes)", + .friendly_name = "AES-128-CBC-ESSIV", + .cipher_str = "essiv(cbc(aes),sha256)", .keysize = 16, .ivsize = 16, - .needs_essiv = true, }, [FSCRYPT_MODE_AES_128_CTS] = { .friendly_name = "AES-128-CTS-CBC", @@ -86,15 +81,13 @@ struct crypto_skcipher *fscrypt_allocate_skcipher(struct fscrypt_mode *mode, mode->cipher_str, PTR_ERR(tfm)); return tfm; } - if (unlikely(!mode->logged_impl_name)) { + if (!xchg(&mode->logged_impl_name, 1)) { /* * fscrypt performance can vary greatly depending on which * crypto algorithm implementation is used. Help people debug * performance problems by logging the ->cra_driver_name the - * first time a mode is used. Note that multiple threads can - * race here, but it doesn't really matter. + * first time a mode is used. */ - mode->logged_impl_name = true; pr_info("fscrypt: %s using implementation \"%s\"\n", mode->friendly_name, crypto_skcipher_alg(tfm)->base.cra_driver_name); @@ -111,131 +104,64 @@ err_free_tfm: return ERR_PTR(err); } -static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt) -{ - struct crypto_shash *tfm = READ_ONCE(essiv_hash_tfm); - - /* init hash transform on demand */ - if (unlikely(!tfm)) { - struct crypto_shash *prev_tfm; - - tfm = crypto_alloc_shash("sha256", 0, 0); - if (IS_ERR(tfm)) { - if (PTR_ERR(tfm) == -ENOENT) { - fscrypt_warn(NULL, - "Missing crypto API support for SHA-256"); - return -ENOPKG; - } - fscrypt_err(NULL, - "Error allocating SHA-256 transform: %ld", - PTR_ERR(tfm)); - return PTR_ERR(tfm); - } - prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm); - if (prev_tfm) { - crypto_free_shash(tfm); - tfm = prev_tfm; - } - } - - { - SHASH_DESC_ON_STACK(desc, tfm); - desc->tfm = tfm; - - return crypto_shash_digest(desc, key, keysize, salt); - } -} - -static int init_essiv_generator(struct fscrypt_info *ci, const u8 *raw_key, - int keysize) -{ - int err; - struct crypto_cipher *essiv_tfm; - u8 salt[SHA256_DIGEST_SIZE]; - - if (WARN_ON(ci->ci_mode->ivsize != AES_BLOCK_SIZE)) - return -EINVAL; - - essiv_tfm = crypto_alloc_cipher("aes", 0, 0); - if (IS_ERR(essiv_tfm)) - return PTR_ERR(essiv_tfm); - - ci->ci_essiv_tfm = essiv_tfm; - - err = derive_essiv_salt(raw_key, keysize, salt); - if (err) - goto out; - - /* - * Using SHA256 to derive the salt/key will result in AES-256 being - * used for IV generation. File contents encryption will still use the - * configured keysize (AES-128) nevertheless. - */ - err = crypto_cipher_setkey(essiv_tfm, salt, sizeof(salt)); - if (err) - goto out; - -out: - memzero_explicit(salt, sizeof(salt)); - return err; -} - -/* Given the per-file key, set up the file's crypto transform object(s) */ +/* Given the per-file key, set up the file's crypto transform object */ int fscrypt_set_derived_key(struct fscrypt_info *ci, const u8 *derived_key) { - struct fscrypt_mode *mode = ci->ci_mode; - struct crypto_skcipher *ctfm; - int err; - - ctfm = fscrypt_allocate_skcipher(mode, derived_key, ci->ci_inode); - if (IS_ERR(ctfm)) - return PTR_ERR(ctfm); + struct crypto_skcipher *tfm; - ci->ci_ctfm = ctfm; + tfm = fscrypt_allocate_skcipher(ci->ci_mode, derived_key, ci->ci_inode); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); - if (mode->needs_essiv) { - err = init_essiv_generator(ci, derived_key, mode->keysize); - if (err) { - fscrypt_warn(ci->ci_inode, - "Error initializing ESSIV generator: %d", - err); - return err; - } - } + ci->ci_ctfm = tfm; + ci->ci_owns_key = true; return 0; } static int setup_per_mode_key(struct fscrypt_info *ci, - struct fscrypt_master_key *mk) + struct fscrypt_master_key *mk, + struct crypto_skcipher **tfms, + u8 hkdf_context, bool include_fs_uuid) { + const struct inode *inode = ci->ci_inode; + const struct super_block *sb = inode->i_sb; struct fscrypt_mode *mode = ci->ci_mode; u8 mode_num = mode - available_modes; struct crypto_skcipher *tfm, *prev_tfm; u8 mode_key[FSCRYPT_MAX_KEY_SIZE]; + u8 hkdf_info[sizeof(mode_num) + sizeof(sb->s_uuid)]; + unsigned int hkdf_infolen = 0; int err; - if (WARN_ON(mode_num >= ARRAY_SIZE(mk->mk_mode_keys))) + if (WARN_ON(mode_num > __FSCRYPT_MODE_MAX)) return -EINVAL; /* pairs with cmpxchg() below */ - tfm = READ_ONCE(mk->mk_mode_keys[mode_num]); + tfm = READ_ONCE(tfms[mode_num]); if (likely(tfm != NULL)) goto done; BUILD_BUG_ON(sizeof(mode_num) != 1); + BUILD_BUG_ON(sizeof(sb->s_uuid) != 16); + BUILD_BUG_ON(sizeof(hkdf_info) != 17); + hkdf_info[hkdf_infolen++] = mode_num; + if (include_fs_uuid) { + memcpy(&hkdf_info[hkdf_infolen], &sb->s_uuid, + sizeof(sb->s_uuid)); + hkdf_infolen += sizeof(sb->s_uuid); + } err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, - HKDF_CONTEXT_PER_MODE_KEY, - &mode_num, sizeof(mode_num), + hkdf_context, hkdf_info, hkdf_infolen, mode_key, mode->keysize); if (err) return err; - tfm = fscrypt_allocate_skcipher(mode, mode_key, ci->ci_inode); + tfm = fscrypt_allocate_skcipher(mode, mode_key, inode); memzero_explicit(mode_key, mode->keysize); if (IS_ERR(tfm)) return PTR_ERR(tfm); /* pairs with READ_ONCE() above */ - prev_tfm = cmpxchg(&mk->mk_mode_keys[mode_num], NULL, tfm); + prev_tfm = cmpxchg(&tfms[mode_num], NULL, tfm); if (prev_tfm != NULL) { crypto_free_skcipher(tfm); tfm = prev_tfm; @@ -266,7 +192,19 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, ci->ci_mode->friendly_name); return -EINVAL; } - return setup_per_mode_key(ci, mk); + return setup_per_mode_key(ci, mk, mk->mk_direct_tfms, + HKDF_CONTEXT_DIRECT_KEY, false); + } else if (ci->ci_policy.v2.flags & + FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) { + /* + * IV_INO_LBLK_64: encryption keys are derived from (master_key, + * mode_num, filesystem_uuid), and inode number is included in + * the IVs. This format is optimized for use with inline + * encryption hardware compliant with the UFS or eMMC standards. + */ + return setup_per_mode_key(ci, mk, mk->mk_iv_ino_lblk_64_tfms, + HKDF_CONTEXT_IV_INO_LBLK_64_KEY, + true); } err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, @@ -388,13 +326,10 @@ static void put_crypt_info(struct fscrypt_info *ci) if (!ci) return; - if (ci->ci_direct_key) { + if (ci->ci_direct_key) fscrypt_put_direct_key(ci->ci_direct_key); - } else if ((ci->ci_ctfm != NULL || ci->ci_essiv_tfm != NULL) && - !fscrypt_is_direct_key_policy(&ci->ci_policy)) { + else if (ci->ci_owns_key) crypto_free_skcipher(ci->ci_ctfm); - crypto_free_cipher(ci->ci_essiv_tfm); - } key = ci->ci_master_key; if (key) { @@ -415,6 +350,7 @@ static void put_crypt_info(struct fscrypt_info *ci) key_invalidate(key); key_put(key); } + memzero_explicit(ci, sizeof(*ci)); kmem_cache_free(fscrypt_info_cachep, ci); } diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c index ad1a36c370c3..5298ef22aa85 100644 --- a/fs/crypto/keysetup_v1.c +++ b/fs/crypto/keysetup_v1.c @@ -270,10 +270,6 @@ static int setup_v1_file_key_direct(struct fscrypt_info *ci, return -EINVAL; } - /* ESSIV implies 16-byte IVs which implies !DIRECT_KEY */ - if (WARN_ON(mode->needs_essiv)) - return -EINVAL; - dk = fscrypt_get_direct_key(ci, raw_master_key); if (IS_ERR(dk)) return PTR_ERR(dk); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 4072ba644595..96f528071bed 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -29,6 +29,40 @@ bool fscrypt_policies_equal(const union fscrypt_policy *policy1, return !memcmp(policy1, policy2, fscrypt_policy_size(policy1)); } +static bool supported_iv_ino_lblk_64_policy( + const struct fscrypt_policy_v2 *policy, + const struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + int ino_bits = 64, lblk_bits = 64; + + if (policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) { + fscrypt_warn(inode, + "The DIRECT_KEY and IV_INO_LBLK_64 flags are mutually exclusive"); + return false; + } + /* + * It's unsafe to include inode numbers in the IVs if the filesystem can + * potentially renumber inodes, e.g. via filesystem shrinking. + */ + if (!sb->s_cop->has_stable_inodes || + !sb->s_cop->has_stable_inodes(sb)) { + fscrypt_warn(inode, + "Can't use IV_INO_LBLK_64 policy on filesystem '%s' because it doesn't have stable inode numbers", + sb->s_id); + return false; + } + if (sb->s_cop->get_ino_and_lblk_bits) + sb->s_cop->get_ino_and_lblk_bits(sb, &ino_bits, &lblk_bits); + if (ino_bits > 32 || lblk_bits > 32) { + fscrypt_warn(inode, + "Can't use IV_INO_LBLK_64 policy on filesystem '%s' because it doesn't use 32-bit inode and block numbers", + sb->s_id); + return false; + } + return true; +} + /** * fscrypt_supported_policy - check whether an encryption policy is supported * @@ -55,7 +89,8 @@ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, return false; } - if (policy->flags & ~FSCRYPT_POLICY_FLAGS_VALID) { + if (policy->flags & ~(FSCRYPT_POLICY_FLAGS_PAD_MASK | + FSCRYPT_POLICY_FLAG_DIRECT_KEY)) { fscrypt_warn(inode, "Unsupported encryption flags (0x%02x)", policy->flags); @@ -83,6 +118,10 @@ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, return false; } + if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) && + !supported_iv_ino_lblk_64_policy(policy, inode)) + return false; + if (memchr_inv(policy->__reserved, 0, sizeof(policy->__reserved))) { fscrypt_warn(inode, @@ -220,10 +220,11 @@ static void *get_unlocked_entry(struct xa_state *xas, unsigned int order) for (;;) { entry = xas_find_conflict(xas); + if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) + return entry; if (dax_entry_order(entry) < order) return XA_RETRY_ENTRY; - if (!entry || WARN_ON_ONCE(!xa_is_value(entry)) || - !dax_is_locked(entry)) + if (!dax_is_locked(entry)) return entry; wq = dax_entry_waitqueue(xas, entry, &ewait.key); diff --git a/fs/dcache.c b/fs/dcache.c index e88cf0554e65..a2749a700230 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -319,7 +319,7 @@ static inline void __d_set_inode_and_type(struct dentry *dentry, flags = READ_ONCE(dentry->d_flags); flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); flags |= type_flags; - WRITE_ONCE(dentry->d_flags, flags); + smp_store_release(&dentry->d_flags, flags); } static inline void __d_clear_type_and_inode(struct dentry *dentry) @@ -903,17 +903,19 @@ struct dentry *dget_parent(struct dentry *dentry) { int gotref; struct dentry *ret; + unsigned seq; /* * Do optimistic parent lookup without any * locking. */ rcu_read_lock(); + seq = raw_seqcount_begin(&dentry->d_seq); ret = READ_ONCE(dentry->d_parent); gotref = lockref_get_not_zero(&ret->d_lockref); rcu_read_unlock(); if (likely(gotref)) { - if (likely(ret == READ_ONCE(dentry->d_parent))) + if (!read_seqcount_retry(&dentry->d_seq, seq)) return ret; dput(ret); } @@ -1319,7 +1321,7 @@ resume: if (!list_empty(&dentry->d_subdirs)) { spin_unlock(&this_parent->d_lock); - spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); + spin_release(&dentry->d_lock.dep_map, _RET_IP_); this_parent = dentry; spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); goto repeat; diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 87846aad594b..dede25247b81 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -420,20 +420,11 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n"); * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. - * - * This function will return a pointer to a dentry if it succeeds. This - * pointer must be passed to the debugfs_remove() function when the file is - * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be - * returned. - * - * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will - * be returned. */ -struct dentry *debugfs_create_u8(const char *name, umode_t mode, - struct dentry *parent, u8 *value) +void debugfs_create_u8(const char *name, umode_t mode, struct dentry *parent, + u8 *value) { - return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u8, + debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u8, &fops_u8_ro, &fops_u8_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u8); @@ -465,20 +456,11 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n"); * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. - * - * This function will return a pointer to a dentry if it succeeds. This - * pointer must be passed to the debugfs_remove() function when the file is - * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be - * returned. - * - * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will - * be returned. */ -struct dentry *debugfs_create_u16(const char *name, umode_t mode, - struct dentry *parent, u16 *value) +void debugfs_create_u16(const char *name, umode_t mode, struct dentry *parent, + u16 *value) { - return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u16, + debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u16, &fops_u16_ro, &fops_u16_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u16); @@ -556,20 +538,11 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n"); * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. - * - * This function will return a pointer to a dentry if it succeeds. This - * pointer must be passed to the debugfs_remove() function when the file is - * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be - * returned. - * - * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will - * be returned. */ -struct dentry *debugfs_create_u64(const char *name, umode_t mode, - struct dentry *parent, u64 *value) +void debugfs_create_u64(const char *name, umode_t mode, struct dentry *parent, + u64 *value) { - return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u64, + debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u64, &fops_u64_ro, &fops_u64_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u64); @@ -660,10 +633,10 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n"); * @value: a pointer to the variable that the file should read to and write * from. */ -struct dentry *debugfs_create_x8(const char *name, umode_t mode, - struct dentry *parent, u8 *value) +void debugfs_create_x8(const char *name, umode_t mode, struct dentry *parent, + u8 *value) { - return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x8, + debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x8, &fops_x8_ro, &fops_x8_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x8); @@ -678,10 +651,10 @@ EXPORT_SYMBOL_GPL(debugfs_create_x8); * @value: a pointer to the variable that the file should read to and write * from. */ -struct dentry *debugfs_create_x16(const char *name, umode_t mode, - struct dentry *parent, u16 *value) +void debugfs_create_x16(const char *name, umode_t mode, struct dentry *parent, + u16 *value) { - return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x16, + debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x16, &fops_x16_ro, &fops_x16_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x16); @@ -696,10 +669,10 @@ EXPORT_SYMBOL_GPL(debugfs_create_x16); * @value: a pointer to the variable that the file should read to and write * from. */ -struct dentry *debugfs_create_x32(const char *name, umode_t mode, - struct dentry *parent, u32 *value) +void debugfs_create_x32(const char *name, umode_t mode, struct dentry *parent, + u32 *value) { - return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x32, + debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x32, &fops_x32_ro, &fops_x32_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x32); @@ -714,10 +687,10 @@ EXPORT_SYMBOL_GPL(debugfs_create_x32); * @value: a pointer to the variable that the file should read to and write * from. */ -struct dentry *debugfs_create_x64(const char *name, umode_t mode, - struct dentry *parent, u64 *value) +void debugfs_create_x64(const char *name, umode_t mode, struct dentry *parent, + u64 *value) { - return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x64, + debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x64, &fops_x64_ro, &fops_x64_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x64); @@ -748,12 +721,11 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n"); * @value: a pointer to the variable that the file should read to and write * from. */ -struct dentry *debugfs_create_size_t(const char *name, umode_t mode, - struct dentry *parent, size_t *value) +void debugfs_create_size_t(const char *name, umode_t mode, + struct dentry *parent, size_t *value) { - return debugfs_create_mode_unsafe(name, mode, parent, value, - &fops_size_t, &fops_size_t_ro, - &fops_size_t_wo); + debugfs_create_mode_unsafe(name, mode, parent, value, &fops_size_t, + &fops_size_t_ro, &fops_size_t_wo); } EXPORT_SYMBOL_GPL(debugfs_create_size_t); @@ -785,12 +757,11 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, * @value: a pointer to the variable that the file should read to and write * from. */ -struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode, - struct dentry *parent, atomic_t *value) +void debugfs_create_atomic_t(const char *name, umode_t mode, + struct dentry *parent, atomic_t *value) { - return debugfs_create_mode_unsafe(name, mode, parent, value, - &fops_atomic_t, &fops_atomic_t_ro, - &fops_atomic_t_wo); + debugfs_create_mode_unsafe(name, mode, parent, value, &fops_atomic_t, + &fops_atomic_t_ro, &fops_atomic_t_wo); } EXPORT_SYMBOL_GPL(debugfs_create_atomic_t); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 7b975dbb2bb4..f4d8df5e4714 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -299,13 +299,9 @@ struct dentry *debugfs_lookup(const char *name, struct dentry *parent) if (!parent) parent = debugfs_mount->mnt_root; - dentry = lookup_one_len_unlocked(name, parent, strlen(name)); + dentry = lookup_positive_unlocked(name, parent, strlen(name)); if (IS_ERR(dentry)) return NULL; - if (!d_really_is_positive(dentry)) { - dput(dentry); - return NULL; - } return dentry; } EXPORT_SYMBOL_GPL(debugfs_lookup); diff --git a/fs/direct-io.c b/fs/direct-io.c index ae196784f487..0ec4f270139f 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -221,29 +221,7 @@ static inline struct page *dio_get_page(struct dio *dio, } /* - * Warn about a page cache invalidation failure during a direct io write. - */ -void dio_warn_stale_pagecache(struct file *filp) -{ - static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); - char pathname[128]; - struct inode *inode = file_inode(filp); - char *path; - - errseq_set(&inode->i_mapping->wb_err, -EIO); - if (__ratelimit(&_rs)) { - path = file_path(filp, pathname, sizeof(pathname)); - if (IS_ERR(path)) - path = "(unknown)"; - pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n"); - pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid, - current->comm); - } -} - -/** * dio_complete() - called when all DIO BIO I/O has been completed - * @offset: the byte offset in the file of the completed operation * * This drops i_dio_count, lets interested parties know that a DIO operation * has completed, and calculates the resulting return code for the operation. diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index feecb57defa7..5fb45d865ce5 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -378,6 +378,7 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return rc; switch (cmd) { + case FITRIM: case FS_IOC32_GETFLAGS: case FS_IOC32_SETFLAGS: case FS_IOC32_GETVERSION: diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 18426f4855f1..e23752d9a79f 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -128,13 +128,20 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry, struct inode *inode) { struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); - struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir); struct dentry *lower_dir_dentry; + struct inode *lower_dir_inode; int rc; - dget(lower_dentry); - lower_dir_dentry = lock_parent(lower_dentry); - rc = vfs_unlink(lower_dir_inode, lower_dentry, NULL); + lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent); + lower_dir_inode = d_inode(lower_dir_dentry); + inode_lock_nested(lower_dir_inode, I_MUTEX_PARENT); + dget(lower_dentry); // don't even try to make the lower negative + if (lower_dentry->d_parent != lower_dir_dentry) + rc = -EINVAL; + else if (d_unhashed(lower_dentry)) + rc = -EINVAL; + else + rc = vfs_unlink(lower_dir_inode, lower_dentry, NULL); if (rc) { printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc); goto out_unlock; @@ -142,10 +149,11 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry, fsstack_copy_attr_times(dir, lower_dir_inode); set_nlink(inode, ecryptfs_inode_to_lower(inode)->i_nlink); inode->i_ctime = dir->i_ctime; - d_drop(dentry); out_unlock: - unlock_dir(lower_dir_dentry); dput(lower_dentry); + inode_unlock(lower_dir_inode); + if (!rc) + d_drop(dentry); return rc; } @@ -311,9 +319,9 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode) static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry, struct dentry *lower_dentry) { - struct inode *inode, *lower_inode = d_inode(lower_dentry); + struct path *path = ecryptfs_dentry_to_lower_path(dentry->d_parent); + struct inode *inode, *lower_inode; struct ecryptfs_dentry_info *dentry_info; - struct vfsmount *lower_mnt; int rc = 0; dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL); @@ -322,16 +330,23 @@ static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry, return ERR_PTR(-ENOMEM); } - lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); fsstack_copy_attr_atime(d_inode(dentry->d_parent), - d_inode(lower_dentry->d_parent)); + d_inode(path->dentry)); BUG_ON(!d_count(lower_dentry)); ecryptfs_set_dentry_private(dentry, dentry_info); - dentry_info->lower_path.mnt = lower_mnt; + dentry_info->lower_path.mnt = mntget(path->mnt); dentry_info->lower_path.dentry = lower_dentry; - if (d_really_is_negative(lower_dentry)) { + /* + * negative dentry can go positive under us here - its parent is not + * locked. That's OK and that could happen just as we return from + * ecryptfs_lookup() anyway. Just need to be careful and fetch + * ->d_inode only once - it's not stable here. + */ + lower_inode = READ_ONCE(lower_dentry->d_inode); + + if (!lower_inode) { /* We want to add because we couldn't find in lower */ d_add(dentry, NULL); return NULL; @@ -512,22 +527,30 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) { struct dentry *lower_dentry; struct dentry *lower_dir_dentry; + struct inode *lower_dir_inode; int rc; lower_dentry = ecryptfs_dentry_to_lower(dentry); - dget(dentry); - lower_dir_dentry = lock_parent(lower_dentry); - dget(lower_dentry); - rc = vfs_rmdir(d_inode(lower_dir_dentry), lower_dentry); - dput(lower_dentry); - if (!rc && d_really_is_positive(dentry)) + lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent); + lower_dir_inode = d_inode(lower_dir_dentry); + + inode_lock_nested(lower_dir_inode, I_MUTEX_PARENT); + dget(lower_dentry); // don't even try to make the lower negative + if (lower_dentry->d_parent != lower_dir_dentry) + rc = -EINVAL; + else if (d_unhashed(lower_dentry)) + rc = -EINVAL; + else + rc = vfs_rmdir(lower_dir_inode, lower_dentry); + if (!rc) { clear_nlink(d_inode(dentry)); - fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); - set_nlink(dir, d_inode(lower_dir_dentry)->i_nlink); - unlock_dir(lower_dir_dentry); + fsstack_copy_attr_times(dir, lower_dir_inode); + set_nlink(dir, lower_dir_inode->i_nlink); + } + dput(lower_dentry); + inode_unlock(lower_dir_inode); if (!rc) d_drop(dentry); - dput(dentry); return rc; } @@ -565,20 +588,22 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct dentry *lower_new_dentry; struct dentry *lower_old_dir_dentry; struct dentry *lower_new_dir_dentry; - struct dentry *trap = NULL; + struct dentry *trap; struct inode *target_inode; if (flags) return -EINVAL; + lower_old_dir_dentry = ecryptfs_dentry_to_lower(old_dentry->d_parent); + lower_new_dir_dentry = ecryptfs_dentry_to_lower(new_dentry->d_parent); + lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry); - dget(lower_old_dentry); - dget(lower_new_dentry); - lower_old_dir_dentry = dget_parent(lower_old_dentry); - lower_new_dir_dentry = dget_parent(lower_new_dentry); + target_inode = d_inode(new_dentry); + trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry); + dget(lower_new_dentry); rc = -EINVAL; if (lower_old_dentry->d_parent != lower_old_dir_dentry) goto out_lock; @@ -606,11 +631,8 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_dir != old_dir) fsstack_copy_attr_all(old_dir, d_inode(lower_old_dir_dentry)); out_lock: - unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); - dput(lower_new_dir_dentry); - dput(lower_old_dir_dentry); dput(lower_new_dentry); - dput(lower_old_dentry); + unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); return rc; } diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 9d634d3a1845..74b0aaa7114c 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -3,6 +3,7 @@ config EROFS_FS tristate "EROFS filesystem support" depends on BLOCK + select LIBCRC32C help EROFS (Enhanced Read-Only File System) is a lightweight read-only file system with modern designs (eg. page-sized diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 19f89f9fb10c..2890a67a1ded 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -73,7 +73,7 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq, victim = availables[--top]; get_page(victim); } else { - victim = erofs_allocpage(pagepool, GFP_KERNEL, false); + victim = erofs_allocpage(pagepool, GFP_KERNEL); if (!victim) return -ENOMEM; victim->mapping = Z_EROFS_MAPPING_STAGING; diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index b1ee5654750d..385fa49c7749 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -11,6 +11,8 @@ #define EROFS_SUPER_OFFSET 1024 +#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001 + /* * Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should * be incompatible with this kernel version. @@ -37,7 +39,6 @@ struct erofs_super_block { __u8 uuid[16]; /* 128-bit uuid for volume */ __u8 volume_name[16]; /* volume name */ __le32 feature_incompat; - __u8 reserved2[44]; }; diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 544a453f3076..1ed5beff7d11 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -85,6 +85,7 @@ struct erofs_sb_info { u8 uuid[16]; /* 128-bit uuid for volume */ u8 volume_name[16]; /* volume name */ + u32 feature_compat; u32 feature_incompat; unsigned int mount_opt; @@ -278,9 +279,7 @@ static inline unsigned int erofs_inode_datalayout(unsigned int value) extern const struct super_operations erofs_sops; extern const struct address_space_operations erofs_raw_access_aops; -#ifdef CONFIG_EROFS_FS_ZIP -extern const struct address_space_operations z_erofs_vle_normalaccess_aops; -#endif +extern const struct address_space_operations z_erofs_aops; /* * Logical to physical block mapping, used by erofs_map_blocks() @@ -382,7 +381,7 @@ int erofs_namei(struct inode *dir, struct qstr *name, extern const struct file_operations erofs_dir_fops; /* utils.c / zdata.c */ -struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp, bool nofail); +struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp); #if (EROFS_PCPUBUF_NR_PAGES > 0) void *erofs_get_pcpubuf(unsigned int pagenr); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 0e369494f2f2..057e6d7b5b7f 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -9,6 +9,7 @@ #include <linux/statfs.h> #include <linux/parser.h> #include <linux/seq_file.h> +#include <linux/crc32c.h> #include "xattr.h" #define CREATE_TRACE_POINTS @@ -46,6 +47,30 @@ void _erofs_info(struct super_block *sb, const char *function, va_end(args); } +static int erofs_superblock_csum_verify(struct super_block *sb, void *sbdata) +{ + struct erofs_super_block *dsb; + u32 expected_crc, crc; + + dsb = kmemdup(sbdata + EROFS_SUPER_OFFSET, + EROFS_BLKSIZ - EROFS_SUPER_OFFSET, GFP_KERNEL); + if (!dsb) + return -ENOMEM; + + expected_crc = le32_to_cpu(dsb->checksum); + dsb->checksum = 0; + /* to allow for x86 boot sectors and other oddities. */ + crc = crc32c(~0, dsb, EROFS_BLKSIZ - EROFS_SUPER_OFFSET); + kfree(dsb); + + if (crc != expected_crc) { + erofs_err(sb, "invalid checksum 0x%08x, 0x%08x expected", + crc, expected_crc); + return -EBADMSG; + } + return 0; +} + static void erofs_inode_init_once(void *ptr) { struct erofs_inode *vi = ptr; @@ -112,7 +137,7 @@ static int erofs_read_superblock(struct super_block *sb) sbi = EROFS_SB(sb); - data = kmap_atomic(page); + data = kmap(page); dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET); ret = -EINVAL; @@ -121,6 +146,13 @@ static int erofs_read_superblock(struct super_block *sb) goto out; } + sbi->feature_compat = le32_to_cpu(dsb->feature_compat); + if (sbi->feature_compat & EROFS_FEATURE_COMPAT_SB_CHKSUM) { + ret = erofs_superblock_csum_verify(sb, data); + if (ret) + goto out; + } + blkszbits = dsb->blkszbits; /* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */ if (blkszbits != LOG_BLOCK_SIZE) { @@ -155,7 +187,7 @@ static int erofs_read_superblock(struct super_block *sb) } ret = 0; out: - kunmap_atomic(data); + kunmap(page); put_page(page); return ret; } @@ -566,9 +598,6 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",cache_strategy=readahead"); } else if (sbi->cache_strategy == EROFS_ZIP_CACHE_READAROUND) { seq_puts(seq, ",cache_strategy=readaround"); - } else { - seq_puts(seq, ",cache_strategy=(unknown)"); - DBG_BUGON(1); } #endif return 0; diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index d92b3e753a6f..1e8e1450d5b0 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -7,7 +7,7 @@ #include "internal.h" #include <linux/pagevec.h> -struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp, bool nofail) +struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp) { struct page *page; @@ -16,7 +16,7 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp, bool nofail) DBG_BUGON(page_ref_count(page) != 1); list_del(&page->lru); } else { - page = alloc_pages(gfp | (nofail ? __GFP_NOFAIL : 0), 0); + page = alloc_page(gfp); } return page; } @@ -149,8 +149,7 @@ static void erofs_workgroup_unfreeze_final(struct erofs_workgroup *grp) } static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, - struct erofs_workgroup *grp, - bool cleanup) + struct erofs_workgroup *grp) { /* * If managed cache is on, refcount of workgroups @@ -188,8 +187,7 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, } static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi, - unsigned long nr_shrink, - bool cleanup) + unsigned long nr_shrink) { pgoff_t first_index = 0; void *batch[PAGEVEC_SIZE]; @@ -208,7 +206,7 @@ repeat: first_index = grp->index + 1; /* try to shrink each valid workgroup */ - if (!erofs_try_to_release_workgroup(sbi, grp, cleanup)) + if (!erofs_try_to_release_workgroup(sbi, grp)) continue; ++freed; @@ -245,7 +243,8 @@ void erofs_shrinker_unregister(struct super_block *sb) struct erofs_sb_info *const sbi = EROFS_SB(sb); mutex_lock(&sbi->umount_mutex); - erofs_shrink_workstation(sbi, ~0UL, true); + /* clean up all remaining workgroups in memory */ + erofs_shrink_workstation(sbi, ~0UL); spin_lock(&erofs_sb_list_lock); list_del(&sbi->list); @@ -294,7 +293,7 @@ static unsigned long erofs_shrink_scan(struct shrinker *shrink, spin_unlock(&erofs_sb_list_lock); sbi->shrinker_run_no = run_no; - freed += erofs_shrink_workstation(sbi, nr, false); + freed += erofs_shrink_workstation(sbi, nr); spin_lock(&erofs_sb_list_lock); /* Get the next list element before we move this one */ diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index fad80c97d247..ca99425a4536 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -337,9 +337,9 @@ retry: return COLLECT_PRIMARY; /* :( better luck next time */ } -static struct z_erofs_collection *cllookup(struct z_erofs_collector *clt, - struct inode *inode, - struct erofs_map_blocks *map) +static int z_erofs_lookup_collection(struct z_erofs_collector *clt, + struct inode *inode, + struct erofs_map_blocks *map) { struct erofs_workgroup *grp; struct z_erofs_pcluster *pcl; @@ -349,20 +349,20 @@ static struct z_erofs_collection *cllookup(struct z_erofs_collector *clt, grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT, &tag); if (!grp) - return NULL; + return -ENOENT; pcl = container_of(grp, struct z_erofs_pcluster, obj); if (clt->owned_head == &pcl->next || pcl == clt->tailpcl) { DBG_BUGON(1); erofs_workgroup_put(grp); - return ERR_PTR(-EFSCORRUPTED); + return -EFSCORRUPTED; } cl = z_erofs_primarycollection(pcl); if (cl->pageofs != (map->m_la & ~PAGE_MASK)) { DBG_BUGON(1); erofs_workgroup_put(grp); - return ERR_PTR(-EFSCORRUPTED); + return -EFSCORRUPTED; } length = READ_ONCE(pcl->length); @@ -370,7 +370,7 @@ static struct z_erofs_collection *cllookup(struct z_erofs_collector *clt, if ((map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) > length) { DBG_BUGON(1); erofs_workgroup_put(grp); - return ERR_PTR(-EFSCORRUPTED); + return -EFSCORRUPTED; } } else { unsigned int llen = map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT; @@ -394,12 +394,12 @@ static struct z_erofs_collection *cllookup(struct z_erofs_collector *clt, clt->tailpcl = NULL; clt->pcl = pcl; clt->cl = cl; - return cl; + return 0; } -static struct z_erofs_collection *clregister(struct z_erofs_collector *clt, - struct inode *inode, - struct erofs_map_blocks *map) +static int z_erofs_register_collection(struct z_erofs_collector *clt, + struct inode *inode, + struct erofs_map_blocks *map) { struct z_erofs_pcluster *pcl; struct z_erofs_collection *cl; @@ -408,7 +408,7 @@ static struct z_erofs_collection *clregister(struct z_erofs_collector *clt, /* no available workgroup, let's allocate one */ pcl = kmem_cache_alloc(pcluster_cachep, GFP_NOFS); if (!pcl) - return ERR_PTR(-ENOMEM); + return -ENOMEM; z_erofs_pcluster_init_always(pcl); pcl->obj.index = map->m_pa >> PAGE_SHIFT; @@ -442,7 +442,7 @@ static struct z_erofs_collection *clregister(struct z_erofs_collector *clt, if (err) { mutex_unlock(&cl->lock); kmem_cache_free(pcluster_cachep, pcl); - return ERR_PTR(-EAGAIN); + return -EAGAIN; } /* used to check tail merging loop due to corrupted images */ if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) @@ -450,14 +450,14 @@ static struct z_erofs_collection *clregister(struct z_erofs_collector *clt, clt->owned_head = &pcl->next; clt->pcl = pcl; clt->cl = cl; - return cl; + return 0; } static int z_erofs_collector_begin(struct z_erofs_collector *clt, struct inode *inode, struct erofs_map_blocks *map) { - struct z_erofs_collection *cl; + int ret; DBG_BUGON(clt->cl); @@ -471,19 +471,22 @@ static int z_erofs_collector_begin(struct z_erofs_collector *clt, } repeat: - cl = cllookup(clt, inode, map); - if (!cl) { - cl = clregister(clt, inode, map); + ret = z_erofs_lookup_collection(clt, inode, map); + if (ret == -ENOENT) { + ret = z_erofs_register_collection(clt, inode, map); - if (cl == ERR_PTR(-EAGAIN)) + /* someone registered at the same time, give another try */ + if (ret == -EAGAIN) { + cond_resched(); goto repeat; + } } - if (IS_ERR(cl)) - return PTR_ERR(cl); + if (ret) + return ret; z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS, - cl->pagevec, cl->vcnt); + clt->cl->pagevec, clt->cl->vcnt); clt->compressedpages = clt->pcl->compressed_pages; if (clt->mode <= COLLECT_PRIMARY) /* cannot do in-place I/O */ @@ -543,15 +546,6 @@ static bool z_erofs_collector_end(struct z_erofs_collector *clt) return true; } -static inline struct page *__stagingpage_alloc(struct list_head *pagepool, - gfp_t gfp) -{ - struct page *page = erofs_allocpage(pagepool, gfp, true); - - page->mapping = Z_EROFS_MAPPING_STAGING; - return page; -} - static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe, unsigned int cachestrategy, erofs_off_t la) @@ -571,7 +565,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct list_head *pagepool) { struct inode *const inode = fe->inode; - struct erofs_sb_info *const sbi __maybe_unused = EROFS_I_SB(inode); + struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct erofs_map_blocks *const map = &fe->map; struct z_erofs_collector *const clt = &fe->clt; const loff_t offset = page_offset(page); @@ -658,8 +652,9 @@ retry: /* should allocate an additional staging page for pagevec */ if (err == -EAGAIN) { struct page *const newpage = - __stagingpage_alloc(pagepool, GFP_NOFS); + erofs_allocpage(pagepool, GFP_NOFS | __GFP_NOFAIL); + newpage->mapping = Z_EROFS_MAPPING_STAGING; err = z_erofs_attach_page(clt, newpage, Z_EROFS_PAGE_TYPE_EXCLUSIVE); if (!err) @@ -698,13 +693,11 @@ err_out: goto out; } -static void z_erofs_vle_unzip_kickoff(void *ptr, int bios) +static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, + bool sync, int bios) { - tagptr1_t t = tagptr_init(tagptr1_t, ptr); - struct z_erofs_unzip_io *io = tagptr_unfold_ptr(t); - bool background = tagptr_unfold_tags(t); - - if (!background) { + /* wake up the caller thread for sync decompression */ + if (sync) { unsigned long flags; spin_lock_irqsave(&io->u.wait.lock, flags); @@ -718,37 +711,30 @@ static void z_erofs_vle_unzip_kickoff(void *ptr, int bios) queue_work(z_erofs_workqueue, &io->u.work); } -static inline void z_erofs_vle_read_endio(struct bio *bio) +static void z_erofs_decompressqueue_endio(struct bio *bio) { - struct erofs_sb_info *sbi = NULL; + tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private); + struct z_erofs_decompressqueue *q = tagptr_unfold_ptr(t); blk_status_t err = bio->bi_status; struct bio_vec *bvec; struct bvec_iter_all iter_all; bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; - bool cachemngd = false; DBG_BUGON(PageUptodate(page)); DBG_BUGON(!page->mapping); - if (!sbi && !z_erofs_page_is_staging(page)) - sbi = EROFS_SB(page->mapping->host->i_sb); - - /* sbi should already be gotten if the page is managed */ - if (sbi) - cachemngd = erofs_page_is_managed(sbi, page); - if (err) SetPageError(page); - else if (cachemngd) - SetPageUptodate(page); - if (cachemngd) + if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { + if (!err) + SetPageUptodate(page); unlock_page(page); + } } - - z_erofs_vle_unzip_kickoff(bio->bi_private, -1); + z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1); bio_put(bio); } @@ -953,9 +939,8 @@ out: return err; } -static void z_erofs_vle_unzip_all(struct super_block *sb, - struct z_erofs_unzip_io *io, - struct list_head *pagepool) +static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, + struct list_head *pagepool) { z_erofs_next_pcluster_t owned = io->head; @@ -971,21 +956,21 @@ static void z_erofs_vle_unzip_all(struct super_block *sb, pcl = container_of(owned, struct z_erofs_pcluster, next); owned = READ_ONCE(pcl->next); - z_erofs_decompress_pcluster(sb, pcl, pagepool); + z_erofs_decompress_pcluster(io->sb, pcl, pagepool); } } -static void z_erofs_vle_unzip_wq(struct work_struct *work) +static void z_erofs_decompressqueue_work(struct work_struct *work) { - struct z_erofs_unzip_io_sb *iosb = - container_of(work, struct z_erofs_unzip_io_sb, io.u.work); + struct z_erofs_decompressqueue *bgq = + container_of(work, struct z_erofs_decompressqueue, u.work); LIST_HEAD(pagepool); - DBG_BUGON(iosb->io.head == Z_EROFS_PCLUSTER_TAIL_CLOSED); - z_erofs_vle_unzip_all(iosb->sb, &iosb->io, &pagepool); + DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED); + z_erofs_decompress_queue(bgq, &pagepool); put_pages_list(&pagepool); - kvfree(iosb); + kvfree(bgq); } static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, @@ -994,8 +979,6 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, struct address_space *mc, gfp_t gfp) { - /* determined at compile time to avoid too many #ifdefs */ - const bool nocache = __builtin_constant_p(mc) ? !mc : false; const pgoff_t index = pcl->obj.index; bool tocache = false; @@ -1016,7 +999,7 @@ repeat: * the cached page has not been allocated and * an placeholder is out there, prepare it now. */ - if (!nocache && page == PAGE_UNALLOCATED) { + if (page == PAGE_UNALLOCATED) { tocache = true; goto out_allocpage; } @@ -1029,21 +1012,6 @@ repeat: mapping = READ_ONCE(page->mapping); /* - * if managed cache is disabled, it's no way to - * get such a cached-like page. - */ - if (nocache) { - /* if managed cache is disabled, it is impossible `justfound' */ - DBG_BUGON(justfound); - - /* and it should be locked, not uptodate, and not truncated */ - DBG_BUGON(!PageLocked(page)); - DBG_BUGON(PageUptodate(page)); - DBG_BUGON(!mapping); - goto out; - } - - /* * unmanaged (file) pages are all locked solidly, * therefore it is impossible for `mapping' to be NULL. */ @@ -1093,50 +1061,52 @@ repeat: unlock_page(page); put_page(page); out_allocpage: - page = __stagingpage_alloc(pagepool, gfp); - if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) { - list_add(&page->lru, pagepool); - cpu_relax(); - goto repeat; - } - if (nocache || !tocache) - goto out; - if (add_to_page_cache_lru(page, mc, index + nr, gfp)) { + page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); + if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) { + /* non-LRU / non-movable temporary page is needed */ page->mapping = Z_EROFS_MAPPING_STAGING; - goto out; + tocache = false; } + if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) { + if (tocache) { + /* since it added to managed cache successfully */ + unlock_page(page); + put_page(page); + } else { + list_add(&page->lru, pagepool); + } + cond_resched(); + goto repeat; + } set_page_private(page, (unsigned long)pcl); SetPagePrivate(page); out: /* the only exit (for tracing and debugging) */ return page; } -static struct z_erofs_unzip_io *jobqueue_init(struct super_block *sb, - struct z_erofs_unzip_io *io, - bool foreground) +static struct z_erofs_decompressqueue * +jobqueue_init(struct super_block *sb, + struct z_erofs_decompressqueue *fgq, bool *fg) { - struct z_erofs_unzip_io_sb *iosb; - - if (foreground) { - /* waitqueue available for foreground io */ - DBG_BUGON(!io); + struct z_erofs_decompressqueue *q; - init_waitqueue_head(&io->u.wait); - atomic_set(&io->pending_bios, 0); - goto out; + if (fg && !*fg) { + q = kvzalloc(sizeof(*q), GFP_KERNEL | __GFP_NOWARN); + if (!q) { + *fg = true; + goto fg_out; + } + INIT_WORK(&q->u.work, z_erofs_decompressqueue_work); + } else { +fg_out: + q = fgq; + init_waitqueue_head(&fgq->u.wait); + atomic_set(&fgq->pending_bios, 0); } - - iosb = kvzalloc(sizeof(*iosb), GFP_KERNEL | __GFP_NOFAIL); - DBG_BUGON(!iosb); - - /* initialize fields in the allocated descriptor */ - io = &iosb->io; - iosb->sb = sb; - INIT_WORK(&io->u.work, z_erofs_vle_unzip_wq); -out: - io->head = Z_EROFS_PCLUSTER_TAIL_CLOSED; - return io; + q->sb = sb; + q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED; + return q; } /* define decompression jobqueue types */ @@ -1147,22 +1117,17 @@ enum { }; static void *jobqueueset_init(struct super_block *sb, - z_erofs_next_pcluster_t qtail[], - struct z_erofs_unzip_io *q[], - struct z_erofs_unzip_io *fgq, - bool forcefg) + struct z_erofs_decompressqueue *q[], + struct z_erofs_decompressqueue *fgq, bool *fg) { /* * if managed cache is enabled, bypass jobqueue is needed, * no need to read from device for all pclusters in this queue. */ - q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, true); - qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; - - q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, forcefg); - qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head; + q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL); + q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, fg); - return tagptr_cast_ptr(tagptr_fold(tagptr1_t, q[JQ_SUBMIT], !forcefg)); + return tagptr_cast_ptr(tagptr_fold(tagptr1_t, q[JQ_SUBMIT], *fg)); } static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, @@ -1184,9 +1149,8 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, qtail[JQ_BYPASS] = &pcl->next; } -static bool postsubmit_is_all_bypassed(struct z_erofs_unzip_io *q[], - unsigned int nr_bios, - bool force_fg) +static bool postsubmit_is_all_bypassed(struct z_erofs_decompressqueue *q[], + unsigned int nr_bios, bool force_fg) { /* * although background is preferred, no one is pending for submission. @@ -1195,19 +1159,19 @@ static bool postsubmit_is_all_bypassed(struct z_erofs_unzip_io *q[], if (force_fg || nr_bios) return false; - kvfree(container_of(q[JQ_SUBMIT], struct z_erofs_unzip_io_sb, io)); + kvfree(q[JQ_SUBMIT]); return true; } -static bool z_erofs_vle_submit_all(struct super_block *sb, - z_erofs_next_pcluster_t owned_head, - struct list_head *pagepool, - struct z_erofs_unzip_io *fgq, - bool force_fg) +static bool z_erofs_submit_queue(struct super_block *sb, + z_erofs_next_pcluster_t owned_head, + struct list_head *pagepool, + struct z_erofs_decompressqueue *fgq, + bool *force_fg) { - struct erofs_sb_info *const sbi __maybe_unused = EROFS_SB(sb); + struct erofs_sb_info *const sbi = EROFS_SB(sb); z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; - struct z_erofs_unzip_io *q[NR_JOBQUEUES]; + struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; struct bio *bio; void *bi_private; /* since bio will be NULL, no need to initialize last_index */ @@ -1221,7 +1185,9 @@ static bool z_erofs_vle_submit_all(struct super_block *sb, force_submit = false; bio = NULL; nr_bios = 0; - bi_private = jobqueueset_init(sb, qtail, q, fgq, force_fg); + bi_private = jobqueueset_init(sb, q, fgq, force_fg); + qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; + qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head; /* by default, all need io submission */ q[JQ_SUBMIT]->head = owned_head; @@ -1268,7 +1234,7 @@ submit_bio_retry: if (!bio) { bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); - bio->bi_end_io = z_erofs_vle_read_endio; + bio->bi_end_io = z_erofs_decompressqueue_endio; bio_set_dev(bio, sb->s_bdev); bio->bi_iter.bi_sector = (sector_t)(first_index + i) << LOG_SECTORS_PER_BLOCK; @@ -1297,40 +1263,38 @@ skippage: if (bio) submit_bio(bio); - if (postsubmit_is_all_bypassed(q, nr_bios, force_fg)) + if (postsubmit_is_all_bypassed(q, nr_bios, *force_fg)) return true; - z_erofs_vle_unzip_kickoff(bi_private, nr_bios); + z_erofs_decompress_kickoff(q[JQ_SUBMIT], *force_fg, nr_bios); return true; } -static void z_erofs_submit_and_unzip(struct super_block *sb, - struct z_erofs_collector *clt, - struct list_head *pagepool, - bool force_fg) +static void z_erofs_runqueue(struct super_block *sb, + struct z_erofs_collector *clt, + struct list_head *pagepool, bool force_fg) { - struct z_erofs_unzip_io io[NR_JOBQUEUES]; + struct z_erofs_decompressqueue io[NR_JOBQUEUES]; - if (!z_erofs_vle_submit_all(sb, clt->owned_head, - pagepool, io, force_fg)) + if (!z_erofs_submit_queue(sb, clt->owned_head, + pagepool, io, &force_fg)) return; - /* decompress no I/O pclusters immediately */ - z_erofs_vle_unzip_all(sb, &io[JQ_BYPASS], pagepool); + /* handle bypass queue (no i/o pclusters) immediately */ + z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool); if (!force_fg) return; /* wait until all bios are completed */ - wait_event(io[JQ_SUBMIT].u.wait, - !atomic_read(&io[JQ_SUBMIT].pending_bios)); + io_wait_event(io[JQ_SUBMIT].u.wait, + !atomic_read(&io[JQ_SUBMIT].pending_bios)); - /* let's synchronous decompression */ - z_erofs_vle_unzip_all(sb, &io[JQ_SUBMIT], pagepool); + /* handle synchronous decompress queue in the caller context */ + z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool); } -static int z_erofs_vle_normalaccess_readpage(struct file *file, - struct page *page) +static int z_erofs_readpage(struct file *file, struct page *page) { struct inode *const inode = page->mapping->host; struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); @@ -1345,7 +1309,7 @@ static int z_erofs_vle_normalaccess_readpage(struct file *file, (void)z_erofs_collector_end(&f.clt); /* if some compressed cluster ready, need submit them anyway */ - z_erofs_submit_and_unzip(inode->i_sb, &f.clt, &pagepool, true); + z_erofs_runqueue(inode->i_sb, &f.clt, &pagepool, true); if (err) erofs_err(inode->i_sb, "failed to read, err [%d]", err); @@ -1364,10 +1328,8 @@ static bool should_decompress_synchronously(struct erofs_sb_info *sbi, return nr <= sbi->max_sync_decompress_pages; } -static int z_erofs_vle_normalaccess_readpages(struct file *filp, - struct address_space *mapping, - struct list_head *pages, - unsigned int nr_pages) +static int z_erofs_readpages(struct file *filp, struct address_space *mapping, + struct list_head *pages, unsigned int nr_pages) { struct inode *const inode = mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); @@ -1422,7 +1384,7 @@ static int z_erofs_vle_normalaccess_readpages(struct file *filp, (void)z_erofs_collector_end(&f.clt); - z_erofs_submit_and_unzip(inode->i_sb, &f.clt, &pagepool, sync); + z_erofs_runqueue(inode->i_sb, &f.clt, &pagepool, sync); if (f.map.mpage) put_page(f.map.mpage); @@ -1432,8 +1394,8 @@ static int z_erofs_vle_normalaccess_readpages(struct file *filp, return 0; } -const struct address_space_operations z_erofs_vle_normalaccess_aops = { - .readpage = z_erofs_vle_normalaccess_readpage, - .readpages = z_erofs_vle_normalaccess_readpages, +const struct address_space_operations z_erofs_aops = { + .readpage = z_erofs_readpage, + .readpages = z_erofs_readpages, }; diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index faf950189bd7..7824f5563a55 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -84,7 +84,8 @@ struct z_erofs_pcluster { #define Z_EROFS_WORKGROUP_SIZE sizeof(struct z_erofs_pcluster) -struct z_erofs_unzip_io { +struct z_erofs_decompressqueue { + struct super_block *sb; atomic_t pending_bios; z_erofs_next_pcluster_t head; @@ -94,11 +95,6 @@ struct z_erofs_unzip_io { } u; }; -struct z_erofs_unzip_io_sb { - struct z_erofs_unzip_io io; - struct super_block *sb; -}; - #define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, struct page *page) diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 6a26c293ae2d..736db3a4cdef 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -22,11 +22,11 @@ int z_erofs_fill_inode(struct inode *inode) set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); } - inode->i_mapping->a_ops = &z_erofs_vle_normalaccess_aops; + inode->i_mapping->a_ops = &z_erofs_aops; return 0; } -static int fill_inode_lazy(struct inode *inode) +static int z_erofs_fill_inode_lazy(struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); struct super_block *const sb = inode->i_sb; @@ -138,8 +138,8 @@ static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m, return 0; } -static int vle_legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, - unsigned long lcn) +static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, + unsigned long lcn) { struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); @@ -311,13 +311,13 @@ out: return unpack_compacted_index(m, amortizedshift, erofs_blkoff(pos)); } -static int vle_load_cluster_from_disk(struct z_erofs_maprecorder *m, - unsigned int lcn) +static int z_erofs_load_cluster_from_disk(struct z_erofs_maprecorder *m, + unsigned int lcn) { const unsigned int datamode = EROFS_I(m->inode)->datalayout; if (datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY) - return vle_legacy_load_cluster_from_disk(m, lcn); + return legacy_load_cluster_from_disk(m, lcn); if (datamode == EROFS_INODE_FLAT_COMPRESSION) return compacted_load_cluster_from_disk(m, lcn); @@ -325,8 +325,8 @@ static int vle_load_cluster_from_disk(struct z_erofs_maprecorder *m, return -EINVAL; } -static int vle_extent_lookback(struct z_erofs_maprecorder *m, - unsigned int lookback_distance) +static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, + unsigned int lookback_distance) { struct erofs_inode *const vi = EROFS_I(m->inode); struct erofs_map_blocks *const map = m->map; @@ -343,7 +343,7 @@ static int vle_extent_lookback(struct z_erofs_maprecorder *m, /* load extent head logical cluster if needed */ lcn -= lookback_distance; - err = vle_load_cluster_from_disk(m, lcn); + err = z_erofs_load_cluster_from_disk(m, lcn); if (err) return err; @@ -356,7 +356,7 @@ static int vle_extent_lookback(struct z_erofs_maprecorder *m, DBG_BUGON(1); return -EFSCORRUPTED; } - return vle_extent_lookback(m, m->delta[0]); + return z_erofs_extent_lookback(m, m->delta[0]); case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: map->m_flags &= ~EROFS_MAP_ZIPPED; /* fallthrough */ @@ -396,7 +396,7 @@ int z_erofs_map_blocks_iter(struct inode *inode, goto out; } - err = fill_inode_lazy(inode); + err = z_erofs_fill_inode_lazy(inode); if (err) goto out; @@ -405,7 +405,7 @@ int z_erofs_map_blocks_iter(struct inode *inode, m.lcn = ofs >> lclusterbits; endoff = ofs & ((1 << lclusterbits) - 1); - err = vle_load_cluster_from_disk(&m, m.lcn); + err = z_erofs_load_cluster_from_disk(&m, m.lcn); if (err) goto unmap_out; @@ -436,7 +436,7 @@ int z_erofs_map_blocks_iter(struct inode *inode, /* fallthrough */ case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: /* get the correspoinding first chunk */ - err = vle_extent_lookback(&m, m.delta[0]); + err = z_erofs_extent_lookback(&m, m.delta[0]); if (err) goto unmap_out; break; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index c4159bcc05d9..67a395039268 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -551,28 +551,23 @@ out_unlock: */ #ifdef CONFIG_DEBUG_LOCK_ALLOC -static struct nested_calls poll_safewake_ncalls; - -static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests) -{ - unsigned long flags; - wait_queue_head_t *wqueue = (wait_queue_head_t *)cookie; - - spin_lock_irqsave_nested(&wqueue->lock, flags, call_nests + 1); - wake_up_locked_poll(wqueue, EPOLLIN); - spin_unlock_irqrestore(&wqueue->lock, flags); - - return 0; -} +static DEFINE_PER_CPU(int, wakeup_nest); static void ep_poll_safewake(wait_queue_head_t *wq) { - int this_cpu = get_cpu(); - - ep_call_nested(&poll_safewake_ncalls, - ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu); + unsigned long flags; + int subclass; - put_cpu(); + local_irq_save(flags); + preempt_disable(); + subclass = __this_cpu_read(wakeup_nest); + spin_lock_nested(&wq->lock, subclass + 1); + __this_cpu_inc(wakeup_nest); + wake_up_locked_poll(wq, POLLIN); + __this_cpu_dec(wakeup_nest); + spin_unlock(&wq->lock); + local_irq_restore(flags); + preempt_enable(); } #else @@ -671,7 +666,6 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, void *priv, int depth, bool ep_locked) { __poll_t res; - int pwake = 0; struct epitem *epi, *nepi; LIST_HEAD(txlist); @@ -738,26 +732,11 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, */ list_splice(&txlist, &ep->rdllist); __pm_relax(ep->ws); - - if (!list_empty(&ep->rdllist)) { - /* - * Wake up (if active) both the eventpoll wait list and - * the ->poll() wait list (delayed after we release the lock). - */ - if (waitqueue_active(&ep->wq)) - wake_up(&ep->wq); - if (waitqueue_active(&ep->poll_wait)) - pwake++; - } write_unlock_irq(&ep->lock); if (!ep_locked) mutex_unlock(&ep->mtx); - /* We have to call this outside the lock */ - if (pwake) - ep_poll_safewake(&ep->poll_wait); - return res; } @@ -2370,11 +2349,6 @@ static int __init eventpoll_init(void) */ ep_nested_calls_init(&poll_loop_ncalls); -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* Initialize the structure used to perform safe poll wait head wake ups */ - ep_nested_calls_init(&poll_safewake_ncalls); -#endif - /* * We can have many thousands of epitems, so prevent this from * using an extra cache line on 64-bit (and smaller) CPUs diff --git a/fs/exec.c b/fs/exec.c index 555e93c7dec8..74d88dab98dd 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -59,7 +59,6 @@ #include <linux/kmod.h> #include <linux/fsnotify.h> #include <linux/fs_struct.h> -#include <linux/pipe_fs_i.h> #include <linux/oom.h> #include <linux/compat.h> #include <linux/vmalloc.h> @@ -1015,7 +1014,7 @@ static int exec_mmap(struct mm_struct *mm) /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; - mm_release(tsk, old_mm); + exec_mm_release(tsk, old_mm); if (old_mm) { sync_mm_rss(old_mm); @@ -1132,7 +1131,7 @@ static int de_thread(struct task_struct *tsk) * also take its birthdate (always earlier than our own). */ tsk->start_time = leader->start_time; - tsk->real_start_time = leader->real_start_time; + tsk->start_boottime = leader->start_boottime; BUG_ON(!same_thread_group(leader, tsk)); BUG_ON(has_group_leader_pid(tsk)); diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 09bc68708d28..2dd55b172d57 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -519,26 +519,33 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, * inode is actually connected to the parent. */ err = exportfs_get_name(mnt, target_dir, nbuf, result); - if (!err) { - inode_lock(target_dir->d_inode); - nresult = lookup_one_len(nbuf, target_dir, - strlen(nbuf)); - inode_unlock(target_dir->d_inode); - if (!IS_ERR(nresult)) { - if (nresult->d_inode) { - dput(result); - result = nresult; - } else - dput(nresult); - } + if (err) { + dput(target_dir); + goto err_result; } + inode_lock(target_dir->d_inode); + nresult = lookup_one_len(nbuf, target_dir, strlen(nbuf)); + if (!IS_ERR(nresult)) { + if (unlikely(nresult->d_inode != result->d_inode)) { + dput(nresult); + nresult = ERR_PTR(-ESTALE); + } + } + inode_unlock(target_dir->d_inode); /* * At this point we are done with the parent, but it's pinned * by the child dentry anyway. */ dput(target_dir); + if (IS_ERR(nresult)) { + err = PTR_ERR(nresult); + goto err_result; + } + dput(result); + result = nresult; + /* * And finally make sure the dentry is actually acceptable * to NFSD. diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index e0cc55164505..fa9c951d3471 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -269,7 +269,7 @@ goal_in_my_reservation(struct ext2_reserve_window *rsv, ext2_grpblk_t grp_goal, ext2_fsblk_t group_first_block, group_last_block; group_first_block = ext2_group_first_block_no(sb, group); - group_last_block = group_first_block + EXT2_BLOCKS_PER_GROUP(sb) - 1; + group_last_block = ext2_group_last_block_no(sb, group); if ((rsv->_rsv_start > group_last_block) || (rsv->_rsv_end < group_first_block)) @@ -666,37 +666,24 @@ ext2_try_to_allocate(struct super_block *sb, int group, unsigned long *count, struct ext2_reserve_window *my_rsv) { - ext2_fsblk_t group_first_block; + ext2_fsblk_t group_first_block = ext2_group_first_block_no(sb, group); + ext2_fsblk_t group_last_block = ext2_group_last_block_no(sb, group); ext2_grpblk_t start, end; unsigned long num = 0; + start = 0; + end = group_last_block - group_first_block + 1; /* we do allocation within the reservation window if we have a window */ if (my_rsv) { - group_first_block = ext2_group_first_block_no(sb, group); if (my_rsv->_rsv_start >= group_first_block) start = my_rsv->_rsv_start - group_first_block; - else - /* reservation window cross group boundary */ - start = 0; - end = my_rsv->_rsv_end - group_first_block + 1; - if (end > EXT2_BLOCKS_PER_GROUP(sb)) - /* reservation window crosses group boundary */ - end = EXT2_BLOCKS_PER_GROUP(sb); - if ((start <= grp_goal) && (grp_goal < end)) - start = grp_goal; - else + if (my_rsv->_rsv_end < group_last_block) + end = my_rsv->_rsv_end - group_first_block + 1; + if (grp_goal < start || grp_goal >= end) grp_goal = -1; - } else { - if (grp_goal > 0) - start = grp_goal; - else - start = 0; - end = EXT2_BLOCKS_PER_GROUP(sb); } - BUG_ON(start > EXT2_BLOCKS_PER_GROUP(sb)); -repeat: if (grp_goal < 0) { grp_goal = find_next_usable_block(start, bitmap_bh, end); if (grp_goal < 0) @@ -711,32 +698,23 @@ repeat: ; } } - start = grp_goal; - if (ext2_set_bit_atomic(sb_bgl_lock(EXT2_SB(sb), group), grp_goal, - bitmap_bh->b_data)) { - /* - * The block was allocated by another thread, or it was - * allocated and then freed by another thread - */ - start++; - grp_goal++; - if (start >= end) - goto fail_access; - goto repeat; - } - num++; - grp_goal++; - while (num < *count && grp_goal < end - && !ext2_set_bit_atomic(sb_bgl_lock(EXT2_SB(sb), group), + for (; num < *count && grp_goal < end; grp_goal++) { + if (ext2_set_bit_atomic(sb_bgl_lock(EXT2_SB(sb), group), grp_goal, bitmap_bh->b_data)) { + if (num == 0) + continue; + break; + } num++; - grp_goal++; } + + if (num == 0) + goto fail_access; + *count = num; return grp_goal - num; fail_access: - *count = num; return -1; } @@ -754,10 +732,9 @@ fail_access: * but we will shift to the place where start_block is, * then start from there, when looking for a reservable space. * - * @size: the target new reservation window size + * @sb: the super block. * - * @group_first_block: the first block we consider to start - * the real search from + * @start_block: the first block we consider to start the real search from * * @last_block: * the maximum block number that our goal reservable space @@ -908,7 +885,7 @@ static int alloc_new_reservation(struct ext2_reserve_window_node *my_rsv, spinlock_t *rsv_lock = &EXT2_SB(sb)->s_rsv_window_lock; group_first_block = ext2_group_first_block_no(sb, group); - group_end_block = group_first_block + (EXT2_BLOCKS_PER_GROUP(sb) - 1); + group_end_block = ext2_group_last_block_no(sb, group); if (grp_goal < 0) start_block = group_first_block; @@ -1115,7 +1092,7 @@ ext2_try_to_allocate_with_rsv(struct super_block *sb, unsigned int group, * first block is the block number of the first block in this group */ group_first_block = ext2_group_first_block_no(sb, group); - group_last_block = group_first_block + (EXT2_BLOCKS_PER_GROUP(sb) - 1); + group_last_block = ext2_group_last_block_no(sb, group); /* * Basically we will allocate a new block from inode's reservation @@ -1313,6 +1290,13 @@ retry_alloc: if (free_blocks > 0) { grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) % EXT2_BLOCKS_PER_GROUP(sb)); + /* + * In case we retry allocation (due to fs reservation not + * working out or fs corruption), the bitmap_bh is non-null + * pointer and we have to release it before calling + * read_block_bitmap(). + */ + brelse(bitmap_bh); bitmap_bh = read_block_bitmap(sb, group_no); if (!bitmap_bh) goto io_error; @@ -1404,6 +1388,7 @@ allocated: * use. So we may want to selectively mark some of the blocks * as free */ + num = *count; goto retry_alloc; } diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 10ab238de9a6..8178bd38a9d6 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -813,6 +813,18 @@ ext2_group_first_block_no(struct super_block *sb, unsigned long group_no) le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block); } +static inline ext2_fsblk_t +ext2_group_last_block_no(struct super_block *sb, unsigned long group_no) +{ + struct ext2_sb_info *sbi = EXT2_SB(sb); + + if (group_no == sbi->s_groups_count - 1) + return le32_to_cpu(sbi->s_es->s_blocks_count) - 1; + else + return ext2_group_first_block_no(sb, group_no) + + EXT2_BLOCKS_PER_GROUP(sb) - 1; +} + #define ext2_set_bit __test_and_set_bit_le #define ext2_clear_bit __test_and_clear_bit_le #define ext2_test_bit test_bit_le diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 467c13ff6b40..119667e65890 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -701,10 +701,13 @@ static int ext2_get_blocks(struct inode *inode, if (!partial) { count++; mutex_unlock(&ei->truncate_mutex); - if (err) - goto cleanup; goto got_it; } + + if (err) { + mutex_unlock(&ei->truncate_mutex); + goto cleanup; + } } /* diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index 1b853fb0b163..32a8d10b579d 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -145,10 +145,13 @@ setversion_out: if (ei->i_block_alloc_info){ struct ext2_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node; rsv->rsv_goal_size = rsv_window_size; + } else { + ret = -ENOMEM; } + mutex_unlock(&ei->truncate_mutex); mnt_drop_write_file(filp); - return 0; + return ret; } default: return -ENOTTY; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 30c630d73f0f..bcffe25da2f0 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -702,13 +702,7 @@ static int ext2_check_descriptors(struct super_block *sb) for (i = 0; i < sbi->s_groups_count; i++) { struct ext2_group_desc *gdp = ext2_get_group_desc(sb, i, NULL); ext2_fsblk_t first_block = ext2_group_first_block_no(sb, i); - ext2_fsblk_t last_block; - - if (i == sbi->s_groups_count - 1) - last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1; - else - last_block = first_block + - (EXT2_BLOCKS_PER_GROUP(sb) - 1); + ext2_fsblk_t last_block = ext2_group_last_block_no(sb, i); if (le32_to_cpu(gdp->bg_block_bitmap) < first_block || le32_to_cpu(gdp->bg_block_bitmap) > last_block) @@ -806,7 +800,6 @@ static unsigned long descriptor_loc(struct super_block *sb, { struct ext2_sb_info *sbi = EXT2_SB(sb); unsigned long bg, first_meta_bg; - int has_super = 0; first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); @@ -814,10 +807,8 @@ static unsigned long descriptor_loc(struct super_block *sb, nr < first_meta_bg) return (logic_sb_block + nr + 1); bg = sbi->s_desc_per_block * nr; - if (ext2_bg_has_super(sb, bg)) - has_super = 1; - return ext2_group_first_block_no(sb, bg) + has_super; + return ext2_group_first_block_no(sb, bg) + ext2_bg_has_super(sb, bg); } static int ext2_fill_super(struct super_block *sb, void *data, int silent) diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index cbb5ca830e57..ef42ab040905 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -106,3 +106,20 @@ config EXT4_DEBUG If you select Y here, then you will be able to turn on debugging with a command such as: echo 1 > /sys/module/ext4/parameters/mballoc_debug + +config EXT4_KUNIT_TESTS + bool "KUnit tests for ext4" + select EXT4_FS + depends on KUNIT + help + This builds the ext4 KUnit tests. + + KUnit tests run during boot and output the results to the debug log + in TAP format (http://testanything.org/). Only useful for kernel devs + running KUnit test harness and are not for inclusion into a production + build. + + For more information on KUnit and unit tests in general please refer + to the KUnit documentation in Documentation/dev-tools/kunit/. + + If unsure, say N. diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index b17ddc229ac5..840b91d040f1 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -13,4 +13,5 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o +ext4-$(CONFIG_EXT4_KUNIT_TESTS) += inode-test.o ext4-$(CONFIG_FS_VERITY) += verity.o diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 03db3e71676c..f8578caba40d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -198,6 +198,12 @@ struct ext4_system_blocks { */ #define EXT4_IO_END_UNWRITTEN 0x0001 +struct ext4_io_end_vec { + struct list_head list; /* list of io_end_vec */ + loff_t offset; /* offset in the file */ + ssize_t size; /* size of the extent */ +}; + /* * For converting unwritten extents on a work queue. 'handle' is used for * buffered writeback. @@ -211,8 +217,7 @@ typedef struct ext4_io_end { * bios covering the extent */ unsigned int flag; /* unwritten or not */ atomic_t count; /* reference counter */ - loff_t offset; /* offset in the file */ - ssize_t size; /* size of the extent */ + struct list_head list_vec; /* list of ext4_io_end_vec */ } ext4_io_end_t; struct ext4_io_submit { @@ -1579,7 +1584,6 @@ enum { EXT4_STATE_NO_EXPAND, /* No space for expansion */ EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ - EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ EXT4_STATE_NEWENTRY, /* File just added to dir */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ @@ -1678,6 +1682,7 @@ static inline bool ext4_verity_in_progress(struct inode *inode) #define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010 #define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020 #define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200 +#define EXT4_FEATURE_COMPAT_STABLE_INODES 0x0800 #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 #define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 @@ -1779,6 +1784,7 @@ EXT4_FEATURE_COMPAT_FUNCS(xattr, EXT_ATTR) EXT4_FEATURE_COMPAT_FUNCS(resize_inode, RESIZE_INODE) EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX) EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2) +EXT4_FEATURE_COMPAT_FUNCS(stable_inodes, STABLE_INODES) EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER) EXT4_FEATURE_RO_COMPAT_FUNCS(large_file, LARGE_FILE) @@ -2560,8 +2566,6 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); -int ext4_dio_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create); int ext4_walk_page_buffers(handle_t *handle, @@ -2604,7 +2608,6 @@ extern int ext4_can_truncate(struct inode *inode); extern int ext4_truncate(struct inode *); extern int ext4_break_layouts(struct inode *); extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); -extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); extern void ext4_set_inode_flags(struct inode *); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); @@ -3264,6 +3267,8 @@ extern long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len); extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, loff_t offset, ssize_t len); +extern int ext4_convert_unwritten_io_end_vec(handle_t *handle, + ext4_io_end_t *io_end); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern int ext4_ext_calc_metadata_amount(struct inode *inode, @@ -3296,6 +3301,10 @@ extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, ext4_lblk_t lblk2, ext4_lblk_t count, int mark_unwritten,int *err); extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu); +extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode, + int check_cred, int restart_cred, + int revoke_cred); + /* move_extent.c */ extern void ext4_double_down_write_data_sem(struct inode *first, @@ -3322,6 +3331,8 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io, int len, struct writeback_control *wbc, bool keep_towrite); +extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end); +extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end); /* mmp.c */ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); @@ -3379,6 +3390,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) } extern const struct iomap_ops ext4_iomap_ops; +extern const struct iomap_ops ext4_iomap_report_ops; static inline int ext4_buffer_uptodate(struct buffer_head *bh) { diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 7c70b08d104c..d3b8cdea5df7 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -65,12 +65,14 @@ static int ext4_journal_check_start(struct super_block *sb) } handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, - int type, int blocks, int rsv_blocks) + int type, int blocks, int rsv_blocks, + int revoke_creds) { journal_t *journal; int err; - trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_); + trace_ext4_journal_start(sb, blocks, rsv_blocks, revoke_creds, + _RET_IP_); err = ext4_journal_check_start(sb); if (err < 0) return ERR_PTR(err); @@ -78,8 +80,8 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, journal = EXT4_SB(sb)->s_journal; if (!journal) return ext4_get_nojournal(); - return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS, - type, line); + return jbd2__journal_start(journal, blocks, rsv_blocks, revoke_creds, + GFP_NOFS, type, line); } int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) @@ -119,8 +121,8 @@ handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, return ext4_get_nojournal(); sb = handle->h_journal->j_private; - trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits, - _RET_IP_); + trace_ext4_journal_start_reserved(sb, + jbd2_handle_buffer_credits(handle), _RET_IP_); err = ext4_journal_check_start(sb); if (err < 0) { jbd2_journal_free_reserved(handle); @@ -133,6 +135,19 @@ handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, return handle; } +int __ext4_journal_ensure_credits(handle_t *handle, int check_cred, + int extend_cred, int revoke_cred) +{ + if (!ext4_handle_valid(handle)) + return 0; + if (jbd2_handle_buffer_credits(handle) >= check_cred && + handle->h_revoke_credits >= revoke_cred) + return 0; + extend_cred = max(0, extend_cred - jbd2_handle_buffer_credits(handle)); + revoke_cred = max(0, revoke_cred - handle->h_revoke_credits); + return ext4_journal_extend(handle, extend_cred, revoke_cred); +} + static void ext4_journal_abort_handle(const char *caller, unsigned int line, const char *err_fn, struct buffer_head *bh, @@ -278,7 +293,7 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, handle->h_type, handle->h_line_no, handle->h_requested_credits, - handle->h_buffer_credits, err); + jbd2_handle_buffer_credits(handle), err); return err; } ext4_error_inode(inode, where, line, @@ -289,7 +304,8 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, handle->h_type, handle->h_line_no, handle->h_requested_credits, - handle->h_buffer_credits, err); + jbd2_handle_buffer_credits(handle), + err); } } else { if (inode) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index ef8fcf7d0d3b..a6b9b66dbfad 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -261,7 +261,8 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line, __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, - int type, int blocks, int rsv_blocks); + int type, int blocks, int rsv_blocks, + int revoke_creds); int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) @@ -288,28 +289,41 @@ static inline int ext4_handle_is_aborted(handle_t *handle) return 0; } -static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed) +static inline int ext4_free_metadata_revoke_credits(struct super_block *sb, + int blocks) { - if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed) - return 0; - return 1; + /* Freeing each metadata block can result in freeing one cluster */ + return blocks * EXT4_SB(sb)->s_cluster_ratio; +} + +static inline int ext4_trans_default_revoke_credits(struct super_block *sb) +{ + return ext4_free_metadata_revoke_credits(sb, 8); } #define ext4_journal_start_sb(sb, type, nblocks) \ - __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0) + __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0, \ + ext4_trans_default_revoke_credits(sb)) #define ext4_journal_start(inode, type, nblocks) \ - __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0) + __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0, \ + ext4_trans_default_revoke_credits((inode)->i_sb)) + +#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks)\ + __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks),\ + ext4_trans_default_revoke_credits((inode)->i_sb)) -#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \ - __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks)) +#define ext4_journal_start_with_revoke(inode, type, blocks, revoke_creds) \ + __ext4_journal_start((inode), __LINE__, (type), (blocks), 0, \ + (revoke_creds)) static inline handle_t *__ext4_journal_start(struct inode *inode, unsigned int line, int type, - int blocks, int rsv_blocks) + int blocks, int rsv_blocks, + int revoke_creds) { return __ext4_journal_start_sb(inode->i_sb, line, type, blocks, - rsv_blocks); + rsv_blocks, revoke_creds); } #define ext4_journal_stop(handle) \ @@ -332,20 +346,68 @@ static inline handle_t *ext4_journal_current_handle(void) return journal_current_handle(); } -static inline int ext4_journal_extend(handle_t *handle, int nblocks) +static inline int ext4_journal_extend(handle_t *handle, int nblocks, int revoke) { if (ext4_handle_valid(handle)) - return jbd2_journal_extend(handle, nblocks); + return jbd2_journal_extend(handle, nblocks, revoke); return 0; } -static inline int ext4_journal_restart(handle_t *handle, int nblocks) +static inline int ext4_journal_restart(handle_t *handle, int nblocks, + int revoke) { if (ext4_handle_valid(handle)) - return jbd2_journal_restart(handle, nblocks); + return jbd2__journal_restart(handle, nblocks, revoke, GFP_NOFS); return 0; } +int __ext4_journal_ensure_credits(handle_t *handle, int check_cred, + int extend_cred, int revoke_cred); + + +/* + * Ensure @handle has at least @check_creds credits available. If not, + * transaction will be extended or restarted to contain at least @extend_cred + * credits. Before restarting transaction @fn is executed to allow for cleanup + * before the transaction is restarted. + * + * The return value is < 0 in case of error, 0 in case the handle has enough + * credits or transaction extension succeeded, 1 in case transaction had to be + * restarted. + */ +#define ext4_journal_ensure_credits_fn(handle, check_cred, extend_cred, \ + revoke_cred, fn) \ +({ \ + __label__ __ensure_end; \ + int err = __ext4_journal_ensure_credits((handle), (check_cred), \ + (extend_cred), (revoke_cred)); \ + \ + if (err <= 0) \ + goto __ensure_end; \ + err = (fn); \ + if (err < 0) \ + goto __ensure_end; \ + err = ext4_journal_restart((handle), (extend_cred), (revoke_cred)); \ + if (err == 0) \ + err = 1; \ +__ensure_end: \ + err; \ +}) + +/* + * Ensure given handle has at least requested amount of credits available, + * possibly restarting transaction if needed. We also make sure the transaction + * has space for at least ext4_trans_default_revoke_credits(sb) revoke records + * as freeing one or two blocks is very common pattern and requesting this is + * very cheap. + */ +static inline int ext4_journal_ensure_credits(handle_t *handle, int credits, + int revoke_creds) +{ + return ext4_journal_ensure_credits_fn(handle, credits, credits, + revoke_creds, 0); +} + static inline int ext4_journal_blocks_per_page(struct inode *inode) { if (EXT4_JOURNAL(inode) != NULL) @@ -407,6 +469,7 @@ static inline int ext4_inode_journal_mode(struct inode *inode) return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ /* We do not support data journalling with delayed allocation */ if (!S_ISREG(inode->i_mode) || + ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && !test_opt(inode->i_sb, DELALLOC))) { @@ -437,6 +500,19 @@ static inline int ext4_should_writeback_data(struct inode *inode) return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE; } +static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks) +{ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + return 0; + if (!ext4_should_journal_data(inode)) + return 0; + /* + * Data blocks in one extent are contiguous, just account for partial + * clusters at extent boundaries + */ + return blocks + 2*(EXT4_SB(inode->i_sb)->s_cluster_ratio - 1); +} + /* * This function controls whether or not we should try to go down the * dioread_nolock code paths, which makes it safe to avoid taking diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index fb0f99dc8c22..0e8708b77da6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -100,29 +100,41 @@ static int ext4_split_extent_at(handle_t *handle, static int ext4_find_delayed_extent(struct inode *inode, struct extent_status *newes); -static int ext4_ext_truncate_extend_restart(handle_t *handle, - struct inode *inode, - int needed) +static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) { - int err; - - if (!ext4_handle_valid(handle)) - return 0; - if (handle->h_buffer_credits >= needed) - return 0; /* - * If we need to extend the journal get a few extra blocks - * while we're at it for efficiency's sake. + * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this + * moment, get_block can be called only for blocks inside i_size since + * page cache has been already dropped and writes are blocked by + * i_mutex. So we can safely drop the i_data_sem here. */ - needed += 3; - err = ext4_journal_extend(handle, needed - handle->h_buffer_credits); - if (err <= 0) - return err; - err = ext4_truncate_restart_trans(handle, inode, needed); - if (err == 0) - err = -EAGAIN; + BUG_ON(EXT4_JOURNAL(inode) == NULL); + ext4_discard_preallocations(inode); + up_write(&EXT4_I(inode)->i_data_sem); + *dropped = 1; + return 0; +} - return err; +/* + * Make sure 'handle' has at least 'check_cred' credits. If not, restart + * transaction with 'restart_cred' credits. The function drops i_data_sem + * when restarting transaction and gets it after transaction is restarted. + * + * The function returns 0 on success, 1 if transaction had to be restarted, + * and < 0 in case of fatal error. + */ +int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode, + int check_cred, int restart_cred, + int revoke_cred) +{ + int ret; + int dropped = 0; + + ret = ext4_journal_ensure_credits_fn(handle, check_cred, restart_cred, + revoke_cred, ext4_ext_trunc_restart_fn(inode, &dropped)); + if (dropped) + down_write(&EXT4_I(inode)->i_data_sem); + return ret; } /* @@ -1753,16 +1765,9 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, */ if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) return 0; - /* - * The check for IO to unwritten extent is somewhat racy as we - * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after - * dropping i_data_sem. But reserved blocks should save us in that - * case. - */ + if (ext4_ext_is_unwritten(ex1) && - (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || - atomic_read(&EXT4_I(inode)->i_unwritten) || - (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN))) + ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN) return 0; #ifdef AGGRESSIVE_TEST if (ext1_ee_len >= 4) @@ -1840,7 +1845,8 @@ static void ext4_ext_try_to_merge_up(handle_t *handle, * group descriptor to release the extent tree block. If we * can't get the journal credits, give up. */ - if (ext4_journal_extend(handle, 2)) + if (ext4_journal_extend(handle, 2, + ext4_free_metadata_revoke_credits(inode->i_sb, 1))) return; /* @@ -2727,7 +2733,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int err = 0, correct_index = 0; - int depth = ext_depth(inode), credits; + int depth = ext_depth(inode), credits, revoke_credits; struct ext4_extent_header *eh; ext4_lblk_t a, b; unsigned num; @@ -2819,10 +2825,23 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, credits += (ext_depth(inode)) + 1; } credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); - - err = ext4_ext_truncate_extend_restart(handle, inode, credits); - if (err) + /* + * We may end up freeing some index blocks and data from the + * punched range. Note that partial clusters are accounted for + * by ext4_free_data_revoke_credits(). + */ + revoke_credits = + ext4_free_metadata_revoke_credits(inode->i_sb, + ext_depth(inode)) + + ext4_free_data_revoke_credits(inode, b - a + 1); + + err = ext4_datasem_ensure_credits(handle, inode, credits, + credits, revoke_credits); + if (err) { + if (err > 0) + err = -EAGAIN; goto out; + } err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -2948,7 +2967,9 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, ext_debug("truncate since %u to %u\n", start, end); /* probably first extent we're gonna free will be last in block */ - handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1); + handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE, + depth + 1, + ext4_free_metadata_revoke_credits(inode->i_sb, depth)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -4962,23 +4983,13 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, int ret = 0; int ret2 = 0; struct ext4_map_blocks map; - unsigned int credits, blkbits = inode->i_blkbits; + unsigned int blkbits = inode->i_blkbits; + unsigned int credits = 0; map.m_lblk = offset >> blkbits; max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); - /* - * This is somewhat ugly but the idea is clear: When transaction is - * reserved, everything goes into it. Otherwise we rather start several - * smaller transactions for conversion of each extent separately. - */ - if (handle) { - handle = ext4_journal_start_reserved(handle, - EXT4_HT_EXT_CONVERT); - if (IS_ERR(handle)) - return PTR_ERR(handle); - credits = 0; - } else { + if (!handle) { /* * credits to insert 1 extent into extent tree */ @@ -5009,11 +5020,40 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, if (ret <= 0 || ret2) break; } - if (!credits) - ret2 = ext4_journal_stop(handle); return ret > 0 ? ret2 : ret; } +int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end) +{ + int ret, err = 0; + struct ext4_io_end_vec *io_end_vec; + + /* + * This is somewhat ugly but the idea is clear: When transaction is + * reserved, everything goes into it. Otherwise we rather start several + * smaller transactions for conversion of each extent separately. + */ + if (handle) { + handle = ext4_journal_start_reserved(handle, + EXT4_HT_EXT_CONVERT); + if (IS_ERR(handle)) + return PTR_ERR(handle); + } + + list_for_each_entry(io_end_vec, &io_end->list_vec, list) { + ret = ext4_convert_unwritten_extents(handle, io_end->inode, + io_end_vec->offset, + io_end_vec->size); + if (ret) + break; + } + + if (handle) + err = ext4_journal_stop(handle); + + return ret < 0 ? ret : err; +} + /* * If newes is not existing extent (newes->ec_pblk equals zero) find * delayed extent at start of newes and update newes accordingly and @@ -5206,13 +5246,10 @@ ext4_access_path(handle_t *handle, struct inode *inode, * descriptor) for each block group; assume two block * groups */ - if (handle->h_buffer_credits < 7) { - credits = ext4_writepage_trans_blocks(inode); - err = ext4_ext_truncate_extend_restart(handle, inode, credits); - /* EAGAIN is success */ - if (err && err != -EAGAIN) - return err; - } + credits = ext4_writepage_trans_blocks(inode); + err = ext4_datasem_ensure_credits(handle, inode, 7, credits, 0); + if (err < 0) + return err; err = ext4_ext_get_access(handle, inode, path); return err; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 8d2bbcc2d813..6a7293a5cda2 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -29,10 +29,58 @@ #include <linux/pagevec.h> #include <linux/uio.h> #include <linux/mman.h> +#include <linux/backing-dev.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" +#include "truncate.h" + +static bool ext4_dio_supported(struct inode *inode) +{ + if (IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENCRYPTED(inode)) + return false; + if (fsverity_active(inode)) + return false; + if (ext4_should_journal_data(inode)) + return false; + if (ext4_has_inline_data(inode)) + return false; + return true; +} + +static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + ssize_t ret; + struct inode *inode = file_inode(iocb->ki_filp); + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + } else { + inode_lock_shared(inode); + } + + if (!ext4_dio_supported(inode)) { + inode_unlock_shared(inode); + /* + * Fallback to buffered I/O if the operation being performed on + * the inode is not supported by direct I/O. The IOCB_DIRECT + * flag needs to be cleared here in order to ensure that the + * direct I/O path within generic_file_read_iter() is not + * taken. + */ + iocb->ki_flags &= ~IOCB_DIRECT; + return generic_file_read_iter(iocb, to); + } + + ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, + is_sync_kiocb(iocb)); + inode_unlock_shared(inode); + + file_accessed(iocb->ki_filp); + return ret; +} #ifdef CONFIG_FS_DAX static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -64,16 +112,21 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { - if (unlikely(ext4_forced_shutdown(EXT4_SB(file_inode(iocb->ki_filp)->i_sb)))) + struct inode *inode = file_inode(iocb->ki_filp); + + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; if (!iov_iter_count(to)) return 0; /* skip atime */ #ifdef CONFIG_FS_DAX - if (IS_DAX(file_inode(iocb->ki_filp))) + if (IS_DAX(inode)) return ext4_dax_read_iter(iocb, to); #endif + if (iocb->ki_flags & IOCB_DIRECT) + return ext4_dio_read_iter(iocb, to); + return generic_file_read_iter(iocb, to); } @@ -103,13 +156,6 @@ static int ext4_release_file(struct inode *inode, struct file *filp) return 0; } -static void ext4_unwritten_wait(struct inode *inode) -{ - wait_queue_head_t *wq = ext4_ioend_wq(inode); - - wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0)); -} - /* * This tests whether the IO in question is block-aligned or not. * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they @@ -162,13 +208,13 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + ret = generic_write_checks(iocb, from); if (ret <= 0) return ret; - if (unlikely(IS_IMMUTABLE(inode))) - return -EPERM; - /* * If we have encountered a bitmap-format file, the size limit * is smaller than s_maxbytes, which is for extent-mapped files. @@ -180,56 +226,266 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) return -EFBIG; iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); } + + ret = file_modified(iocb->ki_filp); + if (ret) + return ret; + return iov_iter_count(from); } -#ifdef CONFIG_FS_DAX -static ssize_t -ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) +static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, + struct iov_iter *from) { - struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; + struct inode *inode = file_inode(iocb->ki_filp); - if (!inode_trylock(inode)) { - if (iocb->ki_flags & IOCB_NOWAIT) - return -EAGAIN; - inode_lock(inode); - } + if (iocb->ki_flags & IOCB_NOWAIT) + return -EOPNOTSUPP; + + inode_lock(inode); ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; - ret = file_remove_privs(iocb->ki_filp); - if (ret) - goto out; - ret = file_update_time(iocb->ki_filp); - if (ret) - goto out; - ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); + current->backing_dev_info = inode_to_bdi(inode); + ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos); + current->backing_dev_info = NULL; + out: inode_unlock(inode); - if (ret > 0) + if (likely(ret > 0)) { + iocb->ki_pos += ret; ret = generic_write_sync(iocb, ret); + } + return ret; } -#endif -static ssize_t -ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, + ssize_t written, size_t count) { + handle_t *handle; + bool truncate = false; + u8 blkbits = inode->i_blkbits; + ext4_lblk_t written_blk, end_blk; + + /* + * Note that EXT4_I(inode)->i_disksize can get extended up to + * inode->i_size while the I/O was running due to writeback of delalloc + * blocks. But, the code in ext4_iomap_alloc() is careful to use + * zeroed/unwritten extents if this is possible; thus we won't leave + * uninitialized blocks in a file even if we didn't succeed in writing + * as much as we intended. + */ + WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize); + if (offset + count <= EXT4_I(inode)->i_disksize) { + /* + * We need to ensure that the inode is removed from the orphan + * list if it has been added prematurely, due to writeback of + * delalloc blocks. + */ + if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) { + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + + if (IS_ERR(handle)) { + ext4_orphan_del(NULL, inode); + return PTR_ERR(handle); + } + + ext4_orphan_del(handle, inode); + ext4_journal_stop(handle); + } + + return written; + } + + if (written < 0) + goto truncate; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + written = PTR_ERR(handle); + goto truncate; + } + + if (ext4_update_inode_size(inode, offset + written)) + ext4_mark_inode_dirty(handle, inode); + + /* + * We may need to truncate allocated but not written blocks beyond EOF. + */ + written_blk = ALIGN(offset + written, 1 << blkbits); + end_blk = ALIGN(offset + count, 1 << blkbits); + if (written_blk < end_blk && ext4_can_truncate(inode)) + truncate = true; + + /* + * Remove the inode from the orphan list if it has been extended and + * everything went OK. + */ + if (!truncate && inode->i_nlink) + ext4_orphan_del(handle, inode); + ext4_journal_stop(handle); + + if (truncate) { +truncate: + ext4_truncate_failed_write(inode); + /* + * If the truncate operation failed early, then the inode may + * still be on the orphan list. In that case, we need to try + * remove the inode from the in-memory linked list. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + + return written; +} + +static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, + int error, unsigned int flags) +{ + loff_t offset = iocb->ki_pos; struct inode *inode = file_inode(iocb->ki_filp); - int o_direct = iocb->ki_flags & IOCB_DIRECT; - int unaligned_aio = 0; - int overwrite = 0; + + if (error) + return error; + + if (size && flags & IOMAP_DIO_UNWRITTEN) + return ext4_convert_unwritten_extents(NULL, inode, + offset, size); + + return 0; +} + +static const struct iomap_dio_ops ext4_dio_write_ops = { + .end_io = ext4_dio_write_end_io, +}; + +static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ ssize_t ret; + size_t count; + loff_t offset; + handle_t *handle; + struct inode *inode = file_inode(iocb->ki_filp); + bool extend = false, overwrite = false, unaligned_aio = false; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) - return -EIO; + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) + return -EAGAIN; + } else { + inode_lock(inode); + } + + if (!ext4_dio_supported(inode)) { + inode_unlock(inode); + /* + * Fallback to buffered I/O if the inode does not support + * direct I/O. + */ + return ext4_buffered_write_iter(iocb, from); + } + + ret = ext4_write_checks(iocb, from); + if (ret <= 0) { + inode_unlock(inode); + return ret; + } + + /* + * Unaligned asynchronous direct I/O must be serialized among each + * other as the zeroing of partial blocks of two competing unaligned + * asynchronous direct I/O writes can result in data corruption. + */ + offset = iocb->ki_pos; + count = iov_iter_count(from); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && + !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) { + unaligned_aio = true; + inode_dio_wait(inode); + } + + /* + * Determine whether the I/O will overwrite allocated and initialized + * blocks. If so, check to see whether it is possible to take the + * dioread_nolock path. + */ + if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) && + ext4_should_dioread_nolock(inode)) { + overwrite = true; + downgrade_write(&inode->i_rwsem); + } + + if (offset + count > EXT4_I(inode)->i_disksize) { + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + ret = ext4_orphan_add(handle, inode); + if (ret) { + ext4_journal_stop(handle); + goto out; + } + + extend = true; + ext4_journal_stop(handle); + } + + ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops, + is_sync_kiocb(iocb) || unaligned_aio || extend); + + if (extend) + ret = ext4_handle_inode_extension(inode, offset, ret, count); + +out: + if (overwrite) + inode_unlock_shared(inode); + else + inode_unlock(inode); + + if (ret >= 0 && iov_iter_count(from)) { + ssize_t err; + loff_t endbyte; + + offset = iocb->ki_pos; + err = ext4_buffered_write_iter(iocb, from); + if (err < 0) + return err; + + /* + * We need to ensure that the pages within the page cache for + * the range covered by this I/O are written to disk and + * invalidated. This is in attempt to preserve the expected + * direct I/O semantics in the case we fallback to buffered I/O + * to complete off the I/O request. + */ + ret += err; + endbyte = offset + err - 1; + err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping, + offset, endbyte); + if (!err) + invalidate_mapping_pages(iocb->ki_filp->f_mapping, + offset >> PAGE_SHIFT, + endbyte >> PAGE_SHIFT); + } + + return ret; +} #ifdef CONFIG_FS_DAX - if (IS_DAX(inode)) - return ext4_dax_write_iter(iocb, from); -#endif +static ssize_t +ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + ssize_t ret; + size_t count; + loff_t offset; + handle_t *handle; + bool extend = false; + struct inode *inode = file_inode(iocb->ki_filp); if (!inode_trylock(inode)) { if (iocb->ki_flags & IOCB_NOWAIT) @@ -241,49 +497,55 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret <= 0) goto out; - /* - * Unaligned direct AIO must be serialized among each other as zeroing - * of partial blocks of two competing unaligned AIOs can result in data - * corruption. - */ - if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && - !is_sync_kiocb(iocb) && - ext4_unaligned_aio(inode, from, iocb->ki_pos)) { - unaligned_aio = 1; - ext4_unwritten_wait(inode); - } + offset = iocb->ki_pos; + count = iov_iter_count(from); - iocb->private = &overwrite; - /* Check whether we do a DIO overwrite or not */ - if (o_direct && !unaligned_aio) { - if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) { - if (ext4_should_dioread_nolock(inode)) - overwrite = 1; - } else if (iocb->ki_flags & IOCB_NOWAIT) { - ret = -EAGAIN; + if (offset + count > EXT4_I(inode)->i_disksize) { + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); goto out; } - } - ret = __generic_file_write_iter(iocb, from); - /* - * Unaligned direct AIO must be the only IO in flight. Otherwise - * overlapping aligned IO after unaligned might result in data - * corruption. - */ - if (ret == -EIOCBQUEUED && unaligned_aio) - ext4_unwritten_wait(inode); - inode_unlock(inode); + ret = ext4_orphan_add(handle, inode); + if (ret) { + ext4_journal_stop(handle); + goto out; + } - if (ret > 0) - ret = generic_write_sync(iocb, ret); + extend = true; + ext4_journal_stop(handle); + } - return ret; + ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); + if (extend) + ret = ext4_handle_inode_extension(inode, offset, ret, count); out: inode_unlock(inode); + if (ret > 0) + ret = generic_write_sync(iocb, ret); return ret; } +#endif + +static ssize_t +ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + return -EIO; + +#ifdef CONFIG_FS_DAX + if (IS_DAX(inode)) + return ext4_dax_write_iter(iocb, from); +#endif + if (iocb->ki_flags & IOCB_DIRECT) + return ext4_dio_write_iter(iocb, from); + + return ext4_buffered_write_iter(iocb, from); +} #ifdef CONFIG_FS_DAX static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, @@ -494,12 +756,14 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence) maxbytes, i_size_read(inode)); case SEEK_HOLE: inode_lock_shared(inode); - offset = iomap_seek_hole(inode, offset, &ext4_iomap_ops); + offset = iomap_seek_hole(inode, offset, + &ext4_iomap_report_ops); inode_unlock_shared(inode); break; case SEEK_DATA: inode_lock_shared(inode); - offset = iomap_seek_data(inode, offset, &ext4_iomap_ops); + offset = iomap_seek_data(inode, offset, + &ext4_iomap_report_ops); inode_unlock_shared(inode); break; } diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 5508baa11bb6..e10206e7f4bb 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -80,6 +80,43 @@ static int ext4_sync_parent(struct inode *inode) return ret; } +static int ext4_fsync_nojournal(struct inode *inode, bool datasync, + bool *needs_barrier) +{ + int ret, err; + + ret = sync_mapping_buffers(inode->i_mapping); + if (!(inode->i_state & I_DIRTY_ALL)) + return ret; + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + return ret; + + err = sync_inode_metadata(inode, 1); + if (!ret) + ret = err; + + if (!ret) + ret = ext4_sync_parent(inode); + if (test_opt(inode->i_sb, BARRIER)) + *needs_barrier = true; + + return ret; +} + +static int ext4_fsync_journal(struct inode *inode, bool datasync, + bool *needs_barrier) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + tid_t commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; + + if (journal->j_flags & JBD2_BARRIER && + !jbd2_trans_will_send_data_barrier(journal, commit_tid)) + *needs_barrier = true; + + return jbd2_complete_transaction(journal, commit_tid); +} + /* * akpm: A new design for ext4_sync_file(). * @@ -91,17 +128,14 @@ static int ext4_sync_parent(struct inode *inode) * What we do is just kick off a commit and wait on it. This will snapshot the * inode to disk. */ - int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { - struct inode *inode = file->f_mapping->host; - struct ext4_inode_info *ei = EXT4_I(inode); - journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; int ret = 0, err; - tid_t commit_tid; bool needs_barrier = false; + struct inode *inode = file->f_mapping->host; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(sbi))) return -EIO; J_ASSERT(ext4_journal_current_handle() == NULL); @@ -111,23 +145,15 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (sb_rdonly(inode->i_sb)) { /* Make sure that we read updated s_mount_flags value */ smp_rmb(); - if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) ret = -EROFS; goto out; } - if (!journal) { - ret = __generic_file_fsync(file, start, end, datasync); - if (!ret) - ret = ext4_sync_parent(inode); - if (test_opt(inode->i_sb, BARRIER)) - goto issue_flush; - goto out; - } - ret = file_write_and_wait_range(file, start, end); if (ret) return ret; + /* * data=writeback,ordered: * The caller's filemap_fdatawrite()/wait will sync the data. @@ -142,18 +168,14 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * (they were dirtied by commit). But that's OK - the blocks are * safe in-journal, which is all fsync() needs to ensure. */ - if (ext4_should_journal_data(inode)) { + if (!sbi->s_journal) + ret = ext4_fsync_nojournal(inode, datasync, &needs_barrier); + else if (ext4_should_journal_data(inode)) ret = ext4_force_commit(inode->i_sb); - goto out; - } + else + ret = ext4_fsync_journal(inode, datasync, &needs_barrier); - commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; - if (journal->j_flags & JBD2_BARRIER && - !jbd2_trans_will_send_data_barrier(journal, commit_tid)) - needs_barrier = true; - ret = jbd2_complete_transaction(journal, commit_tid); if (needs_barrier) { - issue_flush: err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); if (!ret) ret = err; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 764ff4c56233..dc333e8e51e8 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -265,13 +265,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) ext4_debug("freeing inode %lu\n", ino); trace_ext4_free_inode(inode); - /* - * Note: we must free any quota before locking the superblock, - * as writing the quota to disk may need the lock as well. - */ dquot_initialize(inode); dquot_free_inode(inode); - dquot_drop(inode); is_directory = S_ISDIR(inode->i_mode); @@ -927,7 +922,7 @@ repeat_in_this_group: BUG_ON(nblocks <= 0); handle = __ext4_journal_start_sb(dir->i_sb, line_no, handle_type, nblocks, - 0); + 0, 0); if (IS_ERR(handle)) { err = PTR_ERR(handle); ext4_std_error(sb, err); diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 36699a131168..3a4ab70fe9e0 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -331,11 +331,14 @@ static int ext4_alloc_branch(handle_t *handle, for (i = 0; i <= indirect_blks; i++) { if (i == indirect_blks) { new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err); - } else + } else { ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle, ar->inode, ar->goal, ar->flags & EXT4_MB_DELALLOC_RESERVED, NULL, &err); + /* Simplify error cleanup... */ + branch[i+1].bh = NULL; + } if (err) { i--; goto failed; @@ -377,18 +380,25 @@ static int ext4_alloc_branch(handle_t *handle, } return 0; failed: + if (i == indirect_blks) { + /* Free data blocks */ + ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i], + ar->len, 0); + i--; + } for (; i >= 0; i--) { /* * We want to ext4_forget() only freshly allocated indirect - * blocks. Buffer for new_blocks[i-1] is at branch[i].bh and - * buffer at branch[0].bh is indirect block / inode already - * existing before ext4_alloc_branch() was called. + * blocks. Buffer for new_blocks[i] is at branch[i+1].bh + * (buffer at branch[0].bh is indirect block / inode already + * existing before ext4_alloc_branch() was called). Also + * because blocks are freshly allocated, we don't need to + * revoke them which is why we don't set + * EXT4_FREE_BLOCKS_METADATA. */ - if (i > 0 && i != indirect_blks && branch[i].bh) - ext4_forget(handle, 1, ar->inode, branch[i].bh, - branch[i].bh->b_blocknr); - ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i], - (i == indirect_blks) ? ar->len : 1, 0); + ext4_free_blocks(handle, ar->inode, branch[i+1].bh, + new_blocks[i], 1, + branch[i+1].bh ? EXT4_FREE_BLOCKS_FORGET : 0); } return err; } @@ -689,27 +699,63 @@ int ext4_ind_trans_blocks(struct inode *inode, int nrblocks) return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; } +static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int *dropped) +{ + int err; + + if (bh) { + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (unlikely(err)) + return err; + } + err = ext4_mark_inode_dirty(handle, inode); + if (unlikely(err)) + return err; + /* + * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this + * moment, get_block can be called only for blocks inside i_size since + * page cache has been already dropped and writes are blocked by + * i_mutex. So we can safely drop the i_data_sem here. + */ + BUG_ON(EXT4_JOURNAL(inode) == NULL); + ext4_discard_preallocations(inode); + up_write(&EXT4_I(inode)->i_data_sem); + *dropped = 1; + return 0; +} + /* * Truncate transactions can be complex and absolutely huge. So we need to * be able to restart the transaction at a conventient checkpoint to make * sure we don't overflow the journal. * * Try to extend this transaction for the purposes of truncation. If - * extend fails, we need to propagate the failure up and restart the - * transaction in the top-level truncate loop. --sct - * - * Returns 0 if we managed to create more room. If we can't create more - * room, and the transaction must be restarted we return 1. + * extend fails, we restart transaction. */ -static int try_to_extend_transaction(handle_t *handle, struct inode *inode) +static int ext4_ind_truncate_ensure_credits(handle_t *handle, + struct inode *inode, + struct buffer_head *bh, + int revoke_creds) { - if (!ext4_handle_valid(handle)) - return 0; - if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) - return 0; - if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode))) - return 0; - return 1; + int ret; + int dropped = 0; + + ret = ext4_journal_ensure_credits_fn(handle, EXT4_RESERVE_TRANS_BLOCKS, + ext4_blocks_for_truncate(inode), revoke_creds, + ext4_ind_trunc_restart_fn(handle, inode, bh, &dropped)); + if (dropped) + down_write(&EXT4_I(inode)->i_data_sem); + if (ret <= 0) + return ret; + if (bh) { + BUFFER_TRACE(bh, "retaking write access"); + ret = ext4_journal_get_write_access(handle, bh); + if (unlikely(ret)) + return ret; + } + return 0; } /* @@ -844,27 +890,10 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, return 1; } - if (try_to_extend_transaction(handle, inode)) { - if (bh) { - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, bh); - if (unlikely(err)) - goto out_err; - } - err = ext4_mark_inode_dirty(handle, inode); - if (unlikely(err)) - goto out_err; - err = ext4_truncate_restart_trans(handle, inode, - ext4_blocks_for_truncate(inode)); - if (unlikely(err)) - goto out_err; - if (bh) { - BUFFER_TRACE(bh, "retaking write access"); - err = ext4_journal_get_write_access(handle, bh); - if (unlikely(err)) - goto out_err; - } - } + err = ext4_ind_truncate_ensure_credits(handle, inode, bh, + ext4_free_data_revoke_credits(inode, count)); + if (err < 0) + goto out_err; for (p = first; p < last; p++) *p = 0; @@ -1047,11 +1076,11 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, */ if (ext4_handle_is_aborted(handle)) return; - if (try_to_extend_transaction(handle, inode)) { - ext4_mark_inode_dirty(handle, inode); - ext4_truncate_restart_trans(handle, inode, - ext4_blocks_for_truncate(inode)); - } + if (ext4_ind_truncate_ensure_credits(handle, inode, + NULL, + ext4_free_metadata_revoke_credits( + inode->i_sb, 1)) < 0) + return; /* * The forget flag here is critical because if diff --git a/fs/ext4/inode-test.c b/fs/ext4/inode-test.c new file mode 100644 index 000000000000..92a9da1774aa --- /dev/null +++ b/fs/ext4/inode-test.c @@ -0,0 +1,272 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit test of ext4 inode that verify the seconds part of [a/c/m] + * timestamps in ext4 inode structs are decoded correctly. + */ + +#include <kunit/test.h> +#include <linux/kernel.h> +#include <linux/time64.h> + +#include "ext4.h" + +/* + * For constructing the nonnegative timestamp lower bound value. + * binary: 00000000 00000000 00000000 00000000 + */ +#define LOWER_MSB_0 0L +/* + * For constructing the nonnegative timestamp upper bound value. + * binary: 01111111 11111111 11111111 11111111 + * + */ +#define UPPER_MSB_0 0x7fffffffL +/* + * For constructing the negative timestamp lower bound value. + * binary: 10000000 00000000 00000000 00000000 + */ +#define LOWER_MSB_1 (-0x80000000L) +/* + * For constructing the negative timestamp upper bound value. + * binary: 11111111 11111111 11111111 11111111 + */ +#define UPPER_MSB_1 (-1L) +/* + * Upper bound for nanoseconds value supported by the encoding. + * binary: 00111111 11111111 11111111 11111111 + */ +#define MAX_NANOSECONDS ((1L << 30) - 1) + +#define CASE_NAME_FORMAT "%s: msb:%x lower_bound:%x extra_bits: %x" + +#define LOWER_BOUND_NEG_NO_EXTRA_BITS_CASE\ + "1901-12-13 Lower bound of 32bit < 0 timestamp, no extra bits" +#define UPPER_BOUND_NEG_NO_EXTRA_BITS_CASE\ + "1969-12-31 Upper bound of 32bit < 0 timestamp, no extra bits" +#define LOWER_BOUND_NONNEG_NO_EXTRA_BITS_CASE\ + "1970-01-01 Lower bound of 32bit >=0 timestamp, no extra bits" +#define UPPER_BOUND_NONNEG_NO_EXTRA_BITS_CASE\ + "2038-01-19 Upper bound of 32bit >=0 timestamp, no extra bits" +#define LOWER_BOUND_NEG_LO_1_CASE\ + "2038-01-19 Lower bound of 32bit <0 timestamp, lo extra sec bit on" +#define UPPER_BOUND_NEG_LO_1_CASE\ + "2106-02-07 Upper bound of 32bit <0 timestamp, lo extra sec bit on" +#define LOWER_BOUND_NONNEG_LO_1_CASE\ + "2106-02-07 Lower bound of 32bit >=0 timestamp, lo extra sec bit on" +#define UPPER_BOUND_NONNEG_LO_1_CASE\ + "2174-02-25 Upper bound of 32bit >=0 timestamp, lo extra sec bit on" +#define LOWER_BOUND_NEG_HI_1_CASE\ + "2174-02-25 Lower bound of 32bit <0 timestamp, hi extra sec bit on" +#define UPPER_BOUND_NEG_HI_1_CASE\ + "2242-03-16 Upper bound of 32bit <0 timestamp, hi extra sec bit on" +#define LOWER_BOUND_NONNEG_HI_1_CASE\ + "2242-03-16 Lower bound of 32bit >=0 timestamp, hi extra sec bit on" +#define UPPER_BOUND_NONNEG_HI_1_CASE\ + "2310-04-04 Upper bound of 32bit >=0 timestamp, hi extra sec bit on" +#define UPPER_BOUND_NONNEG_HI_1_NS_1_CASE\ + "2310-04-04 Upper bound of 32bit>=0 timestamp, hi extra sec bit 1. 1 ns" +#define LOWER_BOUND_NONNEG_HI_1_NS_MAX_CASE\ + "2378-04-22 Lower bound of 32bit>= timestamp. Extra sec bits 1. Max ns" +#define LOWER_BOUND_NONNEG_EXTRA_BITS_1_CASE\ + "2378-04-22 Lower bound of 32bit >=0 timestamp. All extra sec bits on" +#define UPPER_BOUND_NONNEG_EXTRA_BITS_1_CASE\ + "2446-05-10 Upper bound of 32bit >=0 timestamp. All extra sec bits on" + +struct timestamp_expectation { + const char *test_case_name; + struct timespec64 expected; + u32 extra_bits; + bool msb_set; + bool lower_bound; +}; + +static time64_t get_32bit_time(const struct timestamp_expectation * const test) +{ + if (test->msb_set) { + if (test->lower_bound) + return LOWER_MSB_1; + + return UPPER_MSB_1; + } + + if (test->lower_bound) + return LOWER_MSB_0; + return UPPER_MSB_0; +} + + +/* + * Test data is derived from the table in the Inode Timestamps section of + * Documentation/filesystems/ext4/inodes.rst. + */ +static void inode_test_xtimestamp_decoding(struct kunit *test) +{ + const struct timestamp_expectation test_data[] = { + { + .test_case_name = LOWER_BOUND_NEG_NO_EXTRA_BITS_CASE, + .msb_set = true, + .lower_bound = true, + .extra_bits = 0, + .expected = {.tv_sec = -0x80000000LL, .tv_nsec = 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NEG_NO_EXTRA_BITS_CASE, + .msb_set = true, + .lower_bound = false, + .extra_bits = 0, + .expected = {.tv_sec = -1LL, .tv_nsec = 0L}, + }, + + { + .test_case_name = LOWER_BOUND_NONNEG_NO_EXTRA_BITS_CASE, + .msb_set = false, + .lower_bound = true, + .extra_bits = 0, + .expected = {0LL, 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NONNEG_NO_EXTRA_BITS_CASE, + .msb_set = false, + .lower_bound = false, + .extra_bits = 0, + .expected = {.tv_sec = 0x7fffffffLL, .tv_nsec = 0L}, + }, + + { + .test_case_name = LOWER_BOUND_NEG_LO_1_CASE, + .msb_set = true, + .lower_bound = true, + .extra_bits = 1, + .expected = {.tv_sec = 0x80000000LL, .tv_nsec = 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NEG_LO_1_CASE, + .msb_set = true, + .lower_bound = false, + .extra_bits = 1, + .expected = {.tv_sec = 0xffffffffLL, .tv_nsec = 0L}, + }, + + { + .test_case_name = LOWER_BOUND_NONNEG_LO_1_CASE, + .msb_set = false, + .lower_bound = true, + .extra_bits = 1, + .expected = {.tv_sec = 0x100000000LL, .tv_nsec = 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NONNEG_LO_1_CASE, + .msb_set = false, + .lower_bound = false, + .extra_bits = 1, + .expected = {.tv_sec = 0x17fffffffLL, .tv_nsec = 0L}, + }, + + { + .test_case_name = LOWER_BOUND_NEG_HI_1_CASE, + .msb_set = true, + .lower_bound = true, + .extra_bits = 2, + .expected = {.tv_sec = 0x180000000LL, .tv_nsec = 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NEG_HI_1_CASE, + .msb_set = true, + .lower_bound = false, + .extra_bits = 2, + .expected = {.tv_sec = 0x1ffffffffLL, .tv_nsec = 0L}, + }, + + { + .test_case_name = LOWER_BOUND_NONNEG_HI_1_CASE, + .msb_set = false, + .lower_bound = true, + .extra_bits = 2, + .expected = {.tv_sec = 0x200000000LL, .tv_nsec = 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NONNEG_HI_1_CASE, + .msb_set = false, + .lower_bound = false, + .extra_bits = 2, + .expected = {.tv_sec = 0x27fffffffLL, .tv_nsec = 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NONNEG_HI_1_NS_1_CASE, + .msb_set = false, + .lower_bound = false, + .extra_bits = 6, + .expected = {.tv_sec = 0x27fffffffLL, .tv_nsec = 1L}, + }, + + { + .test_case_name = LOWER_BOUND_NONNEG_HI_1_NS_MAX_CASE, + .msb_set = false, + .lower_bound = true, + .extra_bits = 0xFFFFFFFF, + .expected = {.tv_sec = 0x300000000LL, + .tv_nsec = MAX_NANOSECONDS}, + }, + + { + .test_case_name = LOWER_BOUND_NONNEG_EXTRA_BITS_1_CASE, + .msb_set = false, + .lower_bound = true, + .extra_bits = 3, + .expected = {.tv_sec = 0x300000000LL, .tv_nsec = 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NONNEG_EXTRA_BITS_1_CASE, + .msb_set = false, + .lower_bound = false, + .extra_bits = 3, + .expected = {.tv_sec = 0x37fffffffLL, .tv_nsec = 0L}, + } + }; + + struct timespec64 timestamp; + int i; + + for (i = 0; i < ARRAY_SIZE(test_data); ++i) { + timestamp.tv_sec = get_32bit_time(&test_data[i]); + ext4_decode_extra_time(×tamp, + cpu_to_le32(test_data[i].extra_bits)); + + KUNIT_EXPECT_EQ_MSG(test, + test_data[i].expected.tv_sec, + timestamp.tv_sec, + CASE_NAME_FORMAT, + test_data[i].test_case_name, + test_data[i].msb_set, + test_data[i].lower_bound, + test_data[i].extra_bits); + KUNIT_EXPECT_EQ_MSG(test, + test_data[i].expected.tv_nsec, + timestamp.tv_nsec, + CASE_NAME_FORMAT, + test_data[i].test_case_name, + test_data[i].msb_set, + test_data[i].lower_bound, + test_data[i].extra_bits); + } +} + +static struct kunit_case ext4_inode_test_cases[] = { + KUNIT_CASE(inode_test_xtimestamp_decoding), + {} +}; + +static struct kunit_suite ext4_inode_test_suite = { + .name = "ext4_inode_test", + .test_cases = ext4_inode_test_cases, +}; + +kunit_test_suite(ext4_inode_test_suite); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index abaaf7d96ca4..28f28de0c1b6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -164,39 +164,18 @@ int ext4_inode_is_fast_symlink(struct inode *inode) } /* - * Restart the transaction associated with *handle. This does a commit, - * so before we call here everything must be consistently dirtied against - * this transaction. - */ -int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, - int nblocks) -{ - int ret; - - /* - * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this - * moment, get_block can be called only for blocks inside i_size since - * page cache has been already dropped and writes are blocked by - * i_mutex. So we can safely drop the i_data_sem here. - */ - BUG_ON(EXT4_JOURNAL(inode) == NULL); - jbd_debug(2, "restarting handle %p\n", handle); - up_write(&EXT4_I(inode)->i_data_sem); - ret = ext4_journal_restart(handle, nblocks); - down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode); - - return ret; -} - -/* * Called at the last iput() if i_nlink is zero. */ void ext4_evict_inode(struct inode *inode) { handle_t *handle; int err; - int extra_credits = 3; + /* + * Credits for final inode cleanup and freeing: + * sb + inode (ext4_orphan_del()), block bitmap, group descriptor + * (xattr block freeing), bitmap, group descriptor (inode freeing) + */ + int extra_credits = 6; struct ext4_xattr_inode_array *ea_inode_array = NULL; trace_ext4_evict_inode(inode); @@ -252,8 +231,12 @@ void ext4_evict_inode(struct inode *inode) if (!IS_NOQUOTA(inode)) extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb); + /* + * Block bitmap, group descriptor, and inode are accounted in both + * ext4_blocks_for_truncate() and extra_credits. So subtract 3. + */ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, - ext4_blocks_for_truncate(inode)+extra_credits); + ext4_blocks_for_truncate(inode) + extra_credits - 3); if (IS_ERR(handle)) { ext4_std_error(inode->i_sb, PTR_ERR(handle)); /* @@ -827,136 +810,6 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, #define DIO_MAX_BLOCKS 4096 /* - * Get blocks function for the cases that need to start a transaction - - * generally difference cases of direct IO and DAX IO. It also handles retries - * in case of ENOSPC. - */ -static int ext4_get_block_trans(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int flags) -{ - int dio_credits; - handle_t *handle; - int retries = 0; - int ret; - - /* Trim mapping request to maximum we can map at once for DIO */ - if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS) - bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits; - dio_credits = ext4_chunk_trans_blocks(inode, - bh_result->b_size >> inode->i_blkbits); -retry: - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - ret = _ext4_get_block(inode, iblock, bh_result, flags); - ext4_journal_stop(handle); - - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - return ret; -} - -/* Get block function for DIO reads and writes to inodes without extents */ -int ext4_dio_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) -{ - /* We don't expect handle for direct IO */ - WARN_ON_ONCE(ext4_journal_current_handle()); - - if (!create) - return _ext4_get_block(inode, iblock, bh, 0); - return ext4_get_block_trans(inode, iblock, bh, EXT4_GET_BLOCKS_CREATE); -} - -/* - * Get block function for AIO DIO writes when we create unwritten extent if - * blocks are not allocated yet. The extent will be converted to written - * after IO is complete. - */ -static int ext4_dio_get_block_unwritten_async(struct inode *inode, - sector_t iblock, struct buffer_head *bh_result, int create) -{ - int ret; - - /* We don't expect handle for direct IO */ - WARN_ON_ONCE(ext4_journal_current_handle()); - - ret = ext4_get_block_trans(inode, iblock, bh_result, - EXT4_GET_BLOCKS_IO_CREATE_EXT); - - /* - * When doing DIO using unwritten extents, we need io_end to convert - * unwritten extents to written on IO completion. We allocate io_end - * once we spot unwritten extent and store it in b_private. Generic - * DIO code keeps b_private set and furthermore passes the value to - * our completion callback in 'private' argument. - */ - if (!ret && buffer_unwritten(bh_result)) { - if (!bh_result->b_private) { - ext4_io_end_t *io_end; - - io_end = ext4_init_io_end(inode, GFP_KERNEL); - if (!io_end) - return -ENOMEM; - bh_result->b_private = io_end; - ext4_set_io_unwritten_flag(inode, io_end); - } - set_buffer_defer_completion(bh_result); - } - - return ret; -} - -/* - * Get block function for non-AIO DIO writes when we create unwritten extent if - * blocks are not allocated yet. The extent will be converted to written - * after IO is complete by ext4_direct_IO_write(). - */ -static int ext4_dio_get_block_unwritten_sync(struct inode *inode, - sector_t iblock, struct buffer_head *bh_result, int create) -{ - int ret; - - /* We don't expect handle for direct IO */ - WARN_ON_ONCE(ext4_journal_current_handle()); - - ret = ext4_get_block_trans(inode, iblock, bh_result, - EXT4_GET_BLOCKS_IO_CREATE_EXT); - - /* - * Mark inode as having pending DIO writes to unwritten extents. - * ext4_direct_IO_write() checks this flag and converts extents to - * written. - */ - if (!ret && buffer_unwritten(bh_result)) - ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); - - return ret; -} - -static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - int ret; - - ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n", - inode->i_ino, create); - /* We don't expect handle for direct IO */ - WARN_ON_ONCE(ext4_journal_current_handle()); - - ret = _ext4_get_block(inode, iblock, bh_result, 0); - /* - * Blocks should have been preallocated! ext4_file_write_iter() checks - * that. - */ - WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result)); - - return ret; -} - - -/* * `handle' can be NULL if create is zero */ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, @@ -2341,6 +2194,79 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, } /* + * mpage_process_page - update page buffers corresponding to changed extent and + * may submit fully mapped page for IO + * + * @mpd - description of extent to map, on return next extent to map + * @m_lblk - logical block mapping. + * @m_pblk - corresponding physical mapping. + * @map_bh - determines on return whether this page requires any further + * mapping or not. + * Scan given page buffers corresponding to changed extent and update buffer + * state according to new extent state. + * We map delalloc buffers to their physical location, clear unwritten bits. + * If the given page is not fully mapped, we update @map to the next extent in + * the given page that needs mapping & return @map_bh as true. + */ +static int mpage_process_page(struct mpage_da_data *mpd, struct page *page, + ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk, + bool *map_bh) +{ + struct buffer_head *head, *bh; + ext4_io_end_t *io_end = mpd->io_submit.io_end; + ext4_lblk_t lblk = *m_lblk; + ext4_fsblk_t pblock = *m_pblk; + int err = 0; + int blkbits = mpd->inode->i_blkbits; + ssize_t io_end_size = 0; + struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); + + bh = head = page_buffers(page); + do { + if (lblk < mpd->map.m_lblk) + continue; + if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { + /* + * Buffer after end of mapped extent. + * Find next buffer in the page to map. + */ + mpd->map.m_len = 0; + mpd->map.m_flags = 0; + io_end_vec->size += io_end_size; + io_end_size = 0; + + err = mpage_process_page_bufs(mpd, head, bh, lblk); + if (err > 0) + err = 0; + if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) { + io_end_vec = ext4_alloc_io_end_vec(io_end); + if (IS_ERR(io_end_vec)) { + err = PTR_ERR(io_end_vec); + goto out; + } + io_end_vec->offset = mpd->map.m_lblk << blkbits; + } + *map_bh = true; + goto out; + } + if (buffer_delay(bh)) { + clear_buffer_delay(bh); + bh->b_blocknr = pblock++; + } + clear_buffer_unwritten(bh); + io_end_size += (1 << blkbits); + } while (lblk++, (bh = bh->b_this_page) != head); + + io_end_vec->size += io_end_size; + io_end_size = 0; + *map_bh = false; +out: + *m_lblk = lblk; + *m_pblk = pblock; + return err; +} + +/* * mpage_map_buffers - update buffers corresponding to changed extent and * submit fully mapped pages for IO * @@ -2359,12 +2285,12 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) struct pagevec pvec; int nr_pages, i; struct inode *inode = mpd->inode; - struct buffer_head *head, *bh; int bpp_bits = PAGE_SHIFT - inode->i_blkbits; pgoff_t start, end; ext4_lblk_t lblk; - sector_t pblock; + ext4_fsblk_t pblock; int err; + bool map_bh = false; start = mpd->map.m_lblk >> bpp_bits; end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; @@ -2380,50 +2306,19 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - bh = head = page_buffers(page); - do { - if (lblk < mpd->map.m_lblk) - continue; - if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { - /* - * Buffer after end of mapped extent. - * Find next buffer in the page to map. - */ - mpd->map.m_len = 0; - mpd->map.m_flags = 0; - /* - * FIXME: If dioread_nolock supports - * blocksize < pagesize, we need to make - * sure we add size mapped so far to - * io_end->size as the following call - * can submit the page for IO. - */ - err = mpage_process_page_bufs(mpd, head, - bh, lblk); - pagevec_release(&pvec); - if (err > 0) - err = 0; - return err; - } - if (buffer_delay(bh)) { - clear_buffer_delay(bh); - bh->b_blocknr = pblock++; - } - clear_buffer_unwritten(bh); - } while (lblk++, (bh = bh->b_this_page) != head); - + err = mpage_process_page(mpd, page, &lblk, &pblock, + &map_bh); /* - * FIXME: This is going to break if dioread_nolock - * supports blocksize < pagesize as we will try to - * convert potentially unmapped parts of inode. + * If map_bh is true, means page may require further bh + * mapping, or maybe the page was submitted for IO. + * So we return to call further extent mapping. */ - mpd->io_submit.io_end->size += PAGE_SIZE; + if (err < 0 || map_bh == true) + goto out; /* Page fully mapped - let IO run! */ err = mpage_submit_page(mpd, page); - if (err < 0) { - pagevec_release(&pvec); - return err; - } + if (err < 0) + goto out; } pagevec_release(&pvec); } @@ -2431,6 +2326,9 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) mpd->map.m_len = 0; mpd->map.m_flags = 0; return 0; +out: + pagevec_release(&pvec); + return err; } static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) @@ -2510,9 +2408,13 @@ static int mpage_map_and_submit_extent(handle_t *handle, int err; loff_t disksize; int progress = 0; + ext4_io_end_t *io_end = mpd->io_submit.io_end; + struct ext4_io_end_vec *io_end_vec; - mpd->io_submit.io_end->offset = - ((loff_t)map->m_lblk) << inode->i_blkbits; + io_end_vec = ext4_alloc_io_end_vec(io_end); + if (IS_ERR(io_end_vec)) + return PTR_ERR(io_end_vec); + io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits; do { err = mpage_map_one_extent(handle, mpd); if (err < 0) { @@ -3406,473 +3308,235 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) return inode->i_state & I_DIRTY_DATASYNC; } -static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, - unsigned flags, struct iomap *iomap, struct iomap *srcmap) +static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, + struct ext4_map_blocks *map, loff_t offset, + loff_t length) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - unsigned int blkbits = inode->i_blkbits; - unsigned long first_block, last_block; - struct ext4_map_blocks map; - bool delalloc = false; - int ret; - - if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) - return -EINVAL; - first_block = offset >> blkbits; - last_block = min_t(loff_t, (offset + length - 1) >> blkbits, - EXT4_MAX_LOGICAL_BLOCK); - - if (flags & IOMAP_REPORT) { - if (ext4_has_inline_data(inode)) { - ret = ext4_inline_data_iomap(inode, iomap); - if (ret != -EAGAIN) { - if (ret == 0 && offset >= iomap->length) - ret = -ENOENT; - return ret; - } - } - } else { - if (WARN_ON_ONCE(ext4_has_inline_data(inode))) - return -ERANGE; - } - - map.m_lblk = first_block; - map.m_len = last_block - first_block + 1; - - if (flags & IOMAP_REPORT) { - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret < 0) - return ret; - - if (ret == 0) { - ext4_lblk_t end = map.m_lblk + map.m_len - 1; - struct extent_status es; - - ext4_es_find_extent_range(inode, &ext4_es_is_delayed, - map.m_lblk, end, &es); - - if (!es.es_len || es.es_lblk > end) { - /* entire range is a hole */ - } else if (es.es_lblk > map.m_lblk) { - /* range starts with a hole */ - map.m_len = es.es_lblk - map.m_lblk; - } else { - ext4_lblk_t offs = 0; - - if (es.es_lblk < map.m_lblk) - offs = map.m_lblk - es.es_lblk; - map.m_lblk = es.es_lblk + offs; - map.m_len = es.es_len - offs; - delalloc = true; - } - } - } else if (flags & IOMAP_WRITE) { - int dio_credits; - handle_t *handle; - int retries = 0; - - /* Trim mapping request to maximum we can map at once for DIO */ - if (map.m_len > DIO_MAX_BLOCKS) - map.m_len = DIO_MAX_BLOCKS; - dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); -retry: - /* - * Either we allocate blocks and then we don't get unwritten - * extent so we have reserved enough credits, or the blocks - * are already allocated and unwritten and in that case - * extent conversion fits in the credits as well. - */ - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, - dio_credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - ret = ext4_map_blocks(handle, inode, &map, - EXT4_GET_BLOCKS_CREATE_ZERO); - if (ret < 0) { - ext4_journal_stop(handle); - if (ret == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - return ret; - } - - /* - * If we added blocks beyond i_size, we need to make sure they - * will get truncated if we crash before updating i_size in - * ext4_iomap_end(). For faults we don't need to do that (and - * even cannot because for orphan list operations inode_lock is - * required) - if we happen to instantiate block beyond i_size, - * it is because we race with truncate which has already added - * the inode to the orphan list. - */ - if (!(flags & IOMAP_FAULT) && first_block + map.m_len > - (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) { - int err; - - err = ext4_orphan_add(handle, inode); - if (err < 0) { - ext4_journal_stop(handle); - return err; - } - } - ext4_journal_stop(handle); - } else { - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret < 0) - return ret; - } + u8 blkbits = inode->i_blkbits; + /* + * Writes that span EOF might trigger an I/O size update on completion, + * so consider them to be dirty for the purpose of O_DSYNC, even if + * there is no other metadata changes being made or are pending. + */ iomap->flags = 0; - if (ext4_inode_datasync_dirty(inode)) + if (ext4_inode_datasync_dirty(inode) || + offset + length > i_size_read(inode)) iomap->flags |= IOMAP_F_DIRTY; + + if (map->m_flags & EXT4_MAP_NEW) + iomap->flags |= IOMAP_F_NEW; + iomap->bdev = inode->i_sb->s_bdev; - iomap->dax_dev = sbi->s_daxdev; - iomap->offset = (u64)first_block << blkbits; - iomap->length = (u64)map.m_len << blkbits; + iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; + iomap->offset = (u64) map->m_lblk << blkbits; + iomap->length = (u64) map->m_len << blkbits; - if (ret == 0) { - iomap->type = delalloc ? IOMAP_DELALLOC : IOMAP_HOLE; - iomap->addr = IOMAP_NULL_ADDR; + /* + * Flags passed to ext4_map_blocks() for direct I/O writes can result + * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits + * set. In order for any allocated unwritten extents to be converted + * into written extents correctly within the ->end_io() handler, we + * need to ensure that the iomap->type is set appropriately. Hence, the + * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has + * been set first. + */ + if (map->m_flags & EXT4_MAP_UNWRITTEN) { + iomap->type = IOMAP_UNWRITTEN; + iomap->addr = (u64) map->m_pblk << blkbits; + } else if (map->m_flags & EXT4_MAP_MAPPED) { + iomap->type = IOMAP_MAPPED; + iomap->addr = (u64) map->m_pblk << blkbits; } else { - if (map.m_flags & EXT4_MAP_MAPPED) { - iomap->type = IOMAP_MAPPED; - } else if (map.m_flags & EXT4_MAP_UNWRITTEN) { - iomap->type = IOMAP_UNWRITTEN; - } else { - WARN_ON_ONCE(1); - return -EIO; - } - iomap->addr = (u64)map.m_pblk << blkbits; + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; } - - if (map.m_flags & EXT4_MAP_NEW) - iomap->flags |= IOMAP_F_NEW; - - return 0; } -static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, - ssize_t written, unsigned flags, struct iomap *iomap) +static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, + unsigned int flags) { - int ret = 0; handle_t *handle; - int blkbits = inode->i_blkbits; - bool truncate = false; - - if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT)) - return 0; + u8 blkbits = inode->i_blkbits; + int ret, dio_credits, m_flags = 0, retries = 0; - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto orphan_del; - } - if (ext4_update_inode_size(inode, offset + written)) - ext4_mark_inode_dirty(handle, inode); /* - * We may need to truncate allocated but not written blocks beyond EOF. + * Trim the mapping request to the maximum value that we can map at + * once for direct I/O. */ - if (iomap->offset + iomap->length > - ALIGN(inode->i_size, 1 << blkbits)) { - ext4_lblk_t written_blk, end_blk; + if (map->m_len > DIO_MAX_BLOCKS) + map->m_len = DIO_MAX_BLOCKS; + dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); - written_blk = (offset + written) >> blkbits; - end_blk = (offset + length) >> blkbits; - if (written_blk < end_blk && ext4_can_truncate(inode)) - truncate = true; - } +retry: /* - * Remove inode from orphan list if we were extending a inode and - * everything went fine. + * Either we allocate blocks and then don't get an unwritten extent, so + * in that case we have reserved enough credits. Or, the blocks are + * already allocated and unwritten. In that case, the extent conversion + * fits into the credits as well. */ - if (!truncate && inode->i_nlink && - !list_empty(&EXT4_I(inode)->i_orphan)) - ext4_orphan_del(handle, inode); - ext4_journal_stop(handle); - if (truncate) { - ext4_truncate_failed_write(inode); -orphan_del: - /* - * If truncate failed early the inode might still be on the - * orphan list; we need to make sure the inode is removed from - * the orphan list in that case. - */ - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); - } - return ret; -} - -const struct iomap_ops ext4_iomap_ops = { - .iomap_begin = ext4_iomap_begin, - .iomap_end = ext4_iomap_end, -}; - -static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, - ssize_t size, void *private) -{ - ext4_io_end_t *io_end = private; + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); - /* if not async direct IO just return */ - if (!io_end) - return 0; + /* + * DAX and direct I/O are the only two operations that are currently + * supported with IOMAP_WRITE. + */ + WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT)); + if (IS_DAX(inode)) + m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; + /* + * We use i_size instead of i_disksize here because delalloc writeback + * can complete at any point during the I/O and subsequently push the + * i_disksize out to i_size. This could be beyond where direct I/O is + * happening and thus expose allocated blocks to direct I/O reads. + */ + else if ((map->m_lblk * (1 << blkbits)) >= i_size_read(inode)) + m_flags = EXT4_GET_BLOCKS_CREATE; + else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; - ext_debug("ext4_end_io_dio(): io_end 0x%p " - "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", - io_end, io_end->inode->i_ino, iocb, offset, size); + ret = ext4_map_blocks(handle, inode, map, m_flags); /* - * Error during AIO DIO. We cannot convert unwritten extents as the - * data was not written. Just clear the unwritten flag and drop io_end. + * We cannot fill holes in indirect tree based inodes as that could + * expose stale data in the case of a crash. Use the magic error code + * to fallback to buffered I/O. */ - if (size <= 0) { - ext4_clear_io_unwritten_flag(io_end); - size = 0; - } - io_end->offset = offset; - io_end->size = size; - ext4_put_io_end(io_end); + if (!m_flags && !ret) + ret = -ENOTBLK; - return 0; + ext4_journal_stop(handle); + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + + return ret; } -/* - * Handling of direct IO writes. - * - * For ext4 extent files, ext4 will do direct-io write even to holes, - * preallocated extents, and those write extend the file, no need to - * fall back to buffered IO. - * - * For holes, we fallocate those blocks, mark them as unwritten - * If those blocks were preallocated, we mark sure they are split, but - * still keep the range to write as unwritten. - * - * The unwritten extents will be converted to written when DIO is completed. - * For async direct IO, since the IO may still pending when return, we - * set up an end_io call back function, which will do the conversion - * when async direct IO completed. - * - * If the O_DIRECT write will extend the file then add this inode to the - * orphan list. So recovery will truncate it back to the original size - * if the machine crashes during the write. - * - */ -static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) + +static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned flags, struct iomap *iomap, struct iomap *srcmap) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct ext4_inode_info *ei = EXT4_I(inode); - ssize_t ret; - loff_t offset = iocb->ki_pos; - size_t count = iov_iter_count(iter); - int overwrite = 0; - get_block_t *get_block_func = NULL; - int dio_flags = 0; - loff_t final_size = offset + count; - int orphan = 0; - handle_t *handle; + int ret; + struct ext4_map_blocks map; + u8 blkbits = inode->i_blkbits; - if (final_size > inode->i_size || final_size > ei->i_disksize) { - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - ret = ext4_orphan_add(handle, inode); - if (ret) { - ext4_journal_stop(handle); - goto out; - } - orphan = 1; - ext4_update_i_disksize(inode, inode->i_size); - ext4_journal_stop(handle); - } + if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) + return -EINVAL; - BUG_ON(iocb->private == NULL); + if (WARN_ON_ONCE(ext4_has_inline_data(inode))) + return -ERANGE; /* - * Make all waiters for direct IO properly wait also for extent - * conversion. This also disallows race between truncate() and - * overwrite DIO as i_dio_count needs to be incremented under i_mutex. + * Calculate the first and last logical blocks respectively. */ - inode_dio_begin(inode); + map.m_lblk = offset >> blkbits; + map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, + EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; + + if (flags & IOMAP_WRITE) + ret = ext4_iomap_alloc(inode, &map, flags); + else + ret = ext4_map_blocks(NULL, inode, &map, 0); - /* If we do a overwrite dio, i_mutex locking can be released */ - overwrite = *((int *)iocb->private); + if (ret < 0) + return ret; - if (overwrite) - inode_unlock(inode); + ext4_set_iomap(inode, iomap, &map, offset, length); + return 0; +} + +static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, + ssize_t written, unsigned flags, struct iomap *iomap) +{ /* - * For extent mapped files we could direct write to holes and fallocate. - * - * Allocated blocks to fill the hole are marked as unwritten to prevent - * parallel buffered read to expose the stale data before DIO complete - * the data IO. - * - * As to previously fallocated extents, ext4 get_block will just simply - * mark the buffer mapped but still keep the extents unwritten. - * - * For non AIO case, we will convert those unwritten extents to written - * after return back from blockdev_direct_IO. That way we save us from - * allocating io_end structure and also the overhead of offloading - * the extent convertion to a workqueue. - * - * For async DIO, the conversion needs to be deferred when the - * IO is completed. The ext4 end_io callback function will be - * called to take care of the conversion work. Here for async - * case, we allocate an io_end structure to hook to the iocb. + * Check to see whether an error occurred while writing out the data to + * the allocated blocks. If so, return the magic error code so that we + * fallback to buffered I/O and attempt to complete the remainder of + * the I/O. Any blocks that may have been allocated in preparation for + * the direct I/O will be reused during buffered I/O. */ - iocb->private = NULL; - if (overwrite) - get_block_func = ext4_dio_get_block_overwrite; - else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || - round_down(offset, i_blocksize(inode)) >= inode->i_size) { - get_block_func = ext4_dio_get_block; - dio_flags = DIO_LOCKING | DIO_SKIP_HOLES; - } else if (is_sync_kiocb(iocb)) { - get_block_func = ext4_dio_get_block_unwritten_sync; - dio_flags = DIO_LOCKING; - } else { - get_block_func = ext4_dio_get_block_unwritten_async; - dio_flags = DIO_LOCKING; - } - ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, - get_block_func, ext4_end_io_dio, NULL, - dio_flags); + if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0) + return -ENOTBLK; - if (ret > 0 && !overwrite && ext4_test_inode_state(inode, - EXT4_STATE_DIO_UNWRITTEN)) { - int err; - /* - * for non AIO case, since the IO is already - * completed, we could do the conversion right here - */ - err = ext4_convert_unwritten_extents(NULL, inode, - offset, ret); - if (err < 0) - ret = err; - ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); - } + return 0; +} - inode_dio_end(inode); - /* take i_mutex locking again if we do a ovewrite dio */ - if (overwrite) - inode_lock(inode); +const struct iomap_ops ext4_iomap_ops = { + .iomap_begin = ext4_iomap_begin, + .iomap_end = ext4_iomap_end, +}; - if (ret < 0 && final_size > inode->i_size) - ext4_truncate_failed_write(inode); +static bool ext4_iomap_is_delalloc(struct inode *inode, + struct ext4_map_blocks *map) +{ + struct extent_status es; + ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1; - /* Handle extending of i_size after direct IO write */ - if (orphan) { - int err; + ext4_es_find_extent_range(inode, &ext4_es_is_delayed, + map->m_lblk, end, &es); - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) { - /* - * We wrote the data but cannot extend - * i_size. Bail out. In async io case, we do - * not return error here because we have - * already submmitted the corresponding - * bio. Returning error here makes the caller - * think that this IO is done and failed - * resulting in race with bio's completion - * handler. - */ - if (!ret) - ret = PTR_ERR(handle); - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); + if (!es.es_len || es.es_lblk > end) + return false; - goto out; - } - if (inode->i_nlink) - ext4_orphan_del(handle, inode); - if (ret > 0) { - loff_t end = offset + ret; - if (end > inode->i_size || end > ei->i_disksize) { - ext4_update_i_disksize(inode, end); - if (end > inode->i_size) - i_size_write(inode, end); - /* - * We're going to return a positive `ret' - * here due to non-zero-length I/O, so there's - * no way of reporting error returns from - * ext4_mark_inode_dirty() to userspace. So - * ignore it. - */ - ext4_mark_inode_dirty(handle, inode); - } - } - err = ext4_journal_stop(handle); - if (ret == 0) - ret = err; + if (es.es_lblk > map->m_lblk) { + map->m_len = es.es_lblk - map->m_lblk; + return false; } -out: - return ret; -} -static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter) -{ - struct address_space *mapping = iocb->ki_filp->f_mapping; - struct inode *inode = mapping->host; - size_t count = iov_iter_count(iter); - ssize_t ret; + offset = map->m_lblk - es.es_lblk; + map->m_len = es.es_len - offset; - /* - * Shared inode_lock is enough for us - it protects against concurrent - * writes & truncates and since we take care of writing back page cache, - * we are protected against page writeback as well. - */ - inode_lock_shared(inode); - ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, - iocb->ki_pos + count - 1); - if (ret) - goto out_unlock; - ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, - iter, ext4_dio_get_block, NULL, NULL, 0); -out_unlock: - inode_unlock_shared(inode); - return ret; + return true; } -static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, + loff_t length, unsigned int flags, + struct iomap *iomap, struct iomap *srcmap) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - size_t count = iov_iter_count(iter); - loff_t offset = iocb->ki_pos; - ssize_t ret; + int ret; + bool delalloc = false; + struct ext4_map_blocks map; + u8 blkbits = inode->i_blkbits; -#ifdef CONFIG_FS_ENCRYPTION - if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) - return 0; -#endif - if (fsverity_active(inode)) - return 0; + if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) + return -EINVAL; + + if (ext4_has_inline_data(inode)) { + ret = ext4_inline_data_iomap(inode, iomap); + if (ret != -EAGAIN) { + if (ret == 0 && offset >= iomap->length) + ret = -ENOENT; + return ret; + } + } /* - * If we are doing data journalling we don't support O_DIRECT + * Calculate the first and last logical block respectively. */ - if (ext4_should_journal_data(inode)) - return 0; + map.m_lblk = offset >> blkbits; + map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, + EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; - /* Let buffer I/O handle the inline data case. */ - if (ext4_has_inline_data(inode)) - return 0; + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + return ret; + if (ret == 0) + delalloc = ext4_iomap_is_delalloc(inode, &map); - trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); - if (iov_iter_rw(iter) == READ) - ret = ext4_direct_IO_read(iocb, iter); - else - ret = ext4_direct_IO_write(iocb, iter); - trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret); - return ret; + ext4_set_iomap(inode, iomap, &map, offset, length); + if (delalloc && iomap->type == IOMAP_HOLE) + iomap->type = IOMAP_DELALLOC; + + return 0; } +const struct iomap_ops ext4_iomap_report_ops = { + .iomap_begin = ext4_iomap_begin_report, +}; + /* * Pages can be marked dirty completely asynchronously from ext4's journalling * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do @@ -3910,7 +3574,7 @@ static const struct address_space_operations ext4_aops = { .bmap = ext4_bmap, .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, + .direct_IO = noop_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, @@ -3927,7 +3591,7 @@ static const struct address_space_operations ext4_journalled_aops = { .bmap = ext4_bmap, .invalidatepage = ext4_journalled_invalidatepage, .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, + .direct_IO = noop_direct_IO, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; @@ -3943,7 +3607,7 @@ static const struct address_space_operations ext4_da_aops = { .bmap = ext4_bmap, .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, + .direct_IO = noop_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, @@ -5450,11 +5114,15 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) offset = inode->i_size & (PAGE_SIZE - 1); /* - * All buffers in the last page remain valid? Then there's nothing to - * do. We do the check mainly to optimize the common PAGE_SIZE == - * blocksize case + * If the page is fully truncated, we don't need to wait for any commit + * (and we even should not as __ext4_journalled_invalidatepage() may + * strip all buffers from the page but keep the page dirty which can then + * confuse e.g. concurrent ext4_writepage() seeing dirty page without + * buffers). Also we don't need to wait for any commit if all buffers in + * the page remain valid. This is most beneficial for the common case of + * blocksize == PAGESIZE. */ - if (offset > PAGE_SIZE - i_blocksize(inode)) + if (!offset || offset > (PAGE_SIZE - i_blocksize(inode))) return; while (1) { page = find_lock_page(inode->i_mapping, @@ -5717,12 +5385,15 @@ int ext4_getattr(const struct path *path, struct kstat *stat, stat->attributes |= STATX_ATTR_IMMUTABLE; if (flags & EXT4_NODUMP_FL) stat->attributes |= STATX_ATTR_NODUMP; + if (flags & EXT4_VERITY_FL) + stat->attributes |= STATX_ATTR_VERITY; stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_COMPRESSED | STATX_ATTR_ENCRYPTED | STATX_ATTR_IMMUTABLE | - STATX_ATTR_NODUMP); + STATX_ATTR_NODUMP | + STATX_ATTR_VERITY); generic_fillattr(inode, stat); return 0; @@ -5912,8 +5583,23 @@ static int __ext4_expand_extra_isize(struct inode *inode, { struct ext4_inode *raw_inode; struct ext4_xattr_ibody_header *header; + unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); int error; + /* this was checked at iget time, but double check for good measure */ + if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) || + (ei->i_extra_isize & 3)) { + EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)", + ei->i_extra_isize, + EXT4_INODE_SIZE(inode->i_sb)); + return -EFSCORRUPTED; + } + if ((new_extra_isize < ei->i_extra_isize) || + (new_extra_isize < 4) || + (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE)) + return -EINVAL; /* Should never happen */ + raw_inode = ext4_raw_inode(iloc); header = IHDR(inode, raw_inode); @@ -5965,9 +5651,8 @@ static int ext4_try_to_expand_extra_isize(struct inode *inode, * If this is felt to be critical, then e2fsck should be run to * force a large enough s_min_extra_isize. */ - if (ext4_handle_valid(handle) && - jbd2_journal_extend(handle, - EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) != 0) + if (ext4_journal_extend(handle, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0) return -ENOSPC; if (ext4_write_trylock_xattr(inode, &no_expand) == 0) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 0b7f316fd30f..e8870fff8224 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1360,6 +1360,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) } case EXT4_IOC_MOVE_EXT: case EXT4_IOC_RESIZE_FS: + case FITRIM: case EXT4_IOC_PRECACHE_EXTENTS: case EXT4_IOC_SET_ENCRYPTION_POLICY: case EXT4_IOC_GET_ENCRYPTION_PWSALT: diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index b1e4d359f73b..89725fa42573 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -50,29 +50,9 @@ static int finish_range(handle_t *handle, struct inode *inode, needed = ext4_ext_calc_credits_for_single_extent(inode, lb->last_block - lb->first_block + 1, path); - /* - * Make sure the credit we accumalated is not really high - */ - if (needed && ext4_handle_has_enough_credits(handle, - EXT4_RESERVE_TRANS_BLOCKS)) { - up_write((&EXT4_I(inode)->i_data_sem)); - retval = ext4_journal_restart(handle, needed); - down_write((&EXT4_I(inode)->i_data_sem)); - if (retval) - goto err_out; - } else if (needed) { - retval = ext4_journal_extend(handle, needed); - if (retval) { - /* - * IF not able to extend the journal restart the journal - */ - up_write((&EXT4_I(inode)->i_data_sem)); - retval = ext4_journal_restart(handle, needed); - down_write((&EXT4_I(inode)->i_data_sem)); - if (retval) - goto err_out; - } - } + retval = ext4_datasem_ensure_credits(handle, inode, needed, needed, 0); + if (retval < 0) + goto err_out; retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0); err_out: up_write((&EXT4_I(inode)->i_data_sem)); @@ -196,42 +176,30 @@ static int update_tind_extent_range(handle_t *handle, struct inode *inode, } -static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode) -{ - int retval = 0, needed; - - if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) - return 0; - /* - * We are freeing a blocks. During this we touch - * superblock, group descriptor and block bitmap. - * So allocate a credit of 3. We may update - * quota (user and group). - */ - needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); - - if (ext4_journal_extend(handle, needed) != 0) - retval = ext4_journal_restart(handle, needed); - - return retval; -} - static int free_dind_blocks(handle_t *handle, struct inode *inode, __le32 i_data) { int i; __le32 *tmp_idata; struct buffer_head *bh; + struct super_block *sb = inode->i_sb; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; + int err; - bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0); + bh = ext4_sb_bread(sb, le32_to_cpu(i_data), 0); if (IS_ERR(bh)) return PTR_ERR(bh); tmp_idata = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { if (tmp_idata[i]) { - extend_credit_for_blkdel(handle, inode); + err = ext4_journal_ensure_credits(handle, + EXT4_RESERVE_TRANS_BLOCKS, + ext4_free_metadata_revoke_credits(sb, 1)); + if (err < 0) { + put_bh(bh); + return err; + } ext4_free_blocks(handle, inode, NULL, le32_to_cpu(tmp_idata[i]), 1, EXT4_FREE_BLOCKS_METADATA | @@ -239,7 +207,10 @@ static int free_dind_blocks(handle_t *handle, } } put_bh(bh); - extend_credit_for_blkdel(handle, inode); + err = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, + ext4_free_metadata_revoke_credits(sb, 1)); + if (err < 0) + return err; ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); @@ -270,7 +241,10 @@ static int free_tind_blocks(handle_t *handle, } } put_bh(bh); - extend_credit_for_blkdel(handle, inode); + retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, + ext4_free_metadata_revoke_credits(inode->i_sb, 1)); + if (retval < 0) + return retval; ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); @@ -283,7 +257,11 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data) /* ei->i_data[EXT4_IND_BLOCK] */ if (i_data[0]) { - extend_credit_for_blkdel(handle, inode); + retval = ext4_journal_ensure_credits(handle, + EXT4_RESERVE_TRANS_BLOCKS, + ext4_free_metadata_revoke_credits(inode->i_sb, 1)); + if (retval < 0) + return retval; ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data[0]), 1, EXT4_FREE_BLOCKS_METADATA | @@ -318,12 +296,9 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, * One credit accounted for writing the * i_data field of the original inode */ - retval = ext4_journal_extend(handle, 1); - if (retval) { - retval = ext4_journal_restart(handle, 1); - if (retval) - goto err_out; - } + retval = ext4_journal_ensure_credits(handle, 1, 0); + if (retval < 0) + goto err_out; i_data[0] = ei->i_data[EXT4_IND_BLOCK]; i_data[1] = ei->i_data[EXT4_DIND_BLOCK]; @@ -391,15 +366,20 @@ static int free_ext_idx(handle_t *handle, struct inode *inode, ix = EXT_FIRST_INDEX(eh); for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) { retval = free_ext_idx(handle, inode, ix); - if (retval) - break; + if (retval) { + put_bh(bh); + return retval; + } } } put_bh(bh); - extend_credit_for_blkdel(handle, inode); + retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, + ext4_free_metadata_revoke_credits(inode->i_sb, 1)); + if (retval < 0) + return retval; ext4_free_blocks(handle, inode, NULL, block, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); - return retval; + return 0; } /* @@ -574,9 +554,9 @@ err_out: } /* We mark the tmp_inode dirty via ext4_ext_tree_init. */ - if (ext4_journal_extend(handle, 1) != 0) - ext4_journal_restart(handle, 1); - + retval = ext4_journal_ensure_credits(handle, 1, 0); + if (retval < 0) + goto out_stop; /* * Mark the tmp_inode as of size zero */ @@ -594,6 +574,7 @@ err_out: /* Reset the extent details */ ext4_ext_tree_init(handle, tmp_inode); +out_stop: ext4_journal_stop(handle); out: unlock_new_inode(tmp_inode); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a427d2031a8d..a856997d87b5 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2547,18 +2547,29 @@ static void ext4_dec_count(handle_t *handle, struct inode *inode) } +/* + * Add non-directory inode to a directory. On success, the inode reference is + * consumed by dentry is instantiation. This is also indicated by clearing of + * *inodep pointer. On failure, the caller is responsible for dropping the + * inode reference in the safe context. + */ static int ext4_add_nondir(handle_t *handle, - struct dentry *dentry, struct inode *inode) + struct dentry *dentry, struct inode **inodep) { + struct inode *dir = d_inode(dentry->d_parent); + struct inode *inode = *inodep; int err = ext4_add_entry(handle, dentry, inode); if (!err) { ext4_mark_inode_dirty(handle, inode); + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); d_instantiate_new(dentry, inode); + *inodep = NULL; return 0; } drop_nlink(inode); + ext4_orphan_add(handle, inode); unlock_new_inode(inode); - iput(inode); return err; } @@ -2592,12 +2603,12 @@ retry: inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); - err = ext4_add_nondir(handle, dentry, inode); - if (!err && IS_DIRSYNC(dir)) - ext4_handle_sync(handle); + err = ext4_add_nondir(handle, dentry, &inode); } if (handle) ext4_journal_stop(handle); + if (!IS_ERR_OR_NULL(inode)) + iput(inode); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; @@ -2624,12 +2635,12 @@ retry: if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &ext4_special_inode_operations; - err = ext4_add_nondir(handle, dentry, inode); - if (!err && IS_DIRSYNC(dir)) - ext4_handle_sync(handle); + err = ext4_add_nondir(handle, dentry, &inode); } if (handle) ext4_journal_stop(handle); + if (!IS_ERR_OR_NULL(inode)) + iput(inode); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; @@ -2779,10 +2790,12 @@ retry: if (err) { out_clear_inode: clear_nlink(inode); + ext4_orphan_add(handle, inode); unlock_new_inode(inode); ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); iput(inode); - goto out_stop; + goto out_retry; } ext4_inc_count(handle, dir); ext4_update_dx_flag(dir); @@ -2796,6 +2809,7 @@ out_clear_inode: out_stop: if (handle) ext4_journal_stop(handle); +out_retry: if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; @@ -3182,18 +3196,17 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - if (inode->i_nlink == 0) { - ext4_warning_inode(inode, "Deleting file '%.*s' with no links", - dentry->d_name.len, dentry->d_name.name); - set_nlink(inode, 1); - } retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto end_unlink; dir->i_ctime = dir->i_mtime = current_time(dir); ext4_update_dx_flag(dir); ext4_mark_inode_dirty(handle, dir); - drop_nlink(inode); + if (inode->i_nlink == 0) + ext4_warning_inode(inode, "Deleting file '%.*s' with no links", + dentry->d_name.len, dentry->d_name.name); + else + drop_nlink(inode); if (!inode->i_nlink) ext4_orphan_add(handle, inode); inode->i_ctime = current_time(inode); @@ -3328,12 +3341,11 @@ static int ext4_symlink(struct inode *dir, inode->i_size = disk_link.len - 1; } EXT4_I(inode)->i_disksize = inode->i_size; - err = ext4_add_nondir(handle, dentry, inode); - if (!err && IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - + err = ext4_add_nondir(handle, dentry, &inode); if (handle) ext4_journal_stop(handle); + if (inode) + iput(inode); goto out_free_encrypted_link; err_drop_inode: diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 12ceadef32c5..24aeedb8fc75 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -31,18 +31,56 @@ #include "acl.h" static struct kmem_cache *io_end_cachep; +static struct kmem_cache *io_end_vec_cachep; int __init ext4_init_pageio(void) { io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); if (io_end_cachep == NULL) return -ENOMEM; + + io_end_vec_cachep = KMEM_CACHE(ext4_io_end_vec, 0); + if (io_end_vec_cachep == NULL) { + kmem_cache_destroy(io_end_cachep); + return -ENOMEM; + } return 0; } void ext4_exit_pageio(void) { kmem_cache_destroy(io_end_cachep); + kmem_cache_destroy(io_end_vec_cachep); +} + +struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end) +{ + struct ext4_io_end_vec *io_end_vec; + + io_end_vec = kmem_cache_zalloc(io_end_vec_cachep, GFP_NOFS); + if (!io_end_vec) + return ERR_PTR(-ENOMEM); + INIT_LIST_HEAD(&io_end_vec->list); + list_add_tail(&io_end_vec->list, &io_end->list_vec); + return io_end_vec; +} + +static void ext4_free_io_end_vec(ext4_io_end_t *io_end) +{ + struct ext4_io_end_vec *io_end_vec, *tmp; + + if (list_empty(&io_end->list_vec)) + return; + list_for_each_entry_safe(io_end_vec, tmp, &io_end->list_vec, list) { + list_del(&io_end_vec->list); + kmem_cache_free(io_end_vec_cachep, io_end_vec); + } +} + +struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end) +{ + BUG_ON(list_empty(&io_end->list_vec)); + return list_last_entry(&io_end->list_vec, struct ext4_io_end_vec, list); } /* @@ -125,6 +163,7 @@ static void ext4_release_io_end(ext4_io_end_t *io_end) ext4_finish_bio(bio); bio_put(bio); } + ext4_free_io_end_vec(io_end); kmem_cache_free(io_end_cachep, io_end); } @@ -136,29 +175,26 @@ static void ext4_release_io_end(ext4_io_end_t *io_end) * cannot get to ext4_ext_truncate() before all IOs overlapping that range are * completed (happens from ext4_free_ioend()). */ -static int ext4_end_io(ext4_io_end_t *io) +static int ext4_end_io_end(ext4_io_end_t *io_end) { - struct inode *inode = io->inode; - loff_t offset = io->offset; - ssize_t size = io->size; - handle_t *handle = io->handle; + struct inode *inode = io_end->inode; + handle_t *handle = io_end->handle; int ret = 0; - ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," + ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p," "list->prev 0x%p\n", - io, inode->i_ino, io->list.next, io->list.prev); + io_end, inode->i_ino, io_end->list.next, io_end->list.prev); - io->handle = NULL; /* Following call will use up the handle */ - ret = ext4_convert_unwritten_extents(handle, inode, offset, size); + io_end->handle = NULL; /* Following call will use up the handle */ + ret = ext4_convert_unwritten_io_end_vec(handle, io_end); if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) { ext4_msg(inode->i_sb, KERN_EMERG, "failed to convert unwritten extents to written " "extents -- potential data loss! " - "(inode %lu, offset %llu, size %zd, error %d)", - inode->i_ino, offset, size, ret); + "(inode %lu, error %d)", inode->i_ino, ret); } - ext4_clear_io_unwritten_flag(io); - ext4_release_io_end(io); + ext4_clear_io_unwritten_flag(io_end); + ext4_release_io_end(io_end); return ret; } @@ -166,21 +202,21 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head) { #ifdef EXT4FS_DEBUG struct list_head *cur, *before, *after; - ext4_io_end_t *io, *io0, *io1; + ext4_io_end_t *io_end, *io_end0, *io_end1; if (list_empty(head)) return; ext4_debug("Dump inode %lu completed io list\n", inode->i_ino); - list_for_each_entry(io, head, list) { - cur = &io->list; + list_for_each_entry(io_end, head, list) { + cur = &io_end->list; before = cur->prev; - io0 = container_of(before, ext4_io_end_t, list); + io_end0 = container_of(before, ext4_io_end_t, list); after = cur->next; - io1 = container_of(after, ext4_io_end_t, list); + io_end1 = container_of(after, ext4_io_end_t, list); ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", - io, inode->i_ino, io0, io1); + io_end, inode->i_ino, io_end0, io_end1); } #endif } @@ -207,7 +243,7 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end) static int ext4_do_flush_completed_IO(struct inode *inode, struct list_head *head) { - ext4_io_end_t *io; + ext4_io_end_t *io_end; struct list_head unwritten; unsigned long flags; struct ext4_inode_info *ei = EXT4_I(inode); @@ -219,11 +255,11 @@ static int ext4_do_flush_completed_IO(struct inode *inode, spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); while (!list_empty(&unwritten)) { - io = list_entry(unwritten.next, ext4_io_end_t, list); - BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN)); - list_del_init(&io->list); + io_end = list_entry(unwritten.next, ext4_io_end_t, list); + BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); + list_del_init(&io_end->list); - err = ext4_end_io(io); + err = ext4_end_io_end(io_end); if (unlikely(!ret && err)) ret = err; } @@ -242,19 +278,22 @@ void ext4_end_io_rsv_work(struct work_struct *work) ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) { - ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); - if (io) { - io->inode = inode; - INIT_LIST_HEAD(&io->list); - atomic_set(&io->count, 1); + ext4_io_end_t *io_end = kmem_cache_zalloc(io_end_cachep, flags); + + if (io_end) { + io_end->inode = inode; + INIT_LIST_HEAD(&io_end->list); + INIT_LIST_HEAD(&io_end->list_vec); + atomic_set(&io_end->count, 1); } - return io; + return io_end; } void ext4_put_io_end_defer(ext4_io_end_t *io_end) { if (atomic_dec_and_test(&io_end->count)) { - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) { + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || + list_empty(&io_end->list_vec)) { ext4_release_io_end(io_end); return; } @@ -268,9 +307,8 @@ int ext4_put_io_end(ext4_io_end_t *io_end) if (atomic_dec_and_test(&io_end->count)) { if (io_end->flag & EXT4_IO_END_UNWRITTEN) { - err = ext4_convert_unwritten_extents(io_end->handle, - io_end->inode, io_end->offset, - io_end->size); + err = ext4_convert_unwritten_io_end_vec(io_end->handle, + io_end); io_end->handle = NULL; ext4_clear_io_unwritten_flag(io_end); } @@ -307,10 +345,8 @@ static void ext4_end_bio(struct bio *bio) struct inode *inode = io_end->inode; ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " - "(offset %llu size %ld starting block %llu)", + "starting block %llu)", bio->bi_status, inode->i_ino, - (unsigned long long) io_end->offset, - (long) io_end->size, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); mapping_set_error(inode->i_mapping, @@ -358,14 +394,16 @@ void ext4_io_submit_init(struct ext4_io_submit *io, io->io_end = NULL; } -static int io_submit_init_bio(struct ext4_io_submit *io, - struct buffer_head *bh) +static void io_submit_init_bio(struct ext4_io_submit *io, + struct buffer_head *bh) { struct bio *bio; + /* + * bio_alloc will _always_ be able to allocate a bio if + * __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset(). + */ bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); - if (!bio) - return -ENOMEM; bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); bio->bi_end_io = ext4_end_bio; @@ -373,13 +411,12 @@ static int io_submit_init_bio(struct ext4_io_submit *io, io->io_bio = bio; io->io_next_block = bh->b_blocknr; wbc_init_bio(io->io_wbc, bio); - return 0; } -static int io_submit_add_bh(struct ext4_io_submit *io, - struct inode *inode, - struct page *page, - struct buffer_head *bh) +static void io_submit_add_bh(struct ext4_io_submit *io, + struct inode *inode, + struct page *page, + struct buffer_head *bh) { int ret; @@ -388,9 +425,7 @@ submit_and_retry: ext4_io_submit(io); } if (io->io_bio == NULL) { - ret = io_submit_init_bio(io, bh); - if (ret) - return ret; + io_submit_init_bio(io, bh); io->io_bio->bi_write_hint = inode->i_write_hint; } ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); @@ -398,7 +433,6 @@ submit_and_retry: goto submit_and_retry; wbc_account_cgroup_owner(io->io_wbc, page, bh->b_size); io->io_next_block++; - return 0; } int ext4_bio_write_page(struct ext4_io_submit *io, @@ -491,8 +525,14 @@ int ext4_bio_write_page(struct ext4_io_submit *io, gfp_flags |= __GFP_NOFAIL; goto retry_encrypt; } - bounce_page = NULL; - goto out; + + printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret); + redirty_page_for_writepage(wbc, page); + do { + clear_buffer_async_write(bh); + bh = bh->b_this_page; + } while (bh != head); + goto unlock; } } @@ -500,30 +540,13 @@ int ext4_bio_write_page(struct ext4_io_submit *io, do { if (!buffer_async_write(bh)) continue; - ret = io_submit_add_bh(io, inode, bounce_page ?: page, bh); - if (ret) { - /* - * We only get here on ENOMEM. Not much else - * we can do but mark the page as dirty, and - * better luck next time. - */ - break; - } + io_submit_add_bh(io, inode, + bounce_page ? bounce_page : page, bh); nr_submitted++; clear_buffer_dirty(bh); } while ((bh = bh->b_this_page) != head); - /* Error stopped previous loop? Clean up buffers... */ - if (ret) { - out: - fscrypt_free_bounce_page(bounce_page); - printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret); - redirty_page_for_writepage(wbc, page); - do { - clear_buffer_async_write(bh); - bh = bh->b_this_page; - } while (bh != head); - } +unlock: unlock_page(page); /* Nothing submitted - we have to end page writeback */ if (!nr_submitted) diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index a30b203fa461..fef7755300c3 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -360,10 +360,12 @@ int ext4_mpage_readpages(struct address_space *mapping, if (bio == NULL) { struct bio_post_read_ctx *ctx; + /* + * bio_alloc will _always_ be able to allocate a bio if + * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset(). + */ bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); - if (!bio) - goto set_error_page; ctx = get_bio_post_read_ctx(inode, bio, page->index); if (IS_ERR(ctx)) { bio_put(bio); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index c0e9aef376a7..a8c0f2b5b6e1 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -388,28 +388,10 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, return bh; } -/* - * If we have fewer than thresh credits, extend by EXT4_MAX_TRANS_DATA. - * If that fails, restart the transaction & regain write access for the - * buffer head which is used for block_bitmap modifications. - */ -static int extend_or_restart_transaction(handle_t *handle, int thresh) +static int ext4_resize_ensure_credits_batch(handle_t *handle, int credits) { - int err; - - if (ext4_handle_has_enough_credits(handle, thresh)) - return 0; - - err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA); - if (err < 0) - return err; - if (err) { - err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA); - if (err) - return err; - } - - return 0; + return ext4_journal_ensure_credits_fn(handle, credits, + EXT4_MAX_TRANS_DATA, 0, 0); } /* @@ -451,8 +433,8 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, continue; } - err = extend_or_restart_transaction(handle, 1); - if (err) + err = ext4_resize_ensure_credits_batch(handle, 1); + if (err < 0) return err; bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap); @@ -544,8 +526,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb, struct buffer_head *gdb; ext4_debug("update backup group %#04llx\n", block); - err = extend_or_restart_transaction(handle, 1); - if (err) + err = ext4_resize_ensure_credits_batch(handle, 1); + if (err < 0) goto out; gdb = sb_getblk(sb, block); @@ -602,8 +584,8 @@ handle_bb: /* Initialize block bitmap of the @group */ block = group_data[i].block_bitmap; - err = extend_or_restart_transaction(handle, 1); - if (err) + err = ext4_resize_ensure_credits_batch(handle, 1); + if (err < 0) goto out; bh = bclean(handle, sb, block); @@ -631,8 +613,8 @@ handle_ib: /* Initialize inode bitmap of the @group */ block = group_data[i].inode_bitmap; - err = extend_or_restart_transaction(handle, 1); - if (err) + err = ext4_resize_ensure_credits_batch(handle, 1); + if (err < 0) goto out; /* Mark unused entries in inode bitmap used */ bh = bclean(handle, sb, block); @@ -1109,10 +1091,8 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data, ext4_fsblk_t backup_block; /* Out of journal space, and can't get more - abort - so sad */ - if (ext4_handle_valid(handle) && - handle->h_buffer_credits == 0 && - ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) && - (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) + err = ext4_resize_ensure_credits_batch(handle, 1); + if (err < 0) break; if (meta_bg == 0) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dd654e53ba3d..1d82b56d9b11 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1172,9 +1172,9 @@ void ext4_clear_inode(struct inode *inode) { invalidate_inode_buffers(inode); clear_inode(inode); - dquot_drop(inode); ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); + dquot_drop(inode); if (EXT4_I(inode)->jinode) { jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode); @@ -1345,6 +1345,18 @@ static bool ext4_dummy_context(struct inode *inode) return DUMMY_ENCRYPTION_ENABLED(EXT4_SB(inode->i_sb)); } +static bool ext4_has_stable_inodes(struct super_block *sb) +{ + return ext4_has_feature_stable_inodes(sb); +} + +static void ext4_get_ino_and_lblk_bits(struct super_block *sb, + int *ino_bits_ret, int *lblk_bits_ret) +{ + *ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count); + *lblk_bits_ret = 8 * sizeof(ext4_lblk_t); +} + static const struct fscrypt_operations ext4_cryptops = { .key_prefix = "ext4:", .get_context = ext4_get_context, @@ -1352,6 +1364,8 @@ static const struct fscrypt_operations ext4_cryptops = { .dummy_context = ext4_dummy_context, .empty_dir = ext4_empty_dir, .max_namelen = EXT4_NAME_LEN, + .has_stable_inodes = ext4_has_stable_inodes, + .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits, }; #endif @@ -1374,7 +1388,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, static int ext4_quota_enable(struct super_block *sb, int type, int format_id, unsigned int flags); static int ext4_enable_quotas(struct super_block *sb); -static int ext4_get_next_id(struct super_block *sb, struct kqid *qid); static struct dquot **ext4_get_dquots(struct inode *inode) { @@ -1392,7 +1405,7 @@ static const struct dquot_operations ext4_quota_operations = { .destroy_dquot = dquot_destroy, .get_projid = ext4_get_projid, .get_inode_usage = ext4_get_inode_usage, - .get_next_id = ext4_get_next_id, + .get_next_id = dquot_get_next_id, }; static const struct quotactl_ops ext4_qctl_operations = { @@ -2051,7 +2064,7 @@ static int parse_options(char *options, struct super_block *sb, unsigned int *journal_ioprio, int is_remount) { - struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb); char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name; substring_t args[MAX_OPT_ARGS]; int token; @@ -2105,16 +2118,6 @@ static int parse_options(char *options, struct super_block *sb, } } #endif - if (test_opt(sb, DIOREAD_NOLOCK)) { - int blocksize = - BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); - - if (blocksize < PAGE_SIZE) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "dioread_nolock if block size != PAGE_SIZE"); - return 0; - } - } return 1; } @@ -3555,12 +3558,15 @@ static void ext4_clamp_want_extra_isize(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; + unsigned def_extra_isize = sizeof(struct ext4_inode) - + EXT4_GOOD_OLD_INODE_SIZE; - /* determine the minimum size of new large inodes, if present */ - if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE && - sbi->s_want_extra_isize == 0) { - sbi->s_want_extra_isize = sizeof(struct ext4_inode) - - EXT4_GOOD_OLD_INODE_SIZE; + if (sbi->s_inode_size == EXT4_GOOD_OLD_INODE_SIZE) { + sbi->s_want_extra_isize = 0; + return; + } + if (sbi->s_want_extra_isize < 4) { + sbi->s_want_extra_isize = def_extra_isize; if (ext4_has_feature_extra_isize(sb)) { if (sbi->s_want_extra_isize < le16_to_cpu(es->s_want_extra_isize)) @@ -3573,10 +3579,10 @@ static void ext4_clamp_want_extra_isize(struct super_block *sb) } } /* Check if enough inode space is available */ - if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize > - sbi->s_inode_size) { - sbi->s_want_extra_isize = sizeof(struct ext4_inode) - - EXT4_GOOD_OLD_INODE_SIZE; + if ((sbi->s_want_extra_isize > sbi->s_inode_size) || + (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize > + sbi->s_inode_size)) { + sbi->s_want_extra_isize = def_extra_isize; ext4_msg(sb, KERN_INFO, "required extra inode space not available"); } @@ -4439,13 +4445,6 @@ no_journal: } } - if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) && - (blocksize != PAGE_SIZE)) { - ext4_msg(sb, KERN_ERR, - "Unsupported blocksize for fs encryption"); - goto failed_mount_wq; - } - if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) { ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity"); goto failed_mount_wq; @@ -5835,7 +5834,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, /* Don't account quota for quota files to avoid recursion */ qf_inode->i_flags |= S_NOQUOTA; lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA); - err = dquot_enable(qf_inode, type, format_id, flags); + err = dquot_load_quota_inode(qf_inode, type, format_id, flags); if (err) lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL); iput(qf_inode); @@ -6019,18 +6018,6 @@ out: } return len; } - -static int ext4_get_next_id(struct super_block *sb, struct kqid *qid) -{ - const struct quota_format_ops *ops; - - if (!sb_has_quota_loaded(sb, qid->type)) - return -ESRCH; - ops = sb_dqopt(sb)->ops[qid->type]; - if (!ops || !ops->get_next_id) - return -ENOSYS; - return dquot_get_next_id(sb, qid); -} #endif static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 491f9ee4040e..8966a5439a22 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -967,55 +967,6 @@ int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, return credits; } -static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode, - int credits, struct buffer_head *bh, - bool dirty, bool block_csum) -{ - int error; - - if (!ext4_handle_valid(handle)) - return 0; - - if (handle->h_buffer_credits >= credits) - return 0; - - error = ext4_journal_extend(handle, credits - handle->h_buffer_credits); - if (!error) - return 0; - if (error < 0) { - ext4_warning(inode->i_sb, "Extend journal (error %d)", error); - return error; - } - - if (bh && dirty) { - if (block_csum) - ext4_xattr_block_csum_set(inode, bh); - error = ext4_handle_dirty_metadata(handle, NULL, bh); - if (error) { - ext4_warning(inode->i_sb, "Handle metadata (error %d)", - error); - return error; - } - } - - error = ext4_journal_restart(handle, credits); - if (error) { - ext4_warning(inode->i_sb, "Restart journal (error %d)", error); - return error; - } - - if (bh) { - error = ext4_journal_get_write_access(handle, bh); - if (error) { - ext4_warning(inode->i_sb, - "Get write access failed (error %d)", - error); - return error; - } - } - return 0; -} - static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, int ref_change) { @@ -1149,6 +1100,24 @@ cleanup: return saved_err; } +static int ext4_xattr_restart_fn(handle_t *handle, struct inode *inode, + struct buffer_head *bh, bool block_csum, bool dirty) +{ + int error; + + if (bh && dirty) { + if (block_csum) + ext4_xattr_block_csum_set(inode, bh); + error = ext4_handle_dirty_metadata(handle, NULL, bh); + if (error) { + ext4_warning(inode->i_sb, "Handle metadata (error %d)", + error); + return error; + } + } + return 0; +} + static void ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, struct buffer_head *bh, @@ -1185,13 +1154,24 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, continue; } - err = ext4_xattr_ensure_credits(handle, parent, credits, bh, - dirty, block_csum); - if (err) { + err = ext4_journal_ensure_credits_fn(handle, credits, credits, + ext4_free_metadata_revoke_credits(parent->i_sb, 1), + ext4_xattr_restart_fn(handle, parent, bh, block_csum, + dirty)); + if (err < 0) { ext4_warning_inode(ea_inode, "Ensure credits err=%d", err); continue; } + if (err > 0) { + err = ext4_journal_get_write_access(handle, bh); + if (err) { + ext4_warning_inode(ea_inode, + "Re-get write access err=%d", + err); + continue; + } + } err = ext4_xattr_inode_dec_ref(handle, ea_inode); if (err) { @@ -2335,7 +2315,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, flags & XATTR_CREATE); brelse(bh); - if (!ext4_handle_has_enough_credits(handle, credits)) { + if (jbd2_handle_buffer_credits(handle) < credits) { error = -ENOSPC; goto cleanup; } @@ -2862,11 +2842,9 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, struct inode *ea_inode; int error; - error = ext4_xattr_ensure_credits(handle, inode, extra_credits, - NULL /* bh */, - false /* dirty */, - false /* block_csum */); - if (error) { + error = ext4_journal_ensure_credits(handle, extra_credits, + ext4_free_metadata_revoke_credits(inode->i_sb, 1)); + if (error < 0) { EXT4_ERROR_INODE(inode, "ensure credits (error %d)", error); goto cleanup; } diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index a0eef95b9e0e..ffdaba0c55d2 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -581,7 +581,7 @@ int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi) if (time_to_inject(sbi, FAULT_ORPHAN)) { spin_unlock(&im->ino_lock); - f2fs_show_injection_info(FAULT_ORPHAN); + f2fs_show_injection_info(sbi, FAULT_ORPHAN); return -ENOSPC; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 5755e897a5f0..a034cd0ce021 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -29,6 +29,7 @@ #define NUM_PREALLOC_POST_READ_CTXS 128 static struct kmem_cache *bio_post_read_ctx_cache; +static struct kmem_cache *bio_entry_slab; static mempool_t *bio_post_read_ctx_pool; static bool __is_cp_guaranteed(struct page *page) @@ -167,9 +168,10 @@ static bool f2fs_bio_post_read_required(struct bio *bio) static void f2fs_read_end_io(struct bio *bio) { - if (time_to_inject(F2FS_P_SB(bio_first_page_all(bio)), - FAULT_READ_IO)) { - f2fs_show_injection_info(FAULT_READ_IO); + struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio)); + + if (time_to_inject(sbi, FAULT_READ_IO)) { + f2fs_show_injection_info(sbi, FAULT_READ_IO); bio->bi_status = BLK_STS_IOERR; } @@ -191,7 +193,7 @@ static void f2fs_write_end_io(struct bio *bio) struct bvec_iter_all iter_all; if (time_to_inject(sbi, FAULT_WRITE_IO)) { - f2fs_show_injection_info(FAULT_WRITE_IO); + f2fs_show_injection_info(sbi, FAULT_WRITE_IO); bio->bi_status = BLK_STS_IOERR; } @@ -543,6 +545,126 @@ static bool io_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, return io_type_is_mergeable(io, fio); } +static void add_bio_entry(struct f2fs_sb_info *sbi, struct bio *bio, + struct page *page, enum temp_type temp) +{ + struct f2fs_bio_info *io = sbi->write_io[DATA] + temp; + struct bio_entry *be; + + be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS); + be->bio = bio; + bio_get(bio); + + if (bio_add_page(bio, page, PAGE_SIZE, 0) != PAGE_SIZE) + f2fs_bug_on(sbi, 1); + + down_write(&io->bio_list_lock); + list_add_tail(&be->list, &io->bio_list); + up_write(&io->bio_list_lock); +} + +static void del_bio_entry(struct bio_entry *be) +{ + list_del(&be->list); + kmem_cache_free(bio_entry_slab, be); +} + +static int add_ipu_page(struct f2fs_sb_info *sbi, struct bio **bio, + struct page *page) +{ + enum temp_type temp; + bool found = false; + int ret = -EAGAIN; + + for (temp = HOT; temp < NR_TEMP_TYPE && !found; temp++) { + struct f2fs_bio_info *io = sbi->write_io[DATA] + temp; + struct list_head *head = &io->bio_list; + struct bio_entry *be; + + down_write(&io->bio_list_lock); + list_for_each_entry(be, head, list) { + if (be->bio != *bio) + continue; + + found = true; + + if (bio_add_page(*bio, page, PAGE_SIZE, 0) == PAGE_SIZE) { + ret = 0; + break; + } + + /* bio is full */ + del_bio_entry(be); + __submit_bio(sbi, *bio, DATA); + break; + } + up_write(&io->bio_list_lock); + } + + if (ret) { + bio_put(*bio); + *bio = NULL; + } + + return ret; +} + +void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, + struct bio **bio, struct page *page) +{ + enum temp_type temp; + bool found = false; + struct bio *target = bio ? *bio : NULL; + + for (temp = HOT; temp < NR_TEMP_TYPE && !found; temp++) { + struct f2fs_bio_info *io = sbi->write_io[DATA] + temp; + struct list_head *head = &io->bio_list; + struct bio_entry *be; + + if (list_empty(head)) + continue; + + down_read(&io->bio_list_lock); + list_for_each_entry(be, head, list) { + if (target) + found = (target == be->bio); + else + found = __has_merged_page(be->bio, NULL, + page, 0); + if (found) + break; + } + up_read(&io->bio_list_lock); + + if (!found) + continue; + + found = false; + + down_write(&io->bio_list_lock); + list_for_each_entry(be, head, list) { + if (target) + found = (target == be->bio); + else + found = __has_merged_page(be->bio, NULL, + page, 0); + if (found) { + target = be->bio; + del_bio_entry(be); + break; + } + } + up_write(&io->bio_list_lock); + } + + if (found) + __submit_bio(sbi, target, DATA); + if (bio && *bio) { + bio_put(*bio); + *bio = NULL; + } +} + int f2fs_merge_page_bio(struct f2fs_io_info *fio) { struct bio *bio = *fio->bio; @@ -557,20 +679,17 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) f2fs_trace_ios(fio, 0); if (bio && !page_is_mergeable(fio->sbi, bio, *fio->last_block, - fio->new_blkaddr)) { - __submit_bio(fio->sbi, bio, fio->type); - bio = NULL; - } + fio->new_blkaddr)) + f2fs_submit_merged_ipu_write(fio->sbi, &bio, NULL); alloc_new: if (!bio) { bio = __bio_alloc(fio, BIO_MAX_PAGES); bio_set_op_attrs(bio, fio->op, fio->op_flags); - } - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - __submit_bio(fio->sbi, bio, fio->type); - bio = NULL; - goto alloc_new; + add_bio_entry(fio->sbi, bio, page, fio->temp); + } else { + if (add_ipu_page(fio->sbi, &bio, page)) + goto alloc_new; } if (fio->io_wbc) @@ -584,19 +703,6 @@ alloc_new: return 0; } -static void f2fs_submit_ipu_bio(struct f2fs_sb_info *sbi, struct bio **bio, - struct page *page) -{ - if (!bio) - return; - - if (!__has_merged_page(*bio, NULL, page, 0)) - return; - - __submit_bio(sbi, *bio, DATA); - *bio = NULL; -} - void f2fs_submit_page_write(struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; @@ -2098,7 +2204,7 @@ static int __write_data_page(struct page *page, bool *submitted, loff_t i_size = i_size_read(inode); const pgoff_t end_index = ((unsigned long long) i_size) >> PAGE_SHIFT; - loff_t psize = (page->index + 1) << PAGE_SHIFT; + loff_t psize = (loff_t)(page->index + 1) << PAGE_SHIFT; unsigned offset = 0; bool need_balance_fs = false; int err = 0; @@ -2215,14 +2321,12 @@ out: unlock_page(page); if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) && - !F2FS_I(inode)->cp_task) { - f2fs_submit_ipu_bio(sbi, bio, page); + !F2FS_I(inode)->cp_task) f2fs_balance_fs(sbi, need_balance_fs); - } if (unlikely(f2fs_cp_error(sbi))) { - f2fs_submit_ipu_bio(sbi, bio, page); f2fs_submit_merged_write(sbi, DATA); + f2fs_submit_merged_ipu_write(sbi, bio, NULL); submitted = NULL; } @@ -2342,13 +2446,11 @@ continue_unlock: } if (PageWriteback(page)) { - if (wbc->sync_mode != WB_SYNC_NONE) { + if (wbc->sync_mode != WB_SYNC_NONE) f2fs_wait_on_page_writeback(page, DATA, true, true); - f2fs_submit_ipu_bio(sbi, &bio, page); - } else { + else goto continue_unlock; - } } if (!clear_page_dirty_for_io(page)) @@ -2406,7 +2508,7 @@ continue_unlock: NULL, 0, DATA); /* submit cached bio of IPU write */ if (bio) - __submit_bio(sbi, bio, DATA); + f2fs_submit_merged_ipu_write(sbi, &bio, NULL); return ret; } @@ -3211,8 +3313,22 @@ fail: return -ENOMEM; } -void __exit f2fs_destroy_post_read_processing(void) +void f2fs_destroy_post_read_processing(void) { mempool_destroy(bio_post_read_ctx_pool); kmem_cache_destroy(bio_post_read_ctx_cache); } + +int __init f2fs_init_bio_entry_cache(void) +{ + bio_entry_slab = f2fs_kmem_cache_create("bio_entry_slab", + sizeof(struct bio_entry)); + if (!bio_entry_slab) + return -ENOMEM; + return 0; +} + +void __exit f2fs_destroy_bio_entry_cache(void) +{ + kmem_cache_destroy(bio_entry_slab); +} diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 4033778bcbbf..c967cacf979e 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -628,7 +628,7 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, start: if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) { - f2fs_show_injection_info(FAULT_DIR_DEPTH); + f2fs_show_injection_info(F2FS_I_SB(dir), FAULT_DIR_DEPTH); return -ENOSPC; } @@ -919,8 +919,9 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, bit_pos++; ctx->pos = start_pos + bit_pos; printk_ratelimited( - "%s, invalid namelen(0), ino:%u, run fsck to fix.", - KERN_WARNING, le32_to_cpu(de->ino)); + "%sF2FS-fs (%s): invalid namelen(0), ino:%u, run fsck to fix.", + KERN_WARNING, sbi->sb->s_id, + le32_to_cpu(de->ino)); set_sbi_flag(sbi, SBI_NEED_FSCK); continue; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4024790028aa..5a888a063c7f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -890,6 +890,7 @@ enum { CURSEG_WARM_NODE, /* direct node blocks of normal files */ CURSEG_COLD_NODE, /* indirect node blocks */ NO_CHECK_TYPE, + CURSEG_COLD_DATA_PINNED,/* cold data for pinned file */ }; struct flush_cmd { @@ -1068,6 +1069,11 @@ struct f2fs_io_info { unsigned char version; /* version of the node */ }; +struct bio_entry { + struct bio *bio; + struct list_head list; +}; + #define is_read_io(rw) ((rw) == READ) struct f2fs_bio_info { struct f2fs_sb_info *sbi; /* f2fs superblock */ @@ -1077,6 +1083,8 @@ struct f2fs_bio_info { struct rw_semaphore io_rwsem; /* blocking op for bio */ spinlock_t io_lock; /* serialize DATA/NODE IOs */ struct list_head io_list; /* track fios */ + struct list_head bio_list; /* bio entry list head */ + struct rw_semaphore bio_list_lock; /* lock to protect bio entry list */ }; #define FDEV(i) (sbi->devs[i]) @@ -1289,11 +1297,13 @@ struct f2fs_sb_info { unsigned int gc_mode; /* current GC state */ unsigned int next_victim_seg[2]; /* next segment in victim section */ /* for skip statistic */ + unsigned int atomic_files; /* # of opened atomic file */ unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ unsigned long long skipped_gc_rwsem; /* FG_GC only */ /* threshold for gc trials on pinned files */ u64 gc_pin_file_threshold; + struct rw_semaphore pin_sem; /* maximum # of trials to find a victim segment for SSR and GC */ unsigned int max_victim_search; @@ -1365,9 +1375,10 @@ struct f2fs_private_dio { }; #ifdef CONFIG_F2FS_FAULT_INJECTION -#define f2fs_show_injection_info(type) \ - printk_ratelimited("%sF2FS-fs : inject %s in %s of %pS\n", \ - KERN_INFO, f2fs_fault_name[type], \ +#define f2fs_show_injection_info(sbi, type) \ + printk_ratelimited("%sF2FS-fs (%s) : inject %s in %s of %pS\n", \ + KERN_INFO, sbi->sb->s_id, \ + f2fs_fault_name[type], \ __func__, __builtin_return_address(0)) static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) { @@ -1387,7 +1398,7 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) return false; } #else -#define f2fs_show_injection_info(type) do { } while (0) +#define f2fs_show_injection_info(sbi, type) do { } while (0) static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) { return false; @@ -1772,7 +1783,7 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, return ret; if (time_to_inject(sbi, FAULT_BLOCK)) { - f2fs_show_injection_info(FAULT_BLOCK); + f2fs_show_injection_info(sbi, FAULT_BLOCK); release = *count; goto release_quota; } @@ -2024,7 +2035,7 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, } if (time_to_inject(sbi, FAULT_BLOCK)) { - f2fs_show_injection_info(FAULT_BLOCK); + f2fs_show_injection_info(sbi, FAULT_BLOCK); goto enospc; } @@ -2139,7 +2150,8 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, return page; if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) { - f2fs_show_injection_info(FAULT_PAGE_ALLOC); + f2fs_show_injection_info(F2FS_M_SB(mapping), + FAULT_PAGE_ALLOC); return NULL; } } @@ -2154,7 +2166,7 @@ static inline struct page *f2fs_pagecache_get_page( int fgp_flags, gfp_t gfp_mask) { if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) { - f2fs_show_injection_info(FAULT_PAGE_GET); + f2fs_show_injection_info(F2FS_M_SB(mapping), FAULT_PAGE_GET); return NULL; } @@ -2223,7 +2235,7 @@ static inline struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, return bio; } if (time_to_inject(sbi, FAULT_ALLOC_BIO)) { - f2fs_show_injection_info(FAULT_ALLOC_BIO); + f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO); return NULL; } @@ -2704,6 +2716,20 @@ static inline void clear_file(struct inode *inode, int type) f2fs_mark_inode_dirty_sync(inode, true); } +static inline bool f2fs_is_time_consistent(struct inode *inode) +{ + if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime)) + return false; + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime)) + return false; + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime)) + return false; + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 3, + &F2FS_I(inode)->i_crtime)) + return false; + return true; +} + static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) { bool ret; @@ -2721,14 +2747,7 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) i_size_read(inode) & ~PAGE_MASK) return false; - if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime)) - return false; - if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime)) - return false; - if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime)) - return false; - if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 3, - &F2FS_I(inode)->i_crtime)) + if (!f2fs_is_time_consistent(inode)) return false; down_read(&F2FS_I(inode)->i_sem); @@ -2783,7 +2802,7 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, void *ret; if (time_to_inject(sbi, FAULT_KMALLOC)) { - f2fs_show_injection_info(FAULT_KMALLOC); + f2fs_show_injection_info(sbi, FAULT_KMALLOC); return NULL; } @@ -2804,7 +2823,7 @@ static inline void *f2fs_kvmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { if (time_to_inject(sbi, FAULT_KVMALLOC)) { - f2fs_show_injection_info(FAULT_KVMALLOC); + f2fs_show_injection_info(sbi, FAULT_KVMALLOC); return NULL; } @@ -3102,7 +3121,7 @@ void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi); int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, unsigned int start, unsigned int end); -void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi); +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type); int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc); @@ -3188,10 +3207,14 @@ void f2fs_destroy_checkpoint_caches(void); */ int f2fs_init_post_read_processing(void); void f2fs_destroy_post_read_processing(void); +int f2fs_init_bio_entry_cache(void); +void f2fs_destroy_bio_entry_cache(void); void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page, nid_t ino, enum page_type type); +void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, + struct bio **bio, struct page *page); void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi); int f2fs_submit_page_bio(struct f2fs_io_info *fio); int f2fs_merge_page_bio(struct f2fs_io_info *fio); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 29bc0a542759..85af112e868d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -681,7 +681,7 @@ int f2fs_truncate(struct inode *inode) trace_f2fs_truncate(inode); if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) { - f2fs_show_injection_info(FAULT_TRUNCATE); + f2fs_show_injection_info(F2FS_I_SB(inode), FAULT_TRUNCATE); return -EIO; } @@ -726,11 +726,14 @@ int f2fs_getattr(const struct path *path, struct kstat *stat, stat->attributes |= STATX_ATTR_IMMUTABLE; if (flags & F2FS_NODUMP_FL) stat->attributes |= STATX_ATTR_NODUMP; + if (IS_VERITY(inode)) + stat->attributes |= STATX_ATTR_VERITY; stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_ENCRYPTED | STATX_ATTR_IMMUTABLE | - STATX_ATTR_NODUMP); + STATX_ATTR_NODUMP | + STATX_ATTR_VERITY); generic_fillattr(inode, stat); @@ -1139,7 +1142,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, } dn.ofs_in_node++; i++; - new_size = (dst + i) << PAGE_SHIFT; + new_size = (loff_t)(dst + i) << PAGE_SHIFT; if (dst_inode->i_size < new_size) f2fs_i_size_write(dst_inode, new_size); } while (--ilen && (do_replace[i] || blkaddr[i] == NULL_ADDR)); @@ -1545,12 +1548,44 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (off_end) map.m_len++; - if (f2fs_is_pinned_file(inode)) - map.m_seg_type = CURSEG_COLD_DATA; + if (!map.m_len) + return 0; + + if (f2fs_is_pinned_file(inode)) { + block_t len = (map.m_len >> sbi->log_blocks_per_seg) << + sbi->log_blocks_per_seg; + block_t done = 0; + + if (map.m_len % sbi->blocks_per_seg) + len += sbi->blocks_per_seg; + + map.m_len = sbi->blocks_per_seg; +next_alloc: + if (has_not_enough_free_secs(sbi, 0, + GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { + mutex_lock(&sbi->gc_mutex); + err = f2fs_gc(sbi, true, false, NULL_SEGNO); + if (err && err != -ENODATA && err != -EAGAIN) + goto out_err; + } + + down_write(&sbi->pin_sem); + map.m_seg_type = CURSEG_COLD_DATA_PINNED; + f2fs_allocate_new_segments(sbi, CURSEG_COLD_DATA); + err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); + up_write(&sbi->pin_sem); - err = f2fs_map_blocks(inode, &map, 1, (f2fs_is_pinned_file(inode) ? - F2FS_GET_BLOCK_PRE_DIO : - F2FS_GET_BLOCK_PRE_AIO)); + done += map.m_len; + len -= map.m_len; + map.m_lblk += map.m_len; + if (!err && len) + goto next_alloc; + + map.m_len = done; + } else { + err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + } +out_err: if (err) { pgoff_t last_off; @@ -1890,6 +1925,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) spin_lock(&sbi->inode_lock[ATOMIC_FILE]); if (list_empty(&fi->inmem_ilist)) list_add_tail(&fi->inmem_ilist, &sbi->inode_list[ATOMIC_FILE]); + sbi->atomic_files++; spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); /* add inode in inmem_list first and set atomic_file */ @@ -3403,6 +3439,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_RELEASE_VOLATILE_WRITE: case F2FS_IOC_ABORT_VOLATILE_WRITE: case F2FS_IOC_SHUTDOWN: + case FITRIM: case F2FS_IOC_SET_ENCRYPTION_POLICY: case F2FS_IOC_GET_ENCRYPTION_PWSALT: case F2FS_IOC_GET_ENCRYPTION_POLICY: diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 5877bd729689..b3d399623290 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -54,7 +54,7 @@ static int gc_thread_func(void *data) } if (time_to_inject(sbi, FAULT_CHECKPOINT)) { - f2fs_show_injection_info(FAULT_CHECKPOINT); + f2fs_show_injection_info(sbi, FAULT_CHECKPOINT); f2fs_stop_checkpoint(sbi, false); } @@ -1012,8 +1012,14 @@ next_step: block_t start_bidx; nid_t nid = le32_to_cpu(entry->nid); - /* stop BG_GC if there is not enough free sections. */ - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) + /* + * stop BG_GC if there is not enough free sections. + * Or, stop GC if the segment becomes fully valid caused by + * race condition along with SSR block allocation. + */ + if ((gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) || + get_valid_blocks(sbi, segno, false) == + sbi->blocks_per_seg) return submitted; if (check_valid_map(sbi, segno, off) == 0) @@ -1437,11 +1443,20 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs) raw_sb->segment_count_main = cpu_to_le32(segment_count_main + segs); raw_sb->block_count = cpu_to_le64(block_count + (long long)segs * sbi->blocks_per_seg); + if (f2fs_is_multi_device(sbi)) { + int last_dev = sbi->s_ndevs - 1; + int dev_segs = + le32_to_cpu(raw_sb->devs[last_dev].total_segments); + + raw_sb->devs[last_dev].total_segments = + cpu_to_le32(dev_segs + segs); + } } static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) { int segs = secs * sbi->segs_per_sec; + long long blks = (long long)segs * sbi->blocks_per_seg; long long user_block_count = le64_to_cpu(F2FS_CKPT(sbi)->user_block_count); @@ -1449,8 +1464,20 @@ static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) MAIN_SEGS(sbi) = (int)MAIN_SEGS(sbi) + segs; FREE_I(sbi)->free_sections = (int)FREE_I(sbi)->free_sections + secs; FREE_I(sbi)->free_segments = (int)FREE_I(sbi)->free_segments + segs; - F2FS_CKPT(sbi)->user_block_count = cpu_to_le64(user_block_count + - (long long)segs * sbi->blocks_per_seg); + F2FS_CKPT(sbi)->user_block_count = cpu_to_le64(user_block_count + blks); + + if (f2fs_is_multi_device(sbi)) { + int last_dev = sbi->s_ndevs - 1; + + FDEV(last_dev).total_segments = + (int)FDEV(last_dev).total_segments + segs; + FDEV(last_dev).end_blk = + (long long)FDEV(last_dev).end_blk + blks; +#ifdef CONFIG_BLK_DEV_ZONED + FDEV(last_dev).nr_blkz = (int)FDEV(last_dev).nr_blkz + + (int)(blks >> sbi->log_blocks_per_blkz); +#endif + } } int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) @@ -1465,6 +1492,15 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) if (block_count > old_block_count) return -EINVAL; + if (f2fs_is_multi_device(sbi)) { + int last_dev = sbi->s_ndevs - 1; + __u64 last_segs = FDEV(last_dev).total_segments; + + if (block_count + last_segs * sbi->blocks_per_seg <= + old_block_count) + return -EINVAL; + } + /* new fs size should align to section size */ div_u64_rem(block_count, BLKS_PER_SEC(sbi), &rem); if (rem) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index db4fec30c30d..502bd491336a 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -615,7 +615,11 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) inode->i_ino == F2FS_META_INO(sbi)) return 0; - if (!is_inode_flag_set(inode, FI_DIRTY_INODE)) + /* + * atime could be updated without dirtying f2fs inode in lazytime mode + */ + if (f2fs_is_time_consistent(inode) && + !is_inode_flag_set(inode, FI_DIRTY_INODE)) return 0; if (!f2fs_is_checkpoint_ready(sbi)) @@ -677,7 +681,7 @@ retry: err = f2fs_truncate(inode); if (time_to_inject(sbi, FAULT_EVICT_INODE)) { - f2fs_show_injection_info(FAULT_EVICT_INODE); + f2fs_show_injection_info(sbi, FAULT_EVICT_INODE); err = -EIO; } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 4faf06e8bf89..a1c507b0b4ac 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -981,7 +981,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!old_dir_entry || whiteout) file_lost_pino(old_inode); else - F2FS_I(old_inode)->i_pino = new_dir->i_ino; + /* adjust dir's i_pino to pass fsck check */ + f2fs_i_pino_write(old_inode, new_dir->i_ino); up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = current_time(old_inode); @@ -1141,7 +1142,11 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_set_link(old_dir, old_entry, old_page, new_inode); down_write(&F2FS_I(old_inode)->i_sem); - file_lost_pino(old_inode); + if (!old_dir_entry) + file_lost_pino(old_inode); + else + /* adjust dir's i_pino to pass fsck check */ + f2fs_i_pino_write(old_inode, new_dir->i_ino); up_write(&F2FS_I(old_inode)->i_sem); old_dir->i_ctime = current_time(old_dir); @@ -1156,7 +1161,11 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_set_link(new_dir, new_entry, new_page, old_inode); down_write(&F2FS_I(new_inode)->i_sem); - file_lost_pino(new_inode); + if (!new_dir_entry) + file_lost_pino(new_inode); + else + /* adjust dir's i_pino to pass fsck check */ + f2fs_i_pino_write(new_inode, old_dir->i_ino); up_write(&F2FS_I(new_inode)->i_sem); new_dir->i_ctime = current_time(new_dir); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 8b66bc4c004b..3314a0f3405e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2349,7 +2349,6 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, if (ret) { up_read(&nm_i->nat_tree_lock); - f2fs_bug_on(sbi, !mount); f2fs_err(sbi, "NAT is corrupt, run fsck to fix it"); return ret; } @@ -2399,7 +2398,7 @@ bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct free_nid *i = NULL; retry: if (time_to_inject(sbi, FAULT_ALLOC_NID)) { - f2fs_show_injection_info(FAULT_ALLOC_NID); + f2fs_show_injection_info(sbi, FAULT_ALLOC_NID); return false; } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 783773e4560d..76477f71d4ee 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -711,7 +711,7 @@ next: f2fs_put_page(page, 1); } if (!err) - f2fs_allocate_new_segments(sbi); + f2fs_allocate_new_segments(sbi, NO_CHECK_TYPE); return err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 808709581481..56e81447e2f3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -288,6 +288,8 @@ void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure) struct list_head *head = &sbi->inode_list[ATOMIC_FILE]; struct inode *inode; struct f2fs_inode_info *fi; + unsigned int count = sbi->atomic_files; + unsigned int looped = 0; next: spin_lock(&sbi->inode_lock[ATOMIC_FILE]); if (list_empty(head)) { @@ -296,22 +298,26 @@ next: } fi = list_first_entry(head, struct f2fs_inode_info, inmem_ilist); inode = igrab(&fi->vfs_inode); + if (inode) + list_move_tail(&fi->inmem_ilist, head); spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); if (inode) { if (gc_failure) { - if (fi->i_gc_failures[GC_FAILURE_ATOMIC]) - goto drop; - goto skip; + if (!fi->i_gc_failures[GC_FAILURE_ATOMIC]) + goto skip; } -drop: set_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); f2fs_drop_inmem_pages(inode); +skip: iput(inode); } -skip: congestion_wait(BLK_RW_ASYNC, HZ/50); cond_resched(); + if (gc_failure) { + if (++looped >= count) + return; + } goto next; } @@ -327,13 +333,16 @@ void f2fs_drop_inmem_pages(struct inode *inode) mutex_unlock(&fi->inmem_lock); } - clear_inode_flag(inode, FI_ATOMIC_FILE); fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; stat_dec_atomic_write(inode); spin_lock(&sbi->inode_lock[ATOMIC_FILE]); if (!list_empty(&fi->inmem_ilist)) list_del_init(&fi->inmem_ilist); + if (f2fs_is_atomic_file(inode)) { + clear_inode_flag(inode, FI_ATOMIC_FILE); + sbi->atomic_files--; + } spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); } @@ -480,7 +489,7 @@ int f2fs_commit_inmem_pages(struct inode *inode) void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { if (time_to_inject(sbi, FAULT_CHECKPOINT)) { - f2fs_show_injection_info(FAULT_CHECKPOINT); + f2fs_show_injection_info(sbi, FAULT_CHECKPOINT); f2fs_stop_checkpoint(sbi, false); } @@ -1008,8 +1017,9 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, if (dc->error) printk_ratelimited( - "%sF2FS-fs: Issue discard(%u, %u, %u) failed, ret: %d", - KERN_INFO, dc->lstart, dc->start, dc->len, dc->error); + "%sF2FS-fs (%s): Issue discard(%u, %u, %u) failed, ret: %d", + KERN_INFO, sbi->sb->s_id, + dc->lstart, dc->start, dc->len, dc->error); __detach_discard_cmd(dcc, dc); } @@ -1149,7 +1159,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, dc->len += len; if (time_to_inject(sbi, FAULT_DISCARD)) { - f2fs_show_injection_info(FAULT_DISCARD); + f2fs_show_injection_info(sbi, FAULT_DISCARD); err = -EIO; goto submit; } @@ -1771,7 +1781,8 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, return -EIO; } trace_f2fs_issue_reset_zone(bdev, blkstart); - return blkdev_reset_zones(bdev, sector, nr_sects, GFP_NOFS); + return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, + sector, nr_sects, GFP_NOFS); } /* For conventional zones, use regular discard if supported */ @@ -2690,7 +2701,7 @@ unlock: up_read(&SM_I(sbi)->curseg_lock); } -void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg; unsigned int old_segno; @@ -2699,10 +2710,17 @@ void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) down_write(&SIT_I(sbi)->sentry_lock); for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { + if (type != NO_CHECK_TYPE && i != type) + continue; + curseg = CURSEG_I(sbi, i); - old_segno = curseg->segno; - SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true); - locate_dirty_segment(sbi, old_segno); + if (type == NO_CHECK_TYPE || curseg->next_blkoff || + get_valid_blocks(sbi, curseg->segno, false) || + get_ckpt_valid_blocks(sbi, curseg->segno)) { + old_segno = curseg->segno; + SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true); + locate_dirty_segment(sbi, old_segno); + } } up_write(&SIT_I(sbi)->sentry_lock); @@ -3068,6 +3086,19 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); + bool put_pin_sem = false; + + if (type == CURSEG_COLD_DATA) { + /* GC during CURSEG_COLD_DATA_PINNED allocation */ + if (down_read_trylock(&sbi->pin_sem)) { + put_pin_sem = true; + } else { + type = CURSEG_WARM_DATA; + curseg = CURSEG_I(sbi, type); + } + } else if (type == CURSEG_COLD_DATA_PINNED) { + type = CURSEG_COLD_DATA; + } down_read(&SM_I(sbi)->curseg_lock); @@ -3133,6 +3164,9 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, mutex_unlock(&curseg->curseg_mutex); up_read(&SM_I(sbi)->curseg_lock); + + if (put_pin_sem) + up_read(&sbi->pin_sem); } static void update_device_state(struct f2fs_io_info *fio) @@ -3379,7 +3413,10 @@ void f2fs_wait_on_page_writeback(struct page *page, if (PageWriteback(page)) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); + /* submit cached LFS IO */ f2fs_submit_merged_write_cond(sbi, NULL, page, 0, type); + /* sbumit cached IPU IO */ + f2fs_submit_merged_ipu_write(sbi, NULL, page); if (ordered) { wait_on_page_writeback(page); f2fs_bug_on(sbi, locked && PageWriteback(page)); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 325781a1ae4d..a95467b202ea 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -313,6 +313,8 @@ struct sit_entry_set { */ static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type) { + if (type == CURSEG_COLD_DATA_PINNED) + type = CURSEG_COLD_DATA; return (struct curseg_info *)(SM_I(sbi)->curseg_array + type); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1443cee15863..5111e1ffe58a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1213,9 +1213,13 @@ static int f2fs_statfs_project(struct super_block *sb, return PTR_ERR(dquot); spin_lock(&dquot->dq_dqb_lock); - limit = (dquot->dq_dqb.dqb_bsoftlimit ? - dquot->dq_dqb.dqb_bsoftlimit : - dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits; + limit = 0; + if (dquot->dq_dqb.dqb_bsoftlimit) + limit = dquot->dq_dqb.dqb_bsoftlimit; + if (dquot->dq_dqb.dqb_bhardlimit && + (!limit || dquot->dq_dqb.dqb_bhardlimit < limit)) + limit = dquot->dq_dqb.dqb_bhardlimit; + if (limit && buf->f_blocks > limit) { curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits; buf->f_blocks = limit; @@ -1224,9 +1228,13 @@ static int f2fs_statfs_project(struct super_block *sb, (buf->f_blocks - curblock) : 0; } - limit = dquot->dq_dqb.dqb_isoftlimit ? - dquot->dq_dqb.dqb_isoftlimit : - dquot->dq_dqb.dqb_ihardlimit; + limit = 0; + if (dquot->dq_dqb.dqb_isoftlimit) + limit = dquot->dq_dqb.dqb_isoftlimit; + if (dquot->dq_dqb.dqb_ihardlimit && + (!limit || dquot->dq_dqb.dqb_ihardlimit < limit)) + limit = dquot->dq_dqb.dqb_ihardlimit; + if (limit && buf->f_files > limit) { buf->f_files = limit; buf->f_ffree = @@ -1932,7 +1940,7 @@ static int f2fs_quota_enable(struct super_block *sb, int type, int format_id, /* Don't account quota for quota files to avoid recursion */ qf_inode->i_flags |= S_NOQUOTA; - err = dquot_enable(qf_inode, type, format_id, flags); + err = dquot_load_quota_inode(qf_inode, type, format_id, flags); iput(qf_inode); return err; } @@ -2308,13 +2316,27 @@ static bool f2fs_dummy_context(struct inode *inode) return DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(inode)); } +static bool f2fs_has_stable_inodes(struct super_block *sb) +{ + return true; +} + +static void f2fs_get_ino_and_lblk_bits(struct super_block *sb, + int *ino_bits_ret, int *lblk_bits_ret) +{ + *ino_bits_ret = 8 * sizeof(nid_t); + *lblk_bits_ret = 8 * sizeof(block_t); +} + static const struct fscrypt_operations f2fs_cryptops = { - .key_prefix = "f2fs:", - .get_context = f2fs_get_context, - .set_context = f2fs_set_context, - .dummy_context = f2fs_dummy_context, - .empty_dir = f2fs_empty_dir, - .max_namelen = F2FS_NAME_LEN, + .key_prefix = "f2fs:", + .get_context = f2fs_get_context, + .set_context = f2fs_set_context, + .dummy_context = f2fs_dummy_context, + .empty_dir = f2fs_empty_dir, + .max_namelen = F2FS_NAME_LEN, + .has_stable_inodes = f2fs_has_stable_inodes, + .get_ino_and_lblk_bits = f2fs_get_ino_and_lblk_bits, }; #endif @@ -2604,6 +2626,21 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return -EFSCORRUPTED; } + if (RDEV(0).path[0]) { + block_t dev_seg_count = le32_to_cpu(RDEV(0).total_segments); + int i = 1; + + while (i < MAX_DEVICES && RDEV(i).path[0]) { + dev_seg_count += le32_to_cpu(RDEV(i).total_segments); + i++; + } + if (segment_count != dev_seg_count) { + f2fs_info(sbi, "Segment count (%u) mismatch with total segments from devices (%u)", + segment_count, dev_seg_count); + return -EFSCORRUPTED; + } + } + if (secs_per_zone > total_sections || !secs_per_zone) { f2fs_info(sbi, "Wrong secs_per_zone / total_sections (%u, %u)", secs_per_zone, total_sections); @@ -2838,6 +2875,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) spin_lock_init(&sbi->dev_lock); init_rwsem(&sbi->sb_lock); + init_rwsem(&sbi->pin_sem); } static int init_percpu_info(struct f2fs_sb_info *sbi) @@ -2857,15 +2895,21 @@ static int init_percpu_info(struct f2fs_sb_info *sbi) } #ifdef CONFIG_BLK_DEV_ZONED +static int f2fs_report_zone_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + struct f2fs_dev_info *dev = data; + + if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) + set_bit(idx, dev->blkz_seq); + return 0; +} + static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) { struct block_device *bdev = FDEV(devi).bdev; sector_t nr_sectors = bdev->bd_part->nr_sects; - sector_t sector = 0; - struct blk_zone *zones; - unsigned int i, nr_zones; - unsigned int n = 0; - int err = -EIO; + int ret; if (!f2fs_sb_has_blkzoned(sbi)) return 0; @@ -2890,38 +2934,13 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) if (!FDEV(devi).blkz_seq) return -ENOMEM; -#define F2FS_REPORT_NR_ZONES 4096 - - zones = f2fs_kzalloc(sbi, - array_size(F2FS_REPORT_NR_ZONES, - sizeof(struct blk_zone)), - GFP_KERNEL); - if (!zones) - return -ENOMEM; - /* Get block zones type */ - while (zones && sector < nr_sectors) { - - nr_zones = F2FS_REPORT_NR_ZONES; - err = blkdev_report_zones(bdev, sector, zones, &nr_zones); - if (err) - break; - if (!nr_zones) { - err = -EIO; - break; - } - - for (i = 0; i < nr_zones; i++) { - if (zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL) - set_bit(n, FDEV(devi).blkz_seq); - sector += zones[i].len; - n++; - } - } - - kvfree(zones); + ret = blkdev_report_zones(bdev, 0, BLK_ALL_ZONES, f2fs_report_zone_cb, + &FDEV(devi)); + if (ret < 0) + return ret; - return err; + return 0; } #endif @@ -2951,6 +2970,7 @@ static int read_raw_super_block(struct f2fs_sb_info *sbi, f2fs_err(sbi, "Unable to read %dth superblock", block + 1); err = -EIO; + *recovery = 1; continue; } @@ -2960,6 +2980,7 @@ static int read_raw_super_block(struct f2fs_sb_info *sbi, f2fs_err(sbi, "Can't find valid F2FS filesystem in %dth superblock", block + 1); brelse(bh); + *recovery = 1; continue; } @@ -2972,10 +2993,6 @@ static int read_raw_super_block(struct f2fs_sb_info *sbi, brelse(bh); } - /* Fail to read any one of the superblocks*/ - if (err < 0) - *recovery = 1; - /* No valid superblock */ if (!*raw_super) kvfree(super); @@ -3329,6 +3346,8 @@ try_onemore: sbi->write_io[i][j].bio = NULL; spin_lock_init(&sbi->write_io[i][j].io_lock); INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); + INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list); + init_rwsem(&sbi->write_io[i][j].bio_list_lock); } } @@ -3740,8 +3759,13 @@ static int __init init_f2fs_fs(void) err = f2fs_init_post_read_processing(); if (err) goto free_root_stats; + err = f2fs_init_bio_entry_cache(); + if (err) + goto free_post_read; return 0; +free_post_read: + f2fs_destroy_post_read_processing(); free_root_stats: f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); @@ -3765,6 +3789,7 @@ fail: static void __exit exit_f2fs_fs(void) { + f2fs_destroy_bio_entry_cache(); f2fs_destroy_post_read_processing(); f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index b558b64a4c9c..70945ceb9c0c 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -154,6 +154,8 @@ static ssize_t features_show(struct f2fs_attr *a, if (f2fs_sb_has_casefold(sbi)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "casefold"); + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "pin_file"); len += snprintf(buf + len, PAGE_SIZE - len, "\n"); return len; } @@ -443,6 +445,7 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle, gc_mode); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent, gc_mode); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, main_blkaddr, main_blkaddr); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); @@ -510,6 +513,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(gc_idle), ATTR_LIST(gc_urgent), ATTR_LIST(reclaim_segments), + ATTR_LIST(main_blkaddr), ATTR_LIST(max_small_discards), ATTR_LIST(discard_granularity), ATTR_LIST(batched_trim_sections), diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 181900af2576..296b3189448a 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -539,8 +539,9 @@ out: ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { struct inode *inode = d_inode(dentry); + nid_t xnid = F2FS_I(inode)->i_xattr_nid; struct f2fs_xattr_entry *entry; - void *base_addr; + void *base_addr, *last_base_addr; int error = 0; size_t rest = buffer_size; @@ -550,6 +551,8 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) if (error) return error; + last_base_addr = (void *)base_addr + XATTR_SIZE(xnid, inode); + list_for_each_xattr(entry, base_addr) { const struct xattr_handler *handler = f2fs_xattr_handler(entry->e_name_index); @@ -557,6 +560,15 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) size_t prefix_len; size_t size; + if ((void *)(entry) + sizeof(__u32) > last_base_addr || + (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) { + f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr", + inode->i_ino); + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); + error = -EFSCORRUPTED; + goto cleanup; + } + if (!handler || (handler->list && !handler->list(dentry))) continue; diff --git a/fs/fat/file.c b/fs/fat/file.c index 4614c0ba5f1c..bdc4503c00a3 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -172,15 +172,6 @@ long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } } -#ifdef CONFIG_COMPAT -static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd, - unsigned long arg) - -{ - return fat_generic_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)); -} -#endif - static int fat_file_release(struct inode *inode, struct file *filp) { if ((filp->f_mode & FMODE_WRITE) && @@ -215,9 +206,7 @@ const struct file_operations fat_file_operations = { .mmap = generic_file_mmap, .release = fat_file_release, .unlocked_ioctl = fat_generic_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = fat_generic_compat_ioctl, -#endif + .compat_ioctl = compat_ptr_ioctl, .fsync = fat_file_fsync, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, diff --git a/fs/fcntl.c b/fs/fcntl.c index 3d40771e8e7c..41b6438bd2d9 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -261,7 +261,7 @@ static int f_getowner_uids(struct file *filp, unsigned long arg) static bool rw_hint_valid(enum rw_hint hint) { switch (hint) { - case RWF_WRITE_LIFE_NOT_SET: + case RWH_WRITE_LIFE_NOT_SET: case RWH_WRITE_LIFE_NONE: case RWH_WRITE_LIFE_SHORT: case RWH_WRITE_LIFE_MEDIUM: diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e88421d9a48d..335607b8c5c0 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -576,10 +576,13 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, spin_unlock(&inode->i_lock); /* - * A dying wb indicates that the memcg-blkcg mapping has changed - * and a new wb is already serving the memcg. Switch immediately. + * A dying wb indicates that either the blkcg associated with the + * memcg changed or the associated memcg is dying. In the first + * case, a replacement wb should already be available and we should + * refresh the wb immediately. In the second case, trying to + * refresh will keep failing. */ - if (unlikely(wb_dying(wbc->wb))) + if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css))) inode_switch_wbs(inode, wbc->wb_id); } EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode); @@ -905,7 +908,7 @@ restart: * cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs * @bdi_id: target bdi id * @memcg_id: target memcg css id - * @nr_pages: number of pages to write, 0 for best-effort dirty flushing + * @nr: number of pages to write, 0 for best-effort dirty flushing * @reason: reason why some writeback work initiated * @done: target wb_completion * diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 0635cba19971..eb2a585572dc 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -34,7 +34,7 @@ config VIRTIO_FS select VIRTIO help The Virtio Filesystem allows guests to mount file systems from the - host. + host. If you want to share files between guests or with the host, answer Y - or M. + or M. diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 6419a2b3510d..3e8cebfb59b7 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o -obj-$(CONFIG_VIRTIO_FS) += virtio_fs.o +obj-$(CONFIG_VIRTIO_FS) += virtiofs.o fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o +virtiofs-y += virtio_fs.o diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index dadd617d826c..8e02d76fe104 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -276,10 +276,12 @@ static void flush_bg_queue(struct fuse_conn *fc) void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req) { struct fuse_iqueue *fiq = &fc->iq; - bool async = req->args->end; + bool async; if (test_and_set_bit(FR_FINISHED, &req->flags)) goto put_request; + + async = req->args->end; /* * test_and_set_bit() implies smp_mb() between bit * changing and below intr_entry check. Pairs with @@ -703,7 +705,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) cs->pipebufs++; cs->nr_segs--; } else { - if (cs->nr_segs == cs->pipe->buffers) + if (cs->nr_segs >= cs->pipe->max_usage) return -EIO; page = alloc_page(GFP_HIGHUSER); @@ -879,7 +881,7 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, struct pipe_buffer *buf; int err; - if (cs->nr_segs == cs->pipe->buffers) + if (cs->nr_segs >= cs->pipe->max_usage) return -EIO; err = unlock_request(cs->req); @@ -1341,7 +1343,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, if (!fud) return -EPERM; - bufs = kvmalloc_array(pipe->buffers, sizeof(struct pipe_buffer), + bufs = kvmalloc_array(pipe->max_usage, sizeof(struct pipe_buffer), GFP_KERNEL); if (!bufs) return -ENOMEM; @@ -1353,7 +1355,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, if (ret < 0) goto out; - if (pipe->nrbufs + cs.nr_segs > pipe->buffers) { + if (pipe_occupancy(pipe->head, pipe->tail) + cs.nr_segs > pipe->max_usage) { ret = -EIO; goto out; } @@ -1935,6 +1937,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { + unsigned int head, tail, mask, count; unsigned nbuf; unsigned idx; struct pipe_buffer *bufs; @@ -1949,8 +1952,12 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, pipe_lock(pipe); - bufs = kvmalloc_array(pipe->nrbufs, sizeof(struct pipe_buffer), - GFP_KERNEL); + head = pipe->head; + tail = pipe->tail; + mask = pipe->ring_size - 1; + count = head - tail; + + bufs = kvmalloc_array(count, sizeof(struct pipe_buffer), GFP_KERNEL); if (!bufs) { pipe_unlock(pipe); return -ENOMEM; @@ -1958,8 +1965,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, nbuf = 0; rem = 0; - for (idx = 0; idx < pipe->nrbufs && rem < len; idx++) - rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len; + for (idx = tail; idx != head && rem < len; idx++) + rem += pipe->bufs[idx & mask].len; ret = -EINVAL; if (rem < len) @@ -1970,16 +1977,16 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, struct pipe_buffer *ibuf; struct pipe_buffer *obuf; - BUG_ON(nbuf >= pipe->buffers); - BUG_ON(!pipe->nrbufs); - ibuf = &pipe->bufs[pipe->curbuf]; + BUG_ON(nbuf >= pipe->ring_size); + BUG_ON(tail == head); + ibuf = &pipe->bufs[tail & mask]; obuf = &bufs[nbuf]; if (rem >= ibuf->len) { *obuf = *ibuf; ibuf->ops = NULL; - pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); - pipe->nrbufs--; + tail++; + pipe->tail = tail; } else { if (!pipe_buf_get(pipe, ibuf)) goto out_free; @@ -2260,7 +2267,7 @@ const struct file_operations fuse_dev_operations = { .release = fuse_dev_release, .fasync = fuse_dev_fasync, .unlocked_ioctl = fuse_dev_ioctl, - .compat_ioctl = fuse_dev_ioctl, + .compat_ioctl = compat_ptr_ioctl, }; EXPORT_SYMBOL_GPL(fuse_dev_operations); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index d572c900bb0f..ee190119f45c 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -248,7 +248,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) kfree(forget); if (ret == -ENOMEM) goto out; - if (ret || (outarg.attr.mode ^ inode->i_mode) & S_IFMT) + if (ret || fuse_invalid_attr(&outarg.attr) || + (outarg.attr.mode ^ inode->i_mode) & S_IFMT) goto invalid; forget_all_cached_acls(inode); @@ -319,6 +320,12 @@ int fuse_valid_type(int m) S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m); } +bool fuse_invalid_attr(struct fuse_attr *attr) +{ + return !fuse_valid_type(attr->mode) || + attr->size > LLONG_MAX; +} + int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name, struct fuse_entry_out *outarg, struct inode **inode) { @@ -350,7 +357,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name err = -EIO; if (!outarg->nodeid) goto out_put_forget; - if (!fuse_valid_type(outarg->attr.mode)) + if (fuse_invalid_attr(&outarg->attr)) goto out_put_forget; *inode = fuse_iget(sb, outarg->nodeid, outarg->generation, @@ -405,7 +412,8 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, else fuse_invalidate_entry_cache(entry); - fuse_advise_use_readdirplus(dir); + if (inode) + fuse_advise_use_readdirplus(dir); return newent; out_iput: @@ -474,7 +482,8 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, goto out_free_ff; err = -EIO; - if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid)) + if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid) || + fuse_invalid_attr(&outentry.attr)) goto out_free_ff; ff->fh = outopen.fh; @@ -582,7 +591,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, goto out_put_forget_req; err = -EIO; - if (invalid_nodeid(outarg.nodeid)) + if (invalid_nodeid(outarg.nodeid) || fuse_invalid_attr(&outarg.attr)) goto out_put_forget_req; if ((outarg.attr.mode ^ mode) & S_IFMT) @@ -861,7 +870,8 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, spin_lock(&fi->lock); fi->attr_version = atomic64_inc_return(&fc->attr_version); - inc_nlink(inode); + if (likely(inode->i_nlink < UINT_MAX)) + inc_nlink(inode); spin_unlock(&fi->lock); fuse_invalidate_attr(inode); fuse_update_ctime(inode); @@ -941,7 +951,8 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (!err) { - if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) { + if (fuse_invalid_attr(&outarg.attr) || + (inode->i_mode ^ outarg.attr.mode) & S_IFMT) { make_bad_inode(inode); err = -EIO; } else { @@ -1521,6 +1532,19 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, is_truncate = true; } + /* Flush dirty data/metadata before non-truncate SETATTR */ + if (is_wb && S_ISREG(inode->i_mode) && + attr->ia_valid & + (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_MTIME_SET | + ATTR_TIMES_SET)) { + err = write_inode_now(inode, true); + if (err) + return err; + + fuse_set_nowrite(inode); + fuse_release_nowrite(inode); + } + if (is_truncate) { fuse_set_nowrite(inode); set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); @@ -1549,7 +1573,8 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, goto error; } - if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) { + if (fuse_invalid_attr(&outarg.attr) || + (inode->i_mode ^ outarg.attr.mode) & S_IFMT) { make_bad_inode(inode); err = -EIO; goto error; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 0f0225686aee..a63d779eac10 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -217,7 +217,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) { struct fuse_conn *fc = get_fuse_conn(inode); int err; - bool lock_inode = (file->f_flags & O_TRUNC) && + bool is_wb_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc && fc->writeback_cache; @@ -225,16 +225,20 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) if (err) return err; - if (lock_inode) + if (is_wb_truncate) { inode_lock(inode); + fuse_set_nowrite(inode); + } err = fuse_do_open(fc, get_node_id(inode), file, isdir); if (!err) fuse_finish_open(inode, file); - if (lock_inode) + if (is_wb_truncate) { + fuse_release_nowrite(inode); inode_unlock(inode); + } return err; } @@ -709,8 +713,10 @@ static ssize_t fuse_async_req_send(struct fuse_conn *fc, ia->ap.args.end = fuse_aio_complete_req; err = fuse_simple_background(fc, &ia->ap.args, GFP_KERNEL); + if (err) + fuse_aio_complete_req(fc, &ia->ap.args, err); - return err ?: num_bytes; + return num_bytes; } static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count, @@ -1092,6 +1098,8 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, ia->write.in.flags = fuse_write_flags(iocb); err = fuse_simple_request(fc, &ap->args); + if (!err && ia->write.out.size > count) + err = -EIO; offset = ap->descs[0].offset; count = ia->write.out.size; @@ -1997,7 +2005,7 @@ static int fuse_writepages_fill(struct page *page, if (!data->ff) { err = -EIO; - data->ff = fuse_write_file_get(fc, get_fuse_inode(inode)); + data->ff = fuse_write_file_get(fc, fi); if (!data->ff) goto out_unlock; } @@ -2042,8 +2050,6 @@ static int fuse_writepages_fill(struct page *page, * under writeback, so we can release the page lock. */ if (data->wpa == NULL) { - struct fuse_inode *fi = get_fuse_inode(inode); - err = -ENOMEM; wpa = fuse_writepage_args_alloc(); if (!wpa) { diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 956aeaf961ae..aa75e2305b75 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -479,6 +479,7 @@ struct fuse_fs_context { bool destroy:1; bool no_control:1; bool no_force_umount:1; + bool no_mount_options:1; unsigned int max_read; unsigned int blksize; const char *subtype; @@ -713,6 +714,9 @@ struct fuse_conn { /** Do not allow MNT_FORCE umount */ unsigned int no_force_umount:1; + /* Do not show mount options */ + unsigned int no_mount_options:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -985,6 +989,8 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc); */ int fuse_valid_type(int m); +bool fuse_invalid_attr(struct fuse_attr *attr); + /** * Is current process allowed to perform filesystem operation? */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index e040e2a2b621..16aec32f7f3d 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -558,6 +558,9 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) struct super_block *sb = root->d_sb; struct fuse_conn *fc = get_fuse_conn_super(sb); + if (fc->no_mount_options) + return 0; + seq_printf(m, ",user_id=%u", from_kuid_munged(fc->user_ns, fc->user_id)); seq_printf(m, ",group_id=%u", from_kgid_munged(fc->user_ns, fc->group_id)); if (fc->default_permissions) @@ -1180,6 +1183,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) fc->destroy = ctx->destroy; fc->no_control = ctx->no_control; fc->no_force_umount = ctx->no_force_umount; + fc->no_mount_options = ctx->no_mount_options; err = -ENOMEM; root = fuse_get_root_inode(sb, ctx->rootmode); diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 5c38b9d84c6e..6a40f75a0d25 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -184,7 +184,7 @@ static int fuse_direntplus_link(struct file *file, if (invalid_nodeid(o->nodeid)) return -EIO; - if (!fuse_valid_type(o->attr.mode)) + if (fuse_invalid_attr(&o->attr)) return -EIO; fc = get_fuse_conn(dir); diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 6af3f131e468..bade74768903 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -30,10 +30,12 @@ struct virtio_fs_vq { struct virtqueue *vq; /* protected by ->lock */ struct work_struct done_work; struct list_head queued_reqs; + struct list_head end_reqs; /* End these requests */ struct delayed_work dispatch_work; struct fuse_dev *fud; bool connected; long in_flight; + struct completion in_flight_zero; /* No inflight requests */ char name[24]; } ____cacheline_aligned_in_smp; @@ -47,13 +49,20 @@ struct virtio_fs { unsigned int num_request_queues; /* number of request queues */ }; -struct virtio_fs_forget { +struct virtio_fs_forget_req { struct fuse_in_header ih; struct fuse_forget_in arg; +}; + +struct virtio_fs_forget { /* This request can be temporarily queued on virt queue */ struct list_head list; + struct virtio_fs_forget_req req; }; +static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, + struct fuse_req *req, bool in_flight); + static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq) { struct virtio_fs *fs = vq->vdev->priv; @@ -66,6 +75,21 @@ static inline struct fuse_pqueue *vq_to_fpq(struct virtqueue *vq) return &vq_to_fsvq(vq)->fud->pq; } +/* Should be called with fsvq->lock held. */ +static inline void inc_in_flight_req(struct virtio_fs_vq *fsvq) +{ + fsvq->in_flight++; +} + +/* Should be called with fsvq->lock held. */ +static inline void dec_in_flight_req(struct virtio_fs_vq *fsvq) +{ + WARN_ON(fsvq->in_flight <= 0); + fsvq->in_flight--; + if (!fsvq->in_flight) + complete(&fsvq->in_flight_zero); +} + static void release_virtio_fs_obj(struct kref *ref) { struct virtio_fs *vfs = container_of(ref, struct virtio_fs, refcount); @@ -94,51 +118,46 @@ static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq) WARN_ON(fsvq->in_flight < 0); /* Wait for in flight requests to finish.*/ - while (1) { - spin_lock(&fsvq->lock); - if (!fsvq->in_flight) { - spin_unlock(&fsvq->lock); - break; - } + spin_lock(&fsvq->lock); + if (fsvq->in_flight) { + /* We are holding virtio_fs_mutex. There should not be any + * waiters waiting for completion. + */ + reinit_completion(&fsvq->in_flight_zero); + spin_unlock(&fsvq->lock); + wait_for_completion(&fsvq->in_flight_zero); + } else { spin_unlock(&fsvq->lock); - /* TODO use completion instead of timeout */ - usleep_range(1000, 2000); } flush_work(&fsvq->done_work); flush_delayed_work(&fsvq->dispatch_work); } -static inline void drain_hiprio_queued_reqs(struct virtio_fs_vq *fsvq) -{ - struct virtio_fs_forget *forget; - - spin_lock(&fsvq->lock); - while (1) { - forget = list_first_entry_or_null(&fsvq->queued_reqs, - struct virtio_fs_forget, list); - if (!forget) - break; - list_del(&forget->list); - kfree(forget); - } - spin_unlock(&fsvq->lock); -} - -static void virtio_fs_drain_all_queues(struct virtio_fs *fs) +static void virtio_fs_drain_all_queues_locked(struct virtio_fs *fs) { struct virtio_fs_vq *fsvq; int i; for (i = 0; i < fs->nvqs; i++) { fsvq = &fs->vqs[i]; - if (i == VQ_HIPRIO) - drain_hiprio_queued_reqs(fsvq); - virtio_fs_drain_queue(fsvq); } } +static void virtio_fs_drain_all_queues(struct virtio_fs *fs) +{ + /* Provides mutual exclusion between ->remove and ->kill_sb + * paths. We don't want both of these draining queue at the + * same time. Current completion logic reinits completion + * and that means there should not be any other thread + * doing reinit or waiting for completion already. + */ + mutex_lock(&virtio_fs_mutex); + virtio_fs_drain_all_queues_locked(fs); + mutex_unlock(&virtio_fs_mutex); +} + static void virtio_fs_start_all_queues(struct virtio_fs *fs) { struct virtio_fs_vq *fsvq; @@ -253,74 +272,148 @@ static void virtio_fs_hiprio_done_work(struct work_struct *work) while ((req = virtqueue_get_buf(vq, &len)) != NULL) { kfree(req); - fsvq->in_flight--; + dec_in_flight_req(fsvq); } } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq))); spin_unlock(&fsvq->lock); } -static void virtio_fs_dummy_dispatch_work(struct work_struct *work) +static void virtio_fs_request_dispatch_work(struct work_struct *work) { -} - -static void virtio_fs_hiprio_dispatch_work(struct work_struct *work) -{ - struct virtio_fs_forget *forget; + struct fuse_req *req; struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, dispatch_work.work); - struct virtqueue *vq = fsvq->vq; - struct scatterlist sg; - struct scatterlist *sgs[] = {&sg}; - bool notify; + struct fuse_conn *fc = fsvq->fud->fc; int ret; pr_debug("virtio-fs: worker %s called.\n", __func__); while (1) { spin_lock(&fsvq->lock); - forget = list_first_entry_or_null(&fsvq->queued_reqs, - struct virtio_fs_forget, list); - if (!forget) { + req = list_first_entry_or_null(&fsvq->end_reqs, struct fuse_req, + list); + if (!req) { spin_unlock(&fsvq->lock); - return; + break; } - list_del(&forget->list); - if (!fsvq->connected) { + list_del_init(&req->list); + spin_unlock(&fsvq->lock); + fuse_request_end(fc, req); + } + + /* Dispatch pending requests */ + while (1) { + spin_lock(&fsvq->lock); + req = list_first_entry_or_null(&fsvq->queued_reqs, + struct fuse_req, list); + if (!req) { spin_unlock(&fsvq->lock); - kfree(forget); - continue; + return; } + list_del_init(&req->list); + spin_unlock(&fsvq->lock); - sg_init_one(&sg, forget, sizeof(*forget)); - - /* Enqueue the request */ - dev_dbg(&vq->vdev->dev, "%s\n", __func__); - ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC); + ret = virtio_fs_enqueue_req(fsvq, req, true); if (ret < 0) { if (ret == -ENOMEM || ret == -ENOSPC) { - pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later\n", - ret); - list_add_tail(&forget->list, - &fsvq->queued_reqs); + spin_lock(&fsvq->lock); + list_add_tail(&req->list, &fsvq->queued_reqs); schedule_delayed_work(&fsvq->dispatch_work, - msecs_to_jiffies(1)); - } else { - pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n", - ret); - kfree(forget); + msecs_to_jiffies(1)); + spin_unlock(&fsvq->lock); + return; } + req->out.h.error = ret; + spin_lock(&fsvq->lock); + dec_in_flight_req(fsvq); + spin_unlock(&fsvq->lock); + pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n", + ret); + fuse_request_end(fc, req); + } + } +} + +/* + * Returns 1 if queue is full and sender should wait a bit before sending + * next request, 0 otherwise. + */ +static int send_forget_request(struct virtio_fs_vq *fsvq, + struct virtio_fs_forget *forget, + bool in_flight) +{ + struct scatterlist sg; + struct virtqueue *vq; + int ret = 0; + bool notify; + struct virtio_fs_forget_req *req = &forget->req; + + spin_lock(&fsvq->lock); + if (!fsvq->connected) { + if (in_flight) + dec_in_flight_req(fsvq); + kfree(forget); + goto out; + } + + sg_init_one(&sg, req, sizeof(*req)); + vq = fsvq->vq; + dev_dbg(&vq->vdev->dev, "%s\n", __func__); + + ret = virtqueue_add_outbuf(vq, &sg, 1, forget, GFP_ATOMIC); + if (ret < 0) { + if (ret == -ENOMEM || ret == -ENOSPC) { + pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later\n", + ret); + list_add_tail(&forget->list, &fsvq->queued_reqs); + schedule_delayed_work(&fsvq->dispatch_work, + msecs_to_jiffies(1)); + if (!in_flight) + inc_in_flight_req(fsvq); + /* Queue is full */ + ret = 1; + } else { + pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n", + ret); + kfree(forget); + if (in_flight) + dec_in_flight_req(fsvq); + } + goto out; + } + + if (!in_flight) + inc_in_flight_req(fsvq); + notify = virtqueue_kick_prepare(vq); + spin_unlock(&fsvq->lock); + + if (notify) + virtqueue_notify(vq); + return ret; +out: + spin_unlock(&fsvq->lock); + return ret; +} + +static void virtio_fs_hiprio_dispatch_work(struct work_struct *work) +{ + struct virtio_fs_forget *forget; + struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, + dispatch_work.work); + pr_debug("virtio-fs: worker %s called.\n", __func__); + while (1) { + spin_lock(&fsvq->lock); + forget = list_first_entry_or_null(&fsvq->queued_reqs, + struct virtio_fs_forget, list); + if (!forget) { spin_unlock(&fsvq->lock); return; } - fsvq->in_flight++; - notify = virtqueue_kick_prepare(vq); + list_del(&forget->list); spin_unlock(&fsvq->lock); - - if (notify) - virtqueue_notify(vq); - pr_debug("virtio-fs: worker %s dispatched one forget request.\n", - __func__); + if (send_forget_request(fsvq, forget, true)) + return; } } @@ -452,7 +545,7 @@ static void virtio_fs_requests_done_work(struct work_struct *work) fuse_request_end(fc, req); spin_lock(&fsvq->lock); - fsvq->in_flight--; + dec_in_flight_req(fsvq); spin_unlock(&fsvq->lock); } } @@ -502,8 +595,10 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev, names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name; INIT_WORK(&fs->vqs[VQ_HIPRIO].done_work, virtio_fs_hiprio_done_work); INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].queued_reqs); + INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].end_reqs); INIT_DELAYED_WORK(&fs->vqs[VQ_HIPRIO].dispatch_work, virtio_fs_hiprio_dispatch_work); + init_completion(&fs->vqs[VQ_HIPRIO].in_flight_zero); spin_lock_init(&fs->vqs[VQ_HIPRIO].lock); /* Initialize the requests virtqueues */ @@ -511,8 +606,10 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev, spin_lock_init(&fs->vqs[i].lock); INIT_WORK(&fs->vqs[i].done_work, virtio_fs_requests_done_work); INIT_DELAYED_WORK(&fs->vqs[i].dispatch_work, - virtio_fs_dummy_dispatch_work); + virtio_fs_request_dispatch_work); INIT_LIST_HEAD(&fs->vqs[i].queued_reqs); + INIT_LIST_HEAD(&fs->vqs[i].end_reqs); + init_completion(&fs->vqs[i].in_flight_zero); snprintf(fs->vqs[i].name, sizeof(fs->vqs[i].name), "requests.%u", i - VQ_REQUEST); callbacks[i] = virtio_fs_vq_done; @@ -606,7 +703,7 @@ static void virtio_fs_remove(struct virtio_device *vdev) /* This device is going away. No one should get new reference */ list_del_init(&fs->list); virtio_fs_stop_all_queues(fs); - virtio_fs_drain_all_queues(fs); + virtio_fs_drain_all_queues_locked(fs); vdev->config->reset(vdev); virtio_fs_cleanup_vqs(vdev, fs); @@ -631,12 +728,12 @@ static int virtio_fs_restore(struct virtio_device *vdev) } #endif /* CONFIG_PM_SLEEP */ -const static struct virtio_device_id id_table[] = { +static const struct virtio_device_id id_table[] = { { VIRTIO_ID_FS, VIRTIO_DEV_ANY_ID }, {}, }; -const static unsigned int feature_table[] = {}; +static const unsigned int feature_table[] = {}; static struct virtio_driver virtio_fs_driver = { .driver.name = KBUILD_MODNAME, @@ -657,14 +754,10 @@ __releases(fiq->lock) { struct fuse_forget_link *link; struct virtio_fs_forget *forget; - struct scatterlist sg; - struct scatterlist *sgs[] = {&sg}; + struct virtio_fs_forget_req *req; struct virtio_fs *fs; - struct virtqueue *vq; struct virtio_fs_vq *fsvq; - bool notify; u64 unique; - int ret; link = fuse_dequeue_forget(fiq, 1, NULL); unique = fuse_get_unique(fiq); @@ -675,56 +768,19 @@ __releases(fiq->lock) /* Allocate a buffer for the request */ forget = kmalloc(sizeof(*forget), GFP_NOFS | __GFP_NOFAIL); + req = &forget->req; - forget->ih = (struct fuse_in_header){ + req->ih = (struct fuse_in_header){ .opcode = FUSE_FORGET, .nodeid = link->forget_one.nodeid, .unique = unique, - .len = sizeof(*forget), + .len = sizeof(*req), }; - forget->arg = (struct fuse_forget_in){ + req->arg = (struct fuse_forget_in){ .nlookup = link->forget_one.nlookup, }; - sg_init_one(&sg, forget, sizeof(*forget)); - - /* Enqueue the request */ - spin_lock(&fsvq->lock); - - if (!fsvq->connected) { - kfree(forget); - spin_unlock(&fsvq->lock); - goto out; - } - - vq = fsvq->vq; - dev_dbg(&vq->vdev->dev, "%s\n", __func__); - - ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC); - if (ret < 0) { - if (ret == -ENOMEM || ret == -ENOSPC) { - pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later.\n", - ret); - list_add_tail(&forget->list, &fsvq->queued_reqs); - schedule_delayed_work(&fsvq->dispatch_work, - msecs_to_jiffies(1)); - } else { - pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n", - ret); - kfree(forget); - } - spin_unlock(&fsvq->lock); - goto out; - } - - fsvq->in_flight++; - notify = virtqueue_kick_prepare(vq); - - spin_unlock(&fsvq->lock); - - if (notify) - virtqueue_notify(vq); -out: + send_forget_request(fsvq, forget, false); kfree(link); } @@ -819,7 +875,7 @@ static unsigned int sg_init_fuse_args(struct scatterlist *sg, /* Add a request to a virtqueue and kick the device */ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, - struct fuse_req *req) + struct fuse_req *req, bool in_flight) { /* requests need at least 4 elements */ struct scatterlist *stack_sgs[6]; @@ -835,6 +891,7 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, unsigned int i; int ret; bool notify; + struct fuse_pqueue *fpq; /* Does the sglist fit on the stack? */ total_sgs = sg_count_fuse_req(req); @@ -889,7 +946,17 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, goto out; } - fsvq->in_flight++; + /* Request successfully sent. */ + fpq = &fsvq->fud->pq; + spin_lock(&fpq->lock); + list_add_tail(&req->list, fpq->processing); + spin_unlock(&fpq->lock); + set_bit(FR_SENT, &req->flags); + /* matches barrier in request_wait_answer() */ + smp_mb__after_atomic(); + + if (!in_flight) + inc_in_flight_req(fsvq); notify = virtqueue_kick_prepare(vq); spin_unlock(&fsvq->lock); @@ -915,9 +982,8 @@ __releases(fiq->lock) { unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */ struct virtio_fs *fs; - struct fuse_conn *fc; struct fuse_req *req; - struct fuse_pqueue *fpq; + struct virtio_fs_vq *fsvq; int ret; WARN_ON(list_empty(&fiq->pending)); @@ -928,49 +994,41 @@ __releases(fiq->lock) spin_unlock(&fiq->lock); fs = fiq->priv; - fc = fs->vqs[queue_id].fud->fc; pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u\n", __func__, req->in.h.opcode, req->in.h.unique, req->in.h.nodeid, req->in.h.len, fuse_len_args(req->args->out_numargs, req->args->out_args)); - fpq = &fs->vqs[queue_id].fud->pq; - spin_lock(&fpq->lock); - if (!fpq->connected) { - spin_unlock(&fpq->lock); - req->out.h.error = -ENODEV; - pr_err("virtio-fs: %s disconnected\n", __func__); - fuse_request_end(fc, req); - return; - } - list_add_tail(&req->list, fpq->processing); - spin_unlock(&fpq->lock); - set_bit(FR_SENT, &req->flags); - /* matches barrier in request_wait_answer() */ - smp_mb__after_atomic(); - -retry: - ret = virtio_fs_enqueue_req(&fs->vqs[queue_id], req); + fsvq = &fs->vqs[queue_id]; + ret = virtio_fs_enqueue_req(fsvq, req, false); if (ret < 0) { if (ret == -ENOMEM || ret == -ENOSPC) { - /* Virtqueue full. Retry submission */ - /* TODO use completion instead of timeout */ - usleep_range(20, 30); - goto retry; + /* + * Virtqueue full. Retry submission from worker + * context as we might be holding fc->bg_lock. + */ + spin_lock(&fsvq->lock); + list_add_tail(&req->list, &fsvq->queued_reqs); + inc_in_flight_req(fsvq); + schedule_delayed_work(&fsvq->dispatch_work, + msecs_to_jiffies(1)); + spin_unlock(&fsvq->lock); + return; } req->out.h.error = ret; pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n", ret); - spin_lock(&fpq->lock); - clear_bit(FR_SENT, &req->flags); - list_del_init(&req->list); - spin_unlock(&fpq->lock); - fuse_request_end(fc, req); + + /* Can't end request in submission context. Use a worker */ + spin_lock(&fsvq->lock); + list_add_tail(&req->list, &fsvq->end_reqs); + schedule_delayed_work(&fsvq->dispatch_work, 0); + spin_unlock(&fsvq->lock); return; } } -const static struct fuse_iqueue_ops virtio_fs_fiq_ops = { +static const struct fuse_iqueue_ops virtio_fs_fiq_ops = { .wake_forget_and_unlock = virtio_fs_wake_forget_and_unlock, .wake_interrupt_and_unlock = virtio_fs_wake_interrupt_and_unlock, .wake_pending_and_unlock = virtio_fs_wake_pending_and_unlock, @@ -992,6 +1050,7 @@ static int virtio_fs_fill_super(struct super_block *sb) .destroy = true, .no_control = true, .no_force_umount = true, + .no_mount_options = true, }; mutex_lock(&virtio_fs_mutex); diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index b9fe975d7625..9c6df721321a 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -133,7 +133,7 @@ static int gfs2_write_full_page(struct page *page, get_block_t *get_block, * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ - offset = i_size & (PAGE_SIZE-1); + offset = i_size & (PAGE_SIZE - 1); if (page->index == end_index && offset) zero_user_segment(page, offset, PAGE_SIZE); @@ -497,7 +497,7 @@ static int __gfs2_readpage(void *file, struct page *page) error = mpage_readpage(page, gfs2_block_map); } - if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags))) + if (unlikely(gfs2_withdrawn(sdp))) return -EIO; return error; @@ -614,7 +614,7 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping, gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); - if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags))) + if (unlikely(gfs2_withdrawn(sdp))) ret = -EIO; return ret; } diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 516103248272..08f6fbb3655e 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -2441,8 +2441,16 @@ int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length) struct inode *inode = file_inode(file); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); + unsigned int blocksize = i_blocksize(inode); + loff_t start, end; int error; + start = round_down(offset, blocksize); + end = round_up(offset + length, blocksize) - 1; + error = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (error) + return error; + if (gfs2_is_jdata(ip)) error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_JDATA, GFS2_JTRUNC_REVOKES); @@ -2456,9 +2464,8 @@ int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length) if (error) goto out; } else { - unsigned int start_off, end_len, blocksize; + unsigned int start_off, end_len; - blocksize = i_blocksize(inode); start_off = offset & (blocksize - 1); end_len = (offset + length) & (blocksize - 1); if (start_off) { diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index f0caee2b7c00..9d58295ccf7a 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -6,6 +6,7 @@ #include <linux/slab.h> #include <linux/spinlock.h> +#include <linux/compat.h> #include <linux/completion.h> #include <linux/buffer_head.h> #include <linux/pagemap.h> @@ -354,6 +355,31 @@ static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return -ENOTTY; } +#ifdef CONFIG_COMPAT +static long gfs2_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + switch(cmd) { + /* These are just misnamed, they actually get/put from/to user an int */ + case FS_IOC32_GETFLAGS: + cmd = FS_IOC_GETFLAGS; + break; + case FS_IOC32_SETFLAGS: + cmd = FS_IOC_SETFLAGS; + break; + /* Keep this list in sync with gfs2_ioctl */ + case FITRIM: + case FS_IOC_GETFSLABEL: + break; + default: + return -ENOIOCTLCMD; + } + + return gfs2_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)); +} +#else +#define gfs2_compat_ioctl NULL +#endif + /** * gfs2_size_hint - Give a hint to the size of a write request * @filep: The struct file @@ -381,27 +407,28 @@ static void gfs2_size_hint(struct file *filep, loff_t offset, size_t size) /** * gfs2_allocate_page_backing - Allocate blocks for a write fault * @page: The (locked) page to allocate backing for + * @length: Size of the allocation * * We try to allocate all the blocks required for the page in one go. This * might fail for various reasons, so we keep trying until all the blocks to * back this page are allocated. If some of the blocks are already allocated, * that is ok too. */ -static int gfs2_allocate_page_backing(struct page *page) +static int gfs2_allocate_page_backing(struct page *page, unsigned int length) { u64 pos = page_offset(page); - u64 size = PAGE_SIZE; do { struct iomap iomap = { }; - if (gfs2_iomap_get_alloc(page->mapping->host, pos, 1, &iomap)) + if (gfs2_iomap_get_alloc(page->mapping->host, pos, length, &iomap)) return -EIO; - iomap.length = min(iomap.length, size); - size -= iomap.length; + if (length < iomap.length) + iomap.length = length; + length -= iomap.length; pos += iomap.length; - } while (size > 0); + } while (length > 0); return 0; } @@ -422,10 +449,10 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_alloc_parms ap = { .aflags = 0, }; - unsigned long last_index; - u64 pos = page_offset(page); + u64 offset = page_offset(page); unsigned int data_blocks, ind_blocks, rblocks; struct gfs2_holder gh; + unsigned int length; loff_t size; int ret; @@ -435,20 +462,39 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) if (ret) goto out; - gfs2_size_hint(vmf->vma->vm_file, pos, PAGE_SIZE); - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); if (ret) goto out_uninit; + /* Check page index against inode size */ + size = i_size_read(inode); + if (offset >= size) { + ret = -EINVAL; + goto out_unlock; + } + /* Update file times before taking page lock */ file_update_time(vmf->vma->vm_file); + /* page is wholly or partially inside EOF */ + if (offset > size - PAGE_SIZE) + length = offset_in_page(size); + else + length = PAGE_SIZE; + + gfs2_size_hint(vmf->vma->vm_file, offset, length); + set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); set_bit(GIF_SW_PAGED, &ip->i_flags); - if (!gfs2_write_alloc_required(ip, pos, PAGE_SIZE)) { + /* + * iomap_writepage / iomap_writepages currently don't support inline + * files, so always unstuff here. + */ + + if (!gfs2_is_stuffed(ip) && + !gfs2_write_alloc_required(ip, offset, length)) { lock_page(page); if (!PageUptodate(page) || page->mapping != inode->i_mapping) { ret = -EAGAIN; @@ -461,7 +507,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) if (ret) goto out_unlock; - gfs2_write_calc_reserv(ip, PAGE_SIZE, &data_blocks, &ind_blocks); + gfs2_write_calc_reserv(ip, length, &data_blocks, &ind_blocks); ap.target = data_blocks + ind_blocks; ret = gfs2_quota_lock_check(ip, &ap); if (ret) @@ -482,13 +528,6 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) goto out_trans_fail; lock_page(page); - ret = -EINVAL; - size = i_size_read(inode); - last_index = (size - 1) >> PAGE_SHIFT; - /* Check page index against inode size */ - if (size == 0 || (page->index > last_index)) - goto out_trans_end; - ret = -EAGAIN; /* If truncated, we must retry the operation, we may have raced * with the glock demotion code. @@ -501,7 +540,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) if (gfs2_is_stuffed(ip)) ret = gfs2_unstuff_dinode(ip, page); if (ret == 0) - ret = gfs2_allocate_page_backing(page); + ret = gfs2_allocate_page_backing(page, length); out_trans_end: if (ret) @@ -935,6 +974,7 @@ out: brelse(dibh); return error; } + /** * calc_max_reserv() - Reverse of write_calc_reserv. Given a number of * blocks, determine how many bytes can be written. @@ -1182,7 +1222,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) cmd = F_SETLK; fl->fl_type = F_UNLCK; } - if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags))) { + if (unlikely(gfs2_withdrawn(sdp))) { if (fl->fl_type == F_UNLCK) locks_lock_file_wait(file, fl); return -EIO; @@ -1295,6 +1335,7 @@ const struct file_operations gfs2_file_fops = { .write_iter = gfs2_file_write_iter, .iopoll = iomap_dio_iopoll, .unlocked_ioctl = gfs2_ioctl, + .compat_ioctl = gfs2_compat_ioctl, .mmap = gfs2_mmap, .open = gfs2_open, .release = gfs2_release, @@ -1310,6 +1351,7 @@ const struct file_operations gfs2_file_fops = { const struct file_operations gfs2_dir_fops = { .iterate_shared = gfs2_readdir, .unlocked_ioctl = gfs2_ioctl, + .compat_ioctl = gfs2_compat_ioctl, .open = gfs2_open, .release = gfs2_release, .fsync = gfs2_fsync, @@ -1326,6 +1368,7 @@ const struct file_operations gfs2_file_fops_nolock = { .write_iter = gfs2_file_write_iter, .iopoll = iomap_dio_iopoll, .unlocked_ioctl = gfs2_ioctl, + .compat_ioctl = gfs2_compat_ioctl, .mmap = gfs2_mmap, .open = gfs2_open, .release = gfs2_release, @@ -1339,6 +1382,7 @@ const struct file_operations gfs2_file_fops_nolock = { const struct file_operations gfs2_dir_fops_nolock = { .iterate_shared = gfs2_readdir, .unlocked_ioctl = gfs2_ioctl, + .compat_ioctl = gfs2_compat_ioctl, .open = gfs2_open, .release = gfs2_release, .fsync = gfs2_fsync, diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 0290a22ebccf..b7123de7c180 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -549,7 +549,7 @@ __acquires(&gl->gl_lockref.lock) unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0); int ret; - if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags)) && + if (unlikely(gfs2_withdrawn(sdp)) && target != LM_ST_UNLOCKED) return; lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | @@ -558,7 +558,14 @@ __acquires(&gl->gl_lockref.lock) GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target); if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) && glops->go_inval) { - set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); + /* + * If another process is already doing the invalidate, let that + * finish first. The glock state machine will get back to this + * holder again later. + */ + if (test_and_set_bit(GLF_INVALIDATE_IN_PROGRESS, + &gl->gl_flags)) + return; do_error(gl, 0); /* Fail queued try locks */ } gl->gl_req = target; @@ -586,8 +593,7 @@ __acquires(&gl->gl_lockref.lock) } else if (ret) { fs_err(sdp, "lm_lock ret %d\n", ret); - GLOCK_BUG_ON(gl, !test_bit(SDF_WITHDRAWN, - &sdp->sd_flags)); + GLOCK_BUG_ON(gl, !gfs2_withdrawn(sdp)); } } else { /* lock_nolock */ finish_xmote(gl, target); @@ -1191,7 +1197,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh) struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; int error = 0; - if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags))) + if (unlikely(gfs2_withdrawn(sdp))) return -EIO; if (test_bit(GLF_LRU, &gl->gl_flags)) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index ff213690e364..4ede1f18de85 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -350,7 +350,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) ip->i_inode.i_rdev = MKDEV(be32_to_cpu(str->di_major), be32_to_cpu(str->di_minor)); break; - }; + } i_uid_write(&ip->i_inode, be32_to_cpu(str->di_uid)); i_gid_write(&ip->i_inode, be32_to_cpu(str->di_gid)); @@ -540,7 +540,7 @@ static int freeze_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh) gfs2_consist(sdp); /* Initialize some head of the log stuff */ - if (!test_bit(SDF_WITHDRAWN, &sdp->sd_flags)) { + if (!gfs2_withdrawn(sdp)) { sdp->sd_log_sequence = head.lh_sequence + 1; gfs2_log_pointers_init(sdp, head.lh_blkno); } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index e1e18fb587eb..dafef10b91f1 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -656,7 +656,6 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, inode->i_rdev = dev; inode->i_size = size; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); - gfs2_set_inode_blocks(inode, 1); munge_mode_uid_gid(dip, inode); check_and_update_goal(dip); ip->i_goal = dip->i_goal; @@ -712,7 +711,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = gfs2_trans_begin(sdp, blocks, 0); if (error) - goto fail_gunlock2; + goto fail_free_inode; if (blocks > 1) { ip->i_eattr = ip->i_no_addr + 1; @@ -723,7 +722,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_iopen_glops, CREATE, &io_gl); if (error) - goto fail_gunlock2; + goto fail_free_inode; BUG_ON(test_and_set_bit(GLF_INODE_CREATING, &io_gl->gl_flags)); @@ -732,7 +731,6 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, goto fail_gunlock2; glock_set_object(ip->i_iopen_gh.gh_gl, ip); - gfs2_glock_put(io_gl); gfs2_set_iop(inode); insert_inode_hash(inode); @@ -765,6 +763,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, mark_inode_dirty(inode); d_instantiate(dentry, inode); + /* After instantiate, errors should result in evict which will destroy + * both inode and iopen glocks properly. */ if (file) { file->f_mode |= FMODE_CREATED; error = finish_open(file, dentry, gfs2_open_common); @@ -772,15 +772,15 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, gfs2_glock_dq_uninit(ghs); gfs2_glock_dq_uninit(ghs + 1); clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags); + gfs2_glock_put(io_gl); return error; fail_gunlock3: glock_clear_object(io_gl, ip); gfs2_glock_dq_uninit(&ip->i_iopen_gh); - gfs2_glock_put(io_gl); fail_gunlock2: - if (io_gl) - clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags); + clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags); + gfs2_glock_put(io_gl); fail_free_inode: if (ip->i_gl) { glock_clear_object(ip->i_gl, ip); @@ -1475,7 +1475,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, error = -EEXIST; default: goto out_gunlock; - }; + } if (odip != ndip) { if (!ndip->i_inode.i_nlink) { diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 58e237fba565..eb3f2e7b8085 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -31,6 +31,8 @@ #include "dir.h" #include "trace_gfs2.h" +static void gfs2_log_shutdown(struct gfs2_sbd *sdp); + /** * gfs2_struct2blk - compute stuff * @sdp: the filesystem @@ -159,7 +161,8 @@ restart: list_for_each_entry_reverse(tr, head, tr_list) { if (wbc->nr_to_write <= 0) break; - if (gfs2_ail1_start_one(sdp, wbc, tr, &withdraw)) + if (gfs2_ail1_start_one(sdp, wbc, tr, &withdraw) && + !gfs2_withdrawn(sdp)) goto restart; } spin_unlock(&sdp->sd_ail_lock); @@ -609,6 +612,14 @@ void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) list_add(&bd->bd_list, &sdp->sd_log_revokes); } +void gfs2_glock_remove_revoke(struct gfs2_glock *gl) +{ + if (atomic_dec_return(&gl->gl_revokes) == 0) { + clear_bit(GLF_LFLUSH, &gl->gl_flags); + gfs2_glock_queue_put(gl); + } +} + void gfs2_write_revokes(struct gfs2_sbd *sdp) { struct gfs2_trans *tr; @@ -682,12 +693,16 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, { struct gfs2_log_header *lh; u32 hash, crc; - struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO); + struct page *page; struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; struct timespec64 tv; struct super_block *sb = sdp->sd_vfs; u64 dblock; + if (gfs2_withdrawn(sdp)) + goto out; + + page = mempool_alloc(gfs2_page_pool, GFP_NOIO); lh = page_address(page); clear_page(lh); @@ -707,7 +722,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, lh->lh_nsec = cpu_to_be32(tv.tv_nsec); lh->lh_sec = cpu_to_be64(tv.tv_sec); if (!list_empty(&jd->extent_list)) - dblock = gfs2_log_bmap(sdp); + dblock = gfs2_log_bmap(jd, lblock); else { int ret = gfs2_lblk_to_dblk(jd->jd_inode, lblock, &dblock); if (gfs2_assert_withdraw(sdp, ret == 0)) @@ -740,6 +755,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, gfs2_log_write(sdp, page, sb->s_blocksize, 0, dblock); gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE | op_flags); +out: log_flush_wait(sdp); } @@ -768,6 +784,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) sdp->sd_log_idle = (tail == sdp->sd_log_flush_head); gfs2_write_log_header(sdp, sdp->sd_jdesc, sdp->sd_log_sequence++, tail, sdp->sd_log_flush_head, flags, op_flags); + gfs2_log_incr_head(sdp); if (sdp->sd_log_tail != tail) log_pull_tail(sdp, tail); @@ -948,7 +965,7 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) * */ -void gfs2_log_shutdown(struct gfs2_sbd *sdp) +static void gfs2_log_shutdown(struct gfs2_sbd *sdp) { gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved); gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 2315fca47a2b..2ff163a8dce1 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -74,9 +74,9 @@ extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc); -extern void gfs2_log_shutdown(struct gfs2_sbd *sdp); extern int gfs2_logd(void *data); extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); +extern void gfs2_glock_remove_revoke(struct gfs2_glock *gl); extern void gfs2_write_revokes(struct gfs2_sbd *sdp); #endif /* __LOG_DOT_H__ */ diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 5b17979af539..55fed7daf2b1 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -129,7 +129,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, atomic_dec(&sdp->sd_log_pinned); } -static void gfs2_log_incr_head(struct gfs2_sbd *sdp) +void gfs2_log_incr_head(struct gfs2_sbd *sdp) { BUG_ON((sdp->sd_log_flush_head == sdp->sd_log_tail) && (sdp->sd_log_flush_head != sdp->sd_log_head)); @@ -138,18 +138,13 @@ static void gfs2_log_incr_head(struct gfs2_sbd *sdp) sdp->sd_log_flush_head = 0; } -u64 gfs2_log_bmap(struct gfs2_sbd *sdp) +u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lblock) { - unsigned int lbn = sdp->sd_log_flush_head; struct gfs2_journal_extent *je; - u64 block; - list_for_each_entry(je, &sdp->sd_jdesc->extent_list, list) { - if ((lbn >= je->lblock) && (lbn < (je->lblock + je->blocks))) { - block = je->dblock + lbn - je->lblock; - gfs2_log_incr_head(sdp); - return block; - } + list_for_each_entry(je, &jd->extent_list, list) { + if (lblock >= je->lblock && lblock < je->lblock + je->blocks) + return je->dblock + lblock - je->lblock; } return -1; @@ -351,8 +346,11 @@ void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page, static void gfs2_log_write_bh(struct gfs2_sbd *sdp, struct buffer_head *bh) { - gfs2_log_write(sdp, bh->b_page, bh->b_size, bh_offset(bh), - gfs2_log_bmap(sdp)); + u64 dblock; + + dblock = gfs2_log_bmap(sdp->sd_jdesc, sdp->sd_log_flush_head); + gfs2_log_incr_head(sdp); + gfs2_log_write(sdp, bh->b_page, bh->b_size, bh_offset(bh), dblock); } /** @@ -369,8 +367,11 @@ static void gfs2_log_write_bh(struct gfs2_sbd *sdp, struct buffer_head *bh) void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page) { struct super_block *sb = sdp->sd_vfs; - gfs2_log_write(sdp, page, sb->s_blocksize, 0, - gfs2_log_bmap(sdp)); + u64 dblock; + + dblock = gfs2_log_bmap(sdp->sd_jdesc, sdp->sd_log_flush_head); + gfs2_log_incr_head(sdp); + gfs2_log_write(sdp, page, sb->s_blocksize, 0, dblock); } /** @@ -882,10 +883,7 @@ static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) bd = list_entry(head->next, struct gfs2_bufdata, bd_list); list_del_init(&bd->bd_list); gl = bd->bd_gl; - if (atomic_dec_return(&gl->gl_revokes) == 0) { - clear_bit(GLF_LFLUSH, &gl->gl_flags); - gfs2_glock_queue_put(gl); - } + gfs2_glock_remove_revoke(gl); kmem_cache_free(gfs2_bufdata_cachep, bd); } } diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 9c059957a733..9c5e4e491e03 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -18,7 +18,8 @@ ~(2 * sizeof(__be64) - 1)) extern const struct gfs2_log_operations *gfs2_log_ops[]; -extern u64 gfs2_log_bmap(struct gfs2_sbd *sdp); +extern void gfs2_log_incr_head(struct gfs2_sbd *sdp); +extern u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lbn); extern void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page, unsigned size, unsigned offset, u64 blkno); extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 662ef36c1874..0c3772974030 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -251,7 +251,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, struct buffer_head *bh, *bhs[2]; int num = 0; - if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags))) { + if (unlikely(gfs2_withdrawn(sdp))) { *bhp = NULL; return -EIO; } @@ -309,7 +309,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) { - if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags))) + if (unlikely(gfs2_withdrawn(sdp))) return -EIO; wait_on_buffer(bh); @@ -320,7 +320,7 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) gfs2_io_error_bh_wd(sdp, bh); return -EIO; } - if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags))) + if (unlikely(gfs2_withdrawn(sdp))) return -EIO; return 0; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 681b44682b0d..e8b7b0ce8404 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1006,8 +1006,7 @@ hostdata_error: void gfs2_lm_unmount(struct gfs2_sbd *sdp) { const struct lm_lockops *lm = sdp->sd_lockstruct.ls_ops; - if (likely(!test_bit(SDF_WITHDRAWN, &sdp->sd_flags)) && - lm->lm_unmount) + if (likely(!gfs2_withdrawn(sdp)) && lm->lm_unmount) lm->lm_unmount(sdp); } @@ -1328,7 +1327,7 @@ static const struct fs_parameter_enum gfs2_param_enums[] = { {} }; -const struct fs_parameter_description gfs2_fs_parameters = { +static const struct fs_parameter_description gfs2_fs_parameters = { .name = "gfs2", .specs = gfs2_param_specs, .enums = gfs2_param_enums, @@ -1540,17 +1539,23 @@ static int gfs2_init_fs_context(struct fs_context *fc) { struct gfs2_args *args; - args = kzalloc(sizeof(*args), GFP_KERNEL); + args = kmalloc(sizeof(*args), GFP_KERNEL); if (args == NULL) return -ENOMEM; - args->ar_quota = GFS2_QUOTA_DEFAULT; - args->ar_data = GFS2_DATA_DEFAULT; - args->ar_commit = 30; - args->ar_statfs_quantum = 30; - args->ar_quota_quantum = 60; - args->ar_errors = GFS2_ERRORS_DEFAULT; + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + struct gfs2_sbd *sdp = fc->root->d_sb->s_fs_info; + *args = sdp->sd_args; + } else { + memset(args, 0, sizeof(*args)); + args->ar_quota = GFS2_QUOTA_DEFAULT; + args->ar_data = GFS2_DATA_DEFAULT; + args->ar_commit = 30; + args->ar_statfs_quantum = 30; + args->ar_quota_quantum = 60; + args->ar_errors = GFS2_ERRORS_DEFAULT; + } fc->fs_private = args; fc->ops = &gfs2_context_ops; return 0; @@ -1600,6 +1605,7 @@ static int gfs2_meta_get_tree(struct fs_context *fc) } static const struct fs_context_operations gfs2_meta_context_ops = { + .free = gfs2_fc_free, .get_tree = gfs2_meta_get_tree, }; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 7c016a082aa6..e9f93045eb01 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1273,7 +1273,7 @@ int gfs2_quota_sync(struct super_block *sb, int type) { struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_quota_data **qda; - unsigned int max_qd = PAGE_SIZE/sizeof(struct gfs2_holder); + unsigned int max_qd = PAGE_SIZE / sizeof(struct gfs2_holder); unsigned int num_qd; unsigned int x; int error = 0; @@ -1475,7 +1475,7 @@ static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error) { if (error == 0 || error == -EROFS) return; - if (!test_bit(SDF_WITHDRAWN, &sdp->sd_flags)) { + if (!gfs2_withdrawn(sdp)) { fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error); sdp->sd_log_error = error; wake_up(&sdp->sd_logd_waitq); diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index c529f8749a89..85f830e56945 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -263,11 +263,13 @@ static void clean_journal(struct gfs2_jdesc *jd, u32 lblock = head->lh_blkno; gfs2_replay_incr_blk(jd, &lblock); - if (jd->jd_jid == sdp->sd_lockstruct.ls_jid) - sdp->sd_log_flush_head = lblock; gfs2_write_log_header(sdp, jd, head->lh_sequence + 1, 0, lblock, GFS2_LOG_HEAD_UNMOUNT | GFS2_LOG_HEAD_RECOVERY, REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC); + if (jd->jd_jid == sdp->sd_lockstruct.ls_jid) { + sdp->sd_log_flush_head = lblock; + gfs2_log_incr_head(sdp); + } } @@ -326,7 +328,7 @@ void gfs2_recover_func(struct work_struct *work) default: goto fail; - }; + } error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_NOEXP | GL_NOCACHE, &ji_gh); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 5fa1eec4fb4f..68cc7c291a81 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -399,8 +399,7 @@ struct lfcc { * Returns: errno */ -static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, - struct gfs2_holder *freeze_gh) +static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp) { struct gfs2_inode *ip; struct gfs2_jdesc *jd; @@ -425,7 +424,9 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, } error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_EXCLUSIVE, - GL_NOCACHE, freeze_gh); + GL_NOCACHE, &sdp->sd_freeze_gh); + if (error) + goto out; list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { error = gfs2_jdesc_check(jd); @@ -441,7 +442,7 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, } if (error) - gfs2_glock_dq_uninit(freeze_gh); + gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); out: while (!list_empty(&list)) { @@ -553,7 +554,7 @@ static void gfs2_dirty_inode(struct inode *inode, int flags) if (!(flags & I_DIRTY_INODE)) return; - if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags))) + if (unlikely(gfs2_withdrawn(sdp))) return; if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); @@ -602,7 +603,7 @@ int gfs2_make_fs_ro(struct gfs2_sbd *sdp) error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, GL_NOCACHE, &freeze_gh); - if (error && !test_bit(SDF_WITHDRAWN, &sdp->sd_flags)) + if (error && !gfs2_withdrawn(sdp)) return error; flush_workqueue(gfs2_delete_workqueue); @@ -761,21 +762,25 @@ static int gfs2_freeze(struct super_block *sb) if (atomic_read(&sdp->sd_freeze_state) != SFS_UNFROZEN) goto out; - if (test_bit(SDF_WITHDRAWN, &sdp->sd_flags)) { - error = -EINVAL; - goto out; - } - for (;;) { - error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh); + if (gfs2_withdrawn(sdp)) { + error = -EINVAL; + goto out; + } + + error = gfs2_lock_fs_check_clean(sdp); if (!error) break; if (error == -EBUSY) fs_err(sdp, "waiting for recovery before freeze\n"); - else + else if (error == -EIO) { + fs_err(sdp, "Fatal IO error: cannot freeze gfs2 due " + "to recovery error.\n"); + goto out; + } else { fs_err(sdp, "error freezing FS: %d\n", error); - + } fs_err(sdp, "retrying...\n"); msleep(1000); } diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index dd15b8e4af2c..8ccb68f4ed16 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -118,7 +118,7 @@ static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len) static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf) { - unsigned int b = test_bit(SDF_WITHDRAWN, &sdp->sd_flags); + unsigned int b = gfs2_withdrawn(sdp); return snprintf(buf, PAGE_SIZE, "%u\n", b); } diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 35e3059255fe..9d4227330de4 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -262,6 +262,8 @@ void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) list_del_init(&bd->bd_list); gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke); sdp->sd_log_num_revoke--; + if (bd->bd_gl) + gfs2_glock_remove_revoke(bd->bd_gl); kmem_cache_free(gfs2_bufdata_cachep, bd); tr->tr_num_revoke--; if (--n == 0) diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index c45159133d8e..ec600b487498 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -258,7 +258,7 @@ void gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh, const char *function, char *file, unsigned int line, bool withdraw) { - if (!test_bit(SDF_WITHDRAWN, &sdp->sd_flags)) + if (!gfs2_withdrawn(sdp)) fs_err(sdp, "fatal: I/O error\n" " block = %llu\n" diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index 4b68b2c1fe56..f2702bc9837c 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -164,6 +164,15 @@ static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt, return x; } +/** + * gfs2_withdrawn - test whether the file system is withdrawing or withdrawn + * @sdp: the superblock + */ +static inline bool gfs2_withdrawn(struct gfs2_sbd *sdp) +{ + return test_bit(SDF_WITHDRAWN, &sdp->sd_flags); +} + #define gfs2_tune_get(sdp, field) \ gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field) diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index d85230c84ef2..f32f15669996 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -325,4 +325,5 @@ const struct file_operations hpfs_dir_ops = .release = hpfs_dir_release, .fsync = hpfs_file_fsync, .unlocked_ioctl = hpfs_ioctl, + .compat_ioctl = compat_ptr_ioctl, }; diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 1ecec124e76f..b36abf9cb345 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -215,6 +215,7 @@ const struct file_operations hpfs_file_ops = .fsync = hpfs_file_fsync, .splice_read = generic_file_splice_read, .unlocked_ioctl = hpfs_ioctl, + .compat_ioctl = compat_ptr_ioctl, }; const struct inode_operations hpfs_file_iops = diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a478df035651..d5c2a3158610 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -440,7 +440,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, u32 hash; index = page->index; - hash = hugetlb_fault_mutex_hash(h, mapping, index, 0); + hash = hugetlb_fault_mutex_hash(mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* @@ -644,7 +644,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, addr = index * hpage_size; /* mutex taken here, fault path and hole punch */ - hash = hugetlb_fault_mutex_hash(h, mapping, index, addr); + hash = hugetlb_fault_mutex_hash(mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* See if already present in mapping to avoid alloc/free */ @@ -815,8 +815,11 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, /* * File creation. Allocate an inode, and we're done.. */ -static int hugetlbfs_mknod(struct inode *dir, - struct dentry *dentry, umode_t mode, dev_t dev) +static int do_hugetlbfs_mknod(struct inode *dir, + struct dentry *dentry, + umode_t mode, + dev_t dev, + bool tmpfile) { struct inode *inode; int error = -ENOSPC; @@ -824,13 +827,23 @@ static int hugetlbfs_mknod(struct inode *dir, inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev); if (inode) { dir->i_ctime = dir->i_mtime = current_time(dir); - d_instantiate(dentry, inode); - dget(dentry); /* Extra count - pin the dentry in core */ + if (tmpfile) { + d_tmpfile(dentry, inode); + } else { + d_instantiate(dentry, inode); + dget(dentry);/* Extra count - pin the dentry in core */ + } error = 0; } return error; } +static int hugetlbfs_mknod(struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t dev) +{ + return do_hugetlbfs_mknod(dir, dentry, mode, dev, false); +} + static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0); @@ -844,6 +857,12 @@ static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mo return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0); } +static int hugetlbfs_tmpfile(struct inode *dir, + struct dentry *dentry, umode_t mode) +{ + return do_hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0, true); +} + static int hugetlbfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { @@ -1102,6 +1121,7 @@ static const struct inode_operations hugetlbfs_dir_inode_operations = { .mknod = hugetlbfs_mknod, .rename = simple_rename, .setattr = hugetlbfs_setattr, + .tmpfile = hugetlbfs_tmpfile, }; static const struct inode_operations hugetlbfs_inode_operations = { @@ -1461,28 +1481,41 @@ static int __init init_hugetlbfs_fs(void) sizeof(struct hugetlbfs_inode_info), 0, SLAB_ACCOUNT, init_once); if (hugetlbfs_inode_cachep == NULL) - goto out2; + goto out; error = register_filesystem(&hugetlbfs_fs_type); if (error) - goto out; + goto out_free; + /* default hstate mount is required */ + mnt = mount_one_hugetlbfs(&hstates[default_hstate_idx]); + if (IS_ERR(mnt)) { + error = PTR_ERR(mnt); + goto out_unreg; + } + hugetlbfs_vfsmount[default_hstate_idx] = mnt; + + /* other hstates are optional */ i = 0; for_each_hstate(h) { + if (i == default_hstate_idx) + continue; + mnt = mount_one_hugetlbfs(h); - if (IS_ERR(mnt) && i == 0) { - error = PTR_ERR(mnt); - goto out; - } - hugetlbfs_vfsmount[i] = mnt; + if (IS_ERR(mnt)) + hugetlbfs_vfsmount[i] = NULL; + else + hugetlbfs_vfsmount[i] = mnt; i++; } return 0; - out: + out_unreg: + (void)unregister_filesystem(&hugetlbfs_fs_type); + out_free: kmem_cache_destroy(hugetlbfs_inode_cachep); - out2: + out: return error; } fs_initcall(init_hugetlbfs_fs) diff --git a/fs/io-wq.c b/fs/io-wq.c new file mode 100644 index 000000000000..74b40506c5d9 --- /dev/null +++ b/fs/io-wq.c @@ -0,0 +1,1094 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Basic worker thread pool for io_uring + * + * Copyright (C) 2019 Jens Axboe + * + */ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/sched/signal.h> +#include <linux/mm.h> +#include <linux/mmu_context.h> +#include <linux/sched/mm.h> +#include <linux/percpu.h> +#include <linux/slab.h> +#include <linux/kthread.h> +#include <linux/rculist_nulls.h> + +#include "io-wq.h" + +#define WORKER_IDLE_TIMEOUT (5 * HZ) + +enum { + IO_WORKER_F_UP = 1, /* up and active */ + IO_WORKER_F_RUNNING = 2, /* account as running */ + IO_WORKER_F_FREE = 4, /* worker on free list */ + IO_WORKER_F_EXITING = 8, /* worker exiting */ + IO_WORKER_F_FIXED = 16, /* static idle worker */ + IO_WORKER_F_BOUND = 32, /* is doing bounded work */ +}; + +enum { + IO_WQ_BIT_EXIT = 0, /* wq exiting */ + IO_WQ_BIT_CANCEL = 1, /* cancel work on list */ + IO_WQ_BIT_ERROR = 2, /* error on setup */ +}; + +enum { + IO_WQE_FLAG_STALLED = 1, /* stalled on hash */ +}; + +/* + * One for each thread in a wqe pool + */ +struct io_worker { + refcount_t ref; + unsigned flags; + struct hlist_nulls_node nulls_node; + struct list_head all_list; + struct task_struct *task; + wait_queue_head_t wait; + struct io_wqe *wqe; + + struct io_wq_work *cur_work; + spinlock_t lock; + + struct rcu_head rcu; + struct mm_struct *mm; + const struct cred *creds; + struct files_struct *restore_files; +}; + +#if BITS_PER_LONG == 64 +#define IO_WQ_HASH_ORDER 6 +#else +#define IO_WQ_HASH_ORDER 5 +#endif + +struct io_wqe_acct { + unsigned nr_workers; + unsigned max_workers; + atomic_t nr_running; +}; + +enum { + IO_WQ_ACCT_BOUND, + IO_WQ_ACCT_UNBOUND, +}; + +/* + * Per-node worker thread pool + */ +struct io_wqe { + struct { + spinlock_t lock; + struct io_wq_work_list work_list; + unsigned long hash_map; + unsigned flags; + } ____cacheline_aligned_in_smp; + + int node; + struct io_wqe_acct acct[2]; + + struct hlist_nulls_head free_list; + struct hlist_nulls_head busy_list; + struct list_head all_list; + + struct io_wq *wq; +}; + +/* + * Per io_wq state + */ +struct io_wq { + struct io_wqe **wqes; + unsigned long state; + + get_work_fn *get_work; + put_work_fn *put_work; + + struct task_struct *manager; + struct user_struct *user; + const struct cred *creds; + struct mm_struct *mm; + refcount_t refs; + struct completion done; +}; + +static bool io_worker_get(struct io_worker *worker) +{ + return refcount_inc_not_zero(&worker->ref); +} + +static void io_worker_release(struct io_worker *worker) +{ + if (refcount_dec_and_test(&worker->ref)) + wake_up_process(worker->task); +} + +/* + * Note: drops the wqe->lock if returning true! The caller must re-acquire + * the lock in that case. Some callers need to restart handling if this + * happens, so we can't just re-acquire the lock on behalf of the caller. + */ +static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) +{ + bool dropped_lock = false; + + if (worker->creds) { + revert_creds(worker->creds); + worker->creds = NULL; + } + + if (current->files != worker->restore_files) { + __acquire(&wqe->lock); + spin_unlock_irq(&wqe->lock); + dropped_lock = true; + + task_lock(current); + current->files = worker->restore_files; + task_unlock(current); + } + + /* + * If we have an active mm, we need to drop the wq lock before unusing + * it. If we do, return true and let the caller retry the idle loop. + */ + if (worker->mm) { + if (!dropped_lock) { + __acquire(&wqe->lock); + spin_unlock_irq(&wqe->lock); + dropped_lock = true; + } + __set_current_state(TASK_RUNNING); + set_fs(KERNEL_DS); + unuse_mm(worker->mm); + mmput(worker->mm); + worker->mm = NULL; + } + + return dropped_lock; +} + +static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe, + struct io_wq_work *work) +{ + if (work->flags & IO_WQ_WORK_UNBOUND) + return &wqe->acct[IO_WQ_ACCT_UNBOUND]; + + return &wqe->acct[IO_WQ_ACCT_BOUND]; +} + +static inline struct io_wqe_acct *io_wqe_get_acct(struct io_wqe *wqe, + struct io_worker *worker) +{ + if (worker->flags & IO_WORKER_F_BOUND) + return &wqe->acct[IO_WQ_ACCT_BOUND]; + + return &wqe->acct[IO_WQ_ACCT_UNBOUND]; +} + +static void io_worker_exit(struct io_worker *worker) +{ + struct io_wqe *wqe = worker->wqe; + struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker); + unsigned nr_workers; + + /* + * If we're not at zero, someone else is holding a brief reference + * to the worker. Wait for that to go away. + */ + set_current_state(TASK_INTERRUPTIBLE); + if (!refcount_dec_and_test(&worker->ref)) + schedule(); + __set_current_state(TASK_RUNNING); + + preempt_disable(); + current->flags &= ~PF_IO_WORKER; + if (worker->flags & IO_WORKER_F_RUNNING) + atomic_dec(&acct->nr_running); + if (!(worker->flags & IO_WORKER_F_BOUND)) + atomic_dec(&wqe->wq->user->processes); + worker->flags = 0; + preempt_enable(); + + spin_lock_irq(&wqe->lock); + hlist_nulls_del_rcu(&worker->nulls_node); + list_del_rcu(&worker->all_list); + if (__io_worker_unuse(wqe, worker)) { + __release(&wqe->lock); + spin_lock_irq(&wqe->lock); + } + acct->nr_workers--; + nr_workers = wqe->acct[IO_WQ_ACCT_BOUND].nr_workers + + wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers; + spin_unlock_irq(&wqe->lock); + + /* all workers gone, wq exit can proceed */ + if (!nr_workers && refcount_dec_and_test(&wqe->wq->refs)) + complete(&wqe->wq->done); + + kfree_rcu(worker, rcu); +} + +static inline bool io_wqe_run_queue(struct io_wqe *wqe) + __must_hold(wqe->lock) +{ + if (!wq_list_empty(&wqe->work_list) && + !(wqe->flags & IO_WQE_FLAG_STALLED)) + return true; + return false; +} + +/* + * Check head of free list for an available worker. If one isn't available, + * caller must wake up the wq manager to create one. + */ +static bool io_wqe_activate_free_worker(struct io_wqe *wqe) + __must_hold(RCU) +{ + struct hlist_nulls_node *n; + struct io_worker *worker; + + n = rcu_dereference(hlist_nulls_first_rcu(&wqe->free_list)); + if (is_a_nulls(n)) + return false; + + worker = hlist_nulls_entry(n, struct io_worker, nulls_node); + if (io_worker_get(worker)) { + wake_up(&worker->wait); + io_worker_release(worker); + return true; + } + + return false; +} + +/* + * We need a worker. If we find a free one, we're good. If not, and we're + * below the max number of workers, wake up the manager to create one. + */ +static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) +{ + bool ret; + + /* + * Most likely an attempt to queue unbounded work on an io_wq that + * wasn't setup with any unbounded workers. + */ + WARN_ON_ONCE(!acct->max_workers); + + rcu_read_lock(); + ret = io_wqe_activate_free_worker(wqe); + rcu_read_unlock(); + + if (!ret && acct->nr_workers < acct->max_workers) + wake_up_process(wqe->wq->manager); +} + +static void io_wqe_inc_running(struct io_wqe *wqe, struct io_worker *worker) +{ + struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker); + + atomic_inc(&acct->nr_running); +} + +static void io_wqe_dec_running(struct io_wqe *wqe, struct io_worker *worker) + __must_hold(wqe->lock) +{ + struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker); + + if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) + io_wqe_wake_worker(wqe, acct); +} + +static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker) +{ + allow_kernel_signal(SIGINT); + + current->flags |= PF_IO_WORKER; + + worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); + worker->restore_files = current->files; + io_wqe_inc_running(wqe, worker); +} + +/* + * Worker will start processing some work. Move it to the busy list, if + * it's currently on the freelist + */ +static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker, + struct io_wq_work *work) + __must_hold(wqe->lock) +{ + bool worker_bound, work_bound; + + if (worker->flags & IO_WORKER_F_FREE) { + worker->flags &= ~IO_WORKER_F_FREE; + hlist_nulls_del_init_rcu(&worker->nulls_node); + hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->busy_list); + } + + /* + * If worker is moving from bound to unbound (or vice versa), then + * ensure we update the running accounting. + */ + worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0; + work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0; + if (worker_bound != work_bound) { + io_wqe_dec_running(wqe, worker); + if (work_bound) { + worker->flags |= IO_WORKER_F_BOUND; + wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers--; + wqe->acct[IO_WQ_ACCT_BOUND].nr_workers++; + atomic_dec(&wqe->wq->user->processes); + } else { + worker->flags &= ~IO_WORKER_F_BOUND; + wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers++; + wqe->acct[IO_WQ_ACCT_BOUND].nr_workers--; + atomic_inc(&wqe->wq->user->processes); + } + io_wqe_inc_running(wqe, worker); + } +} + +/* + * No work, worker going to sleep. Move to freelist, and unuse mm if we + * have one attached. Dropping the mm may potentially sleep, so we drop + * the lock in that case and return success. Since the caller has to + * retry the loop in that case (we changed task state), we don't regrab + * the lock if we return success. + */ +static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker) + __must_hold(wqe->lock) +{ + if (!(worker->flags & IO_WORKER_F_FREE)) { + worker->flags |= IO_WORKER_F_FREE; + hlist_nulls_del_init_rcu(&worker->nulls_node); + hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); + } + + return __io_worker_unuse(wqe, worker); +} + +static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash) + __must_hold(wqe->lock) +{ + struct io_wq_work_node *node, *prev; + struct io_wq_work *work; + + wq_list_for_each(node, prev, &wqe->work_list) { + work = container_of(node, struct io_wq_work, list); + + /* not hashed, can run anytime */ + if (!(work->flags & IO_WQ_WORK_HASHED)) { + wq_node_del(&wqe->work_list, node, prev); + return work; + } + + /* hashed, can run if not already running */ + *hash = work->flags >> IO_WQ_HASH_SHIFT; + if (!(wqe->hash_map & BIT_ULL(*hash))) { + wqe->hash_map |= BIT_ULL(*hash); + wq_node_del(&wqe->work_list, node, prev); + return work; + } + } + + return NULL; +} + +static void io_worker_handle_work(struct io_worker *worker) + __releases(wqe->lock) +{ + struct io_wq_work *work, *old_work = NULL, *put_work = NULL; + struct io_wqe *wqe = worker->wqe; + struct io_wq *wq = wqe->wq; + + do { + unsigned hash = -1U; + + /* + * If we got some work, mark us as busy. If we didn't, but + * the list isn't empty, it means we stalled on hashed work. + * Mark us stalled so we don't keep looking for work when we + * can't make progress, any work completion or insertion will + * clear the stalled flag. + */ + work = io_get_next_work(wqe, &hash); + if (work) + __io_worker_busy(wqe, worker, work); + else if (!wq_list_empty(&wqe->work_list)) + wqe->flags |= IO_WQE_FLAG_STALLED; + + spin_unlock_irq(&wqe->lock); + if (put_work && wq->put_work) + wq->put_work(old_work); + if (!work) + break; +next: + /* flush any pending signals before assigning new work */ + if (signal_pending(current)) + flush_signals(current); + + spin_lock_irq(&worker->lock); + worker->cur_work = work; + spin_unlock_irq(&worker->lock); + + if (work->flags & IO_WQ_WORK_CB) + work->func(&work); + + if ((work->flags & IO_WQ_WORK_NEEDS_FILES) && + current->files != work->files) { + task_lock(current); + current->files = work->files; + task_unlock(current); + } + if ((work->flags & IO_WQ_WORK_NEEDS_USER) && !worker->mm && + wq->mm && mmget_not_zero(wq->mm)) { + use_mm(wq->mm); + set_fs(USER_DS); + worker->mm = wq->mm; + } + if (!worker->creds) + worker->creds = override_creds(wq->creds); + if (test_bit(IO_WQ_BIT_CANCEL, &wq->state)) + work->flags |= IO_WQ_WORK_CANCEL; + if (worker->mm) + work->flags |= IO_WQ_WORK_HAS_MM; + + if (wq->get_work && !(work->flags & IO_WQ_WORK_INTERNAL)) { + put_work = work; + wq->get_work(work); + } + + old_work = work; + work->func(&work); + + spin_lock_irq(&worker->lock); + worker->cur_work = NULL; + spin_unlock_irq(&worker->lock); + + spin_lock_irq(&wqe->lock); + + if (hash != -1U) { + wqe->hash_map &= ~BIT_ULL(hash); + wqe->flags &= ~IO_WQE_FLAG_STALLED; + } + if (work && work != old_work) { + spin_unlock_irq(&wqe->lock); + + if (put_work && wq->put_work) { + wq->put_work(put_work); + put_work = NULL; + } + + /* dependent work not hashed */ + hash = -1U; + goto next; + } + } while (1); +} + +static int io_wqe_worker(void *data) +{ + struct io_worker *worker = data; + struct io_wqe *wqe = worker->wqe; + struct io_wq *wq = wqe->wq; + DEFINE_WAIT(wait); + + io_worker_start(wqe, worker); + + while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) { + prepare_to_wait(&worker->wait, &wait, TASK_INTERRUPTIBLE); + + spin_lock_irq(&wqe->lock); + if (io_wqe_run_queue(wqe)) { + __set_current_state(TASK_RUNNING); + io_worker_handle_work(worker); + continue; + } + /* drops the lock on success, retry */ + if (__io_worker_idle(wqe, worker)) { + __release(&wqe->lock); + continue; + } + spin_unlock_irq(&wqe->lock); + if (signal_pending(current)) + flush_signals(current); + if (schedule_timeout(WORKER_IDLE_TIMEOUT)) + continue; + /* timed out, exit unless we're the fixed worker */ + if (test_bit(IO_WQ_BIT_EXIT, &wq->state) || + !(worker->flags & IO_WORKER_F_FIXED)) + break; + } + + finish_wait(&worker->wait, &wait); + + if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { + spin_lock_irq(&wqe->lock); + if (!wq_list_empty(&wqe->work_list)) + io_worker_handle_work(worker); + else + spin_unlock_irq(&wqe->lock); + } + + io_worker_exit(worker); + return 0; +} + +/* + * Called when a worker is scheduled in. Mark us as currently running. + */ +void io_wq_worker_running(struct task_struct *tsk) +{ + struct io_worker *worker = kthread_data(tsk); + struct io_wqe *wqe = worker->wqe; + + if (!(worker->flags & IO_WORKER_F_UP)) + return; + if (worker->flags & IO_WORKER_F_RUNNING) + return; + worker->flags |= IO_WORKER_F_RUNNING; + io_wqe_inc_running(wqe, worker); +} + +/* + * Called when worker is going to sleep. If there are no workers currently + * running and we have work pending, wake up a free one or have the manager + * set one up. + */ +void io_wq_worker_sleeping(struct task_struct *tsk) +{ + struct io_worker *worker = kthread_data(tsk); + struct io_wqe *wqe = worker->wqe; + + if (!(worker->flags & IO_WORKER_F_UP)) + return; + if (!(worker->flags & IO_WORKER_F_RUNNING)) + return; + + worker->flags &= ~IO_WORKER_F_RUNNING; + + spin_lock_irq(&wqe->lock); + io_wqe_dec_running(wqe, worker); + spin_unlock_irq(&wqe->lock); +} + +static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) +{ + struct io_wqe_acct *acct =&wqe->acct[index]; + struct io_worker *worker; + + worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node); + if (!worker) + return false; + + refcount_set(&worker->ref, 1); + worker->nulls_node.pprev = NULL; + init_waitqueue_head(&worker->wait); + worker->wqe = wqe; + spin_lock_init(&worker->lock); + + worker->task = kthread_create_on_node(io_wqe_worker, worker, wqe->node, + "io_wqe_worker-%d/%d", index, wqe->node); + if (IS_ERR(worker->task)) { + kfree(worker); + return false; + } + + spin_lock_irq(&wqe->lock); + hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); + list_add_tail_rcu(&worker->all_list, &wqe->all_list); + worker->flags |= IO_WORKER_F_FREE; + if (index == IO_WQ_ACCT_BOUND) + worker->flags |= IO_WORKER_F_BOUND; + if (!acct->nr_workers && (worker->flags & IO_WORKER_F_BOUND)) + worker->flags |= IO_WORKER_F_FIXED; + acct->nr_workers++; + spin_unlock_irq(&wqe->lock); + + if (index == IO_WQ_ACCT_UNBOUND) + atomic_inc(&wq->user->processes); + + wake_up_process(worker->task); + return true; +} + +static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index) + __must_hold(wqe->lock) +{ + struct io_wqe_acct *acct = &wqe->acct[index]; + + /* if we have available workers or no work, no need */ + if (!hlist_nulls_empty(&wqe->free_list) || !io_wqe_run_queue(wqe)) + return false; + return acct->nr_workers < acct->max_workers; +} + +/* + * Manager thread. Tasked with creating new workers, if we need them. + */ +static int io_wq_manager(void *data) +{ + struct io_wq *wq = data; + int workers_to_create = num_possible_nodes(); + int node; + + /* create fixed workers */ + refcount_set(&wq->refs, workers_to_create); + for_each_node(node) { + if (!create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND)) + goto err; + workers_to_create--; + } + + complete(&wq->done); + + while (!kthread_should_stop()) { + for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; + bool fork_worker[2] = { false, false }; + + spin_lock_irq(&wqe->lock); + if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND)) + fork_worker[IO_WQ_ACCT_BOUND] = true; + if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND)) + fork_worker[IO_WQ_ACCT_UNBOUND] = true; + spin_unlock_irq(&wqe->lock); + if (fork_worker[IO_WQ_ACCT_BOUND]) + create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND); + if (fork_worker[IO_WQ_ACCT_UNBOUND]) + create_io_worker(wq, wqe, IO_WQ_ACCT_UNBOUND); + } + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ); + } + + return 0; +err: + set_bit(IO_WQ_BIT_ERROR, &wq->state); + set_bit(IO_WQ_BIT_EXIT, &wq->state); + if (refcount_sub_and_test(workers_to_create, &wq->refs)) + complete(&wq->done); + return 0; +} + +static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct, + struct io_wq_work *work) +{ + bool free_worker; + + if (!(work->flags & IO_WQ_WORK_UNBOUND)) + return true; + if (atomic_read(&acct->nr_running)) + return true; + + rcu_read_lock(); + free_worker = !hlist_nulls_empty(&wqe->free_list); + rcu_read_unlock(); + if (free_worker) + return true; + + if (atomic_read(&wqe->wq->user->processes) >= acct->max_workers && + !(capable(CAP_SYS_RESOURCE) || capable(CAP_SYS_ADMIN))) + return false; + + return true; +} + +static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) +{ + struct io_wqe_acct *acct = io_work_get_acct(wqe, work); + unsigned long flags; + + /* + * Do early check to see if we need a new unbound worker, and if we do, + * if we're allowed to do so. This isn't 100% accurate as there's a + * gap between this check and incrementing the value, but that's OK. + * It's close enough to not be an issue, fork() has the same delay. + */ + if (unlikely(!io_wq_can_queue(wqe, acct, work))) { + work->flags |= IO_WQ_WORK_CANCEL; + work->func(&work); + return; + } + + spin_lock_irqsave(&wqe->lock, flags); + wq_list_add_tail(&work->list, &wqe->work_list); + wqe->flags &= ~IO_WQE_FLAG_STALLED; + spin_unlock_irqrestore(&wqe->lock, flags); + + if (!atomic_read(&acct->nr_running)) + io_wqe_wake_worker(wqe, acct); +} + +void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) +{ + struct io_wqe *wqe = wq->wqes[numa_node_id()]; + + io_wqe_enqueue(wqe, work); +} + +/* + * Enqueue work, hashed by some key. Work items that hash to the same value + * will not be done in parallel. Used to limit concurrent writes, generally + * hashed by inode. + */ +void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val) +{ + struct io_wqe *wqe = wq->wqes[numa_node_id()]; + unsigned bit; + + + bit = hash_ptr(val, IO_WQ_HASH_ORDER); + work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT)); + io_wqe_enqueue(wqe, work); +} + +static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data) +{ + send_sig(SIGINT, worker->task, 1); + return false; +} + +/* + * Iterate the passed in list and call the specific function for each + * worker that isn't exiting + */ +static bool io_wq_for_each_worker(struct io_wqe *wqe, + bool (*func)(struct io_worker *, void *), + void *data) +{ + struct io_worker *worker; + bool ret = false; + + list_for_each_entry_rcu(worker, &wqe->all_list, all_list) { + if (io_worker_get(worker)) { + ret = func(worker, data); + io_worker_release(worker); + if (ret) + break; + } + } + + return ret; +} + +void io_wq_cancel_all(struct io_wq *wq) +{ + int node; + + set_bit(IO_WQ_BIT_CANCEL, &wq->state); + + /* + * Browse both lists, as there's a gap between handing work off + * to a worker and the worker putting itself on the busy_list + */ + rcu_read_lock(); + for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; + + io_wq_for_each_worker(wqe, io_wqe_worker_send_sig, NULL); + } + rcu_read_unlock(); +} + +struct io_cb_cancel_data { + struct io_wqe *wqe; + work_cancel_fn *cancel; + void *caller_data; +}; + +static bool io_work_cancel(struct io_worker *worker, void *cancel_data) +{ + struct io_cb_cancel_data *data = cancel_data; + unsigned long flags; + bool ret = false; + + /* + * Hold the lock to avoid ->cur_work going out of scope, caller + * may dereference the passed in work. + */ + spin_lock_irqsave(&worker->lock, flags); + if (worker->cur_work && + data->cancel(worker->cur_work, data->caller_data)) { + send_sig(SIGINT, worker->task, 1); + ret = true; + } + spin_unlock_irqrestore(&worker->lock, flags); + + return ret; +} + +static enum io_wq_cancel io_wqe_cancel_cb_work(struct io_wqe *wqe, + work_cancel_fn *cancel, + void *cancel_data) +{ + struct io_cb_cancel_data data = { + .wqe = wqe, + .cancel = cancel, + .caller_data = cancel_data, + }; + struct io_wq_work_node *node, *prev; + struct io_wq_work *work; + unsigned long flags; + bool found = false; + + spin_lock_irqsave(&wqe->lock, flags); + wq_list_for_each(node, prev, &wqe->work_list) { + work = container_of(node, struct io_wq_work, list); + + if (cancel(work, cancel_data)) { + wq_node_del(&wqe->work_list, node, prev); + found = true; + break; + } + } + spin_unlock_irqrestore(&wqe->lock, flags); + + if (found) { + work->flags |= IO_WQ_WORK_CANCEL; + work->func(&work); + return IO_WQ_CANCEL_OK; + } + + rcu_read_lock(); + found = io_wq_for_each_worker(wqe, io_work_cancel, &data); + rcu_read_unlock(); + return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND; +} + +enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, + void *data) +{ + enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; + int node; + + for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; + + ret = io_wqe_cancel_cb_work(wqe, cancel, data); + if (ret != IO_WQ_CANCEL_NOTFOUND) + break; + } + + return ret; +} + +static bool io_wq_worker_cancel(struct io_worker *worker, void *data) +{ + struct io_wq_work *work = data; + unsigned long flags; + bool ret = false; + + if (worker->cur_work != work) + return false; + + spin_lock_irqsave(&worker->lock, flags); + if (worker->cur_work == work) { + send_sig(SIGINT, worker->task, 1); + ret = true; + } + spin_unlock_irqrestore(&worker->lock, flags); + + return ret; +} + +static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, + struct io_wq_work *cwork) +{ + struct io_wq_work_node *node, *prev; + struct io_wq_work *work; + unsigned long flags; + bool found = false; + + cwork->flags |= IO_WQ_WORK_CANCEL; + + /* + * First check pending list, if we're lucky we can just remove it + * from there. CANCEL_OK means that the work is returned as-new, + * no completion will be posted for it. + */ + spin_lock_irqsave(&wqe->lock, flags); + wq_list_for_each(node, prev, &wqe->work_list) { + work = container_of(node, struct io_wq_work, list); + + if (work == cwork) { + wq_node_del(&wqe->work_list, node, prev); + found = true; + break; + } + } + spin_unlock_irqrestore(&wqe->lock, flags); + + if (found) { + work->flags |= IO_WQ_WORK_CANCEL; + work->func(&work); + return IO_WQ_CANCEL_OK; + } + + /* + * Now check if a free (going busy) or busy worker has the work + * currently running. If we find it there, we'll return CANCEL_RUNNING + * as an indication that we attempte to signal cancellation. The + * completion will run normally in this case. + */ + rcu_read_lock(); + found = io_wq_for_each_worker(wqe, io_wq_worker_cancel, cwork); + rcu_read_unlock(); + return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND; +} + +enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork) +{ + enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; + int node; + + for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; + + ret = io_wqe_cancel_work(wqe, cwork); + if (ret != IO_WQ_CANCEL_NOTFOUND) + break; + } + + return ret; +} + +struct io_wq_flush_data { + struct io_wq_work work; + struct completion done; +}; + +static void io_wq_flush_func(struct io_wq_work **workptr) +{ + struct io_wq_work *work = *workptr; + struct io_wq_flush_data *data; + + data = container_of(work, struct io_wq_flush_data, work); + complete(&data->done); +} + +/* + * Doesn't wait for previously queued work to finish. When this completes, + * it just means that previously queued work was started. + */ +void io_wq_flush(struct io_wq *wq) +{ + struct io_wq_flush_data data; + int node; + + for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; + + init_completion(&data.done); + INIT_IO_WORK(&data.work, io_wq_flush_func); + data.work.flags |= IO_WQ_WORK_INTERNAL; + io_wqe_enqueue(wqe, &data.work); + wait_for_completion(&data.done); + } +} + +struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) +{ + int ret = -ENOMEM, node; + struct io_wq *wq; + + wq = kzalloc(sizeof(*wq), GFP_KERNEL); + if (!wq) + return ERR_PTR(-ENOMEM); + + wq->wqes = kcalloc(nr_node_ids, sizeof(struct io_wqe *), GFP_KERNEL); + if (!wq->wqes) { + kfree(wq); + return ERR_PTR(-ENOMEM); + } + + wq->get_work = data->get_work; + wq->put_work = data->put_work; + + /* caller must already hold a reference to this */ + wq->user = data->user; + wq->creds = data->creds; + + for_each_node(node) { + struct io_wqe *wqe; + + wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, node); + if (!wqe) + goto err; + wq->wqes[node] = wqe; + wqe->node = node; + wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; + atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0); + if (wq->user) { + wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers = + task_rlimit(current, RLIMIT_NPROC); + } + atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0); + wqe->node = node; + wqe->wq = wq; + spin_lock_init(&wqe->lock); + INIT_WQ_LIST(&wqe->work_list); + INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0); + INIT_HLIST_NULLS_HEAD(&wqe->busy_list, 1); + INIT_LIST_HEAD(&wqe->all_list); + } + + init_completion(&wq->done); + + /* caller must have already done mmgrab() on this mm */ + wq->mm = data->mm; + + wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager"); + if (!IS_ERR(wq->manager)) { + wake_up_process(wq->manager); + wait_for_completion(&wq->done); + if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) { + ret = -ENOMEM; + goto err; + } + reinit_completion(&wq->done); + return wq; + } + + ret = PTR_ERR(wq->manager); + complete(&wq->done); +err: + for_each_node(node) + kfree(wq->wqes[node]); + kfree(wq->wqes); + kfree(wq); + return ERR_PTR(ret); +} + +static bool io_wq_worker_wake(struct io_worker *worker, void *data) +{ + wake_up_process(worker->task); + return false; +} + +void io_wq_destroy(struct io_wq *wq) +{ + int node; + + set_bit(IO_WQ_BIT_EXIT, &wq->state); + if (wq->manager) + kthread_stop(wq->manager); + + rcu_read_lock(); + for_each_node(node) + io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL); + rcu_read_unlock(); + + wait_for_completion(&wq->done); + + for_each_node(node) + kfree(wq->wqes[node]); + kfree(wq->wqes); + kfree(wq); +} diff --git a/fs/io-wq.h b/fs/io-wq.h new file mode 100644 index 000000000000..7c333a28e2a7 --- /dev/null +++ b/fs/io-wq.h @@ -0,0 +1,124 @@ +#ifndef INTERNAL_IO_WQ_H +#define INTERNAL_IO_WQ_H + +struct io_wq; + +enum { + IO_WQ_WORK_CANCEL = 1, + IO_WQ_WORK_HAS_MM = 2, + IO_WQ_WORK_HASHED = 4, + IO_WQ_WORK_NEEDS_USER = 8, + IO_WQ_WORK_NEEDS_FILES = 16, + IO_WQ_WORK_UNBOUND = 32, + IO_WQ_WORK_INTERNAL = 64, + IO_WQ_WORK_CB = 128, + + IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */ +}; + +enum io_wq_cancel { + IO_WQ_CANCEL_OK, /* cancelled before started */ + IO_WQ_CANCEL_RUNNING, /* found, running, and attempted cancelled */ + IO_WQ_CANCEL_NOTFOUND, /* work not found */ +}; + +struct io_wq_work_node { + struct io_wq_work_node *next; +}; + +struct io_wq_work_list { + struct io_wq_work_node *first; + struct io_wq_work_node *last; +}; + +static inline void wq_list_add_tail(struct io_wq_work_node *node, + struct io_wq_work_list *list) +{ + if (!list->first) { + list->first = list->last = node; + } else { + list->last->next = node; + list->last = node; + } +} + +static inline void wq_node_del(struct io_wq_work_list *list, + struct io_wq_work_node *node, + struct io_wq_work_node *prev) +{ + if (node == list->first) + list->first = node->next; + if (node == list->last) + list->last = prev; + if (prev) + prev->next = node->next; + node->next = NULL; +} + +#define wq_list_for_each(pos, prv, head) \ + for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) + +#define wq_list_empty(list) ((list)->first == NULL) +#define INIT_WQ_LIST(list) do { \ + (list)->first = NULL; \ + (list)->last = NULL; \ +} while (0) + +struct io_wq_work { + union { + struct io_wq_work_node list; + void *data; + }; + void (*func)(struct io_wq_work **); + struct files_struct *files; + unsigned flags; +}; + +#define INIT_IO_WORK(work, _func) \ + do { \ + (work)->list.next = NULL; \ + (work)->func = _func; \ + (work)->flags = 0; \ + (work)->files = NULL; \ + } while (0) \ + +typedef void (get_work_fn)(struct io_wq_work *); +typedef void (put_work_fn)(struct io_wq_work *); + +struct io_wq_data { + struct mm_struct *mm; + struct user_struct *user; + const struct cred *creds; + + get_work_fn *get_work; + put_work_fn *put_work; +}; + +struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data); +void io_wq_destroy(struct io_wq *wq); + +void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); +void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val); +void io_wq_flush(struct io_wq *wq); + +void io_wq_cancel_all(struct io_wq *wq); +enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork); + +typedef bool (work_cancel_fn)(struct io_wq_work *, void *); + +enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, + void *data); + +#if defined(CONFIG_IO_WQ) +extern void io_wq_worker_sleeping(struct task_struct *); +extern void io_wq_worker_running(struct task_struct *); +#else +static inline void io_wq_worker_sleeping(struct task_struct *tsk) +{ +} +static inline void io_wq_worker_running(struct task_struct *tsk) +{ +} +#endif /* CONFIG_IO_WQ */ + +#endif /* INTERNAL_IO_WQ_H */ diff --git a/fs/io_uring.c b/fs/io_uring.c index 76fdbe84aff5..405be10da73d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -56,7 +56,6 @@ #include <linux/mmu_context.h> #include <linux/percpu.h> #include <linux/slab.h> -#include <linux/workqueue.h> #include <linux/kthread.h> #include <linux/blkdev.h> #include <linux/bvec.h> @@ -70,13 +69,26 @@ #include <linux/nospec.h> #include <linux/sizes.h> #include <linux/hugetlb.h> +#include <linux/highmem.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/io_uring.h> #include <uapi/linux/io_uring.h> #include "internal.h" +#include "io-wq.h" #define IORING_MAX_ENTRIES 32768 -#define IORING_MAX_FIXED_FILES 1024 +#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) + +/* + * Shift of 9 is 512 entries, or exactly one page on 64-bit archs + */ +#define IORING_FILE_TABLE_SHIFT 9 +#define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT) +#define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1) +#define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE) struct io_uring { u32 head ____cacheline_aligned_in_smp; @@ -133,7 +145,7 @@ struct io_rings { /* * Number of completion events lost because the queue was full; * this should be avoided by the application by making sure - * there are not more requests pending thatn there is space in + * there are not more requests pending than there is space in * the completion queue. * * Written by the kernel, shouldn't be modified by the @@ -161,14 +173,8 @@ struct io_mapped_ubuf { unsigned int nr_bvecs; }; -struct async_list { - spinlock_t lock; - atomic_t cnt; - struct list_head list; - - struct file *file; - off_t io_start; - size_t io_len; +struct fixed_file_table { + struct file **files; }; struct io_ring_ctx { @@ -180,6 +186,8 @@ struct io_ring_ctx { unsigned int flags; bool compat; bool account_mem; + bool cq_overflow_flushed; + bool drain_next; /* * Ring buffer of indices into array of io_uring_sqe, which is @@ -197,37 +205,31 @@ struct io_ring_ctx { unsigned sq_entries; unsigned sq_mask; unsigned sq_thread_idle; + unsigned cached_sq_dropped; + atomic_t cached_cq_overflow; struct io_uring_sqe *sq_sqes; struct list_head defer_list; struct list_head timeout_list; + struct list_head cq_overflow_list; + + wait_queue_head_t inflight_wait; } ____cacheline_aligned_in_smp; + struct io_rings *rings; + /* IO offload */ - struct workqueue_struct *sqo_wq[2]; + struct io_wq *io_wq; struct task_struct *sqo_thread; /* if using sq thread polling */ struct mm_struct *sqo_mm; wait_queue_head_t sqo_wait; - struct completion sqo_thread_started; - - struct { - unsigned cached_cq_tail; - unsigned cq_entries; - unsigned cq_mask; - struct wait_queue_head cq_wait; - struct fasync_struct *cq_fasync; - struct eventfd_ctx *cq_ev_fd; - atomic_t cq_timeouts; - } ____cacheline_aligned_in_smp; - - struct io_rings *rings; /* * If used, fixed file set. Writers must ensure that ->refs is dead, * readers must ensure that ->refs is alive as long as the file* is * used. Only updated through io_uring_register(2). */ - struct file **user_files; + struct fixed_file_table *file_table; unsigned nr_user_files; /* if used, fixed mapped user buffers */ @@ -236,7 +238,27 @@ struct io_ring_ctx { struct user_struct *user; - struct completion ctx_done; + const struct cred *creds; + + /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */ + struct completion *completions; + + /* if all else fails... */ + struct io_kiocb *fallback_req; + +#if defined(CONFIG_UNIX) + struct socket *ring_sock; +#endif + + struct { + unsigned cached_cq_tail; + unsigned cq_entries; + unsigned cq_mask; + atomic_t cq_timeouts; + struct wait_queue_head cq_wait; + struct fasync_struct *cq_fasync; + struct eventfd_ctx *cq_ev_fd; + } ____cacheline_aligned_in_smp; struct { struct mutex uring_lock; @@ -253,23 +275,12 @@ struct io_ring_ctx { * manipulate the list, hence no extra locking is needed there. */ struct list_head poll_list; - struct list_head cancel_list; - } ____cacheline_aligned_in_smp; + struct hlist_head *cancel_hash; + unsigned cancel_hash_bits; - struct async_list pending_async[2]; - -#if defined(CONFIG_UNIX) - struct socket *ring_sock; -#endif -}; - -struct sqe_submit { - const struct io_uring_sqe *sqe; - unsigned short index; - u32 sequence; - bool has_user; - bool needs_lock; - bool needs_fixed_file; + spinlock_t inflight_lock; + struct list_head inflight_list; + } ____cacheline_aligned_in_smp; }; /* @@ -282,12 +293,43 @@ struct io_poll_iocb { __poll_t events; bool done; bool canceled; - struct wait_queue_entry wait; + struct wait_queue_entry *wait; }; -struct io_timeout { - struct file *file; +struct io_timeout_data { + struct io_kiocb *req; struct hrtimer timer; + struct timespec64 ts; + enum hrtimer_mode mode; + u32 seq_offset; +}; + +struct io_async_connect { + struct sockaddr_storage address; +}; + +struct io_async_msghdr { + struct iovec fast_iov[UIO_FASTIOV]; + struct iovec *iov; + struct sockaddr __user *uaddr; + struct msghdr msg; +}; + +struct io_async_rw { + struct iovec fast_iov[UIO_FASTIOV]; + struct iovec *iov; + ssize_t nr_segs; + ssize_t size; +}; + +struct io_async_ctx { + struct io_uring_sqe sqe; + union { + struct io_async_rw rw; + struct io_async_msghdr msg; + struct io_async_connect connect; + struct io_timeout_data timeout; + }; }; /* @@ -301,32 +343,47 @@ struct io_kiocb { struct file *file; struct kiocb rw; struct io_poll_iocb poll; - struct io_timeout timeout; }; - struct sqe_submit submit; + const struct io_uring_sqe *sqe; + struct io_async_ctx *io; + struct file *ring_file; + int ring_fd; + bool has_user; + bool in_async; + bool needs_fixed_file; struct io_ring_ctx *ctx; - struct list_head list; + union { + struct list_head list; + struct hlist_node hash_node; + }; struct list_head link_list; unsigned int flags; refcount_t refs; #define REQ_F_NOWAIT 1 /* must not punt to workers */ #define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ #define REQ_F_FIXED_FILE 4 /* ctx owns file */ -#define REQ_F_SEQ_PREV 8 /* sequential with previous */ +#define REQ_F_LINK_NEXT 8 /* already grabbed next link */ #define REQ_F_IO_DRAIN 16 /* drain existing IO first */ #define REQ_F_IO_DRAINED 32 /* drain done */ #define REQ_F_LINK 64 /* linked sqes */ -#define REQ_F_LINK_DONE 128 /* linked sqes done */ +#define REQ_F_LINK_TIMEOUT 128 /* has linked timeout */ #define REQ_F_FAIL_LINK 256 /* fail rest of links */ -#define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */ +#define REQ_F_DRAIN_LINK 512 /* link should be fully drained */ #define REQ_F_TIMEOUT 1024 /* timeout request */ +#define REQ_F_ISREG 2048 /* regular file */ +#define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */ +#define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */ +#define REQ_F_INFLIGHT 16384 /* on inflight list */ +#define REQ_F_COMP_LOCKED 32768 /* completion under lock */ u64 user_data; u32 result; u32 sequence; - struct work_struct work; + struct list_head inflight_entry; + + struct io_wq_work work; }; #define IO_PLUG_THRESHOLD 2 @@ -352,10 +409,14 @@ struct io_submit_state { unsigned int ios_left; }; -static void io_sq_wq_submit_work(struct work_struct *work); -static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, - long res); +static void io_wq_submit_work(struct io_wq_work **workptr); +static void io_cqring_fill_event(struct io_kiocb *req, long res); static void __io_free_req(struct io_kiocb *req); +static void io_put_req(struct io_kiocb *req); +static void io_double_put_req(struct io_kiocb *req); +static void __io_double_put_req(struct io_kiocb *req); +static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); +static void io_queue_linked_timeout(struct io_kiocb *req); static struct kmem_cache *req_cachep; @@ -378,56 +439,83 @@ static void io_ring_ctx_ref_free(struct percpu_ref *ref) { struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); - complete(&ctx->ctx_done); + complete(&ctx->completions[0]); } static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; - int i; + int hash_bits; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return NULL; + ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL); + if (!ctx->fallback_req) + goto err; + + ctx->completions = kmalloc(2 * sizeof(struct completion), GFP_KERNEL); + if (!ctx->completions) + goto err; + + /* + * Use 5 bits less than the max cq entries, that should give us around + * 32 entries per hash list if totally full and uniformly spread. + */ + hash_bits = ilog2(p->cq_entries); + hash_bits -= 5; + if (hash_bits <= 0) + hash_bits = 1; + ctx->cancel_hash_bits = hash_bits; + ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head), + GFP_KERNEL); + if (!ctx->cancel_hash) + goto err; + __hash_init(ctx->cancel_hash, 1U << hash_bits); + if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, - PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { - kfree(ctx); - return NULL; - } + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) + goto err; ctx->flags = p->flags; init_waitqueue_head(&ctx->cq_wait); - init_completion(&ctx->ctx_done); - init_completion(&ctx->sqo_thread_started); + INIT_LIST_HEAD(&ctx->cq_overflow_list); + init_completion(&ctx->completions[0]); + init_completion(&ctx->completions[1]); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->wait); - for (i = 0; i < ARRAY_SIZE(ctx->pending_async); i++) { - spin_lock_init(&ctx->pending_async[i].lock); - INIT_LIST_HEAD(&ctx->pending_async[i].list); - atomic_set(&ctx->pending_async[i].cnt, 0); - } spin_lock_init(&ctx->completion_lock); INIT_LIST_HEAD(&ctx->poll_list); - INIT_LIST_HEAD(&ctx->cancel_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); + init_waitqueue_head(&ctx->inflight_wait); + spin_lock_init(&ctx->inflight_lock); + INIT_LIST_HEAD(&ctx->inflight_list); return ctx; +err: + if (ctx->fallback_req) + kmem_cache_free(req_cachep, ctx->fallback_req); + kfree(ctx->completions); + kfree(ctx->cancel_hash); + kfree(ctx); + return NULL; } -static inline bool __io_sequence_defer(struct io_ring_ctx *ctx, - struct io_kiocb *req) +static inline bool __req_need_defer(struct io_kiocb *req) { - return req->sequence != ctx->cached_cq_tail + ctx->rings->sq_dropped; + struct io_ring_ctx *ctx = req->ctx; + + return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped + + atomic_read(&ctx->cached_cq_overflow); } -static inline bool io_sequence_defer(struct io_ring_ctx *ctx, - struct io_kiocb *req) +static inline bool req_need_defer(struct io_kiocb *req) { - if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN) - return false; + if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) == REQ_F_IO_DRAIN) + return __req_need_defer(req); - return __io_sequence_defer(ctx, req); + return false; } static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) @@ -435,7 +523,7 @@ static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) struct io_kiocb *req; req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list); - if (req && !io_sequence_defer(ctx, req)) { + if (req && !req_need_defer(req)) { list_del_init(&req->list); return req; } @@ -448,9 +536,13 @@ static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx) struct io_kiocb *req; req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list); - if (req && !__io_sequence_defer(ctx, req)) { - list_del_init(&req->list); - return req; + if (req) { + if (req->flags & REQ_F_TIMEOUT_NOSEQ) + return NULL; + if (!__req_need_defer(req)) { + list_del_init(&req->list); + return req; + } } return NULL; @@ -471,33 +563,80 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } -static inline void io_queue_async_work(struct io_ring_ctx *ctx, - struct io_kiocb *req) +static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe) +{ + u8 opcode = READ_ONCE(sqe->opcode); + + return !(opcode == IORING_OP_READ_FIXED || + opcode == IORING_OP_WRITE_FIXED); +} + +static inline bool io_prep_async_work(struct io_kiocb *req, + struct io_kiocb **link) { - int rw = 0; + bool do_hashed = false; - if (req->submit.sqe) { - switch (req->submit.sqe->opcode) { + if (req->sqe) { + switch (req->sqe->opcode) { case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: - rw = !(req->rw.ki_flags & IOCB_DIRECT); + do_hashed = true; + /* fall-through */ + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + case IORING_OP_SENDMSG: + case IORING_OP_RECVMSG: + case IORING_OP_ACCEPT: + case IORING_OP_POLL_ADD: + case IORING_OP_CONNECT: + /* + * We know REQ_F_ISREG is not set on some of these + * opcodes, but this enables us to keep the check in + * just one place. + */ + if (!(req->flags & REQ_F_ISREG)) + req->work.flags |= IO_WQ_WORK_UNBOUND; break; } + if (io_sqe_needs_user(req->sqe)) + req->work.flags |= IO_WQ_WORK_NEEDS_USER; } - queue_work(ctx->sqo_wq[rw], &req->work); + *link = io_prep_linked_timeout(req); + return do_hashed; +} + +static inline void io_queue_async_work(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_kiocb *link; + bool do_hashed; + + do_hashed = io_prep_async_work(req, &link); + + trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work, + req->flags); + if (!do_hashed) { + io_wq_enqueue(ctx->io_wq, &req->work); + } else { + io_wq_enqueue_hashed(ctx->io_wq, &req->work, + file_inode(req->file)); + } + + if (link) + io_queue_linked_timeout(link); } static void io_kill_timeout(struct io_kiocb *req) { int ret; - ret = hrtimer_try_to_cancel(&req->timeout.timer); + ret = hrtimer_try_to_cancel(&req->io->timeout.timer); if (ret != -1) { atomic_inc(&req->ctx->cq_timeouts); - list_del(&req->list); - io_cqring_fill_event(req->ctx, req->user_data, 0); - __io_free_req(req); + list_del_init(&req->list); + io_cqring_fill_event(req, 0); + io_put_req(req); } } @@ -521,13 +660,8 @@ static void io_commit_cqring(struct io_ring_ctx *ctx) __io_commit_cqring(ctx); while ((req = io_get_deferred_req(ctx)) != NULL) { - if (req->flags & REQ_F_SHADOW_DRAIN) { - /* Just for drain, free it. */ - __io_free_req(req); - continue; - } req->flags |= REQ_F_IO_DRAINED; - io_queue_async_work(ctx, req); + io_queue_async_work(req); } } @@ -549,51 +683,128 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) return &rings->cqes[tail & ctx->cq_mask]; } -static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, - long res) +static void io_cqring_ev_posted(struct io_ring_ctx *ctx) +{ + if (waitqueue_active(&ctx->wait)) + wake_up(&ctx->wait); + if (waitqueue_active(&ctx->sqo_wait)) + wake_up(&ctx->sqo_wait); + if (ctx->cq_ev_fd) + eventfd_signal(ctx->cq_ev_fd, 1); +} + +/* Returns true if there are no backlogged entries after the flush */ +static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { + struct io_rings *rings = ctx->rings; + struct io_uring_cqe *cqe; + struct io_kiocb *req; + unsigned long flags; + LIST_HEAD(list); + + if (!force) { + if (list_empty_careful(&ctx->cq_overflow_list)) + return true; + if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) == + rings->cq_ring_entries)) + return false; + } + + spin_lock_irqsave(&ctx->completion_lock, flags); + + /* if force is set, the ring is going away. always drop after that */ + if (force) + ctx->cq_overflow_flushed = true; + + cqe = NULL; + while (!list_empty(&ctx->cq_overflow_list)) { + cqe = io_get_cqring(ctx); + if (!cqe && !force) + break; + + req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb, + list); + list_move(&req->list, &list); + if (cqe) { + WRITE_ONCE(cqe->user_data, req->user_data); + WRITE_ONCE(cqe->res, req->result); + WRITE_ONCE(cqe->flags, 0); + } else { + WRITE_ONCE(ctx->rings->cq_overflow, + atomic_inc_return(&ctx->cached_cq_overflow)); + } + } + + io_commit_cqring(ctx); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + io_cqring_ev_posted(ctx); + + while (!list_empty(&list)) { + req = list_first_entry(&list, struct io_kiocb, list); + list_del(&req->list); + io_put_req(req); + } + + return cqe != NULL; +} + +static void io_cqring_fill_event(struct io_kiocb *req, long res) +{ + struct io_ring_ctx *ctx = req->ctx; struct io_uring_cqe *cqe; + trace_io_uring_complete(ctx, req->user_data, res); + /* * If we can't get a cq entry, userspace overflowed the * submission (by quite a lot). Increment the overflow count in * the ring. */ cqe = io_get_cqring(ctx); - if (cqe) { - WRITE_ONCE(cqe->user_data, ki_user_data); + if (likely(cqe)) { + WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, 0); + } else if (ctx->cq_overflow_flushed) { + WRITE_ONCE(ctx->rings->cq_overflow, + atomic_inc_return(&ctx->cached_cq_overflow)); } else { - unsigned overflow = READ_ONCE(ctx->rings->cq_overflow); - - WRITE_ONCE(ctx->rings->cq_overflow, overflow + 1); + refcount_inc(&req->refs); + req->result = res; + list_add_tail(&req->list, &ctx->cq_overflow_list); } } -static void io_cqring_ev_posted(struct io_ring_ctx *ctx) -{ - if (waitqueue_active(&ctx->wait)) - wake_up(&ctx->wait); - if (waitqueue_active(&ctx->sqo_wait)) - wake_up(&ctx->sqo_wait); - if (ctx->cq_ev_fd) - eventfd_signal(ctx->cq_ev_fd, 1); -} - -static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data, - long res) +static void io_cqring_add_event(struct io_kiocb *req, long res) { + struct io_ring_ctx *ctx = req->ctx; unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); - io_cqring_fill_event(ctx, user_data, res); + io_cqring_fill_event(req, res); io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); } +static inline bool io_is_fallback_req(struct io_kiocb *req) +{ + return req == (struct io_kiocb *) + ((unsigned long) req->ctx->fallback_req & ~1UL); +} + +static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx) +{ + struct io_kiocb *req; + + req = ctx->fallback_req; + if (!test_and_set_bit_lock(0, (unsigned long *) ctx->fallback_req)) + return req; + + return NULL; +} + static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, struct io_submit_state *state) { @@ -606,7 +817,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, if (!state) { req = kmem_cache_alloc(req_cachep, gfp); if (unlikely(!req)) - goto out; + goto fallback; } else if (!state->free_reqs) { size_t sz; int ret; @@ -621,7 +832,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, if (unlikely(ret <= 0)) { state->reqs[0] = kmem_cache_alloc(req_cachep, gfp); if (!state->reqs[0]) - goto out; + goto fallback; ret = 1; } state->free_reqs = ret - 1; @@ -633,14 +844,21 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, state->cur_req++; } +got_it: + req->io = NULL; + req->ring_file = NULL; req->file = NULL; req->ctx = ctx; req->flags = 0; /* one is dropped after submission, the other at completion */ refcount_set(&req->refs, 2); req->result = 0; + INIT_IO_WORK(&req->work, io_wq_submit_work); return req; -out: +fallback: + req = io_get_fallback_req(ctx); + if (req) + goto got_it; percpu_ref_put(&ctx->refs); return NULL; } @@ -656,34 +874,81 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) static void __io_free_req(struct io_kiocb *req) { + struct io_ring_ctx *ctx = req->ctx; + + if (req->io) + kfree(req->io); if (req->file && !(req->flags & REQ_F_FIXED_FILE)) fput(req->file); - percpu_ref_put(&req->ctx->refs); - kmem_cache_free(req_cachep, req); + if (req->flags & REQ_F_INFLIGHT) { + unsigned long flags; + + spin_lock_irqsave(&ctx->inflight_lock, flags); + list_del(&req->inflight_entry); + if (waitqueue_active(&ctx->inflight_wait)) + wake_up(&ctx->inflight_wait); + spin_unlock_irqrestore(&ctx->inflight_lock, flags); + } + percpu_ref_put(&ctx->refs); + if (likely(!io_is_fallback_req(req))) + kmem_cache_free(req_cachep, req); + else + clear_bit_unlock(0, (unsigned long *) ctx->fallback_req); } -static void io_req_link_next(struct io_kiocb *req) +static bool io_link_cancel_timeout(struct io_kiocb *req) { - struct io_kiocb *nxt; + struct io_ring_ctx *ctx = req->ctx; + int ret; + + ret = hrtimer_try_to_cancel(&req->io->timeout.timer); + if (ret != -1) { + io_cqring_fill_event(req, -ECANCELED); + io_commit_cqring(ctx); + req->flags &= ~REQ_F_LINK; + io_put_req(req); + return true; + } + + return false; +} + +static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) +{ + struct io_ring_ctx *ctx = req->ctx; + bool wake_ev = false; + + /* Already got next link */ + if (req->flags & REQ_F_LINK_NEXT) + return; /* * The list should never be empty when we are called here. But could * potentially happen if the chain is messed up, check to be on the * safe side. */ - nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list); - if (nxt) { - list_del(&nxt->list); - if (!list_empty(&req->link_list)) { - INIT_LIST_HEAD(&nxt->link_list); - list_splice(&req->link_list, &nxt->link_list); - nxt->flags |= REQ_F_LINK; + while (!list_empty(&req->link_list)) { + struct io_kiocb *nxt = list_first_entry(&req->link_list, + struct io_kiocb, link_list); + + if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) && + (nxt->flags & REQ_F_TIMEOUT))) { + list_del_init(&nxt->link_list); + wake_ev |= io_link_cancel_timeout(nxt); + req->flags &= ~REQ_F_LINK_TIMEOUT; + continue; } - nxt->flags |= REQ_F_LINK_DONE; - INIT_WORK(&nxt->work, io_sq_wq_submit_work); - io_queue_async_work(req->ctx, nxt); + list_del_init(&req->link_list); + if (!list_empty(&nxt->link_list)) + nxt->flags |= REQ_F_LINK; + *nxtptr = nxt; + break; } + + req->flags |= REQ_F_LINK_NEXT; + if (wake_ev) + io_cqring_ev_posted(ctx); } /* @@ -691,33 +956,86 @@ static void io_req_link_next(struct io_kiocb *req) */ static void io_fail_links(struct io_kiocb *req) { - struct io_kiocb *link; + struct io_ring_ctx *ctx = req->ctx; + unsigned long flags; + + spin_lock_irqsave(&ctx->completion_lock, flags); while (!list_empty(&req->link_list)) { - link = list_first_entry(&req->link_list, struct io_kiocb, list); - list_del(&link->list); + struct io_kiocb *link = list_first_entry(&req->link_list, + struct io_kiocb, link_list); + + list_del_init(&link->link_list); + trace_io_uring_fail_link(req, link); - io_cqring_add_event(req->ctx, link->user_data, -ECANCELED); - __io_free_req(link); + if ((req->flags & REQ_F_LINK_TIMEOUT) && + link->sqe->opcode == IORING_OP_LINK_TIMEOUT) { + io_link_cancel_timeout(link); + } else { + io_cqring_fill_event(link, -ECANCELED); + __io_double_put_req(link); + } + req->flags &= ~REQ_F_LINK_TIMEOUT; } + + io_commit_cqring(ctx); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + io_cqring_ev_posted(ctx); } -static void io_free_req(struct io_kiocb *req) +static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) { + if (likely(!(req->flags & REQ_F_LINK))) + return; + /* * If LINK is set, we have dependent requests in this chain. If we * didn't fail this request, queue the first one up, moving any other * dependencies to the next request. In case of failure, fail the rest * of the chain. */ - if (req->flags & REQ_F_LINK) { - if (req->flags & REQ_F_FAIL_LINK) - io_fail_links(req); - else - io_req_link_next(req); + if (req->flags & REQ_F_FAIL_LINK) { + io_fail_links(req); + } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) == + REQ_F_LINK_TIMEOUT) { + struct io_ring_ctx *ctx = req->ctx; + unsigned long flags; + + /* + * If this is a timeout link, we could be racing with the + * timeout timer. Grab the completion lock for this case to + * protect against that. + */ + spin_lock_irqsave(&ctx->completion_lock, flags); + io_req_link_next(req, nxt); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + } else { + io_req_link_next(req, nxt); } +} + +static void io_free_req(struct io_kiocb *req) +{ + struct io_kiocb *nxt = NULL; + io_req_find_next(req, &nxt); __io_free_req(req); + + if (nxt) + io_queue_async_work(nxt); +} + +/* + * Drop reference to request, return next in chain (if there is one) if this + * was the last reference to this request. + */ +__attribute__((nonnull)) +static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr) +{ + io_req_find_next(req, nxtptr); + + if (refcount_dec_and_test(&req->refs)) + __io_free_req(req); } static void io_put_req(struct io_kiocb *req) @@ -726,13 +1044,51 @@ static void io_put_req(struct io_kiocb *req) io_free_req(req); } -static unsigned io_cqring_events(struct io_rings *rings) +/* + * Must only be used if we don't need to care about links, usually from + * within the completion handling itself. + */ +static void __io_double_put_req(struct io_kiocb *req) +{ + /* drop both submit and complete references */ + if (refcount_sub_and_test(2, &req->refs)) + __io_free_req(req); +} + +static void io_double_put_req(struct io_kiocb *req) { + /* drop both submit and complete references */ + if (refcount_sub_and_test(2, &req->refs)) + io_free_req(req); +} + +static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush) +{ + struct io_rings *rings = ctx->rings; + + /* + * noflush == true is from the waitqueue handler, just ensure we wake + * up the task, and the next invocation will flush the entries. We + * cannot safely to it from here. + */ + if (noflush && !list_empty(&ctx->cq_overflow_list)) + return -1U; + + io_cqring_overflow_flush(ctx, false); + /* See comment at the top of this file */ smp_rmb(); return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head); } +static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) +{ + struct io_rings *rings = ctx->rings; + + /* make sure SQ entry isn't read before tail */ + return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; +} + /* * Find and free completed poll iocbs */ @@ -748,7 +1104,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, req = list_first_entry(done, struct io_kiocb, list); list_del(&req->list); - io_cqring_fill_event(ctx, req->user_data, req->result); + io_cqring_fill_event(req, req->result); (*nr_events)++; if (refcount_dec_and_test(&req->refs)) { @@ -757,8 +1113,9 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, * completions for those, only batch free for fixed * file and non-linked commands. */ - if ((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) == - REQ_F_FIXED_FILE) { + if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) == + REQ_F_FIXED_FILE) && !io_is_fallback_req(req) && + !req->io) { reqs[to_free++] = req; if (to_free == ARRAY_SIZE(reqs)) io_free_req_many(ctx, reqs, &to_free); @@ -862,19 +1219,11 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } -static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, - long min) +static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, + long min) { - int iters, ret = 0; - - /* - * We disallow the app entering submit/complete with polling, but we - * still need to lock the ring to prevent racing with polled issue - * that got punted to a workqueue. - */ - mutex_lock(&ctx->uring_lock); + int iters = 0, ret = 0; - iters = 0; do { int tmin = 0; @@ -883,7 +1232,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, * If we do, we can potentially be spinning for commands that * already triggered a CQE (eg in error). */ - if (io_cqring_events(ctx->rings)) + if (io_cqring_events(ctx, false)) break; /* @@ -910,42 +1259,76 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, ret = 0; } while (min && !*nr_events && !need_resched()); + return ret; +} + +static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, + long min) +{ + int ret; + + /* + * We disallow the app entering submit/complete with polling, but we + * still need to lock the ring to prevent racing with polled issue + * that got punted to a workqueue. + */ + mutex_lock(&ctx->uring_lock); + ret = __io_iopoll_check(ctx, nr_events, min); mutex_unlock(&ctx->uring_lock); return ret; } -static void kiocb_end_write(struct kiocb *kiocb) +static void kiocb_end_write(struct io_kiocb *req) { - if (kiocb->ki_flags & IOCB_WRITE) { - struct inode *inode = file_inode(kiocb->ki_filp); + /* + * Tell lockdep we inherited freeze protection from submission + * thread. + */ + if (req->flags & REQ_F_ISREG) { + struct inode *inode = file_inode(req->file); - /* - * Tell lockdep we inherited freeze protection from submission - * thread. - */ - if (S_ISREG(inode->i_mode)) - __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); - file_end_write(kiocb->ki_filp); + __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); } + file_end_write(req->file); } -static void io_complete_rw(struct kiocb *kiocb, long res, long res2) +static void io_complete_rw_common(struct kiocb *kiocb, long res) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); - kiocb_end_write(kiocb); + if (kiocb->ki_flags & IOCB_WRITE) + kiocb_end_write(req); if ((req->flags & REQ_F_LINK) && res != req->result) req->flags |= REQ_F_FAIL_LINK; - io_cqring_add_event(req->ctx, req->user_data, res); + io_cqring_add_event(req, res); +} + +static void io_complete_rw(struct kiocb *kiocb, long res, long res2) +{ + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); + + io_complete_rw_common(kiocb, res); io_put_req(req); } +static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res) +{ + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); + struct io_kiocb *nxt = NULL; + + io_complete_rw_common(kiocb, res); + io_put_req_find_next(req, &nxt); + + return nxt; +} + static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); - kiocb_end_write(kiocb); + if (kiocb->ki_flags & IOCB_WRITE) + kiocb_end_write(req); if ((req->flags & REQ_F_LINK) && res != req->result) req->flags |= REQ_F_FAIL_LINK; @@ -1047,10 +1430,9 @@ static bool io_file_supports_async(struct file *file) return false; } -static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, - bool force_nonblock) +static int io_prep_rw(struct io_kiocb *req, bool force_nonblock) { - const struct io_uring_sqe *sqe = s->sqe; + const struct io_uring_sqe *sqe = req->sqe; struct io_ring_ctx *ctx = req->ctx; struct kiocb *kiocb = &req->rw; unsigned ioprio; @@ -1059,8 +1441,8 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, if (!req->file) return -EBADF; - if (force_nonblock && !io_file_supports_async(req->file)) - force_nonblock = false; + if (S_ISREG(file_inode(req->file)->i_mode)) + req->flags |= REQ_F_ISREG; kiocb->ki_pos = READ_ONCE(sqe->off); kiocb->ki_flags = iocb_flags(kiocb->ki_filp); @@ -1081,7 +1463,8 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, return ret; /* don't allow async punt if RWF_NOWAIT was requested */ - if (kiocb->ki_flags & IOCB_NOWAIT) + if ((kiocb->ki_flags & IOCB_NOWAIT) || + (req->file->f_flags & O_NONBLOCK)) req->flags |= REQ_F_NOWAIT; if (force_nonblock) @@ -1094,6 +1477,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, kiocb->ki_flags |= IOCB_HIPRI; kiocb->ki_complete = io_complete_rw_iopoll; + req->result = 0; } else { if (kiocb->ki_flags & IOCB_HIPRI) return -EINVAL; @@ -1123,9 +1507,18 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) } } -static int io_import_fixed(struct io_ring_ctx *ctx, int rw, - const struct io_uring_sqe *sqe, - struct iov_iter *iter) +static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt, + bool in_async) +{ + if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw) + *nxt = __io_complete_rw(kiocb, ret); + else + io_rw_done(kiocb, ret); +} + +static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw, + const struct io_uring_sqe *sqe, + struct iov_iter *iter) { size_t len = READ_ONCE(sqe->len); struct io_mapped_ubuf *imu; @@ -1194,14 +1587,13 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw, } } - return 0; + return len; } -static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw, - const struct sqe_submit *s, struct iovec **iovec, - struct iov_iter *iter) +static ssize_t io_import_iovec(int rw, struct io_kiocb *req, + struct iovec **iovec, struct iov_iter *iter) { - const struct io_uring_sqe *sqe = s->sqe; + const struct io_uring_sqe *sqe = req->sqe; void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); size_t sqe_len = READ_ONCE(sqe->len); u8 opcode; @@ -1215,18 +1607,26 @@ static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw, * flag. */ opcode = READ_ONCE(sqe->opcode); - if (opcode == IORING_OP_READ_FIXED || - opcode == IORING_OP_WRITE_FIXED) { - ssize_t ret = io_import_fixed(ctx, rw, sqe, iter); + if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { *iovec = NULL; - return ret; + return io_import_fixed(req->ctx, rw, sqe, iter); } - if (!s->has_user) + if (req->io) { + struct io_async_rw *iorw = &req->io->rw; + + *iovec = iorw->iov; + iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size); + if (iorw->iov == iorw->fast_iov) + *iovec = NULL; + return iorw->size; + } + + if (!req->has_user) return -EFAULT; #ifdef CONFIG_COMPAT - if (ctx->compat) + if (req->ctx->compat) return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter); #endif @@ -1234,65 +1634,6 @@ static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw, return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter); } -static inline bool io_should_merge(struct async_list *al, struct kiocb *kiocb) -{ - if (al->file == kiocb->ki_filp) { - off_t start, end; - - /* - * Allow merging if we're anywhere in the range of the same - * page. Generally this happens for sub-page reads or writes, - * and it's beneficial to allow the first worker to bring the - * page in and the piggy backed work can then work on the - * cached page. - */ - start = al->io_start & PAGE_MASK; - end = (al->io_start + al->io_len + PAGE_SIZE - 1) & PAGE_MASK; - if (kiocb->ki_pos >= start && kiocb->ki_pos <= end) - return true; - } - - al->file = NULL; - return false; -} - -/* - * Make a note of the last file/offset/direction we punted to async - * context. We'll use this information to see if we can piggy back a - * sequential request onto the previous one, if it's still hasn't been - * completed by the async worker. - */ -static void io_async_list_note(int rw, struct io_kiocb *req, size_t len) -{ - struct async_list *async_list = &req->ctx->pending_async[rw]; - struct kiocb *kiocb = &req->rw; - struct file *filp = kiocb->ki_filp; - - if (io_should_merge(async_list, kiocb)) { - unsigned long max_bytes; - - /* Use 8x RA size as a decent limiter for both reads/writes */ - max_bytes = filp->f_ra.ra_pages << (PAGE_SHIFT + 3); - if (!max_bytes) - max_bytes = VM_READAHEAD_PAGES << (PAGE_SHIFT + 3); - - /* If max len are exceeded, reset the state */ - if (async_list->io_len + len <= max_bytes) { - req->flags |= REQ_F_SEQ_PREV; - async_list->io_len += len; - } else { - async_list->file = NULL; - } - } - - /* New file? Reset state. */ - if (async_list->file != filp) { - async_list->io_start = kiocb->ki_pos; - async_list->io_len = len; - async_list->file = filp; - } -} - /* * For files that don't have ->read_iter() and ->write_iter(), handle them * by looping over ->read() or ->write() manually. @@ -1313,9 +1654,19 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, return -EAGAIN; while (iov_iter_count(iter)) { - struct iovec iovec = iov_iter_iovec(iter); + struct iovec iovec; ssize_t nr; + if (!iov_iter_is_bvec(iter)) { + iovec = iov_iter_iovec(iter); + } else { + /* fixed buffers import bvec */ + iovec.iov_base = kmap(iter->bvec->bv_page) + + iter->iov_offset; + iovec.iov_len = min(iter->count, + iter->bvec->bv_len - iter->iov_offset); + } + if (rw == READ) { nr = file->f_op->read(file, iovec.iov_base, iovec.iov_len, &kiocb->ki_pos); @@ -1324,6 +1675,9 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, iovec.iov_len, &kiocb->ki_pos); } + if (iov_iter_is_bvec(iter)) + kunmap(iter->bvec->bv_page); + if (nr < 0) { if (!ret) ret = nr; @@ -1338,7 +1692,51 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, return ret; } -static int io_read(struct io_kiocb *req, const struct sqe_submit *s, +static void io_req_map_io(struct io_kiocb *req, ssize_t io_size, + struct iovec *iovec, struct iovec *fast_iov, + struct iov_iter *iter) +{ + req->io->rw.nr_segs = iter->nr_segs; + req->io->rw.size = io_size; + req->io->rw.iov = iovec; + if (!req->io->rw.iov) { + req->io->rw.iov = req->io->rw.fast_iov; + memcpy(req->io->rw.iov, fast_iov, + sizeof(struct iovec) * iter->nr_segs); + } +} + +static int io_setup_async_io(struct io_kiocb *req, ssize_t io_size, + struct iovec *iovec, struct iovec *fast_iov, + struct iov_iter *iter) +{ + req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); + if (req->io) { + io_req_map_io(req, io_size, iovec, fast_iov, iter); + memcpy(&req->io->sqe, req->sqe, sizeof(req->io->sqe)); + req->sqe = &req->io->sqe; + return 0; + } + + return -ENOMEM; +} + +static int io_read_prep(struct io_kiocb *req, struct iovec **iovec, + struct iov_iter *iter, bool force_nonblock) +{ + ssize_t ret; + + ret = io_prep_rw(req, force_nonblock); + if (ret) + return ret; + + if (unlikely(!(req->file->f_mode & FMODE_READ))) + return -EBADF; + + return io_import_iovec(READ, req, iovec, iter); +} + +static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; @@ -1346,23 +1744,31 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, struct iov_iter iter; struct file *file; size_t iov_count; - ssize_t read_size, ret; + ssize_t io_size, ret; - ret = io_prep_rw(req, s, force_nonblock); - if (ret) - return ret; - file = kiocb->ki_filp; - - if (unlikely(!(file->f_mode & FMODE_READ))) - return -EBADF; - - ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter); - if (ret < 0) - return ret; + if (!req->io) { + ret = io_read_prep(req, &iovec, &iter, force_nonblock); + if (ret < 0) + return ret; + } else { + ret = io_import_iovec(READ, req, &iovec, &iter); + if (ret < 0) + return ret; + } - read_size = ret; + file = req->file; + io_size = ret; if (req->flags & REQ_F_LINK) - req->result = read_size; + req->result = io_size; + + /* + * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so + * we know to async punt it even if it was opened O_NONBLOCK + */ + if (force_nonblock && !io_file_supports_async(file)) { + req->flags |= REQ_F_MUST_PUNT; + goto copy_iov; + } iov_count = iov_iter_count(&iter); ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count); @@ -1382,26 +1788,43 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, * need async punt anyway, so it's more efficient to do it * here. */ - if (force_nonblock && ret2 > 0 && ret2 < read_size) + if (force_nonblock && !(req->flags & REQ_F_NOWAIT) && + (req->flags & REQ_F_ISREG) && + ret2 > 0 && ret2 < io_size) ret2 = -EAGAIN; /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || ret2 != -EAGAIN) { - io_rw_done(kiocb, ret2); + kiocb_done(kiocb, ret2, nxt, req->in_async); } else { - /* - * If ->needs_lock is true, we're already in async - * context. - */ - if (!s->needs_lock) - io_async_list_note(READ, req, iov_count); - ret = -EAGAIN; +copy_iov: + ret = io_setup_async_io(req, io_size, iovec, + inline_vecs, &iter); + if (ret) + goto out_free; + return -EAGAIN; } } +out_free: kfree(iovec); return ret; } -static int io_write(struct io_kiocb *req, const struct sqe_submit *s, +static int io_write_prep(struct io_kiocb *req, struct iovec **iovec, + struct iov_iter *iter, bool force_nonblock) +{ + ssize_t ret; + + ret = io_prep_rw(req, force_nonblock); + if (ret) + return ret; + + if (unlikely(!(req->file->f_mode & FMODE_WRITE))) + return -EBADF; + + return io_import_iovec(WRITE, req, iovec, iter); +} + +static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; @@ -1409,33 +1832,36 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, struct iov_iter iter; struct file *file; size_t iov_count; - ssize_t ret; + ssize_t ret, io_size; - ret = io_prep_rw(req, s, force_nonblock); - if (ret) - return ret; + if (!req->io) { + ret = io_write_prep(req, &iovec, &iter, force_nonblock); + if (ret < 0) + return ret; + } else { + ret = io_import_iovec(WRITE, req, &iovec, &iter); + if (ret < 0) + return ret; + } file = kiocb->ki_filp; - if (unlikely(!(file->f_mode & FMODE_WRITE))) - return -EBADF; - - ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter); - if (ret < 0) - return ret; - + io_size = ret; if (req->flags & REQ_F_LINK) - req->result = ret; - - iov_count = iov_iter_count(&iter); + req->result = io_size; - ret = -EAGAIN; - if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) { - /* If ->needs_lock is true, we're already in async context. */ - if (!s->needs_lock) - io_async_list_note(WRITE, req, iov_count); - goto out_free; + /* + * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so + * we know to async punt it even if it was opened O_NONBLOCK + */ + if (force_nonblock && !io_file_supports_async(req->file)) { + req->flags |= REQ_F_MUST_PUNT; + goto copy_iov; } + if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) + goto copy_iov; + + iov_count = iov_iter_count(&iter); ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count); if (!ret) { ssize_t ret2; @@ -1447,7 +1873,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, * released so that it doesn't complain about the held lock when * we return to userspace. */ - if (S_ISREG(file_inode(file)->i_mode)) { + if (req->flags & REQ_F_ISREG) { __sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true); __sb_writers_release(file_inode(file)->i_sb, @@ -1460,15 +1886,14 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, else ret2 = loop_rw_iter(WRITE, file, kiocb, &iter); if (!force_nonblock || ret2 != -EAGAIN) { - io_rw_done(kiocb, ret2); + kiocb_done(kiocb, ret2, nxt, req->in_async); } else { - /* - * If ->needs_lock is true, we're already in async - * context. - */ - if (!s->needs_lock) - io_async_list_note(WRITE, req, iov_count); - ret = -EAGAIN; +copy_iov: + ret = io_setup_async_io(req, io_size, iovec, + inline_vecs, &iter); + if (ret) + goto out_free; + return -EAGAIN; } } out_free: @@ -1479,15 +1904,14 @@ out_free: /* * IORING_OP_NOP just posts a completion event, nothing else. */ -static int io_nop(struct io_kiocb *req, u64 user_data) +static int io_nop(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - long err = 0; if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - io_cqring_add_event(ctx, user_data, err); + io_cqring_add_event(req, 0); io_put_req(req); return 0; } @@ -1508,7 +1932,7 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) } static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, - bool force_nonblock) + struct io_kiocb **nxt, bool force_nonblock) { loff_t sqe_off = READ_ONCE(sqe->off); loff_t sqe_len = READ_ONCE(sqe->len); @@ -1534,8 +1958,8 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret < 0 && (req->flags & REQ_F_LINK)) req->flags |= REQ_F_FAIL_LINK; - io_cqring_add_event(req->ctx, sqe->user_data, ret); - io_put_req(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); return 0; } @@ -1557,6 +1981,7 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_sync_file_range(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_kiocb **nxt, bool force_nonblock) { loff_t sqe_off; @@ -1580,17 +2005,109 @@ static int io_sync_file_range(struct io_kiocb *req, if (ret < 0 && (req->flags & REQ_F_LINK)) req->flags |= REQ_F_FAIL_LINK; - io_cqring_add_event(req->ctx, sqe->user_data, ret); - io_put_req(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +} + +static int io_sendmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) +{ +#if defined(CONFIG_NET) + const struct io_uring_sqe *sqe = req->sqe; + struct user_msghdr __user *msg; + unsigned flags; + + flags = READ_ONCE(sqe->msg_flags); + msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr); + return sendmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.iov); +#else + return 0; +#endif +} + +static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_kiocb **nxt, bool force_nonblock) +{ +#if defined(CONFIG_NET) + struct socket *sock; + int ret; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + + sock = sock_from_file(req->file, &ret); + if (sock) { + struct io_async_ctx io, *copy; + struct sockaddr_storage addr; + struct msghdr *kmsg; + unsigned flags; + + flags = READ_ONCE(sqe->msg_flags); + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + if (req->io) { + kmsg = &req->io->msg.msg; + kmsg->msg_name = &addr; + } else { + kmsg = &io.msg.msg; + kmsg->msg_name = &addr; + io.msg.iov = io.msg.fast_iov; + ret = io_sendmsg_prep(req, &io); + if (ret) + goto out; + } + + ret = __sys_sendmsg_sock(sock, kmsg, flags); + if (force_nonblock && ret == -EAGAIN) { + copy = kmalloc(sizeof(*copy), GFP_KERNEL); + if (!copy) { + ret = -ENOMEM; + goto out; + } + memcpy(©->msg, &io.msg, sizeof(copy->msg)); + req->io = copy; + memcpy(&req->io->sqe, req->sqe, sizeof(*req->sqe)); + req->sqe = &req->io->sqe; + return ret; + } + if (ret == -ERESTARTSYS) + ret = -EINTR; + } + +out: + io_cqring_add_event(req, ret); + if (ret < 0 && (req->flags & REQ_F_LINK)) + req->flags |= REQ_F_FAIL_LINK; + io_put_req_find_next(req, nxt); return 0; +#else + return -EOPNOTSUPP; +#endif } +static int io_recvmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) +{ #if defined(CONFIG_NET) -static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, - bool force_nonblock, - long (*fn)(struct socket *, struct user_msghdr __user *, - unsigned int)) + const struct io_uring_sqe *sqe = req->sqe; + struct user_msghdr __user *msg; + unsigned flags; + + flags = READ_ONCE(sqe->msg_flags); + msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr); + return recvmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.uaddr, + &io->msg.iov); +#else + return 0; +#endif +} + +static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_kiocb **nxt, bool force_nonblock) { +#if defined(CONFIG_NET) struct socket *sock; int ret; @@ -1600,6 +2117,9 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, sock = sock_from_file(req->file, &ret); if (sock) { struct user_msghdr __user *msg; + struct io_async_ctx io, *copy; + struct sockaddr_storage addr; + struct msghdr *kmsg; unsigned flags; flags = READ_ONCE(sqe->msg_flags); @@ -1610,33 +2130,144 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, msg = (struct user_msghdr __user *) (unsigned long) READ_ONCE(sqe->addr); + if (req->io) { + kmsg = &req->io->msg.msg; + kmsg->msg_name = &addr; + } else { + kmsg = &io.msg.msg; + kmsg->msg_name = &addr; + io.msg.iov = io.msg.fast_iov; + ret = io_recvmsg_prep(req, &io); + if (ret) + goto out; + } - ret = fn(sock, msg, flags); - if (force_nonblock && ret == -EAGAIN) + ret = __sys_recvmsg_sock(sock, kmsg, msg, io.msg.uaddr, flags); + if (force_nonblock && ret == -EAGAIN) { + copy = kmalloc(sizeof(*copy), GFP_KERNEL); + if (!copy) { + ret = -ENOMEM; + goto out; + } + memcpy(copy, &io, sizeof(*copy)); + req->io = copy; + memcpy(&req->io->sqe, req->sqe, sizeof(*req->sqe)); + req->sqe = &req->io->sqe; return ret; + } + if (ret == -ERESTARTSYS) + ret = -EINTR; } - io_cqring_add_event(req->ctx, sqe->user_data, ret); - io_put_req(req); +out: + io_cqring_add_event(req, ret); + if (ret < 0 && (req->flags & REQ_F_LINK)) + req->flags |= REQ_F_FAIL_LINK; + io_put_req_find_next(req, nxt); return 0; -} +#else + return -EOPNOTSUPP; #endif +} -static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, - bool force_nonblock) +static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_kiocb **nxt, bool force_nonblock) { #if defined(CONFIG_NET) - return io_send_recvmsg(req, sqe, force_nonblock, __sys_sendmsg_sock); + struct sockaddr __user *addr; + int __user *addr_len; + unsigned file_flags; + int flags, ret; + + if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + return -EINVAL; + if (sqe->ioprio || sqe->len || sqe->buf_index) + return -EINVAL; + + addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr); + addr_len = (int __user *) (unsigned long) READ_ONCE(sqe->addr2); + flags = READ_ONCE(sqe->accept_flags); + file_flags = force_nonblock ? O_NONBLOCK : 0; + + ret = __sys_accept4_file(req->file, file_flags, addr, addr_len, flags); + if (ret == -EAGAIN && force_nonblock) { + req->work.flags |= IO_WQ_WORK_NEEDS_FILES; + return -EAGAIN; + } + if (ret == -ERESTARTSYS) + ret = -EINTR; + if (ret < 0 && (req->flags & REQ_F_LINK)) + req->flags |= REQ_F_FAIL_LINK; + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; #else return -EOPNOTSUPP; #endif } -static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, - bool force_nonblock) +static int io_connect_prep(struct io_kiocb *req, struct io_async_ctx *io) +{ +#if defined(CONFIG_NET) + const struct io_uring_sqe *sqe = req->sqe; + struct sockaddr __user *addr; + int addr_len; + + addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr); + addr_len = READ_ONCE(sqe->addr2); + return move_addr_to_kernel(addr, addr_len, &io->connect.address); +#else + return 0; +#endif +} + +static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_kiocb **nxt, bool force_nonblock) { #if defined(CONFIG_NET) - return io_send_recvmsg(req, sqe, force_nonblock, __sys_recvmsg_sock); + struct io_async_ctx __io, *io; + unsigned file_flags; + int addr_len, ret; + + if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + return -EINVAL; + if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) + return -EINVAL; + + addr_len = READ_ONCE(sqe->addr2); + file_flags = force_nonblock ? O_NONBLOCK : 0; + + if (req->io) { + io = req->io; + } else { + ret = io_connect_prep(req, &__io); + if (ret) + goto out; + io = &__io; + } + + ret = __sys_connect_file(req->file, &io->connect.address, addr_len, + file_flags); + if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { + io = kmalloc(sizeof(*io), GFP_KERNEL); + if (!io) { + ret = -ENOMEM; + goto out; + } + memcpy(&io->connect, &__io.connect, sizeof(io->connect)); + req->io = io; + memcpy(&io->sqe, req->sqe, sizeof(*req->sqe)); + req->sqe = &io->sqe; + return -EAGAIN; + } + if (ret == -ERESTARTSYS) + ret = -EINTR; +out: + if (ret < 0 && (req->flags & REQ_F_LINK)) + req->flags |= REQ_F_FAIL_LINK; + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; #else return -EOPNOTSUPP; #endif @@ -1648,27 +2279,47 @@ static void io_poll_remove_one(struct io_kiocb *req) spin_lock(&poll->head->lock); WRITE_ONCE(poll->canceled, true); - if (!list_empty(&poll->wait.entry)) { - list_del_init(&poll->wait.entry); - io_queue_async_work(req->ctx, req); + if (!list_empty(&poll->wait->entry)) { + list_del_init(&poll->wait->entry); + io_queue_async_work(req); } spin_unlock(&poll->head->lock); - - list_del_init(&req->list); + hash_del(&req->hash_node); } static void io_poll_remove_all(struct io_ring_ctx *ctx) { + struct hlist_node *tmp; struct io_kiocb *req; + int i; spin_lock_irq(&ctx->completion_lock); - while (!list_empty(&ctx->cancel_list)) { - req = list_first_entry(&ctx->cancel_list, struct io_kiocb,list); - io_poll_remove_one(req); + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { + struct hlist_head *list; + + list = &ctx->cancel_hash[i]; + hlist_for_each_entry_safe(req, tmp, list, hash_node) + io_poll_remove_one(req); } spin_unlock_irq(&ctx->completion_lock); } +static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) +{ + struct hlist_head *list; + struct io_kiocb *req; + + list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)]; + hlist_for_each_entry(req, list, hash_node) { + if (sqe_addr == req->user_data) { + io_poll_remove_one(req); + return 0; + } + } + + return -ENOENT; +} + /* * Find a running poll command that matches one specified in sqe->addr, * and remove it if found. @@ -1676,8 +2327,7 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx) static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *poll_req, *next; - int ret = -ENOENT; + int ret; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -1686,37 +2336,48 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; spin_lock_irq(&ctx->completion_lock); - list_for_each_entry_safe(poll_req, next, &ctx->cancel_list, list) { - if (READ_ONCE(sqe->addr) == poll_req->user_data) { - io_poll_remove_one(poll_req); - ret = 0; - break; - } - } + ret = io_poll_cancel(ctx, READ_ONCE(sqe->addr)); spin_unlock_irq(&ctx->completion_lock); - io_cqring_add_event(req->ctx, sqe->user_data, ret); + io_cqring_add_event(req, ret); + if (ret < 0 && (req->flags & REQ_F_LINK)) + req->flags |= REQ_F_FAIL_LINK; io_put_req(req); return 0; } -static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req, - __poll_t mask) +static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error) { + struct io_ring_ctx *ctx = req->ctx; + req->poll.done = true; - io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask)); + kfree(req->poll.wait); + if (error) + io_cqring_fill_event(req, error); + else + io_cqring_fill_event(req, mangle_poll(mask)); io_commit_cqring(ctx); } -static void io_poll_complete_work(struct work_struct *work) +static void io_poll_complete_work(struct io_wq_work **workptr) { + struct io_wq_work *work = *workptr; struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_poll_iocb *poll = &req->poll; struct poll_table_struct pt = { ._key = poll->events }; struct io_ring_ctx *ctx = req->ctx; + struct io_kiocb *nxt = NULL; __poll_t mask = 0; + int ret = 0; - if (!READ_ONCE(poll->canceled)) + if (work->flags & IO_WQ_WORK_CANCEL) { + WRITE_ONCE(poll->canceled, true); + ret = -ECANCELED; + } else if (READ_ONCE(poll->canceled)) { + ret = -ECANCELED; + } + + if (ret != -ECANCELED) mask = vfs_poll(poll->file, &pt) & poll->events; /* @@ -1727,24 +2388,28 @@ static void io_poll_complete_work(struct work_struct *work) * avoid further branches in the fast path. */ spin_lock_irq(&ctx->completion_lock); - if (!mask && !READ_ONCE(poll->canceled)) { - add_wait_queue(poll->head, &poll->wait); + if (!mask && ret != -ECANCELED) { + add_wait_queue(poll->head, poll->wait); spin_unlock_irq(&ctx->completion_lock); return; } - list_del_init(&req->list); - io_poll_complete(ctx, req, mask); + hash_del(&req->hash_node); + io_poll_complete(req, mask, ret); spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); - io_put_req(req); + + if (ret < 0 && req->flags & REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; + io_put_req_find_next(req, &nxt); + if (nxt) + *workptr = &nxt->work; } static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { - struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb, - wait); + struct io_poll_iocb *poll = wait->private; struct io_kiocb *req = container_of(poll, struct io_kiocb, poll); struct io_ring_ctx *ctx = req->ctx; __poll_t mask = key_to_poll(key); @@ -1754,17 +2419,24 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (mask && !(mask & poll->events)) return 0; - list_del_init(&poll->wait.entry); + list_del_init(&poll->wait->entry); + /* + * Run completion inline if we can. We're using trylock here because + * we are violating the completion_lock -> poll wq lock ordering. + * If we have a link timeout we're going to need the completion_lock + * for finalizing the request, mark us as having grabbed that already. + */ if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) { - list_del(&req->list); - io_poll_complete(ctx, req, mask); + hash_del(&req->hash_node); + io_poll_complete(req, mask, 0); + req->flags |= REQ_F_COMP_LOCKED; + io_put_req(req); spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); - io_put_req(req); } else { - io_queue_async_work(ctx, req); + io_queue_async_work(req); } return 1; @@ -1788,10 +2460,20 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, pt->error = 0; pt->req->poll.head = head; - add_wait_queue(head, &pt->req->poll.wait); + add_wait_queue(head, pt->req->poll.wait); +} + +static void io_poll_req_insert(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + struct hlist_head *list; + + list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; + hlist_add_head(&req->hash_node, list); } -static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_kiocb **nxt) { struct io_poll_iocb *poll = &req->poll; struct io_ring_ctx *ctx = req->ctx; @@ -1807,10 +2489,15 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!poll->file) return -EBADF; - req->submit.sqe = NULL; - INIT_WORK(&req->work, io_poll_complete_work); + poll->wait = kmalloc(sizeof(*poll->wait), GFP_KERNEL); + if (!poll->wait) + return -ENOMEM; + + req->io = NULL; + INIT_IO_WORK(&req->work, io_poll_complete_work); events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; + INIT_HLIST_NODE(&req->hash_node); poll->head = NULL; poll->done = false; @@ -1822,8 +2509,9 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */ /* initialized the list so that we can do list_empty checks */ - INIT_LIST_HEAD(&poll->wait.entry); - init_waitqueue_func_entry(&poll->wait, io_poll_wake); + INIT_LIST_HEAD(&poll->wait->entry); + init_waitqueue_func_entry(poll->wait, io_poll_wake); + poll->wait->private = poll; INIT_LIST_HEAD(&req->list); @@ -1832,187 +2520,464 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) spin_lock_irq(&ctx->completion_lock); if (likely(poll->head)) { spin_lock(&poll->head->lock); - if (unlikely(list_empty(&poll->wait.entry))) { + if (unlikely(list_empty(&poll->wait->entry))) { if (ipt.error) cancel = true; ipt.error = 0; mask = 0; } if (mask || ipt.error) - list_del_init(&poll->wait.entry); + list_del_init(&poll->wait->entry); else if (cancel) WRITE_ONCE(poll->canceled, true); else if (!poll->done) /* actually waiting for an event */ - list_add_tail(&req->list, &ctx->cancel_list); + io_poll_req_insert(req); spin_unlock(&poll->head->lock); } if (mask) { /* no async, we'd stolen it */ ipt.error = 0; - io_poll_complete(ctx, req, mask); + io_poll_complete(req, mask, 0); } spin_unlock_irq(&ctx->completion_lock); if (mask) { io_cqring_ev_posted(ctx); - io_put_req(req); + io_put_req_find_next(req, nxt); } return ipt.error; } static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) { - struct io_ring_ctx *ctx; - struct io_kiocb *req; + struct io_timeout_data *data = container_of(timer, + struct io_timeout_data, timer); + struct io_kiocb *req = data->req; + struct io_ring_ctx *ctx = req->ctx; unsigned long flags; - req = container_of(timer, struct io_kiocb, timeout.timer); - ctx = req->ctx; atomic_inc(&ctx->cq_timeouts); spin_lock_irqsave(&ctx->completion_lock, flags); - list_del(&req->list); + /* + * We could be racing with timeout deletion. If the list is empty, + * then timeout lookup already found it and will be handling it. + */ + if (!list_empty(&req->list)) { + struct io_kiocb *prev; + + /* + * Adjust the reqs sequence before the current one because it + * will consume a slot in the cq_ring and the the cq_tail + * pointer will be increased, otherwise other timeout reqs may + * return in advance without waiting for enough wait_nr. + */ + prev = req; + list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list) + prev->sequence++; + list_del_init(&req->list); + } - io_cqring_fill_event(ctx, req->user_data, -ETIME); + io_cqring_fill_event(req, -ETIME); io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); - + if (req->flags & REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; io_put_req(req); return HRTIMER_NORESTART; } -static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) +{ + struct io_kiocb *req; + int ret = -ENOENT; + + list_for_each_entry(req, &ctx->timeout_list, list) { + if (user_data == req->user_data) { + list_del_init(&req->list); + ret = 0; + break; + } + } + + if (ret == -ENOENT) + return ret; + + ret = hrtimer_try_to_cancel(&req->io->timeout.timer); + if (ret == -1) + return -EALREADY; + + if (req->flags & REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; + io_cqring_fill_event(req, -ECANCELED); + io_put_req(req); + return 0; +} + +/* + * Remove or update an existing timeout command + */ +static int io_timeout_remove(struct io_kiocb *req, + const struct io_uring_sqe *sqe) { - unsigned count, req_dist, tail_index; struct io_ring_ctx *ctx = req->ctx; - struct list_head *entry; - struct timespec64 ts; + unsigned flags; + int ret; if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->timeout_flags || - sqe->len != 1) + if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len) + return -EINVAL; + flags = READ_ONCE(sqe->timeout_flags); + if (flags) + return -EINVAL; + + spin_lock_irq(&ctx->completion_lock); + ret = io_timeout_cancel(ctx, READ_ONCE(sqe->addr)); + + io_cqring_fill_event(req, ret); + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); + io_cqring_ev_posted(ctx); + if (ret < 0 && req->flags & REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; + io_put_req(req); + return 0; +} + +static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io, + bool is_timeout_link) +{ + const struct io_uring_sqe *sqe = req->sqe; + struct io_timeout_data *data; + unsigned flags; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->ioprio || sqe->buf_index || sqe->len != 1) + return -EINVAL; + if (sqe->off && is_timeout_link) + return -EINVAL; + flags = READ_ONCE(sqe->timeout_flags); + if (flags & ~IORING_TIMEOUT_ABS) return -EINVAL; - if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr))) + data = &io->timeout; + data->req = req; + req->flags |= REQ_F_TIMEOUT; + + if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) return -EFAULT; + if (flags & IORING_TIMEOUT_ABS) + data->mode = HRTIMER_MODE_ABS; + else + data->mode = HRTIMER_MODE_REL; + + hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); + req->io = io; + return 0; +} + +static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + unsigned count; + struct io_ring_ctx *ctx = req->ctx; + struct io_timeout_data *data; + struct io_async_ctx *io; + struct list_head *entry; + unsigned span = 0; + + io = req->io; + if (!io) { + int ret; + + io = kmalloc(sizeof(*io), GFP_KERNEL); + if (!io) + return -ENOMEM; + ret = io_timeout_prep(req, io, false); + if (ret) { + kfree(io); + return ret; + } + } + data = &req->io->timeout; + /* * sqe->off holds how many events that need to occur for this - * timeout event to be satisfied. + * timeout event to be satisfied. If it isn't set, then this is + * a pure timeout request, sequence isn't used. */ count = READ_ONCE(sqe->off); - if (!count) - count = 1; + if (!count) { + req->flags |= REQ_F_TIMEOUT_NOSEQ; + spin_lock_irq(&ctx->completion_lock); + entry = ctx->timeout_list.prev; + goto add; + } req->sequence = ctx->cached_sq_head + count - 1; - req->flags |= REQ_F_TIMEOUT; + data->seq_offset = count; /* * Insertion sort, ensuring the first entry in the list is always * the one we need first. */ - tail_index = ctx->cached_cq_tail - ctx->rings->sq_dropped; - req_dist = req->sequence - tail_index; spin_lock_irq(&ctx->completion_lock); list_for_each_prev(entry, &ctx->timeout_list) { struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); - unsigned dist; + unsigned nxt_sq_head; + long long tmp, tmp_nxt; + u32 nxt_offset = nxt->io->timeout.seq_offset; + + if (nxt->flags & REQ_F_TIMEOUT_NOSEQ) + continue; + + /* + * Since cached_sq_head + count - 1 can overflow, use type long + * long to store it. + */ + tmp = (long long)ctx->cached_sq_head + count - 1; + nxt_sq_head = nxt->sequence - nxt_offset + 1; + tmp_nxt = (long long)nxt_sq_head + nxt_offset - 1; + + /* + * cached_sq_head may overflow, and it will never overflow twice + * once there is some timeout req still be valid. + */ + if (ctx->cached_sq_head < nxt_sq_head) + tmp += UINT_MAX; - dist = nxt->sequence - tail_index; - if (req_dist >= dist) + if (tmp > tmp_nxt) break; + + /* + * Sequence of reqs after the insert one and itself should + * be adjusted because each timeout req consumes a slot. + */ + span++; + nxt->sequence++; } + req->sequence -= span; +add: list_add(&req->list, entry); + data->timer.function = io_timeout_fn; + hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); spin_unlock_irq(&ctx->completion_lock); + return 0; +} + +static bool io_cancel_cb(struct io_wq_work *work, void *data) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); - hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - req->timeout.timer.function = io_timeout_fn; - hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts), - HRTIMER_MODE_REL); + return req->user_data == (unsigned long) data; +} + +static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr) +{ + enum io_wq_cancel cancel_ret; + int ret = 0; + + cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr); + switch (cancel_ret) { + case IO_WQ_CANCEL_OK: + ret = 0; + break; + case IO_WQ_CANCEL_RUNNING: + ret = -EALREADY; + break; + case IO_WQ_CANCEL_NOTFOUND: + ret = -ENOENT; + break; + } + + return ret; +} + +static void io_async_find_and_cancel(struct io_ring_ctx *ctx, + struct io_kiocb *req, __u64 sqe_addr, + struct io_kiocb **nxt, int success_ret) +{ + unsigned long flags; + int ret; + + ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr); + if (ret != -ENOENT) { + spin_lock_irqsave(&ctx->completion_lock, flags); + goto done; + } + + spin_lock_irqsave(&ctx->completion_lock, flags); + ret = io_timeout_cancel(ctx, sqe_addr); + if (ret != -ENOENT) + goto done; + ret = io_poll_cancel(ctx, sqe_addr); +done: + if (!ret) + ret = success_ret; + io_cqring_fill_event(req, ret); + io_commit_cqring(ctx); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + io_cqring_ev_posted(ctx); + + if (ret < 0 && (req->flags & REQ_F_LINK)) + req->flags |= REQ_F_FAIL_LINK; + io_put_req_find_next(req, nxt); +} + +static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_kiocb **nxt) +{ + struct io_ring_ctx *ctx = req->ctx; + + if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->flags || sqe->ioprio || sqe->off || sqe->len || + sqe->cancel_flags) + return -EINVAL; + + io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), nxt, 0); + return 0; +} + +static int io_req_defer_prep(struct io_kiocb *req, struct io_async_ctx *io) +{ + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + struct iov_iter iter; + ssize_t ret; + + memcpy(&io->sqe, req->sqe, sizeof(io->sqe)); + req->sqe = &io->sqe; + + switch (io->sqe.opcode) { + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + ret = io_read_prep(req, &iovec, &iter, true); + break; + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + ret = io_write_prep(req, &iovec, &iter, true); + break; + case IORING_OP_SENDMSG: + ret = io_sendmsg_prep(req, io); + break; + case IORING_OP_RECVMSG: + ret = io_recvmsg_prep(req, io); + break; + case IORING_OP_CONNECT: + ret = io_connect_prep(req, io); + break; + case IORING_OP_TIMEOUT: + return io_timeout_prep(req, io, false); + case IORING_OP_LINK_TIMEOUT: + return io_timeout_prep(req, io, true); + default: + req->io = io; + return 0; + } + + if (ret < 0) + return ret; + + req->io = io; + io_req_map_io(req, ret, iovec, inline_vecs, &iter); return 0; } -static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct io_uring_sqe *sqe) +static int io_req_defer(struct io_kiocb *req) { - struct io_uring_sqe *sqe_copy; + struct io_ring_ctx *ctx = req->ctx; + struct io_async_ctx *io; + int ret; - if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list)) + /* Still need defer if there is pending req in defer list. */ + if (!req_need_defer(req) && list_empty(&ctx->defer_list)) return 0; - sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); - if (!sqe_copy) + io = kmalloc(sizeof(*io), GFP_KERNEL); + if (!io) return -EAGAIN; + ret = io_req_defer_prep(req, io); + if (ret < 0) { + kfree(io); + return ret; + } + spin_lock_irq(&ctx->completion_lock); - if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list)) { + if (!req_need_defer(req) && list_empty(&ctx->defer_list)) { spin_unlock_irq(&ctx->completion_lock); - kfree(sqe_copy); return 0; } - memcpy(sqe_copy, sqe, sizeof(*sqe_copy)); - req->submit.sqe = sqe_copy; - - INIT_WORK(&req->work, io_sq_wq_submit_work); + trace_io_uring_defer(ctx, req, req->user_data); list_add_tail(&req->list, &ctx->defer_list); spin_unlock_irq(&ctx->completion_lock); return -EIOCBQUEUED; } -static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct sqe_submit *s, bool force_nonblock) +__attribute__((nonnull)) +static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) { int ret, opcode; + struct io_ring_ctx *ctx = req->ctx; - req->user_data = READ_ONCE(s->sqe->user_data); - - if (unlikely(s->index >= ctx->sq_entries)) - return -EINVAL; - - opcode = READ_ONCE(s->sqe->opcode); + opcode = READ_ONCE(req->sqe->opcode); switch (opcode) { case IORING_OP_NOP: - ret = io_nop(req, req->user_data); + ret = io_nop(req); break; case IORING_OP_READV: - if (unlikely(s->sqe->buf_index)) + if (unlikely(req->sqe->buf_index)) return -EINVAL; - ret = io_read(req, s, force_nonblock); + ret = io_read(req, nxt, force_nonblock); break; case IORING_OP_WRITEV: - if (unlikely(s->sqe->buf_index)) + if (unlikely(req->sqe->buf_index)) return -EINVAL; - ret = io_write(req, s, force_nonblock); + ret = io_write(req, nxt, force_nonblock); break; case IORING_OP_READ_FIXED: - ret = io_read(req, s, force_nonblock); + ret = io_read(req, nxt, force_nonblock); break; case IORING_OP_WRITE_FIXED: - ret = io_write(req, s, force_nonblock); + ret = io_write(req, nxt, force_nonblock); break; case IORING_OP_FSYNC: - ret = io_fsync(req, s->sqe, force_nonblock); + ret = io_fsync(req, req->sqe, nxt, force_nonblock); break; case IORING_OP_POLL_ADD: - ret = io_poll_add(req, s->sqe); + ret = io_poll_add(req, req->sqe, nxt); break; case IORING_OP_POLL_REMOVE: - ret = io_poll_remove(req, s->sqe); + ret = io_poll_remove(req, req->sqe); break; case IORING_OP_SYNC_FILE_RANGE: - ret = io_sync_file_range(req, s->sqe, force_nonblock); + ret = io_sync_file_range(req, req->sqe, nxt, force_nonblock); break; case IORING_OP_SENDMSG: - ret = io_sendmsg(req, s->sqe, force_nonblock); + ret = io_sendmsg(req, req->sqe, nxt, force_nonblock); break; case IORING_OP_RECVMSG: - ret = io_recvmsg(req, s->sqe, force_nonblock); + ret = io_recvmsg(req, req->sqe, nxt, force_nonblock); break; case IORING_OP_TIMEOUT: - ret = io_timeout(req, s->sqe); + ret = io_timeout(req, req->sqe); + break; + case IORING_OP_TIMEOUT_REMOVE: + ret = io_timeout_remove(req, req->sqe); + break; + case IORING_OP_ACCEPT: + ret = io_accept(req, req->sqe, nxt, force_nonblock); + break; + case IORING_OP_CONNECT: + ret = io_connect(req, req->sqe, nxt, force_nonblock); + break; + case IORING_OP_ASYNC_CANCEL: + ret = io_async_cancel(req, req->sqe, nxt); break; default: ret = -EINVAL; @@ -2027,187 +2992,76 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, return -EAGAIN; /* workqueue context doesn't hold uring_lock, grab it now */ - if (s->needs_lock) + if (req->in_async) mutex_lock(&ctx->uring_lock); io_iopoll_req_issued(req); - if (s->needs_lock) + if (req->in_async) mutex_unlock(&ctx->uring_lock); } return 0; } -static struct async_list *io_async_list_from_sqe(struct io_ring_ctx *ctx, - const struct io_uring_sqe *sqe) +static void io_link_work_cb(struct io_wq_work **workptr) { - switch (sqe->opcode) { - case IORING_OP_READV: - case IORING_OP_READ_FIXED: - return &ctx->pending_async[READ]; - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - return &ctx->pending_async[WRITE]; - default: - return NULL; - } -} + struct io_wq_work *work = *workptr; + struct io_kiocb *link = work->data; -static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe) -{ - u8 opcode = READ_ONCE(sqe->opcode); - - return !(opcode == IORING_OP_READ_FIXED || - opcode == IORING_OP_WRITE_FIXED); + io_queue_linked_timeout(link); + work->func = io_wq_submit_work; } -static void io_sq_wq_submit_work(struct work_struct *work) +static void io_wq_submit_work(struct io_wq_work **workptr) { + struct io_wq_work *work = *workptr; struct io_kiocb *req = container_of(work, struct io_kiocb, work); - struct io_ring_ctx *ctx = req->ctx; - struct mm_struct *cur_mm = NULL; - struct async_list *async_list; - LIST_HEAD(req_list); - mm_segment_t old_fs; - int ret; + struct io_kiocb *nxt = NULL; + int ret = 0; - async_list = io_async_list_from_sqe(ctx, req->submit.sqe); -restart: - do { - struct sqe_submit *s = &req->submit; - const struct io_uring_sqe *sqe = s->sqe; - unsigned int flags = req->flags; + /* Ensure we clear previously set non-block flag */ + req->rw.ki_flags &= ~IOCB_NOWAIT; - /* Ensure we clear previously set non-block flag */ - req->rw.ki_flags &= ~IOCB_NOWAIT; + if (work->flags & IO_WQ_WORK_CANCEL) + ret = -ECANCELED; - ret = 0; - if (io_sqe_needs_user(sqe) && !cur_mm) { - if (!mmget_not_zero(ctx->sqo_mm)) { - ret = -EFAULT; - } else { - cur_mm = ctx->sqo_mm; - use_mm(cur_mm); - old_fs = get_fs(); - set_fs(USER_DS); - } - } + if (!ret) { + req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0; + req->in_async = true; + do { + ret = io_issue_sqe(req, &nxt, false); + /* + * We can get EAGAIN for polled IO even though we're + * forcing a sync submission from here, since we can't + * wait for request slots on the block side. + */ + if (ret != -EAGAIN) + break; + cond_resched(); + } while (1); + } - if (!ret) { - s->has_user = cur_mm != NULL; - s->needs_lock = true; - do { - ret = __io_submit_sqe(ctx, req, s, false); - /* - * We can get EAGAIN for polled IO even though - * we're forcing a sync submission from here, - * since we can't wait for request slots on the - * block side. - */ - if (ret != -EAGAIN) - break; - cond_resched(); - } while (1); - } + /* drop submission reference */ + io_put_req(req); - /* drop submission reference */ + if (ret) { + if (req->flags & REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; + io_cqring_add_event(req, ret); io_put_req(req); - - if (ret) { - io_cqring_add_event(ctx, sqe->user_data, ret); - io_put_req(req); - } - - /* async context always use a copy of the sqe */ - kfree(sqe); - - /* req from defer and link list needn't decrease async cnt */ - if (flags & (REQ_F_IO_DRAINED | REQ_F_LINK_DONE)) - goto out; - - if (!async_list) - break; - if (!list_empty(&req_list)) { - req = list_first_entry(&req_list, struct io_kiocb, - list); - list_del(&req->list); - continue; - } - if (list_empty(&async_list->list)) - break; - - req = NULL; - spin_lock(&async_list->lock); - if (list_empty(&async_list->list)) { - spin_unlock(&async_list->lock); - break; - } - list_splice_init(&async_list->list, &req_list); - spin_unlock(&async_list->lock); - - req = list_first_entry(&req_list, struct io_kiocb, list); - list_del(&req->list); - } while (req); - - /* - * Rare case of racing with a submitter. If we find the count has - * dropped to zero AND we have pending work items, then restart - * the processing. This is a tiny race window. - */ - if (async_list) { - ret = atomic_dec_return(&async_list->cnt); - while (!ret && !list_empty(&async_list->list)) { - spin_lock(&async_list->lock); - atomic_inc(&async_list->cnt); - list_splice_init(&async_list->list, &req_list); - spin_unlock(&async_list->lock); - - if (!list_empty(&req_list)) { - req = list_first_entry(&req_list, - struct io_kiocb, list); - list_del(&req->list); - goto restart; - } - ret = atomic_dec_return(&async_list->cnt); - } - } - -out: - if (cur_mm) { - set_fs(old_fs); - unuse_mm(cur_mm); - mmput(cur_mm); } -} - -/* - * See if we can piggy back onto previously submitted work, that is still - * running. We currently only allow this if the new request is sequential - * to the previous one we punted. - */ -static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req) -{ - bool ret; - if (!list) - return false; - if (!(req->flags & REQ_F_SEQ_PREV)) - return false; - if (!atomic_read(&list->cnt)) - return false; + /* if a dependent link is ready, pass it back */ + if (!ret && nxt) { + struct io_kiocb *link; - ret = true; - spin_lock(&list->lock); - list_add_tail(&req->list, &list->list); - /* - * Ensure we see a simultaneous modification from io_sq_wq_submit_work() - */ - smp_mb(); - if (!atomic_read(&list->cnt)) { - list_del_init(&req->list); - ret = false; + io_prep_async_work(nxt, &link); + *workptr = &nxt->work; + if (link) { + nxt->work.flags |= IO_WQ_WORK_CB; + nxt->work.func = io_link_work_cb; + nxt->work.data = link; + } } - spin_unlock(&list->lock); - return ret; } static bool io_op_needs_file(const struct io_uring_sqe *sqe) @@ -2217,42 +3071,53 @@ static bool io_op_needs_file(const struct io_uring_sqe *sqe) switch (op) { case IORING_OP_NOP: case IORING_OP_POLL_REMOVE: + case IORING_OP_TIMEOUT: + case IORING_OP_TIMEOUT_REMOVE: + case IORING_OP_ASYNC_CANCEL: + case IORING_OP_LINK_TIMEOUT: return false; default: return true; } } -static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, - struct io_submit_state *state, struct io_kiocb *req) +static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, + int index) { + struct fixed_file_table *table; + + table = &ctx->file_table[index >> IORING_FILE_TABLE_SHIFT]; + return table->files[index & IORING_FILE_TABLE_MASK]; +} + +static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; unsigned flags; int fd; - flags = READ_ONCE(s->sqe->flags); - fd = READ_ONCE(s->sqe->fd); + flags = READ_ONCE(req->sqe->flags); + fd = READ_ONCE(req->sqe->fd); if (flags & IOSQE_IO_DRAIN) req->flags |= REQ_F_IO_DRAIN; - /* - * All io need record the previous position, if LINK vs DARIN, - * it can be used to mark the position of the first IO in the - * link list. - */ - req->sequence = s->sequence; - if (!io_op_needs_file(s->sqe)) + if (!io_op_needs_file(req->sqe)) return 0; if (flags & IOSQE_FIXED_FILE) { - if (unlikely(!ctx->user_files || + if (unlikely(!ctx->file_table || (unsigned) fd >= ctx->nr_user_files)) return -EBADF; - req->file = ctx->user_files[fd]; + fd = array_index_nospec(fd, ctx->nr_user_files); + req->file = io_file_from_index(ctx, fd); + if (!req->file) + return -EBADF; req->flags |= REQ_F_FIXED_FILE; } else { - if (s->needs_fixed_file) + if (req->needs_fixed_file) return -EBADF; + trace_io_uring_file_get(ctx, fd); req->file = io_file_get(state, fd); if (unlikely(!req->file)) return -EBADF; @@ -2261,139 +3126,213 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, return 0; } -static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, - struct sqe_submit *s, bool force_nonblock) +static int io_grab_files(struct io_kiocb *req) { - int ret; + int ret = -EBADF; + struct io_ring_ctx *ctx = req->ctx; - ret = __io_submit_sqe(ctx, req, s, force_nonblock); - if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { - struct io_uring_sqe *sqe_copy; - - sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL); - if (sqe_copy) { - struct async_list *list; - - s->sqe = sqe_copy; - memcpy(&req->submit, s, sizeof(*s)); - list = io_async_list_from_sqe(ctx, s->sqe); - if (!io_add_to_prev_work(list, req)) { - if (list) - atomic_inc(&list->cnt); - INIT_WORK(&req->work, io_sq_wq_submit_work); - io_queue_async_work(ctx, req); - } + rcu_read_lock(); + spin_lock_irq(&ctx->inflight_lock); + /* + * We use the f_ops->flush() handler to ensure that we can flush + * out work accessing these files if the fd is closed. Check if + * the fd has changed since we started down this path, and disallow + * this operation if it has. + */ + if (fcheck(req->ring_fd) == req->ring_file) { + list_add(&req->inflight_entry, &ctx->inflight_list); + req->flags |= REQ_F_INFLIGHT; + req->work.files = current->files; + ret = 0; + } + spin_unlock_irq(&ctx->inflight_lock); + rcu_read_unlock(); - /* - * Queued up for async execution, worker will release - * submit reference when the iocb is actually submitted. - */ - return 0; - } + return ret; +} + +static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) +{ + struct io_timeout_data *data = container_of(timer, + struct io_timeout_data, timer); + struct io_kiocb *req = data->req; + struct io_ring_ctx *ctx = req->ctx; + struct io_kiocb *prev = NULL; + unsigned long flags; + + spin_lock_irqsave(&ctx->completion_lock, flags); + + /* + * We don't expect the list to be empty, that will only happen if we + * race with the completion of the linked work. + */ + if (!list_empty(&req->link_list)) { + prev = list_entry(req->link_list.prev, struct io_kiocb, + link_list); + if (refcount_inc_not_zero(&prev->refs)) { + list_del_init(&req->link_list); + prev->flags &= ~REQ_F_LINK_TIMEOUT; + } else + prev = NULL; } - /* drop submission reference */ - io_put_req(req); + spin_unlock_irqrestore(&ctx->completion_lock, flags); - /* and drop final reference, if we failed */ - if (ret) { - io_cqring_add_event(ctx, req->user_data, ret); - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + if (prev) { + if (prev->flags & REQ_F_LINK) + prev->flags |= REQ_F_FAIL_LINK; + io_async_find_and_cancel(ctx, req, prev->user_data, NULL, + -ETIME); + io_put_req(prev); + } else { + io_cqring_add_event(req, -ETIME); io_put_req(req); } - - return ret; + return HRTIMER_NORESTART; } -static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, - struct sqe_submit *s, bool force_nonblock) +static void io_queue_linked_timeout(struct io_kiocb *req) { - int ret; + struct io_ring_ctx *ctx = req->ctx; - ret = io_req_defer(ctx, req, s->sqe); - if (ret) { - if (ret != -EIOCBQUEUED) { - io_free_req(req); - io_cqring_add_event(ctx, s->sqe->user_data, ret); - } - return 0; + /* + * If the list is now empty, then our linked request finished before + * we got a chance to setup the timer + */ + spin_lock_irq(&ctx->completion_lock); + if (!list_empty(&req->link_list)) { + struct io_timeout_data *data = &req->io->timeout; + + data->timer.function = io_link_timeout_fn; + hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), + data->mode); } + spin_unlock_irq(&ctx->completion_lock); + + /* drop submission reference */ + io_put_req(req); +} + +static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) +{ + struct io_kiocb *nxt; + + if (!(req->flags & REQ_F_LINK)) + return NULL; + + nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, + link_list); + if (!nxt || nxt->sqe->opcode != IORING_OP_LINK_TIMEOUT) + return NULL; - return __io_queue_sqe(ctx, req, s, force_nonblock); + req->flags |= REQ_F_LINK_TIMEOUT; + return nxt; } -static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req, - struct sqe_submit *s, struct io_kiocb *shadow, - bool force_nonblock) +static void __io_queue_sqe(struct io_kiocb *req) { + struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); + struct io_kiocb *nxt = NULL; int ret; - int need_submit = false; - if (!shadow) - return io_queue_sqe(ctx, req, s, force_nonblock); + ret = io_issue_sqe(req, &nxt, true); + if (nxt) + io_queue_async_work(nxt); /* - * Mark the first IO in link list as DRAIN, let all the following - * IOs enter the defer list. all IO needs to be completed before link - * list. + * We async punt it if the file wasn't marked NOWAIT, or if the file + * doesn't support non-blocking read/write attempts */ - req->flags |= REQ_F_IO_DRAIN; - ret = io_req_defer(ctx, req, s->sqe); - if (ret) { - if (ret != -EIOCBQUEUED) { - io_free_req(req); - io_cqring_add_event(ctx, s->sqe->user_data, ret); - return 0; + if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || + (req->flags & REQ_F_MUST_PUNT))) { + if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) { + ret = io_grab_files(req); + if (ret) + goto err; } - } else { + /* - * If ret == 0 means that all IOs in front of link io are - * running done. let's queue link head. + * Queued up for async execution, worker will release + * submit reference when the iocb is actually submitted. */ - need_submit = true; + io_queue_async_work(req); + return; } - /* Insert shadow req to defer_list, blocking next IOs */ - spin_lock_irq(&ctx->completion_lock); - list_add_tail(&shadow->list, &ctx->defer_list); - spin_unlock_irq(&ctx->completion_lock); +err: + /* drop submission reference */ + io_put_req(req); - if (need_submit) - return __io_queue_sqe(ctx, req, s, force_nonblock); + if (linked_timeout) { + if (!ret) + io_queue_linked_timeout(linked_timeout); + else + io_put_req(linked_timeout); + } - return 0; + /* and drop final reference, if we failed */ + if (ret) { + io_cqring_add_event(req, ret); + if (req->flags & REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; + io_put_req(req); + } } +static void io_queue_sqe(struct io_kiocb *req) +{ + int ret; + + if (unlikely(req->ctx->drain_next)) { + req->flags |= REQ_F_IO_DRAIN; + req->ctx->drain_next = false; + } + req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK); + + ret = io_req_defer(req); + if (ret) { + if (ret != -EIOCBQUEUED) { + io_cqring_add_event(req, ret); + if (req->flags & REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; + io_double_put_req(req); + } + } else + __io_queue_sqe(req); +} + +static inline void io_queue_link_head(struct io_kiocb *req) +{ + if (unlikely(req->flags & REQ_F_FAIL_LINK)) { + io_cqring_add_event(req, -ECANCELED); + io_double_put_req(req); + } else + io_queue_sqe(req); +} + + #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK) -static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, - struct io_submit_state *state, struct io_kiocb **link, - bool force_nonblock) +static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, + struct io_kiocb **link) { - struct io_uring_sqe *sqe_copy; - struct io_kiocb *req; + struct io_ring_ctx *ctx = req->ctx; int ret; + req->user_data = req->sqe->user_data; + /* enforce forwards compatibility on users */ - if (unlikely(s->sqe->flags & ~SQE_VALID_FLAGS)) { + if (unlikely(req->sqe->flags & ~SQE_VALID_FLAGS)) { ret = -EINVAL; - goto err; + goto err_req; } - req = io_get_req(ctx, state); - if (unlikely(!req)) { - ret = -EAGAIN; - goto err; - } - - ret = io_req_set_file(ctx, s, state, req); + ret = io_req_set_file(state, req); if (unlikely(ret)) { err_req: - io_free_req(req); -err: - io_cqring_add_event(ctx, s->sqe->user_data, ret); - return; + io_cqring_add_event(req, ret); + io_double_put_req(req); + return false; } /* @@ -2405,25 +3344,35 @@ err: */ if (*link) { struct io_kiocb *prev = *link; + struct io_async_ctx *io; + + if (req->sqe->flags & IOSQE_IO_DRAIN) + (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN; - sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL); - if (!sqe_copy) { + io = kmalloc(sizeof(*io), GFP_KERNEL); + if (!io) { ret = -EAGAIN; goto err_req; } - s->sqe = sqe_copy; - memcpy(&req->submit, s, sizeof(*s)); - list_add_tail(&req->list, &prev->link_list); - } else if (s->sqe->flags & IOSQE_IO_LINK) { + ret = io_req_defer_prep(req, io); + if (ret) { + kfree(io); + prev->flags |= REQ_F_FAIL_LINK; + goto err_req; + } + trace_io_uring_link(ctx, req, prev); + list_add_tail(&req->link_list, &prev->link_list); + } else if (req->sqe->flags & IOSQE_IO_LINK) { req->flags |= REQ_F_LINK; - memcpy(&req->submit, s, sizeof(*s)); INIT_LIST_HEAD(&req->link_list); *link = req; } else { - io_queue_sqe(ctx, req, s, force_nonblock); + io_queue_sqe(req); } + + return true; } /* @@ -2442,7 +3391,7 @@ static void io_submit_state_end(struct io_submit_state *state) * Start submission side cache. */ static void io_submit_state_start(struct io_submit_state *state, - struct io_ring_ctx *ctx, unsigned max_ios) + unsigned int max_ios) { blk_start_plug(&state->plug); state->free_reqs = 0; @@ -2472,7 +3421,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) * used, it's important that those reads are done through READ_ONCE() to * prevent a re-load down the line. */ -static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) +static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req) { struct io_rings *rings = ctx->rings; u32 *sq_array = ctx->sq_array; @@ -2488,108 +3437,143 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) */ head = ctx->cached_sq_head; /* make sure SQ entry isn't read before tail */ - if (head == smp_load_acquire(&rings->sq.tail)) + if (unlikely(head == smp_load_acquire(&rings->sq.tail))) return false; head = READ_ONCE(sq_array[head & ctx->sq_mask]); - if (head < ctx->sq_entries) { - s->index = head; - s->sqe = &ctx->sq_sqes[head]; - s->sequence = ctx->cached_sq_head; + if (likely(head < ctx->sq_entries)) { + /* + * All io need record the previous position, if LINK vs DARIN, + * it can be used to mark the position of the first IO in the + * link list. + */ + req->sequence = ctx->cached_sq_head; + req->sqe = &ctx->sq_sqes[head]; ctx->cached_sq_head++; return true; } /* drop invalid entries */ ctx->cached_sq_head++; - rings->sq_dropped++; + ctx->cached_sq_dropped++; + WRITE_ONCE(rings->sq_dropped, ctx->cached_sq_dropped); return false; } -static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, - unsigned int nr, bool has_user, bool mm_fault) +static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, + struct file *ring_file, int ring_fd, + struct mm_struct **mm, bool async) { struct io_submit_state state, *statep = NULL; struct io_kiocb *link = NULL; - struct io_kiocb *shadow_req = NULL; - bool prev_was_link = false; int i, submitted = 0; + bool mm_fault = false; + + /* if we have a backlog and couldn't flush it all, return BUSY */ + if (!list_empty(&ctx->cq_overflow_list) && + !io_cqring_overflow_flush(ctx, false)) + return -EBUSY; if (nr > IO_PLUG_THRESHOLD) { - io_submit_state_start(&state, ctx, nr); + io_submit_state_start(&state, nr); statep = &state; } for (i = 0; i < nr; i++) { + struct io_kiocb *req; + unsigned int sqe_flags; + + req = io_get_req(ctx, statep); + if (unlikely(!req)) { + if (!submitted) + submitted = -EAGAIN; + break; + } + if (!io_get_sqring(ctx, req)) { + __io_free_req(req); + break; + } + + if (io_sqe_needs_user(req->sqe) && !*mm) { + mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm); + if (!mm_fault) { + use_mm(ctx->sqo_mm); + *mm = ctx->sqo_mm; + } + } + + submitted++; + sqe_flags = req->sqe->flags; + + req->ring_file = ring_file; + req->ring_fd = ring_fd; + req->has_user = *mm != NULL; + req->in_async = async; + req->needs_fixed_file = async; + trace_io_uring_submit_sqe(ctx, req->sqe->user_data, + true, async); + if (!io_submit_sqe(req, statep, &link)) + break; /* * If previous wasn't linked and we have a linked command, * that's the end of the chain. Submit the previous link. */ - if (!prev_was_link && link) { - io_queue_link_head(ctx, link, &link->submit, shadow_req, - true); + if (!(sqe_flags & IOSQE_IO_LINK) && link) { + io_queue_link_head(link); link = NULL; - shadow_req = NULL; - } - prev_was_link = (sqes[i].sqe->flags & IOSQE_IO_LINK) != 0; - - if (link && (sqes[i].sqe->flags & IOSQE_IO_DRAIN)) { - if (!shadow_req) { - shadow_req = io_get_req(ctx, NULL); - if (unlikely(!shadow_req)) - goto out; - shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN); - refcount_dec(&shadow_req->refs); - } - shadow_req->sequence = sqes[i].sequence; - } - -out: - if (unlikely(mm_fault)) { - io_cqring_add_event(ctx, sqes[i].sqe->user_data, - -EFAULT); - } else { - sqes[i].has_user = has_user; - sqes[i].needs_lock = true; - sqes[i].needs_fixed_file = true; - io_submit_sqe(ctx, &sqes[i], statep, &link, true); - submitted++; } } if (link) - io_queue_link_head(ctx, link, &link->submit, shadow_req, true); + io_queue_link_head(link); if (statep) io_submit_state_end(&state); + /* Commit SQ ring head once we've consumed and submitted all SQEs */ + io_commit_sqring(ctx); + return submitted; } static int io_sq_thread(void *data) { - struct sqe_submit sqes[IO_IOPOLL_BATCH]; struct io_ring_ctx *ctx = data; struct mm_struct *cur_mm = NULL; + const struct cred *old_cred; mm_segment_t old_fs; DEFINE_WAIT(wait); unsigned inflight; unsigned long timeout; + int ret; - complete(&ctx->sqo_thread_started); + complete(&ctx->completions[1]); old_fs = get_fs(); set_fs(USER_DS); + old_cred = override_creds(ctx->creds); - timeout = inflight = 0; + ret = timeout = inflight = 0; while (!kthread_should_park()) { - bool all_fixed, mm_fault = false; - int i; + unsigned int to_submit; if (inflight) { unsigned nr_events = 0; if (ctx->flags & IORING_SETUP_IOPOLL) { - io_iopoll_check(ctx, &nr_events, 0); + /* + * inflight is the count of the maximum possible + * entries we submitted, but it can be smaller + * if we dropped some of them. If we don't have + * poll entries available, then we know that we + * have nothing left to poll for. Reset the + * inflight count to zero in that case. + */ + mutex_lock(&ctx->uring_lock); + if (!list_empty(&ctx->poll_list)) + __io_iopoll_check(ctx, &nr_events, 0); + else + inflight = 0; + mutex_unlock(&ctx->uring_lock); } else { /* * Normal IO, just pretend everything completed. @@ -2603,13 +3587,22 @@ static int io_sq_thread(void *data) timeout = jiffies + ctx->sq_thread_idle; } - if (!io_get_sqring(ctx, &sqes[0])) { + to_submit = io_sqring_entries(ctx); + + /* + * If submit got -EBUSY, flag us as needing the application + * to enter the kernel to reap and flush events. + */ + if (!to_submit || ret == -EBUSY) { /* * We're polling. If we're within the defined idle * period, then let us spin without work before going - * to sleep. + * to sleep. The exception is if we got EBUSY doing + * more IO, we should wait for the application to + * reap events and wake us up. */ - if (inflight || !time_after(jiffies, timeout)) { + if (inflight || + (!time_after(jiffies, timeout) && ret != -EBUSY)) { cond_resched(); continue; } @@ -2634,7 +3627,8 @@ static int io_sq_thread(void *data) /* make sure to read SQ tail after writing flags */ smp_mb(); - if (!io_get_sqring(ctx, &sqes[0])) { + to_submit = io_sqring_entries(ctx); + if (!to_submit || ret == -EBUSY) { if (kthread_should_park()) { finish_wait(&ctx->sqo_wait, &wait); break; @@ -2652,31 +3646,10 @@ static int io_sq_thread(void *data) ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; } - i = 0; - all_fixed = true; - do { - if (all_fixed && io_sqe_needs_user(sqes[i].sqe)) - all_fixed = false; - - i++; - if (i == ARRAY_SIZE(sqes)) - break; - } while (io_get_sqring(ctx, &sqes[i])); - - /* Unless all new commands are FIXED regions, grab mm */ - if (!all_fixed && !cur_mm) { - mm_fault = !mmget_not_zero(ctx->sqo_mm); - if (!mm_fault) { - use_mm(ctx->sqo_mm); - cur_mm = ctx->sqo_mm; - } - } - - inflight += io_submit_sqes(ctx, sqes, i, cur_mm != NULL, - mm_fault); - - /* Commit SQ ring head once we've consumed all SQEs */ - io_commit_sqring(ctx); + to_submit = min(to_submit, ctx->sq_entries); + ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true); + if (ret > 0) + inflight += ret; } set_fs(old_fs); @@ -2684,85 +3657,13 @@ static int io_sq_thread(void *data) unuse_mm(cur_mm); mmput(cur_mm); } + revert_creds(old_cred); kthread_parkme(); return 0; } -static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit, - bool block_for_last) -{ - struct io_submit_state state, *statep = NULL; - struct io_kiocb *link = NULL; - struct io_kiocb *shadow_req = NULL; - bool prev_was_link = false; - int i, submit = 0; - - if (to_submit > IO_PLUG_THRESHOLD) { - io_submit_state_start(&state, ctx, to_submit); - statep = &state; - } - - for (i = 0; i < to_submit; i++) { - bool force_nonblock = true; - struct sqe_submit s; - - if (!io_get_sqring(ctx, &s)) - break; - - /* - * If previous wasn't linked and we have a linked command, - * that's the end of the chain. Submit the previous link. - */ - if (!prev_was_link && link) { - io_queue_link_head(ctx, link, &link->submit, shadow_req, - force_nonblock); - link = NULL; - shadow_req = NULL; - } - prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0; - - if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) { - if (!shadow_req) { - shadow_req = io_get_req(ctx, NULL); - if (unlikely(!shadow_req)) - goto out; - shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN); - refcount_dec(&shadow_req->refs); - } - shadow_req->sequence = s.sequence; - } - -out: - s.has_user = true; - s.needs_lock = false; - s.needs_fixed_file = false; - submit++; - - /* - * The caller will block for events after submit, submit the - * last IO non-blocking. This is either the only IO it's - * submitting, or it already submitted the previous ones. This - * improves performance by avoiding an async punt that we don't - * need to do. - */ - if (block_for_last && submit == to_submit) - force_nonblock = false; - - io_submit_sqe(ctx, &s, statep, &link, force_nonblock); - } - io_commit_sqring(ctx); - - if (link) - io_queue_link_head(ctx, link, &link->submit, shadow_req, - !block_for_last); - if (statep) - io_submit_state_end(statep); - - return submit; -} - struct io_wait_queue { struct wait_queue_entry wq; struct io_ring_ctx *ctx; @@ -2770,7 +3671,7 @@ struct io_wait_queue { unsigned nr_timeouts; }; -static inline bool io_should_wake(struct io_wait_queue *iowq) +static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush) { struct io_ring_ctx *ctx = iowq->ctx; @@ -2779,7 +3680,7 @@ static inline bool io_should_wake(struct io_wait_queue *iowq) * started waiting. For timeouts, we always want to return to userspace, * regardless of event count. */ - return io_cqring_events(ctx->rings) >= iowq->to_wait || + return io_cqring_events(ctx, noflush) >= iowq->to_wait || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; } @@ -2789,7 +3690,8 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq); - if (!io_should_wake(iowq)) + /* use noflush == true, as we can't safely rely on locking context */ + if (!io_should_wake(iowq, true)) return -1; return autoremove_wake_function(curr, mode, wake_flags, key); @@ -2812,9 +3714,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, .to_wait = min_events, }; struct io_rings *rings = ctx->rings; - int ret; + int ret = 0; - if (io_cqring_events(rings) >= min_events) + if (io_cqring_events(ctx, false) >= min_events) return 0; if (sig) { @@ -2830,24 +3732,22 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return ret; } - ret = 0; iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); + trace_io_uring_cqring_wait(ctx, min_events); do { prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE); - if (io_should_wake(&iowq)) + if (io_should_wake(&iowq, false)) break; schedule(); if (signal_pending(current)) { - ret = -ERESTARTSYS; + ret = -EINTR; break; } } while (1); finish_wait(&ctx->wait, &iowq.wq); - restore_saved_sigmask_unless(ret == -ERESTARTSYS); - if (ret == -ERESTARTSYS) - ret = -EINTR; + restore_saved_sigmask_unless(ret == -EINTR); return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } @@ -2865,19 +3765,29 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) #else int i; - for (i = 0; i < ctx->nr_user_files; i++) - fput(ctx->user_files[i]); + for (i = 0; i < ctx->nr_user_files; i++) { + struct file *file; + + file = io_file_from_index(ctx, i); + if (file) + fput(file); + } #endif } static int io_sqe_files_unregister(struct io_ring_ctx *ctx) { - if (!ctx->user_files) + unsigned nr_tables, i; + + if (!ctx->file_table) return -ENXIO; __io_sqe_files_unregister(ctx); - kfree(ctx->user_files); - ctx->user_files = NULL; + nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE); + for (i = 0; i < nr_tables; i++) + kfree(ctx->file_table[i].files); + kfree(ctx->file_table); + ctx->file_table = NULL; ctx->nr_user_files = 0; return 0; } @@ -2885,7 +3795,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) static void io_sq_thread_stop(struct io_ring_ctx *ctx) { if (ctx->sqo_thread) { - wait_for_completion(&ctx->sqo_thread_started); + wait_for_completion(&ctx->completions[1]); /* * The park is a bit of a work-around, without it we get * warning spews on shutdown with SQPOLL set and affinity @@ -2899,15 +3809,11 @@ static void io_sq_thread_stop(struct io_ring_ctx *ctx) static void io_finish_async(struct io_ring_ctx *ctx) { - int i; - io_sq_thread_stop(ctx); - for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++) { - if (ctx->sqo_wq[i]) { - destroy_workqueue(ctx->sqo_wq[i]); - ctx->sqo_wq[i] = NULL; - } + if (ctx->io_wq) { + io_wq_destroy(ctx->io_wq); + ctx->io_wq = NULL; } } @@ -2915,11 +3821,9 @@ static void io_finish_async(struct io_ring_ctx *ctx) static void io_destruct_skb(struct sk_buff *skb) { struct io_ring_ctx *ctx = skb->sk->sk_user_data; - int i; - for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++) - if (ctx->sqo_wq[i]) - flush_workqueue(ctx->sqo_wq[i]); + if (ctx->io_wq) + io_wq_flush(ctx->io_wq); unix_destruct_scm(skb); } @@ -2934,7 +3838,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) struct sock *sk = ctx->ring_sock->sk; struct scm_fp_list *fpl; struct sk_buff *skb; - int i; + int i, nr_files; if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) { unsigned long inflight = ctx->user->unix_inflight + nr; @@ -2954,21 +3858,33 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) } skb->sk = sk; - skb->destructor = io_destruct_skb; + nr_files = 0; fpl->user = get_uid(ctx->user); for (i = 0; i < nr; i++) { - fpl->fp[i] = get_file(ctx->user_files[i + offset]); - unix_inflight(fpl->user, fpl->fp[i]); + struct file *file = io_file_from_index(ctx, i + offset); + + if (!file) + continue; + fpl->fp[nr_files] = get_file(file); + unix_inflight(fpl->user, fpl->fp[nr_files]); + nr_files++; } - fpl->max = fpl->count = nr; - UNIXCB(skb).fp = fpl; - refcount_add(skb->truesize, &sk->sk_wmem_alloc); - skb_queue_head(&sk->sk_receive_queue, skb); + if (nr_files) { + fpl->max = SCM_MAX_FD; + fpl->count = nr_files; + UNIXCB(skb).fp = fpl; + skb->destructor = io_destruct_skb; + refcount_add(skb->truesize, &sk->sk_wmem_alloc); + skb_queue_head(&sk->sk_receive_queue, skb); - for (i = 0; i < nr; i++) - fput(fpl->fp[i]); + for (i = 0; i < nr_files; i++) + fput(fpl->fp[i]); + } else { + kfree_skb(skb); + kfree(fpl); + } return 0; } @@ -2999,7 +3915,10 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx) return 0; while (total < ctx->nr_user_files) { - fput(ctx->user_files[total]); + struct file *file = io_file_from_index(ctx, total); + + if (file) + fput(file); total++; } @@ -3012,33 +3931,79 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx) } #endif +static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables, + unsigned nr_files) +{ + int i; + + for (i = 0; i < nr_tables; i++) { + struct fixed_file_table *table = &ctx->file_table[i]; + unsigned this_files; + + this_files = min(nr_files, IORING_MAX_FILES_TABLE); + table->files = kcalloc(this_files, sizeof(struct file *), + GFP_KERNEL); + if (!table->files) + break; + nr_files -= this_files; + } + + if (i == nr_tables) + return 0; + + for (i = 0; i < nr_tables; i++) { + struct fixed_file_table *table = &ctx->file_table[i]; + kfree(table->files); + } + return 1; +} + static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { __s32 __user *fds = (__s32 __user *) arg; + unsigned nr_tables; int fd, ret = 0; unsigned i; - if (ctx->user_files) + if (ctx->file_table) return -EBUSY; if (!nr_args) return -EINVAL; if (nr_args > IORING_MAX_FIXED_FILES) return -EMFILE; - ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL); - if (!ctx->user_files) + nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE); + ctx->file_table = kcalloc(nr_tables, sizeof(struct fixed_file_table), + GFP_KERNEL); + if (!ctx->file_table) return -ENOMEM; - for (i = 0; i < nr_args; i++) { + if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) { + kfree(ctx->file_table); + ctx->file_table = NULL; + return -ENOMEM; + } + + for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { + struct fixed_file_table *table; + unsigned index; + ret = -EFAULT; if (copy_from_user(&fd, &fds[i], sizeof(fd))) break; + /* allow sparse sets */ + if (fd == -1) { + ret = 0; + continue; + } - ctx->user_files[i] = fget(fd); + table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT]; + index = i & IORING_FILE_TABLE_MASK; + table->files[index] = fget(fd); ret = -EBADF; - if (!ctx->user_files[i]) + if (!table->files[index]) break; /* * Don't allow io_uring instances to be registered. If UNIX @@ -3047,20 +4012,26 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, * handle it just fine, but there's still no point in allowing * a ring fd as it doesn't support regular read/write anyway. */ - if (ctx->user_files[i]->f_op == &io_uring_fops) { - fput(ctx->user_files[i]); + if (table->files[index]->f_op == &io_uring_fops) { + fput(table->files[index]); break; } - ctx->nr_user_files++; ret = 0; } if (ret) { - for (i = 0; i < ctx->nr_user_files; i++) - fput(ctx->user_files[i]); + for (i = 0; i < ctx->nr_user_files; i++) { + struct file *file; + + file = io_file_from_index(ctx, i); + if (file) + fput(file); + } + for (i = 0; i < nr_tables; i++) + kfree(ctx->file_table[i].files); - kfree(ctx->user_files); - ctx->user_files = NULL; + kfree(ctx->file_table); + ctx->file_table = NULL; ctx->nr_user_files = 0; return ret; } @@ -3072,9 +4043,202 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } +static void io_sqe_file_unregister(struct io_ring_ctx *ctx, int index) +{ +#if defined(CONFIG_UNIX) + struct file *file = io_file_from_index(ctx, index); + struct sock *sock = ctx->ring_sock->sk; + struct sk_buff_head list, *head = &sock->sk_receive_queue; + struct sk_buff *skb; + int i; + + __skb_queue_head_init(&list); + + /* + * Find the skb that holds this file in its SCM_RIGHTS. When found, + * remove this entry and rearrange the file array. + */ + skb = skb_dequeue(head); + while (skb) { + struct scm_fp_list *fp; + + fp = UNIXCB(skb).fp; + for (i = 0; i < fp->count; i++) { + int left; + + if (fp->fp[i] != file) + continue; + + unix_notinflight(fp->user, fp->fp[i]); + left = fp->count - 1 - i; + if (left) { + memmove(&fp->fp[i], &fp->fp[i + 1], + left * sizeof(struct file *)); + } + fp->count--; + if (!fp->count) { + kfree_skb(skb); + skb = NULL; + } else { + __skb_queue_tail(&list, skb); + } + fput(file); + file = NULL; + break; + } + + if (!file) + break; + + __skb_queue_tail(&list, skb); + + skb = skb_dequeue(head); + } + + if (skb_peek(&list)) { + spin_lock_irq(&head->lock); + while ((skb = __skb_dequeue(&list)) != NULL) + __skb_queue_tail(head, skb); + spin_unlock_irq(&head->lock); + } +#else + fput(io_file_from_index(ctx, index)); +#endif +} + +static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, + int index) +{ +#if defined(CONFIG_UNIX) + struct sock *sock = ctx->ring_sock->sk; + struct sk_buff_head *head = &sock->sk_receive_queue; + struct sk_buff *skb; + + /* + * See if we can merge this file into an existing skb SCM_RIGHTS + * file set. If there's no room, fall back to allocating a new skb + * and filling it in. + */ + spin_lock_irq(&head->lock); + skb = skb_peek(head); + if (skb) { + struct scm_fp_list *fpl = UNIXCB(skb).fp; + + if (fpl->count < SCM_MAX_FD) { + __skb_unlink(skb, head); + spin_unlock_irq(&head->lock); + fpl->fp[fpl->count] = get_file(file); + unix_inflight(fpl->user, fpl->fp[fpl->count]); + fpl->count++; + spin_lock_irq(&head->lock); + __skb_queue_head(head, skb); + } else { + skb = NULL; + } + } + spin_unlock_irq(&head->lock); + + if (skb) { + fput(file); + return 0; + } + + return __io_sqe_files_scm(ctx, 1, index); +#else + return 0; +#endif +} + +static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args) +{ + struct io_uring_files_update up; + __s32 __user *fds; + int fd, i, err; + __u32 done; + + if (!ctx->file_table) + return -ENXIO; + if (!nr_args) + return -EINVAL; + if (copy_from_user(&up, arg, sizeof(up))) + return -EFAULT; + if (check_add_overflow(up.offset, nr_args, &done)) + return -EOVERFLOW; + if (done > ctx->nr_user_files) + return -EINVAL; + + done = 0; + fds = (__s32 __user *) up.fds; + while (nr_args) { + struct fixed_file_table *table; + unsigned index; + + err = 0; + if (copy_from_user(&fd, &fds[done], sizeof(fd))) { + err = -EFAULT; + break; + } + i = array_index_nospec(up.offset, ctx->nr_user_files); + table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT]; + index = i & IORING_FILE_TABLE_MASK; + if (table->files[index]) { + io_sqe_file_unregister(ctx, i); + table->files[index] = NULL; + } + if (fd != -1) { + struct file *file; + + file = fget(fd); + if (!file) { + err = -EBADF; + break; + } + /* + * Don't allow io_uring instances to be registered. If + * UNIX isn't enabled, then this causes a reference + * cycle and this instance can never get freed. If UNIX + * is enabled we'll handle it just fine, but there's + * still no point in allowing a ring fd as it doesn't + * support regular read/write anyway. + */ + if (file->f_op == &io_uring_fops) { + fput(file); + err = -EBADF; + break; + } + table->files[index] = file; + err = io_sqe_file_register(ctx, file, i); + if (err) + break; + } + nr_args--; + done++; + up.offset++; + } + + return done ? done : err; +} + +static void io_put_work(struct io_wq_work *work) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + + io_put_req(req); +} + +static void io_get_work(struct io_wq_work *work) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + + refcount_inc(&req->refs); +} + static int io_sq_offload_start(struct io_ring_ctx *ctx, struct io_uring_params *p) { + struct io_wq_data data; + unsigned concurrency; int ret; init_waitqueue_head(&ctx->sqo_wait); @@ -3118,26 +4282,18 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, goto err; } - /* Do QD, or 2 * CPUS, whatever is smallest */ - ctx->sqo_wq[0] = alloc_workqueue("io_ring-wq", - WQ_UNBOUND | WQ_FREEZABLE, - min(ctx->sq_entries - 1, 2 * num_online_cpus())); - if (!ctx->sqo_wq[0]) { - ret = -ENOMEM; - goto err; - } - - /* - * This is for buffered writes, where we want to limit the parallelism - * due to file locking in file systems. As "normal" buffered writes - * should parellelize on writeout quite nicely, limit us to having 2 - * pending. This avoids massive contention on the inode when doing - * buffered async writes. - */ - ctx->sqo_wq[1] = alloc_workqueue("io_ring-write-wq", - WQ_UNBOUND | WQ_FREEZABLE, 2); - if (!ctx->sqo_wq[1]) { - ret = -ENOMEM; + data.mm = ctx->sqo_mm; + data.user = ctx->user; + data.creds = ctx->creds; + data.get_work = io_get_work; + data.put_work = io_put_work; + + /* Do QD, or 4 * CPUS, whatever is smallest */ + concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); + ctx->io_wq = io_wq_create(concurrency, &data); + if (IS_ERR(ctx->io_wq)) { + ret = PTR_ERR(ctx->io_wq); + ctx->io_wq = NULL; goto err; } @@ -3483,6 +4639,10 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) io_unaccount_mem(ctx->user, ring_pages(ctx->sq_entries, ctx->cq_entries)); free_uid(ctx->user); + put_cred(ctx->creds); + kfree(ctx->completions); + kfree(ctx->cancel_hash); + kmem_cache_free(req_cachep, ctx->fallback_req); kfree(ctx); } @@ -3521,8 +4681,15 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) io_kill_timeouts(ctx); io_poll_remove_all(ctx); + + if (ctx->io_wq) + io_wq_cancel_all(ctx->io_wq); + io_iopoll_reap_events(ctx); - wait_for_completion(&ctx->ctx_done); + /* if we failed setting up the ctx, we might not have any rings */ + if (ctx->rings) + io_cqring_overflow_flush(ctx, true); + wait_for_completion(&ctx->completions[0]); io_ring_ctx_free(ctx); } @@ -3535,12 +4702,58 @@ static int io_uring_release(struct inode *inode, struct file *file) return 0; } -static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) +static void io_uring_cancel_files(struct io_ring_ctx *ctx, + struct files_struct *files) +{ + struct io_kiocb *req; + DEFINE_WAIT(wait); + + while (!list_empty_careful(&ctx->inflight_list)) { + struct io_kiocb *cancel_req = NULL; + + spin_lock_irq(&ctx->inflight_lock); + list_for_each_entry(req, &ctx->inflight_list, inflight_entry) { + if (req->work.files != files) + continue; + /* req is being completed, ignore */ + if (!refcount_inc_not_zero(&req->refs)) + continue; + cancel_req = req; + break; + } + if (cancel_req) + prepare_to_wait(&ctx->inflight_wait, &wait, + TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&ctx->inflight_lock); + + /* We need to keep going until we don't find a matching req */ + if (!cancel_req) + break; + + io_wq_cancel_work(ctx->io_wq, &cancel_req->work); + io_put_req(cancel_req); + schedule(); + } + finish_wait(&ctx->inflight_wait, &wait); +} + +static int io_uring_flush(struct file *file, void *data) { - loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT; - unsigned long sz = vma->vm_end - vma->vm_start; struct io_ring_ctx *ctx = file->private_data; - unsigned long pfn; + + io_uring_cancel_files(ctx, data); + if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) { + io_cqring_overflow_flush(ctx, true); + io_wq_cancel_all(ctx->io_wq); + } + return 0; +} + +static void *io_uring_validate_mmap_request(struct file *file, + loff_t pgoff, size_t sz) +{ + struct io_ring_ctx *ctx = file->private_data; + loff_t offset = pgoff << PAGE_SHIFT; struct page *page; void *ptr; @@ -3553,17 +4766,59 @@ static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) ptr = ctx->sq_sqes; break; default: - return -EINVAL; + return ERR_PTR(-EINVAL); } page = virt_to_head_page(ptr); if (sz > page_size(page)) - return -EINVAL; + return ERR_PTR(-EINVAL); + + return ptr; +} + +#ifdef CONFIG_MMU + +static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) +{ + size_t sz = vma->vm_end - vma->vm_start; + unsigned long pfn; + void *ptr; + + ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); pfn = virt_to_phys(ptr) >> PAGE_SHIFT; return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); } +#else /* !CONFIG_MMU */ + +static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) +{ + return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL; +} + +static unsigned int io_uring_nommu_mmap_capabilities(struct file *file) +{ + return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE; +} + +static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + void *ptr; + + ptr = io_uring_validate_mmap_request(file, pgoff, len); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + return (unsigned long) ptr; +} + +#endif /* !CONFIG_MMU */ + SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, u32, min_complete, u32, flags, const sigset_t __user *, sig, size_t, sigsz) @@ -3596,25 +4851,20 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, */ ret = 0; if (ctx->flags & IORING_SETUP_SQPOLL) { + if (!list_empty_careful(&ctx->cq_overflow_list)) + io_cqring_overflow_flush(ctx, false); if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sqo_wait); submitted = to_submit; } else if (to_submit) { - bool block_for_last = false; + struct mm_struct *cur_mm; to_submit = min(to_submit, ctx->sq_entries); - - /* - * Allow last submission to block in a series, IFF the caller - * asked to wait for events and we don't currently have - * enough. This potentially avoids an async punt. - */ - if (to_submit == min_complete && - io_cqring_events(ctx->rings) < min_complete) - block_for_last = true; - mutex_lock(&ctx->uring_lock); - submitted = io_ring_submit(ctx, to_submit, block_for_last); + /* already have mm, so io_submit_sqes() won't try to grab it */ + cur_mm = ctx->sqo_mm; + submitted = io_submit_sqes(ctx, to_submit, f.file, fd, + &cur_mm, false); mutex_unlock(&ctx->uring_lock); } if (flags & IORING_ENTER_GETEVENTS) { @@ -3637,7 +4887,12 @@ out_fput: static const struct file_operations io_uring_fops = { .release = io_uring_release, + .flush = io_uring_flush, .mmap = io_uring_mmap, +#ifndef CONFIG_MMU + .get_unmapped_area = io_uring_nommu_get_unmapped_area, + .mmap_capabilities = io_uring_nommu_mmap_capabilities, +#endif .poll = io_uring_poll, .fasync = io_uring_fasync, }; @@ -3668,12 +4923,18 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, ctx->cq_entries = rings->cq_ring_entries; size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); - if (size == SIZE_MAX) + if (size == SIZE_MAX) { + io_mem_free(ctx->rings); + ctx->rings = NULL; return -EOVERFLOW; + } ctx->sq_sqes = io_mem_alloc(size); - if (!ctx->sq_sqes) + if (!ctx->sq_sqes) { + io_mem_free(ctx->rings); + ctx->rings = NULL; return -ENOMEM; + } return 0; } @@ -3736,10 +4997,23 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) * Use twice as many entries for the CQ ring. It's possible for the * application to drive a higher depth than the size of the SQ ring, * since the sqes are only used at submission time. This allows for - * some flexibility in overcommitting a bit. + * some flexibility in overcommitting a bit. If the application has + * set IORING_SETUP_CQSIZE, it will have passed in the desired number + * of CQ ring entries manually. */ p->sq_entries = roundup_pow_of_two(entries); - p->cq_entries = 2 * p->sq_entries; + if (p->flags & IORING_SETUP_CQSIZE) { + /* + * If IORING_SETUP_CQSIZE is set, we do the same roundup + * to a power-of-two, if it isn't already. We do NOT impose + * any cq vs sq ring sizing. + */ + if (p->cq_entries < p->sq_entries || p->cq_entries > IORING_MAX_CQ_ENTRIES) + return -EINVAL; + p->cq_entries = roundup_pow_of_two(p->cq_entries); + } else { + p->cq_entries = 2 * p->sq_entries; + } user = get_uid(current_user()); account_mem = !capable(CAP_IPC_LOCK); @@ -3764,6 +5038,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) ctx->compat = in_compat_syscall(); ctx->account_mem = account_mem; ctx->user = user; + ctx->creds = get_current_cred(); ret = io_allocate_scq_urings(ctx, p); if (ret) @@ -3773,10 +5048,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) if (ret) goto err; - ret = io_uring_get_fd(ctx); - if (ret < 0) - goto err; - memset(&p->sq_off, 0, sizeof(p->sq_off)); p->sq_off.head = offsetof(struct io_rings, sq.head); p->sq_off.tail = offsetof(struct io_rings, sq.tail); @@ -3794,7 +5065,17 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); p->cq_off.cqes = offsetof(struct io_rings, cqes); - p->features = IORING_FEAT_SINGLE_MMAP; + /* + * Install ring fd as the very last thing, so we don't risk someone + * having closed it before we finish setup + */ + ret = io_uring_get_fd(ctx); + if (ret < 0) + goto err; + + p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | + IORING_FEAT_SUBMIT_STABLE; + trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: io_ring_ctx_wait_and_kill(ctx); @@ -3820,7 +5101,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) } if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | - IORING_SETUP_SQ_AFF)) + IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE)) return -EINVAL; ret = io_uring_create(entries, &p); @@ -3864,7 +5145,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, * no new references will come in after we've killed the percpu ref. */ mutex_unlock(&ctx->uring_lock); - wait_for_completion(&ctx->ctx_done); + wait_for_completion(&ctx->completions[0]); mutex_lock(&ctx->uring_lock); switch (opcode) { @@ -3886,6 +5167,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_sqe_files_unregister(ctx); break; + case IORING_REGISTER_FILES_UPDATE: + ret = io_sqe_files_update(ctx, arg, nr_args); + break; case IORING_REGISTER_EVENTFD: ret = -EINVAL; if (nr_args != 1) @@ -3904,7 +5188,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, } /* bring the ctx back to life */ - reinit_completion(&ctx->ctx_done); + reinit_completion(&ctx->completions[0]); percpu_ref_reinit(&ctx->refs); return ret; } @@ -3929,6 +5213,8 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, mutex_lock(&ctx->uring_lock); ret = __io_uring_register(ctx, opcode, arg, nr_args); mutex_unlock(&ctx->uring_lock); + trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, + ctx->cq_ev_fd != NULL, ret); out_fput: fdput(f); return ret; diff --git a/fs/ioctl.c b/fs/ioctl.c index 55c7cfb0e599..2f5e4e5b97e1 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -8,6 +8,7 @@ #include <linux/syscalls.h> #include <linux/mm.h> #include <linux/capability.h> +#include <linux/compat.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/security.h> @@ -174,10 +175,9 @@ static int fiemap_check_ranges(struct super_block *sb, return 0; } -static int ioctl_fiemap(struct file *filp, unsigned long arg) +static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap) { struct fiemap fiemap; - struct fiemap __user *ufiemap = (struct fiemap __user *) arg; struct fiemap_extent_info fieinfo = { 0, }; struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; @@ -244,7 +244,8 @@ fdput: return ret; } -static long ioctl_file_clone_range(struct file *file, void __user *argp) +static long ioctl_file_clone_range(struct file *file, + struct file_clone_range __user *argp) { struct file_clone_range args; @@ -491,6 +492,35 @@ int ioctl_preallocate(struct file *filp, int mode, void __user *argp) sr.l_len); } +/* on ia32 l_start is on a 32-bit boundary */ +#if defined CONFIG_COMPAT && defined(CONFIG_X86_64) +/* just account for different alignment */ +int compat_ioctl_preallocate(struct file *file, int mode, + struct space_resv_32 __user *argp) +{ + struct inode *inode = file_inode(file); + struct space_resv_32 sr; + + if (copy_from_user(&sr, argp, sizeof(sr))) + return -EFAULT; + + switch (sr.l_whence) { + case SEEK_SET: + break; + case SEEK_CUR: + sr.l_start += file->f_pos; + break; + case SEEK_END: + sr.l_start += i_size_read(inode); + break; + default: + return -EINVAL; + } + + return vfs_fallocate(file, mode | FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len); +} +#endif + static int file_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { @@ -590,9 +620,9 @@ static int ioctl_fsthaw(struct file *filp) return thaw_super(sb); } -static int ioctl_file_dedupe_range(struct file *file, void __user *arg) +static int ioctl_file_dedupe_range(struct file *file, + struct file_dedupe_range __user *argp) { - struct file_dedupe_range __user *argp = arg; struct file_dedupe_range *same = NULL; int ret; unsigned long size; @@ -641,7 +671,7 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, unsigned long arg) { int error = 0; - int __user *argp = (int __user *)arg; + void __user *argp = (void __user *)arg; struct inode *inode = file_inode(filp); switch (cmd) { @@ -680,13 +710,13 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, break; case FS_IOC_FIEMAP: - return ioctl_fiemap(filp, arg); + return ioctl_fiemap(filp, argp); case FIGETBSZ: /* anon_bdev filesystems may not have a block size */ if (!inode->i_sb->s_blocksize) return -EINVAL; - return put_user(inode->i_sb->s_blocksize, argp); + return put_user(inode->i_sb->s_blocksize, (int __user *)argp); case FICLONE: return ioctl_file_clone(filp, arg, 0, 0, 0); @@ -725,3 +755,37 @@ SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { return ksys_ioctl(fd, cmd, arg); } + +#ifdef CONFIG_COMPAT +/** + * compat_ptr_ioctl - generic implementation of .compat_ioctl file operation + * + * This is not normally called as a function, but instead set in struct + * file_operations as + * + * .compat_ioctl = compat_ptr_ioctl, + * + * On most architectures, the compat_ptr_ioctl() just passes all arguments + * to the corresponding ->ioctl handler. The exception is arch/s390, where + * compat_ptr() clears the top bit of a 32-bit pointer value, so user space + * pointers to the second 2GB alias the first 2GB, as is the case for + * native 32-bit s390 user space. + * + * The compat_ptr_ioctl() function must therefore be used only with ioctl + * functions that either ignore the argument or pass a pointer to a + * compatible data type. + * + * If any ioctl command handled by fops->unlocked_ioctl passes a plain + * integer instead of a pointer, or any of the passed data types + * is incompatible between 32-bit and 64-bit architectures, a proper + * handler is required instead of compat_ptr_ioctl. + */ +long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + if (!file->f_op->unlocked_ioctl) + return -ENOIOCTLCMD; + + return file->f_op->unlocked_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); +} +EXPORT_SYMBOL(compat_ptr_ioctl); +#endif diff --git a/fs/iomap/apply.c b/fs/iomap/apply.c index 484dd8eda861..76925b40b5fd 100644 --- a/fs/iomap/apply.c +++ b/fs/iomap/apply.c @@ -7,6 +7,7 @@ #include <linux/compiler.h> #include <linux/fs.h> #include <linux/iomap.h> +#include "trace.h" /* * Execute a iomap write on a segment of the mapping that spans a @@ -28,6 +29,8 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, loff_t written = 0, ret; u64 end; + trace_iomap_apply(inode, pos, length, flags, ops, actor, _RET_IP_); + /* * Need to map a range from start position for length bytes. This can * span multiple pages - it is only guaranteed to return a range of a @@ -48,6 +51,10 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, if (WARN_ON(iomap.length == 0)) return -EIO; + trace_iomap_apply_dstmap(inode, &iomap); + if (srcmap.type != IOMAP_HOLE) + trace_iomap_apply_srcmap(inode, &srcmap); + /* * Cut down the length to the one actually provided by the filesystem, * as it might not be able to give us the whole size that we requested. diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index c62e807956b6..d33c7bc5ee92 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1067,20 +1067,19 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) lock_page(page); size = i_size_read(inode); - if ((page->mapping != inode->i_mapping) || - (page_offset(page) > size)) { + offset = page_offset(page); + if (page->mapping != inode->i_mapping || offset > size) { /* We overload EFAULT to mean page got truncated */ ret = -EFAULT; goto out_unlock; } /* page is wholly or partially inside EOF */ - if (((page->index + 1) << PAGE_SHIFT) > size) + if (offset > size - PAGE_SIZE) length = offset_in_page(size); else length = PAGE_SIZE; - offset = page_offset(page); while (length > 0) { ret = iomap_apply(inode, offset, length, IOMAP_WRITE | IOMAP_FAULT, ops, page, diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 49bf9780e3ed..23837926c0c5 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -201,12 +201,12 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev)); unsigned int fs_block_size = i_blocksize(inode), pad; unsigned int align = iov_iter_alignment(dio->submit.iter); - struct iov_iter iter; struct bio *bio; bool need_zeroout = false; bool use_fua = false; int nr_pages, ret = 0; size_t copied = 0; + size_t orig_count; if ((pos | length | align) & ((1 << blkbits) - 1)) return -EINVAL; @@ -236,15 +236,18 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, } /* - * Operate on a partial iter trimmed to the extent we were called for. - * We'll update the iter in the dio once we're done with this extent. + * Save the original count and trim the iter to just the extent we + * are operating on right now. The iter will be re-expanded once + * we are done. */ - iter = *dio->submit.iter; - iov_iter_truncate(&iter, length); + orig_count = iov_iter_count(dio->submit.iter); + iov_iter_truncate(dio->submit.iter, length); - nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES); - if (nr_pages <= 0) - return nr_pages; + nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES); + if (nr_pages <= 0) { + ret = nr_pages; + goto out; + } if (need_zeroout) { /* zero out from the start of the block to the write offset */ @@ -257,7 +260,8 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, size_t n; if (dio->error) { iov_iter_revert(dio->submit.iter, copied); - return 0; + copied = ret = 0; + goto out; } bio = bio_alloc(GFP_KERNEL, nr_pages); @@ -268,7 +272,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; - ret = bio_iov_iter_get_pages(bio, &iter); + ret = bio_iov_iter_get_pages(bio, dio->submit.iter); if (unlikely(ret)) { /* * We have to stop part way through an IO. We must fall @@ -294,13 +298,11 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, bio_set_pages_dirty(bio); } - iov_iter_advance(dio->submit.iter, n); - dio->size += n; pos += n; copied += n; - nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES); + nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES); iomap_dio_submit_bio(dio, iomap, bio); } while (nr_pages); @@ -318,7 +320,12 @@ zero_tail: if (pad) iomap_dio_zero(dio, iomap, pos, fs_block_size - pad); } - return copied ? copied : ret; +out: + /* Undo iter limitation to current extent */ + iov_iter_reexpand(dio->submit.iter, orig_count - copied); + if (copied) + return copied; + return ret; } static loff_t @@ -398,7 +405,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = file_inode(iocb->ki_filp); size_t count = iov_iter_count(iter); - loff_t pos = iocb->ki_pos, start = pos; + loff_t pos = iocb->ki_pos; loff_t end = iocb->ki_pos + count - 1, ret = 0; unsigned int flags = IOMAP_DIRECT; struct blk_plug plug; @@ -433,7 +440,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (pos >= dio->i_size) goto out_free_dio; - if (iter_is_iovec(iter) && iov_iter_rw(iter) == READ) + if (iter_is_iovec(iter)) dio->flags |= IOMAP_DIO_DIRTY; } else { flags |= IOMAP_WRITE; @@ -454,14 +461,14 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, } if (iocb->ki_flags & IOCB_NOWAIT) { - if (filemap_range_has_page(mapping, start, end)) { + if (filemap_range_has_page(mapping, pos, end)) { ret = -EAGAIN; goto out_free_dio; } flags |= IOMAP_NOWAIT; } - ret = filemap_write_and_wait_range(mapping, start, end); + ret = filemap_write_and_wait_range(mapping, pos, end); if (ret) goto out_free_dio; @@ -472,7 +479,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, * pretty crazy thing to do, so we don't support it 100%. */ ret = invalidate_inode_pages2_range(mapping, - start >> PAGE_SHIFT, end >> PAGE_SHIFT); + pos >> PAGE_SHIFT, end >> PAGE_SHIFT); if (ret) dio_warn_stale_pagecache(iocb->ki_filp); ret = 0; @@ -500,8 +507,15 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, } pos += ret; - if (iov_iter_rw(iter) == READ && pos >= dio->i_size) + if (iov_iter_rw(iter) == READ && pos >= dio->i_size) { + /* + * We only report that we've read data up to i_size. + * Revert iter to a state corresponding to that as + * some callers (such as splice code) rely on it. + */ + iov_iter_revert(iter, pos - dio->i_size); break; + } } while ((count = iov_iter_count(iter)) > 0); blk_finish_plug(&plug); diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index 690ef2d7c6c8..bccf305ea9ce 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -133,12 +133,16 @@ iomap_bmap(struct address_space *mapping, sector_t bno, struct inode *inode = mapping->host; loff_t pos = bno << inode->i_blkbits; unsigned blocksize = i_blocksize(inode); + int ret; if (filemap_write_and_wait(mapping)) return 0; bno = 0; - iomap_apply(inode, pos, blocksize, 0, ops, &bno, iomap_bmap_actor); + ret = iomap_apply(inode, pos, blocksize, 0, ops, &bno, + iomap_bmap_actor); + if (ret) + return 0; return bno; } EXPORT_SYMBOL_GPL(iomap_bmap); diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 4ca1aa2f3f6e..6dc227b8c47e 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -80,6 +80,109 @@ DEFINE_PAGE_EVENT(iomap_writepage); DEFINE_PAGE_EVENT(iomap_releasepage); DEFINE_PAGE_EVENT(iomap_invalidatepage); +#define IOMAP_TYPE_STRINGS \ + { IOMAP_HOLE, "HOLE" }, \ + { IOMAP_DELALLOC, "DELALLOC" }, \ + { IOMAP_MAPPED, "MAPPED" }, \ + { IOMAP_UNWRITTEN, "UNWRITTEN" }, \ + { IOMAP_INLINE, "INLINE" } + +#define IOMAP_FLAGS_STRINGS \ + { IOMAP_WRITE, "WRITE" }, \ + { IOMAP_ZERO, "ZERO" }, \ + { IOMAP_REPORT, "REPORT" }, \ + { IOMAP_FAULT, "FAULT" }, \ + { IOMAP_DIRECT, "DIRECT" }, \ + { IOMAP_NOWAIT, "NOWAIT" } + +#define IOMAP_F_FLAGS_STRINGS \ + { IOMAP_F_NEW, "NEW" }, \ + { IOMAP_F_DIRTY, "DIRTY" }, \ + { IOMAP_F_SHARED, "SHARED" }, \ + { IOMAP_F_MERGED, "MERGED" }, \ + { IOMAP_F_BUFFER_HEAD, "BH" }, \ + { IOMAP_F_SIZE_CHANGED, "SIZE_CHANGED" } + +DECLARE_EVENT_CLASS(iomap_class, + TP_PROTO(struct inode *inode, struct iomap *iomap), + TP_ARGS(inode, iomap), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, ino) + __field(u64, addr) + __field(loff_t, offset) + __field(u64, length) + __field(u16, type) + __field(u16, flags) + __field(dev_t, bdev) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->addr = iomap->addr; + __entry->offset = iomap->offset; + __entry->length = iomap->length; + __entry->type = iomap->type; + __entry->flags = iomap->flags; + __entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0; + ), + TP_printk("dev %d:%d ino 0x%llx bdev %d:%d addr %lld offset %lld " + "length %llu type %s flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + MAJOR(__entry->bdev), MINOR(__entry->bdev), + __entry->addr, + __entry->offset, + __entry->length, + __print_symbolic(__entry->type, IOMAP_TYPE_STRINGS), + __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS)) +) + +#define DEFINE_IOMAP_EVENT(name) \ +DEFINE_EVENT(iomap_class, name, \ + TP_PROTO(struct inode *inode, struct iomap *iomap), \ + TP_ARGS(inode, iomap)) +DEFINE_IOMAP_EVENT(iomap_apply_dstmap); +DEFINE_IOMAP_EVENT(iomap_apply_srcmap); + +TRACE_EVENT(iomap_apply, + TP_PROTO(struct inode *inode, loff_t pos, loff_t length, + unsigned int flags, const void *ops, void *actor, + unsigned long caller), + TP_ARGS(inode, pos, length, flags, ops, actor, caller), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, ino) + __field(loff_t, pos) + __field(loff_t, length) + __field(unsigned int, flags) + __field(const void *, ops) + __field(void *, actor) + __field(unsigned long, caller) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pos = pos; + __entry->length = length; + __entry->flags = flags; + __entry->ops = ops; + __entry->actor = actor; + __entry->caller = caller; + ), + TP_printk("dev %d:%d ino 0x%llx pos %lld length %lld flags %s (0x%x) " + "ops %ps caller %pS actor %ps", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->pos, + __entry->length, + __print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS), + __entry->flags, + __entry->ops, + (void *)__entry->caller, + __entry->actor) +); + #endif /* _IOMAP_TRACE_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index a1909066bde6..8fff6677a5da 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -110,7 +110,7 @@ void __jbd2_log_wait_for_space(journal_t *journal) int nblocks, space_left; /* assert_spin_locked(&journal->j_state_lock); */ - nblocks = jbd2_space_needed(journal); + nblocks = journal->j_max_transaction_buffers; while (jbd2_log_space_left(journal) < nblocks) { write_unlock(&journal->j_state_lock); mutex_lock_io(&journal->j_checkpoint_mutex); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 132fb92098c7..7f0b362b3842 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -482,10 +482,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) if (jh->b_committed_data) { struct buffer_head *bh = jh2bh(jh); - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); jbd2_free(jh->b_committed_data, bh->b_size); jh->b_committed_data = NULL; - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); } jbd2_journal_refile_buffer(journal, jh); } @@ -560,8 +560,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) stats.run.rs_logging = jiffies; stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing, stats.run.rs_logging); - stats.run.rs_blocks = - atomic_read(&commit_transaction->t_outstanding_credits); + stats.run.rs_blocks = commit_transaction->t_nr_buffers; stats.run.rs_blocks_logged = 0; J_ASSERT(commit_transaction->t_nr_buffers <= @@ -642,8 +641,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) /* * start_this_handle() uses t_outstanding_credits to determine - * the free space in the log, but this counter is changed - * by jbd2_journal_next_log_block() also. + * the free space in the log. */ atomic_dec(&commit_transaction->t_outstanding_credits); @@ -727,7 +725,6 @@ start_journal_io: submit_bh(REQ_OP_WRITE, REQ_SYNC, bh); } cond_resched(); - stats.run.rs_blocks_logged += bufs; /* Force a new descriptor to be generated next time round the loop. */ @@ -814,6 +811,7 @@ start_journal_io: if (unlikely(!buffer_uptodate(bh))) err = -EIO; jbd2_unfile_log_bh(bh); + stats.run.rs_blocks_logged++; /* * The list contains temporary buffer heads created by @@ -859,6 +857,7 @@ start_journal_io: BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); clear_buffer_jwrite(bh); jbd2_unfile_log_bh(bh); + stats.run.rs_blocks_logged++; __brelse(bh); /* One for getblk */ /* AKPM: bforget here */ } @@ -880,6 +879,7 @@ start_journal_io: } if (cbh) err = journal_wait_on_commit_record(journal, cbh); + stats.run.rs_blocks_logged++; if (jbd2_has_feature_async_commit(journal) && journal->j_flags & JBD2_BARRIER) { blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL); @@ -888,6 +888,9 @@ start_journal_io: if (err) jbd2_journal_abort(journal, err); + WARN_ON_ONCE( + atomic_read(&commit_transaction->t_outstanding_credits) < 0); + /* * Now disk caches for filesystem device are flushed so we are safe to * erase checkpointed transactions from the log by updating journal @@ -918,6 +921,7 @@ restart_loop: transaction_t *cp_transaction; struct buffer_head *bh; int try_to_free = 0; + bool drop_ref; jh = commit_transaction->t_forget; spin_unlock(&journal->j_list_lock); @@ -927,7 +931,7 @@ restart_loop: * done with it. */ get_bh(bh); - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); J_ASSERT_JH(jh, jh->b_transaction == commit_transaction); /* @@ -1022,8 +1026,10 @@ restart_loop: try_to_free = 1; } JBUFFER_TRACE(jh, "refile or unfile buffer"); - __jbd2_journal_refile_buffer(jh); - jbd_unlock_bh_state(bh); + drop_ref = __jbd2_journal_refile_buffer(jh); + spin_unlock(&jh->b_state_lock); + if (drop_ref) + jbd2_journal_put_journal_head(jh); if (try_to_free) release_buffer_page(bh); /* Drops bh reference */ else diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 1c58859aa592..5e408ee24a1a 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -363,7 +363,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, /* keep subsequent assertions sane */ atomic_set(&new_bh->b_count, 1); - jbd_lock_bh_state(bh_in); + spin_lock(&jh_in->b_state_lock); repeat: /* * If a new transaction has already done a buffer copy-out, then @@ -405,13 +405,13 @@ repeat: if (need_copy_out && !done_copy_out) { char *tmp; - jbd_unlock_bh_state(bh_in); + spin_unlock(&jh_in->b_state_lock); tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); if (!tmp) { brelse(new_bh); return -ENOMEM; } - jbd_lock_bh_state(bh_in); + spin_lock(&jh_in->b_state_lock); if (jh_in->b_frozen_data) { jbd2_free(tmp, bh_in->b_size); goto repeat; @@ -464,7 +464,7 @@ repeat: __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); spin_unlock(&journal->j_list_lock); set_buffer_shadow(bh_in); - jbd_unlock_bh_state(bh_in); + spin_unlock(&jh_in->b_state_lock); return do_escape | (done_copy_out << 1); } @@ -840,6 +840,7 @@ jbd2_journal_get_descriptor_buffer(transaction_t *transaction, int type) bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); if (!bh) return NULL; + atomic_dec(&transaction->t_outstanding_credits); lock_buffer(bh); memset(bh->b_data, 0, journal->j_blocksize); header = (journal_header_t *)bh->b_data; @@ -1098,6 +1099,16 @@ static void jbd2_stats_proc_exit(journal_t *journal) remove_proc_entry(journal->j_devname, proc_jbd2_stats); } +/* Minimum size of descriptor tag */ +static int jbd2_min_tag_size(void) +{ + /* + * Tag with 32-bit block numbers does not use last four bytes of the + * structure + */ + return sizeof(journal_block_tag_t) - 4; +} + /* * Management for journal control blocks: functions to create and * destroy journal_t structures, and to initialise and read existing @@ -1156,7 +1167,8 @@ static journal_t *journal_init_common(struct block_device *bdev, journal->j_fs_dev = fs_dev; journal->j_blk_offset = start; journal->j_maxlen = len; - n = journal->j_blocksize / sizeof(journal_block_tag_t); + /* We need enough buffers to write out full descriptor block. */ + n = journal->j_blocksize / jbd2_min_tag_size(); journal->j_wbufsize = n; journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *), GFP_KERNEL); @@ -1488,6 +1500,21 @@ void jbd2_journal_update_sb_errno(journal_t *journal) } EXPORT_SYMBOL(jbd2_journal_update_sb_errno); +static int journal_revoke_records_per_block(journal_t *journal) +{ + int record_size; + int space = journal->j_blocksize - sizeof(jbd2_journal_revoke_header_t); + + if (jbd2_has_feature_64bit(journal)) + record_size = 8; + else + record_size = 4; + + if (jbd2_journal_has_csum_v2or3(journal)) + space -= sizeof(struct jbd2_journal_block_tail); + return space / record_size; +} + /* * Read the superblock for a given journal, performing initial * validation of the format. @@ -1596,6 +1623,8 @@ static int journal_get_superblock(journal_t *journal) sizeof(sb->s_uuid)); } + journal->j_revoke_records_per_block = + journal_revoke_records_per_block(journal); set_buffer_verified(bh); return 0; @@ -1916,6 +1945,8 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, sb->s_feature_ro_compat |= cpu_to_be32(ro); sb->s_feature_incompat |= cpu_to_be32(incompat); unlock_buffer(journal->j_sb_buffer); + journal->j_revoke_records_per_block = + journal_revoke_records_per_block(journal); return 1; #undef COMPAT_FEATURE_ON @@ -1946,6 +1977,8 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat, sb->s_feature_compat &= ~cpu_to_be32(compat); sb->s_feature_ro_compat &= ~cpu_to_be32(ro); sb->s_feature_incompat &= ~cpu_to_be32(incompat); + journal->j_revoke_records_per_block = + journal_revoke_records_per_block(journal); } EXPORT_SYMBOL(jbd2_journal_clear_features); @@ -2410,6 +2443,8 @@ static struct journal_head *journal_alloc_journal_head(void) ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS | __GFP_NOFAIL); } + if (ret) + spin_lock_init(&ret->b_state_lock); return ret; } @@ -2529,17 +2564,23 @@ static void __journal_remove_journal_head(struct buffer_head *bh) J_ASSERT_BH(bh, buffer_jbd(bh)); J_ASSERT_BH(bh, jh2bh(jh) == bh); BUFFER_TRACE(bh, "remove journal_head"); + + /* Unlink before dropping the lock */ + bh->b_private = NULL; + jh->b_bh = NULL; /* debug, really */ + clear_buffer_jbd(bh); +} + +static void journal_release_journal_head(struct journal_head *jh, size_t b_size) +{ if (jh->b_frozen_data) { printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__); - jbd2_free(jh->b_frozen_data, bh->b_size); + jbd2_free(jh->b_frozen_data, b_size); } if (jh->b_committed_data) { printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__); - jbd2_free(jh->b_committed_data, bh->b_size); + jbd2_free(jh->b_committed_data, b_size); } - bh->b_private = NULL; - jh->b_bh = NULL; /* debug, really */ - clear_buffer_jbd(bh); journal_free_journal_head(jh); } @@ -2557,9 +2598,11 @@ void jbd2_journal_put_journal_head(struct journal_head *jh) if (!jh->b_jcount) { __journal_remove_journal_head(bh); jbd_unlock_bh_journal_head(bh); + journal_release_journal_head(jh, bh->b_size); __brelse(bh); - } else + } else { jbd_unlock_bh_journal_head(bh); + } } /* diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index f08073d7bbf5..fa608788b93d 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -371,6 +371,11 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, } #endif + if (WARN_ON_ONCE(handle->h_revoke_credits <= 0)) { + if (!bh_in) + brelse(bh); + return -EIO; + } /* We really ought not ever to revoke twice in a row without first having the revoke cancelled: it's illegal to free a block twice without allocating it in between! */ @@ -391,6 +396,7 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, __brelse(bh); } } + handle->h_revoke_credits--; jbd_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in); err = insert_revoke_hash(journal, blocknr, diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index bee8498d7792..27b9f9dee434 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -63,6 +63,28 @@ void jbd2_journal_free_transaction(transaction_t *transaction) } /* + * Base amount of descriptor blocks we reserve for each transaction. + */ +static int jbd2_descriptor_blocks_per_trans(journal_t *journal) +{ + int tag_space = journal->j_blocksize - sizeof(journal_header_t); + int tags_per_block; + + /* Subtract UUID */ + tag_space -= 16; + if (jbd2_journal_has_csum_v2or3(journal)) + tag_space -= sizeof(struct jbd2_journal_block_tail); + /* Commit code leaves a slack space of 16 bytes at the end of block */ + tags_per_block = (tag_space - 16) / journal_tag_bytes(journal); + /* + * Revoke descriptors are accounted separately so we need to reserve + * space for commit block and normal transaction descriptor blocks. + */ + return 1 + DIV_ROUND_UP(journal->j_max_transaction_buffers, + tags_per_block); +} + +/* * jbd2_get_transaction: obtain a new transaction_t object. * * Simply initialise a new transaction. Initialize it in @@ -88,7 +110,9 @@ static void jbd2_get_transaction(journal_t *journal, spin_lock_init(&transaction->t_handle_lock); atomic_set(&transaction->t_updates, 0); atomic_set(&transaction->t_outstanding_credits, + jbd2_descriptor_blocks_per_trans(journal) + atomic_read(&journal->j_reserved_credits)); + atomic_set(&transaction->t_outstanding_revokes, 0); atomic_set(&transaction->t_handle_count, 0); INIT_LIST_HEAD(&transaction->t_inode_list); INIT_LIST_HEAD(&transaction->t_private_list); @@ -258,12 +282,13 @@ static int add_transaction_credits(journal_t *journal, int blocks, * *before* starting to dirty potentially checkpointed buffers * in the new transaction. */ - if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) { + if (jbd2_log_space_left(journal) < journal->j_max_transaction_buffers) { atomic_sub(total, &t->t_outstanding_credits); read_unlock(&journal->j_state_lock); jbd2_might_wait_for_commit(journal); write_lock(&journal->j_state_lock); - if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) + if (jbd2_log_space_left(journal) < + journal->j_max_transaction_buffers) __jbd2_log_wait_for_space(journal); write_unlock(&journal->j_state_lock); return 1; @@ -299,12 +324,12 @@ static int start_this_handle(journal_t *journal, handle_t *handle, gfp_t gfp_mask) { transaction_t *transaction, *new_transaction = NULL; - int blocks = handle->h_buffer_credits; + int blocks = handle->h_total_credits; int rsv_blocks = 0; unsigned long ts = jiffies; if (handle->h_rsv_handle) - rsv_blocks = handle->h_rsv_handle->h_buffer_credits; + rsv_blocks = handle->h_rsv_handle->h_total_credits; /* * Limit the number of reserved credits to 1/2 of maximum transaction @@ -405,6 +430,7 @@ repeat: update_t_max_wait(transaction, ts); handle->h_transaction = transaction; handle->h_requested_credits = blocks; + handle->h_revoke_credits_requested = handle->h_revoke_credits; handle->h_start_jiffies = jiffies; atomic_inc(&transaction->t_updates); atomic_inc(&transaction->t_handle_count); @@ -431,15 +457,15 @@ static handle_t *new_handle(int nblocks) handle_t *handle = jbd2_alloc_handle(GFP_NOFS); if (!handle) return NULL; - handle->h_buffer_credits = nblocks; + handle->h_total_credits = nblocks; handle->h_ref = 1; return handle; } handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, - gfp_t gfp_mask, unsigned int type, - unsigned int line_no) + int revoke_records, gfp_t gfp_mask, + unsigned int type, unsigned int line_no) { handle_t *handle = journal_current_handle(); int err; @@ -453,6 +479,8 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, return handle; } + nblocks += DIV_ROUND_UP(revoke_records, + journal->j_revoke_records_per_block); handle = new_handle(nblocks); if (!handle) return ERR_PTR(-ENOMEM); @@ -468,6 +496,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, rsv_handle->h_journal = journal; handle->h_rsv_handle = rsv_handle; } + handle->h_revoke_credits = revoke_records; err = start_this_handle(journal, handle, gfp_mask); if (err < 0) { @@ -508,16 +537,21 @@ EXPORT_SYMBOL(jbd2__journal_start); */ handle_t *jbd2_journal_start(journal_t *journal, int nblocks) { - return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0); + return jbd2__journal_start(journal, nblocks, 0, 0, GFP_NOFS, 0, 0); } EXPORT_SYMBOL(jbd2_journal_start); -void jbd2_journal_free_reserved(handle_t *handle) +static void __jbd2_journal_unreserve_handle(handle_t *handle) { journal_t *journal = handle->h_journal; WARN_ON(!handle->h_reserved); - sub_reserved_credits(journal, handle->h_buffer_credits); + sub_reserved_credits(journal, handle->h_total_credits); +} + +void jbd2_journal_free_reserved(handle_t *handle) +{ + __jbd2_journal_unreserve_handle(handle); jbd2_free_handle(handle); } EXPORT_SYMBOL(jbd2_journal_free_reserved); @@ -571,7 +605,7 @@ int jbd2_journal_start_reserved(handle_t *handle, unsigned int type, handle->h_line_no = line_no; trace_jbd2_handle_start(journal->j_fs_dev->bd_dev, handle->h_transaction->t_tid, type, - line_no, handle->h_buffer_credits); + line_no, handle->h_total_credits); return 0; } EXPORT_SYMBOL(jbd2_journal_start_reserved); @@ -580,6 +614,7 @@ EXPORT_SYMBOL(jbd2_journal_start_reserved); * int jbd2_journal_extend() - extend buffer credits. * @handle: handle to 'extend' * @nblocks: nr blocks to try to extend by. + * @revoke_records: number of revoke records to try to extend by. * * Some transactions, such as large extends and truncates, can be done * atomically all at once or in several stages. The operation requests @@ -596,7 +631,7 @@ EXPORT_SYMBOL(jbd2_journal_start_reserved); * return code < 0 implies an error * return code > 0 implies normal transaction-full status. */ -int jbd2_journal_extend(handle_t *handle, int nblocks) +int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records) { transaction_t *transaction = handle->h_transaction; journal_t *journal; @@ -618,6 +653,12 @@ int jbd2_journal_extend(handle_t *handle, int nblocks) goto error_out; } + nblocks += DIV_ROUND_UP( + handle->h_revoke_credits_requested + revoke_records, + journal->j_revoke_records_per_block) - + DIV_ROUND_UP( + handle->h_revoke_credits_requested, + journal->j_revoke_records_per_block); spin_lock(&transaction->t_handle_lock); wanted = atomic_add_return(nblocks, &transaction->t_outstanding_credits); @@ -629,22 +670,16 @@ int jbd2_journal_extend(handle_t *handle, int nblocks) goto unlock; } - if (wanted + (wanted >> JBD2_CONTROL_BLOCKS_SHIFT) > - jbd2_log_space_left(journal)) { - jbd_debug(3, "denied handle %p %d blocks: " - "insufficient log space\n", handle, nblocks); - atomic_sub(nblocks, &transaction->t_outstanding_credits); - goto unlock; - } - trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev, transaction->t_tid, handle->h_type, handle->h_line_no, - handle->h_buffer_credits, + handle->h_total_credits, nblocks); - handle->h_buffer_credits += nblocks; + handle->h_total_credits += nblocks; handle->h_requested_credits += nblocks; + handle->h_revoke_credits += revoke_records; + handle->h_revoke_credits_requested += revoke_records; result = 0; jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); @@ -655,11 +690,55 @@ error_out: return result; } +static void stop_this_handle(handle_t *handle) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + int revokes; + + J_ASSERT(journal_current_handle() == handle); + J_ASSERT(atomic_read(&transaction->t_updates) > 0); + current->journal_info = NULL; + /* + * Subtract necessary revoke descriptor blocks from handle credits. We + * take care to account only for revoke descriptor blocks the + * transaction will really need as large sequences of transactions with + * small numbers of revokes are relatively common. + */ + revokes = handle->h_revoke_credits_requested - handle->h_revoke_credits; + if (revokes) { + int t_revokes, revoke_descriptors; + int rr_per_blk = journal->j_revoke_records_per_block; + + WARN_ON_ONCE(DIV_ROUND_UP(revokes, rr_per_blk) + > handle->h_total_credits); + t_revokes = atomic_add_return(revokes, + &transaction->t_outstanding_revokes); + revoke_descriptors = + DIV_ROUND_UP(t_revokes, rr_per_blk) - + DIV_ROUND_UP(t_revokes - revokes, rr_per_blk); + handle->h_total_credits -= revoke_descriptors; + } + atomic_sub(handle->h_total_credits, + &transaction->t_outstanding_credits); + if (handle->h_rsv_handle) + __jbd2_journal_unreserve_handle(handle->h_rsv_handle); + if (atomic_dec_and_test(&transaction->t_updates)) + wake_up(&journal->j_wait_updates); + + rwsem_release(&journal->j_trans_commit_map, _THIS_IP_); + /* + * Scope of the GFP_NOFS context is over here and so we can restore the + * original alloc context. + */ + memalloc_nofs_restore(handle->saved_alloc_context); +} /** * int jbd2_journal_restart() - restart a handle . * @handle: handle to restart * @nblocks: nr credits requested + * @revoke_records: number of revoke record credits requested * @gfp_mask: memory allocation flags (for start_this_handle) * * Restart a handle for a multi-transaction filesystem @@ -672,56 +751,48 @@ error_out: * credits. We preserve reserved handle if there's any attached to the * passed in handle. */ -int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask) +int jbd2__journal_restart(handle_t *handle, int nblocks, int revoke_records, + gfp_t gfp_mask) { transaction_t *transaction = handle->h_transaction; journal_t *journal; tid_t tid; - int need_to_start, ret; + int need_to_start; + int ret; /* If we've had an abort of any type, don't even think about * actually doing the restart! */ if (is_handle_aborted(handle)) return 0; journal = transaction->t_journal; + tid = transaction->t_tid; /* * First unlink the handle from its current transaction, and start the * commit on that. */ - J_ASSERT(atomic_read(&transaction->t_updates) > 0); - J_ASSERT(journal_current_handle() == handle); - - read_lock(&journal->j_state_lock); - spin_lock(&transaction->t_handle_lock); - atomic_sub(handle->h_buffer_credits, - &transaction->t_outstanding_credits); - if (handle->h_rsv_handle) { - sub_reserved_credits(journal, - handle->h_rsv_handle->h_buffer_credits); - } - if (atomic_dec_and_test(&transaction->t_updates)) - wake_up(&journal->j_wait_updates); - tid = transaction->t_tid; - spin_unlock(&transaction->t_handle_lock); + jbd_debug(2, "restarting handle %p\n", handle); + stop_this_handle(handle); handle->h_transaction = NULL; - current->journal_info = NULL; - jbd_debug(2, "restarting handle %p\n", handle); + /* + * TODO: If we use READ_ONCE / WRITE_ONCE for j_commit_request we can + * get rid of pointless j_state_lock traffic like this. + */ + read_lock(&journal->j_state_lock); need_to_start = !tid_geq(journal->j_commit_request, tid); read_unlock(&journal->j_state_lock); if (need_to_start) jbd2_log_start_commit(journal, tid); - - rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_); - handle->h_buffer_credits = nblocks; - /* - * Restore the original nofs context because the journal restart - * is basically the same thing as journal stop and start. - * start_this_handle will start a new nofs context. - */ - memalloc_nofs_restore(handle->saved_alloc_context); + handle->h_total_credits = nblocks + + DIV_ROUND_UP(revoke_records, + journal->j_revoke_records_per_block); + handle->h_revoke_credits = revoke_records; ret = start_this_handle(journal, handle, gfp_mask); + trace_jbd2_handle_restart(journal->j_fs_dev->bd_dev, + ret ? 0 : handle->h_transaction->t_tid, + handle->h_type, handle->h_line_no, + handle->h_total_credits); return ret; } EXPORT_SYMBOL(jbd2__journal_restart); @@ -729,7 +800,7 @@ EXPORT_SYMBOL(jbd2__journal_restart); int jbd2_journal_restart(handle_t *handle, int nblocks) { - return jbd2__journal_restart(handle, nblocks, GFP_NOFS); + return jbd2__journal_restart(handle, nblocks, 0, GFP_NOFS); } EXPORT_SYMBOL(jbd2_journal_restart); @@ -879,7 +950,7 @@ repeat: start_lock = jiffies; lock_buffer(bh); - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); /* If it takes too long to lock the buffer, trace it */ time_lock = jbd2_time_diff(start_lock, jiffies); @@ -929,7 +1000,7 @@ repeat: error = -EROFS; if (is_handle_aborted(handle)) { - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); goto out; } error = 0; @@ -993,7 +1064,7 @@ repeat: */ if (buffer_shadow(bh)) { JBUFFER_TRACE(jh, "on shadow: sleep"); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE); goto repeat; } @@ -1014,7 +1085,7 @@ repeat: JBUFFER_TRACE(jh, "generate frozen data"); if (!frozen_buffer) { JBUFFER_TRACE(jh, "allocate memory for buffer"); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS | __GFP_NOFAIL); goto repeat; @@ -1033,7 +1104,7 @@ attach_next: jh->b_next_transaction = transaction; done: - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); /* * If we are about to journal a buffer, then any revoke pending on it is @@ -1172,7 +1243,7 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) * that case: the transaction must have deleted the buffer for it to be * reused here. */ - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); J_ASSERT_JH(jh, (jh->b_transaction == transaction || jh->b_transaction == NULL || (jh->b_transaction == journal->j_committing_transaction && @@ -1207,7 +1278,7 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) jh->b_next_transaction = transaction; spin_unlock(&journal->j_list_lock); } - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); /* * akpm: I added this. ext3_alloc_branch can pick up new indirect @@ -1275,13 +1346,13 @@ repeat: committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS|__GFP_NOFAIL); - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); if (!jh->b_committed_data) { /* Copy out the current buffer contents into the * preserved, committed copy. */ JBUFFER_TRACE(jh, "generate b_committed data"); if (!committed_data) { - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); goto repeat; } @@ -1289,7 +1360,7 @@ repeat: committed_data = NULL; memcpy(jh->b_committed_data, bh->b_data, bh->b_size); } - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); out: jbd2_journal_put_journal_head(jh); if (unlikely(committed_data)) @@ -1390,16 +1461,16 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) */ if (jh->b_transaction != transaction && jh->b_next_transaction != transaction) { - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); J_ASSERT_JH(jh, jh->b_transaction == transaction || jh->b_next_transaction == transaction); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); } if (jh->b_modified == 1) { /* If it's in our transaction it must be in BJ_Metadata list. */ if (jh->b_transaction == transaction && jh->b_jlist != BJ_Metadata) { - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); if (jh->b_transaction == transaction && jh->b_jlist != BJ_Metadata) pr_err("JBD2: assertion failure: h_type=%u " @@ -1409,13 +1480,13 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) jh->b_jlist); J_ASSERT_JH(jh, jh->b_transaction != transaction || jh->b_jlist == BJ_Metadata); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); } goto out; } journal = transaction->t_journal; - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); if (jh->b_modified == 0) { /* @@ -1423,12 +1494,12 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) * of the transaction. This needs to be done * once a transaction -bzzz */ - if (handle->h_buffer_credits <= 0) { + if (WARN_ON_ONCE(jbd2_handle_buffer_credits(handle) <= 0)) { ret = -ENOSPC; goto out_unlock_bh; } jh->b_modified = 1; - handle->h_buffer_credits--; + handle->h_total_credits--; } /* @@ -1501,7 +1572,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) __jbd2_journal_file_buffer(jh, transaction, BJ_Metadata); spin_unlock(&journal->j_list_lock); out_unlock_bh: - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); out: JBUFFER_TRACE(jh, "exit"); return ret; @@ -1539,18 +1610,20 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) BUFFER_TRACE(bh, "entry"); - jbd_lock_bh_state(bh); + jh = jbd2_journal_grab_journal_head(bh); + if (!jh) { + __bforget(bh); + return 0; + } - if (!buffer_jbd(bh)) - goto not_jbd; - jh = bh2jh(bh); + spin_lock(&jh->b_state_lock); /* Critical error: attempting to delete a bitmap buffer, maybe? * Don't do any jbd operations, and return an error. */ if (!J_EXPECT_JH(jh, !jh->b_committed_data, "inconsistent data on disk")) { err = -EIO; - goto not_jbd; + goto drop; } /* keep track of whether or not this transaction modified us */ @@ -1598,10 +1671,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); } else { __jbd2_journal_unfile_buffer(jh); - if (!buffer_jbd(bh)) { - spin_unlock(&journal->j_list_lock); - goto not_jbd; - } + jbd2_journal_put_journal_head(jh); } spin_unlock(&journal->j_list_lock); } else if (jh->b_transaction) { @@ -1643,7 +1713,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) if (!jh->b_cp_transaction) { JBUFFER_TRACE(jh, "belongs to none transaction"); spin_unlock(&journal->j_list_lock); - goto not_jbd; + goto drop; } /* @@ -1653,7 +1723,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) if (!buffer_dirty(bh)) { __jbd2_journal_remove_checkpoint(jh); spin_unlock(&journal->j_list_lock); - goto not_jbd; + goto drop; } /* @@ -1666,20 +1736,15 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); spin_unlock(&journal->j_list_lock); } - - jbd_unlock_bh_state(bh); - __brelse(bh); drop: + __brelse(bh); + spin_unlock(&jh->b_state_lock); + jbd2_journal_put_journal_head(jh); if (drop_reserve) { /* no need to reserve log space for this block -bzzz */ - handle->h_buffer_credits++; + handle->h_total_credits++; } return err; - -not_jbd: - jbd_unlock_bh_state(bh); - __bforget(bh); - goto drop; } /** @@ -1706,45 +1771,34 @@ int jbd2_journal_stop(handle_t *handle) tid_t tid; pid_t pid; + if (--handle->h_ref > 0) { + jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, + handle->h_ref); + if (is_handle_aborted(handle)) + return -EIO; + return 0; + } if (!transaction) { /* - * Handle is already detached from the transaction so - * there is nothing to do other than decrease a refcount, - * or free the handle if refcount drops to zero + * Handle is already detached from the transaction so there is + * nothing to do other than free the handle. */ - if (--handle->h_ref > 0) { - jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, - handle->h_ref); - return err; - } else { - if (handle->h_rsv_handle) - jbd2_free_handle(handle->h_rsv_handle); - goto free_and_exit; - } + memalloc_nofs_restore(handle->saved_alloc_context); + goto free_and_exit; } journal = transaction->t_journal; - - J_ASSERT(journal_current_handle() == handle); + tid = transaction->t_tid; if (is_handle_aborted(handle)) err = -EIO; - else - J_ASSERT(atomic_read(&transaction->t_updates) > 0); - - if (--handle->h_ref > 0) { - jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, - handle->h_ref); - return err; - } jbd_debug(4, "Handle %p going down\n", handle); trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev, - transaction->t_tid, - handle->h_type, handle->h_line_no, + tid, handle->h_type, handle->h_line_no, jiffies - handle->h_start_jiffies, handle->h_sync, handle->h_requested_credits, (handle->h_requested_credits - - handle->h_buffer_credits)); + handle->h_total_credits)); /* * Implement synchronous transaction batching. If the handle @@ -1804,19 +1858,13 @@ int jbd2_journal_stop(handle_t *handle) if (handle->h_sync) transaction->t_synchronous_commit = 1; - current->journal_info = NULL; - atomic_sub(handle->h_buffer_credits, - &transaction->t_outstanding_credits); /* * If the handle is marked SYNC, we need to set another commit - * going! We also want to force a commit if the current - * transaction is occupying too much of the log, or if the - * transaction is too old now. + * going! We also want to force a commit if the transaction is too + * old now. */ if (handle->h_sync || - (atomic_read(&transaction->t_outstanding_credits) > - journal->j_max_transaction_buffers) || time_after_eq(jiffies, transaction->t_expires)) { /* Do this even for aborted journals: an abort still * completes the commit thread, it just doesn't write @@ -1825,7 +1873,7 @@ int jbd2_journal_stop(handle_t *handle) jbd_debug(2, "transaction too old, requesting commit for " "handle %p\n", handle); /* This is non-blocking */ - jbd2_log_start_commit(journal, transaction->t_tid); + jbd2_log_start_commit(journal, tid); /* * Special case: JBD2_SYNC synchronous updates require us @@ -1836,31 +1884,19 @@ int jbd2_journal_stop(handle_t *handle) } /* - * Once we drop t_updates, if it goes to zero the transaction - * could start committing on us and eventually disappear. So - * once we do this, we must not dereference transaction - * pointer again. + * Once stop_this_handle() drops t_updates, the transaction could start + * committing on us and eventually disappear. So we must not + * dereference transaction pointer again after calling + * stop_this_handle(). */ - tid = transaction->t_tid; - if (atomic_dec_and_test(&transaction->t_updates)) { - wake_up(&journal->j_wait_updates); - if (journal->j_barrier_count) - wake_up(&journal->j_wait_transaction_locked); - } - - rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_); + stop_this_handle(handle); if (wait_for_commit) err = jbd2_log_wait_commit(journal, tid); - if (handle->h_rsv_handle) - jbd2_journal_free_reserved(handle->h_rsv_handle); free_and_exit: - /* - * Scope of the GFP_NOFS context is over here and so we can restore the - * original alloc context. - */ - memalloc_nofs_restore(handle->saved_alloc_context); + if (handle->h_rsv_handle) + jbd2_free_handle(handle->h_rsv_handle); jbd2_free_handle(handle); return err; } @@ -1878,7 +1914,7 @@ free_and_exit: * * j_list_lock is held. * - * jbd_lock_bh_state(jh2bh(jh)) is held. + * jh->b_state_lock is held. */ static inline void @@ -1902,7 +1938,7 @@ __blist_add_buffer(struct journal_head **list, struct journal_head *jh) * * Called with j_list_lock held, and the journal may not be locked. * - * jbd_lock_bh_state(jh2bh(jh)) is held. + * jh->b_state_lock is held. */ static inline void @@ -1934,7 +1970,7 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) transaction_t *transaction; struct buffer_head *bh = jh2bh(jh); - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + lockdep_assert_held(&jh->b_state_lock); transaction = jh->b_transaction; if (transaction) assert_spin_locked(&transaction->t_journal->j_list_lock); @@ -1971,17 +2007,15 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) } /* - * Remove buffer from all transactions. + * Remove buffer from all transactions. The caller is responsible for dropping + * the jh reference that belonged to the transaction. * * Called with bh_state lock and j_list_lock - * - * jh and bh may be already freed when this function returns. */ static void __jbd2_journal_unfile_buffer(struct journal_head *jh) { __jbd2_journal_temp_unlink_buffer(jh); jh->b_transaction = NULL; - jbd2_journal_put_journal_head(jh); } void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh) @@ -1990,18 +2024,19 @@ void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh) /* Get reference so that buffer cannot be freed before we unlock it */ get_bh(bh); - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); spin_lock(&journal->j_list_lock); __jbd2_journal_unfile_buffer(jh); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); + jbd2_journal_put_journal_head(jh); __brelse(bh); } /* * Called from jbd2_journal_try_to_free_buffers(). * - * Called under jbd_lock_bh_state(bh) + * Called under jh->b_state_lock */ static void __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) @@ -2088,10 +2123,10 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, if (!jh) continue; - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); __journal_try_to_free_buffer(journal, bh); + spin_unlock(&jh->b_state_lock); jbd2_journal_put_journal_head(jh); - jbd_unlock_bh_state(bh); if (buffer_jbd(bh)) goto busy; } while ((bh = bh->b_this_page) != head); @@ -2112,7 +2147,7 @@ busy: * * Called under j_list_lock. * - * Called under jbd_lock_bh_state(bh). + * Called under jh->b_state_lock. */ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) { @@ -2133,6 +2168,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) } else { JBUFFER_TRACE(jh, "on running transaction"); __jbd2_journal_unfile_buffer(jh); + jbd2_journal_put_journal_head(jh); } return may_free; } @@ -2199,18 +2235,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, * holding the page lock. --sct */ - if (!buffer_jbd(bh)) + jh = jbd2_journal_grab_journal_head(bh); + if (!jh) goto zap_buffer_unlocked; /* OK, we have data buffer in journaled mode */ write_lock(&journal->j_state_lock); - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); spin_lock(&journal->j_list_lock); - jh = jbd2_journal_grab_journal_head(bh); - if (!jh) - goto zap_buffer_no_jh; - /* * We cannot remove the buffer from checkpoint lists until the * transaction adding inode to orphan list (let's call it T) @@ -2289,10 +2322,10 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, * for commit and try again. */ if (partial_page) { - jbd2_journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); write_unlock(&journal->j_state_lock); + jbd2_journal_put_journal_head(jh); return -EBUSY; } /* @@ -2304,10 +2337,10 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, set_buffer_freed(bh); if (journal->j_running_transaction && buffer_jbddirty(bh)) jh->b_next_transaction = journal->j_running_transaction; - jbd2_journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); write_unlock(&journal->j_state_lock); + jbd2_journal_put_journal_head(jh); return 0; } else { /* Good, the buffer belongs to the running transaction. @@ -2331,11 +2364,10 @@ zap_buffer: * here. */ jh->b_modified = 0; - jbd2_journal_put_journal_head(jh); -zap_buffer_no_jh: spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); + spin_unlock(&jh->b_state_lock); write_unlock(&journal->j_state_lock); + jbd2_journal_put_journal_head(jh); zap_buffer_unlocked: clear_buffer_dirty(bh); J_ASSERT_BH(bh, !buffer_jbddirty(bh)); @@ -2422,7 +2454,7 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, int was_dirty = 0; struct buffer_head *bh = jh2bh(jh); - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + lockdep_assert_held(&jh->b_state_lock); assert_spin_locked(&transaction->t_journal->j_list_lock); J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); @@ -2484,11 +2516,11 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, void jbd2_journal_file_buffer(struct journal_head *jh, transaction_t *transaction, int jlist) { - jbd_lock_bh_state(jh2bh(jh)); + spin_lock(&jh->b_state_lock); spin_lock(&transaction->t_journal->j_list_lock); __jbd2_journal_file_buffer(jh, transaction, jlist); spin_unlock(&transaction->t_journal->j_list_lock); - jbd_unlock_bh_state(jh2bh(jh)); + spin_unlock(&jh->b_state_lock); } /* @@ -2498,23 +2530,25 @@ void jbd2_journal_file_buffer(struct journal_head *jh, * buffer on that transaction's metadata list. * * Called under j_list_lock - * Called under jbd_lock_bh_state(jh2bh(jh)) + * Called under jh->b_state_lock * - * jh and bh may be already free when this function returns + * When this function returns true, there's no next transaction to refile to + * and the caller has to drop jh reference through + * jbd2_journal_put_journal_head(). */ -void __jbd2_journal_refile_buffer(struct journal_head *jh) +bool __jbd2_journal_refile_buffer(struct journal_head *jh) { int was_dirty, jlist; struct buffer_head *bh = jh2bh(jh); - J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + lockdep_assert_held(&jh->b_state_lock); if (jh->b_transaction) assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock); /* If the buffer is now unused, just drop it. */ if (jh->b_next_transaction == NULL) { __jbd2_journal_unfile_buffer(jh); - return; + return true; } /* @@ -2542,6 +2576,7 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh) if (was_dirty) set_buffer_jbddirty(bh); + return false; } /* @@ -2552,16 +2587,15 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh) */ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) { - struct buffer_head *bh = jh2bh(jh); + bool drop; - /* Get reference so that buffer cannot be freed before we unlock it */ - get_bh(bh); - jbd_lock_bh_state(bh); + spin_lock(&jh->b_state_lock); spin_lock(&journal->j_list_lock); - __jbd2_journal_refile_buffer(jh); - jbd_unlock_bh_state(bh); + drop = __jbd2_journal_refile_buffer(jh); + spin_unlock(&jh->b_state_lock); spin_unlock(&journal->j_list_lock); - __brelse(bh); + if (drop) + jbd2_journal_put_journal_head(jh); } /* diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c index 021a4a2190ee..b86c78d178c6 100644 --- a/fs/jffs2/nodelist.c +++ b/fs/jffs2/nodelist.c @@ -226,7 +226,7 @@ static int jffs2_add_frag_to_fragtree(struct jffs2_sb_info *c, struct rb_root *r lastend = this->ofs + this->size; } else { dbg_fragtree2("lookup gave no frag\n"); - return -EINVAL; + lastend = 0; } /* See if we ran off the end of the fragtree */ diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 6ebae6bbe6a5..9d96e6871e1a 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -438,7 +438,7 @@ void kernfs_put_active(struct kernfs_node *kn) return; if (kernfs_lockdep(kn)) - rwsem_release(&kn->dep_map, 1, _RET_IP_); + rwsem_release(&kn->dep_map, _RET_IP_); v = atomic_dec_return(&kn->active); if (likely(v != KN_DEACTIVATED_BIAS)) return; @@ -476,7 +476,7 @@ static void kernfs_drain(struct kernfs_node *kn) if (kernfs_lockdep(kn)) { lock_acquired(&kn->dep_map, _RET_IP_); - rwsem_release(&kn->dep_map, 1, _RET_IP_); + rwsem_release(&kn->dep_map, _RET_IP_); } kernfs_drain_open_files(kn); @@ -508,10 +508,6 @@ void kernfs_put(struct kernfs_node *kn) struct kernfs_node *parent; struct kernfs_root *root; - /* - * kernfs_node is freed with ->count 0, kernfs_find_and_get_node_by_ino - * depends on this to filter reused stale node - */ if (!kn || !atomic_dec_and_test(&kn->count)) return; root = kernfs_root(kn); @@ -536,7 +532,7 @@ void kernfs_put(struct kernfs_node *kn) kmem_cache_free(kernfs_iattrs_cache, kn->iattr); } spin_lock(&kernfs_idr_lock); - idr_remove(&root->ino_idr, kn->id.ino); + idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); spin_unlock(&kernfs_idr_lock); kmem_cache_free(kernfs_node_cache, kn); @@ -621,8 +617,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, unsigned flags) { struct kernfs_node *kn; - u32 gen; - int cursor; + u32 id_highbits; int ret; name = kstrdup_const(name, GFP_KERNEL); @@ -635,23 +630,19 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, idr_preload(GFP_KERNEL); spin_lock(&kernfs_idr_lock); - cursor = idr_get_cursor(&root->ino_idr); ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC); - if (ret >= 0 && ret < cursor) - root->next_generation++; - gen = root->next_generation; + if (ret >= 0 && ret < root->last_id_lowbits) + root->id_highbits++; + id_highbits = root->id_highbits; + root->last_id_lowbits = ret; spin_unlock(&kernfs_idr_lock); idr_preload_end(); if (ret < 0) goto err_out2; - kn->id.ino = ret; - kn->id.generation = gen; - /* - * set ino first. This RELEASE is paired with atomic_inc_not_zero in - * kernfs_find_and_get_node_by_ino - */ - atomic_set_release(&kn->count, 1); + kn->id = (u64)id_highbits << 32 | ret; + + atomic_set(&kn->count, 1); atomic_set(&kn->active, KN_DEACTIVATED_BIAS); RB_CLEAR_NODE(&kn->rb); @@ -680,7 +671,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, return kn; err_out3: - idr_remove(&root->ino_idr, kn->id.ino); + idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); err_out2: kmem_cache_free(kernfs_node_cache, kn); err_out1: @@ -705,50 +696,52 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, } /* - * kernfs_find_and_get_node_by_ino - get kernfs_node from inode number + * kernfs_find_and_get_node_by_id - get kernfs_node from node id * @root: the kernfs root - * @ino: inode number + * @id: the target node id + * + * @id's lower 32bits encode ino and upper gen. If the gen portion is + * zero, all generations are matched. * * RETURNS: * NULL on failure. Return a kernfs node with reference counter incremented */ -struct kernfs_node *kernfs_find_and_get_node_by_ino(struct kernfs_root *root, - unsigned int ino) +struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, + u64 id) { struct kernfs_node *kn; + ino_t ino = kernfs_id_ino(id); + u32 gen = kernfs_id_gen(id); - rcu_read_lock(); - kn = idr_find(&root->ino_idr, ino); + spin_lock(&kernfs_idr_lock); + + kn = idr_find(&root->ino_idr, (u32)ino); if (!kn) - goto out; + goto err_unlock; - /* - * Since kernfs_node is freed in RCU, it's possible an old node for ino - * is freed, but reused before RCU grace period. But a freed node (see - * kernfs_put) or an incompletedly initialized node (see - * __kernfs_new_node) should have 'count' 0. We can use this fact to - * filter out such node. - */ - if (!atomic_inc_not_zero(&kn->count)) { - kn = NULL; - goto out; + if (sizeof(ino_t) >= sizeof(u64)) { + /* we looked up with the low 32bits, compare the whole */ + if (kernfs_ino(kn) != ino) + goto err_unlock; + } else { + /* 0 matches all generations */ + if (unlikely(gen && kernfs_gen(kn) != gen)) + goto err_unlock; } /* - * The node could be a new node or a reused node. If it's a new node, - * we are ok. If it's reused because of RCU (because of - * SLAB_TYPESAFE_BY_RCU), the __kernfs_new_node always sets its 'ino' - * before 'count'. So if 'count' is uptodate, 'ino' should be uptodate, - * hence we can use 'ino' to filter stale node. + * ACTIVATED is protected with kernfs_mutex but it was clear when + * @kn was added to idr and we just wanna see it set. No need to + * grab kernfs_mutex. */ - if (kn->id.ino != ino) - goto out; - rcu_read_unlock(); + if (unlikely(!(kn->flags & KERNFS_ACTIVATED) || + !atomic_inc_not_zero(&kn->count))) + goto err_unlock; + spin_unlock(&kernfs_idr_lock); return kn; -out: - rcu_read_unlock(); - kernfs_put(kn); +err_unlock: + spin_unlock(&kernfs_idr_lock); return NULL; } @@ -962,7 +955,17 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, idr_init(&root->ino_idr); INIT_LIST_HEAD(&root->supers); - root->next_generation = 1; + + /* + * On 64bit ino setups, id is ino. On 32bit, low 32bits are ino. + * High bits generation. The starting value for both ino and + * genenration is 1. Initialize upper 32bit allocation + * accordingly. + */ + if (sizeof(ino_t) >= sizeof(u64)) + root->id_highbits = 0; + else + root->id_highbits = 1; kn = __kernfs_new_node(root, NULL, "", S_IFDIR | S_IRUGO | S_IXUGO, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, @@ -1678,7 +1681,7 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) const char *name = pos->name; unsigned int type = dt_type(pos); int len = strlen(name); - ino_t ino = pos->id.ino; + ino_t ino = kernfs_ino(pos); ctx->pos = pos->hash; file->private_data = pos; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index e8c792b49616..34366db3620d 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -892,7 +892,7 @@ repeat: * have the matching @file available. Look up the inodes * and generate the events manually. */ - inode = ilookup(info->sb, kn->id.ino); + inode = ilookup(info->sb, kernfs_ino(kn)); if (!inode) continue; @@ -901,7 +901,7 @@ repeat: if (parent) { struct inode *p_inode; - p_inode = ilookup(info->sb, parent->id.ino); + p_inode = ilookup(info->sb, kernfs_ino(parent)); if (p_inode) { fsnotify(p_inode, FS_MODIFY | FS_EVENT_ON_CHILD, inode, FSNOTIFY_EVENT_INODE, &name, 0); diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index f3eaa8869f42..eac277c63d42 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -201,7 +201,7 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) inode->i_private = kn; inode->i_mapping->a_ops = &kernfs_aops; inode->i_op = &kernfs_iops; - inode->i_generation = kn->id.generation; + inode->i_generation = kernfs_gen(kn); set_default_inode_attr(inode, kn->mode); kernfs_refresh_inode(kn, inode); @@ -247,7 +247,7 @@ struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) { struct inode *inode; - inode = iget_locked(sb, kn->id.ino); + inode = iget_locked(sb, kernfs_ino(kn)); if (inode && (inode->i_state & I_NEW)) kernfs_init_inode(kn, inode); diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 02ce570a9a3c..2f3c51d55261 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -109,8 +109,6 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, unsigned flags); -struct kernfs_node *kernfs_find_and_get_node_by_ino(struct kernfs_root *root, - unsigned int ino); /* * file.c diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 6c12fac2c287..9dc7e7a64e10 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -53,63 +53,85 @@ const struct super_operations kernfs_sops = { .show_path = kernfs_sop_show_path, }; -/* - * Similar to kernfs_fh_get_inode, this one gets kernfs node from inode - * number and generation - */ -struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root, - const union kernfs_node_id *id) +static int kernfs_encode_fh(struct inode *inode, __u32 *fh, int *max_len, + struct inode *parent) { - struct kernfs_node *kn; + struct kernfs_node *kn = inode->i_private; - kn = kernfs_find_and_get_node_by_ino(root, id->ino); - if (!kn) - return NULL; - if (kn->id.generation != id->generation) { - kernfs_put(kn); - return NULL; + if (*max_len < 2) { + *max_len = 2; + return FILEID_INVALID; } - return kn; + + *max_len = 2; + *(u64 *)fh = kn->id; + return FILEID_KERNFS; } -static struct inode *kernfs_fh_get_inode(struct super_block *sb, - u64 ino, u32 generation) +static struct dentry *__kernfs_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, + int fh_type, bool get_parent) { struct kernfs_super_info *info = kernfs_info(sb); - struct inode *inode; struct kernfs_node *kn; + struct inode *inode; + u64 id; - if (ino == 0) - return ERR_PTR(-ESTALE); + if (fh_len < 2) + return NULL; + + switch (fh_type) { + case FILEID_KERNFS: + id = *(u64 *)fid; + break; + case FILEID_INO32_GEN: + case FILEID_INO32_GEN_PARENT: + /* + * blk_log_action() exposes "LOW32,HIGH32" pair without + * type and userland can call us with generic fid + * constructed from them. Combine it back to ID. See + * blk_log_action(). + */ + id = ((u64)fid->i32.gen << 32) | fid->i32.ino; + break; + default: + return NULL; + } - kn = kernfs_find_and_get_node_by_ino(info->root, ino); + kn = kernfs_find_and_get_node_by_id(info->root, id); if (!kn) return ERR_PTR(-ESTALE); + + if (get_parent) { + struct kernfs_node *parent; + + parent = kernfs_get_parent(kn); + kernfs_put(kn); + kn = parent; + if (!kn) + return ERR_PTR(-ESTALE); + } + inode = kernfs_get_inode(sb, kn); kernfs_put(kn); if (!inode) return ERR_PTR(-ESTALE); - if (generation && inode->i_generation != generation) { - /* we didn't find the right inode.. */ - iput(inode); - return ERR_PTR(-ESTALE); - } - return inode; + return d_obtain_alias(inode); } -static struct dentry *kernfs_fh_to_dentry(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) +static struct dentry *kernfs_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, + int fh_type) { - return generic_fh_to_dentry(sb, fid, fh_len, fh_type, - kernfs_fh_get_inode); + return __kernfs_fh_to_dentry(sb, fid, fh_len, fh_type, false); } -static struct dentry *kernfs_fh_to_parent(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) +static struct dentry *kernfs_fh_to_parent(struct super_block *sb, + struct fid *fid, int fh_len, + int fh_type) { - return generic_fh_to_parent(sb, fid, fh_len, fh_type, - kernfs_fh_get_inode); + return __kernfs_fh_to_dentry(sb, fid, fh_len, fh_type, true); } static struct dentry *kernfs_get_parent_dentry(struct dentry *child) @@ -120,6 +142,7 @@ static struct dentry *kernfs_get_parent_dentry(struct dentry *child) } static const struct export_operations kernfs_export_ops = { + .encode_fh = kernfs_encode_fh, .fh_to_dentry = kernfs_fh_to_dentry, .fh_to_parent = kernfs_fh_to_parent, .get_parent = kernfs_get_parent_dentry, @@ -200,7 +223,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, dput(dentry); return ERR_PTR(-EINVAL); } - dtmp = lookup_one_len_unlocked(kntmp->name, dentry, + dtmp = lookup_positive_unlocked(kntmp->name, dentry, strlen(kntmp->name)); dput(dentry); if (IS_ERR(dtmp)) @@ -363,18 +386,9 @@ void kernfs_kill_sb(struct super_block *sb) void __init kernfs_init(void) { - - /* - * the slab is freed in RCU context, so kernfs_find_and_get_node_by_ino - * can access the slab lock free. This could introduce stale nodes, - * please see how kernfs_find_and_get_node_by_ino filters out stale - * nodes. - */ kernfs_node_cache = kmem_cache_create("kernfs_node_cache", sizeof(struct kernfs_node), - 0, - SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, - NULL); + 0, SLAB_PANIC, NULL); /* Creates slab cache for kernfs inode attributes */ kernfs_iattrs_cache = kmem_cache_create("kernfs_iattrs_cache", diff --git a/fs/libfs.c b/fs/libfs.c index 540611b99b9a..1463b038ffc4 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -473,8 +473,7 @@ EXPORT_SYMBOL(simple_write_begin); /** * simple_write_end - .write_end helper for non-block-device FSes - * @available: See .write_end of address_space_operations - * @file: " + * @file: See .write_end of address_space_operations * @mapping: " * @pos: " * @len: " diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 7d46fafdbbe5..0afb6d59bad0 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -464,7 +464,8 @@ nlm_bind_host(struct nlm_host *host) .version = host->h_version, .authflavor = RPC_AUTH_UNIX, .flags = (RPC_CLNT_CREATE_NOPING | - RPC_CLNT_CREATE_AUTOBIND), + RPC_CLNT_CREATE_AUTOBIND | + RPC_CLNT_CREATE_REUSEPORT), .cred = host->h_cred, }; diff --git a/fs/namei.c b/fs/namei.c index 671c3c1a3425..d6c91d1e88cb 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -925,7 +925,7 @@ static inline int may_follow_link(struct nameidata *nd) return -ECHILD; audit_inode(nd->name, nd->stack[0].link.dentry, 0); - audit_log_link_denied("follow_link"); + audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link"); return -EACCES; } @@ -993,7 +993,7 @@ static int may_linkat(struct path *link) if (safe_hardlink_source(inode) || inode_owner_or_capable(inode)) return 0; - audit_log_link_denied("linkat"); + audit_log_path_denied(AUDIT_ANOM_LINK, "linkat"); return -EPERM; } @@ -1031,6 +1031,10 @@ static int may_create_in_sticky(struct dentry * const dir, (dir->d_inode->i_mode & 0020 && ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) || (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) { + const char *operation = S_ISFIFO(inode->i_mode) ? + "sticky_create_fifo" : + "sticky_create_regular"; + audit_log_path_denied(AUDIT_ANOM_CREAT, operation); return -EACCES; } return 0; @@ -1206,25 +1210,25 @@ static int follow_automount(struct path *path, struct nameidata *nd, * - Flagged as automount point * * This may only be called in refwalk mode. + * On success path->dentry is known positive. * * Serialization is taken care of in namespace.c */ static int follow_managed(struct path *path, struct nameidata *nd) { struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */ - unsigned managed; + unsigned flags; bool need_mntput = false; int ret = 0; /* Given that we're not holding a lock here, we retain the value in a * local variable for each dentry as we look at it so that we don't see * the components of that value change under us */ - while (managed = READ_ONCE(path->dentry->d_flags), - managed &= DCACHE_MANAGED_DENTRY, - unlikely(managed != 0)) { + while (flags = smp_load_acquire(&path->dentry->d_flags), + unlikely(flags & DCACHE_MANAGED_DENTRY)) { /* Allow the filesystem to manage the transit without i_mutex * being held. */ - if (managed & DCACHE_MANAGE_TRANSIT) { + if (flags & DCACHE_MANAGE_TRANSIT) { BUG_ON(!path->dentry->d_op); BUG_ON(!path->dentry->d_op->d_manage); ret = path->dentry->d_op->d_manage(path, false); @@ -1233,7 +1237,7 @@ static int follow_managed(struct path *path, struct nameidata *nd) } /* Transit to a mounted filesystem. */ - if (managed & DCACHE_MOUNTED) { + if (flags & DCACHE_MOUNTED) { struct vfsmount *mounted = lookup_mnt(path); if (mounted) { dput(path->dentry); @@ -1252,7 +1256,7 @@ static int follow_managed(struct path *path, struct nameidata *nd) } /* Handle an automount point */ - if (managed & DCACHE_NEED_AUTOMOUNT) { + if (flags & DCACHE_NEED_AUTOMOUNT) { ret = follow_automount(path, nd, &need_mntput); if (ret < 0) break; @@ -1265,10 +1269,12 @@ static int follow_managed(struct path *path, struct nameidata *nd) if (need_mntput && path->mnt == mnt) mntput(path->mnt); - if (ret == -EISDIR || !ret) - ret = 1; if (need_mntput) nd->flags |= LOOKUP_JUMPED; + if (ret == -EISDIR || !ret) + ret = 1; + if (ret > 0 && unlikely(d_flags_negative(flags))) + ret = -ENOENT; if (unlikely(ret < 0)) path_put_conditional(path, nd); return ret; @@ -1617,10 +1623,6 @@ static int lookup_fast(struct nameidata *nd, dput(dentry); return status; } - if (unlikely(d_is_negative(dentry))) { - dput(dentry); - return -ENOENT; - } path->mnt = mnt; path->dentry = dentry; @@ -1807,11 +1809,6 @@ static int walk_component(struct nameidata *nd, int flags) if (unlikely(err < 0)) return err; - if (unlikely(d_is_negative(path.dentry))) { - path_to_nameidata(&path, nd); - return -ENOENT; - } - seq = 0; /* we are already out of RCU mode */ inode = d_backing_inode(path.dentry); } @@ -2564,6 +2561,26 @@ struct dentry *lookup_one_len_unlocked(const char *name, } EXPORT_SYMBOL(lookup_one_len_unlocked); +/* + * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT) + * on negatives. Returns known positive or ERR_PTR(); that's what + * most of the users want. Note that pinned negative with unlocked parent + * _can_ become positive at any time, so callers of lookup_one_len_unlocked() + * need to be very careful; pinned positives have ->d_inode stable, so + * this one avoids such problems. + */ +struct dentry *lookup_positive_unlocked(const char *name, + struct dentry *base, int len) +{ + struct dentry *ret = lookup_one_len_unlocked(name, base, len); + if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { + dput(ret); + ret = ERR_PTR(-ENOENT); + } + return ret; +} +EXPORT_SYMBOL(lookup_positive_unlocked); + #ifdef CONFIG_UNIX98_PTYS int path_pts(struct path *path) { @@ -2658,7 +2675,7 @@ mountpoint_last(struct nameidata *nd) return PTR_ERR(path.dentry); } } - if (d_is_negative(path.dentry)) { + if (d_flags_negative(smp_load_acquire(&path.dentry->d_flags))) { dput(path.dentry); return -ENOENT; } @@ -3352,11 +3369,6 @@ static int do_last(struct nameidata *nd, if (unlikely(error < 0)) return error; - if (unlikely(d_is_negative(path.dentry))) { - path_to_nameidata(&path, nd); - return -ENOENT; - } - /* * create/update audit record if it already exists. */ diff --git a/fs/namespace.c b/fs/namespace.c index fe0e9e1410fe..2adfe7b166a3 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2478,8 +2478,10 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount * time64_to_tm(sb->s_time_max, 0, &tm); - pr_warn("Mounted %s file system at %s supports timestamps until %04ld (0x%llx)\n", - sb->s_type->name, mntpath, + pr_warn("%s filesystem being %s at %s supports timestamps until %04ld (0x%llx)\n", + sb->s_type->name, + is_mounted(mnt) ? "remounted" : "mounted", + mntpath, tm.tm_year+1900, (unsigned long long)sb->s_time_max); free_page((unsigned long)buf); @@ -2764,14 +2766,11 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, if (IS_ERR(mnt)) return PTR_ERR(mnt); - error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags); - if (error < 0) { - mntput(mnt); - return error; - } - mnt_warn_timestamp_expiry(mountpoint, mnt); + error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags); + if (error < 0) + mntput(mnt); return error; } diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index 8f34daf85f70..549350259840 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -72,8 +72,8 @@ struct cb_getattrres { uint32_t bitmap[2]; uint64_t size; uint64_t change_attr; - struct timespec ctime; - struct timespec mtime; + struct timespec64 ctime; + struct timespec64 mtime; }; struct cb_recallargs { diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index f39924ba050b..cd4c6bc81cae 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -26,7 +26,6 @@ __be32 nfs4_callback_getattr(void *argp, void *resp, struct cb_getattrargs *args = argp; struct cb_getattrres *res = resp; struct nfs_delegation *delegation; - struct nfs_inode *nfsi; struct inode *inode; res->status = htonl(NFS4ERR_OP_NOT_IN_SESSION); @@ -47,17 +46,16 @@ __be32 nfs4_callback_getattr(void *argp, void *resp, -ntohl(res->status)); goto out; } - nfsi = NFS_I(inode); rcu_read_lock(); - delegation = rcu_dereference(nfsi->delegation); + delegation = nfs4_get_valid_delegation(inode); if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0) goto out_iput; res->size = i_size_read(inode); res->change_attr = delegation->change_attr; if (nfs_have_writebacks(inode)) res->change_attr++; - res->ctime = timespec64_to_timespec(inode->i_ctime); - res->mtime = timespec64_to_timespec(inode->i_mtime); + res->ctime = inode->i_ctime; + res->mtime = inode->i_mtime; res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) & args->bitmap[0]; res->bitmap[1] = (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) & diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 73a5a5ea2976..03a20f5716c7 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -627,7 +627,7 @@ static __be32 encode_attr_size(struct xdr_stream *xdr, const uint32_t *bitmap, u return 0; } -static __be32 encode_attr_time(struct xdr_stream *xdr, const struct timespec *time) +static __be32 encode_attr_time(struct xdr_stream *xdr, const struct timespec64 *time) { __be32 *p; @@ -639,14 +639,14 @@ static __be32 encode_attr_time(struct xdr_stream *xdr, const struct timespec *ti return 0; } -static __be32 encode_attr_ctime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time) +static __be32 encode_attr_ctime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec64 *time) { if (!(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) return 0; return encode_attr_time(xdr,time); } -static __be32 encode_attr_mtime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time) +static __be32 encode_attr_mtime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec64 *time) { if (!(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) return 0; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 30838304a0bf..02110a30a49e 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -312,6 +312,12 @@ again: /* Match nfsv4 minorversion */ if (clp->cl_minorversion != data->minorversion) continue; + + /* Match request for a dedicated DS */ + if (test_bit(NFS_CS_DS, &data->init_flags) != + test_bit(NFS_CS_DS, &clp->cl_flags)) + continue; + /* Match the full socket address */ if (!rpc_cmp_addr_port(sap, clap)) /* Match all xprt_switch full socket addresses */ @@ -515,6 +521,10 @@ int nfs_create_rpc_client(struct nfs_client *clp, args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; if (test_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_INFINITE_SLOTS; + if (test_bit(NFS_CS_NOPING, &clp->cl_flags)) + args.flags |= RPC_CLNT_CREATE_NOPING; + if (test_bit(NFS_CS_REUSEPORT, &clp->cl_flags)) + args.flags |= RPC_CLNT_CREATE_REUSEPORT; if (!IS_ERR(clp->cl_rpcclient)) return 0; @@ -662,6 +672,7 @@ static int nfs_init_server(struct nfs_server *server, .timeparms = &timeparms, .cred = server->cred, .nconnect = data->nfs_server.nconnect, + .init_flags = (1UL << NFS_CS_REUSEPORT), }; struct nfs_client *clp; int error; diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 071b90a45933..fe57b2b5314a 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -53,6 +53,16 @@ nfs4_is_valid_delegation(const struct nfs_delegation *delegation, return false; } +struct nfs_delegation *nfs4_get_valid_delegation(const struct inode *inode) +{ + struct nfs_delegation *delegation; + + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (nfs4_is_valid_delegation(delegation, 0)) + return delegation; + return NULL; +} + static int nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark) { @@ -189,7 +199,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred, delegation = rcu_dereference(NFS_I(inode)->delegation); if (delegation != NULL) { spin_lock(&delegation->lock); - if (delegation->inode != NULL) { + if (nfs4_is_valid_delegation(delegation, 0)) { nfs4_stateid_copy(&delegation->stateid, stateid); delegation->type = type; delegation->pagemod_limit = pagemod_limit; @@ -219,7 +229,6 @@ static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation * delegation->cred, &delegation->stateid, issync); - nfs_free_delegation(delegation); return res; } @@ -288,7 +297,10 @@ nfs_detach_delegation_locked(struct nfs_inode *nfsi, return NULL; spin_lock(&delegation->lock); - set_bit(NFS_DELEGATION_RETURNING, &delegation->flags); + if (!delegation->inode) { + spin_unlock(&delegation->lock); + return NULL; + } list_del_rcu(&delegation->super_list); delegation->inode = NULL; rcu_assign_pointer(nfsi->delegation, NULL); @@ -315,10 +327,12 @@ nfs_inode_detach_delegation(struct inode *inode) struct nfs_server *server = NFS_SERVER(inode); struct nfs_delegation *delegation; - delegation = nfs_start_delegation_return(nfsi); - if (delegation == NULL) - return NULL; - return nfs_detach_delegation(nfsi, delegation, server); + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); + if (delegation != NULL) + delegation = nfs_detach_delegation(nfsi, delegation, server); + rcu_read_unlock(); + return delegation; } static void @@ -329,6 +343,7 @@ nfs_update_inplace_delegation(struct nfs_delegation *delegation, delegation->stateid.seqid = update->stateid.seqid; smp_wmb(); delegation->type = update->type; + clear_bit(NFS_DELEGATION_REVOKED, &delegation->flags); } } @@ -369,14 +384,18 @@ int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred, spin_lock(&clp->cl_lock); old_delegation = rcu_dereference_protected(nfsi->delegation, lockdep_is_held(&clp->cl_lock)); - if (old_delegation != NULL) { - /* Is this an update of the existing delegation? */ - if (nfs4_stateid_match_other(&old_delegation->stateid, - &delegation->stateid)) { - nfs_update_inplace_delegation(old_delegation, - delegation); - goto out; - } + if (old_delegation == NULL) + goto add_new; + /* Is this an update of the existing delegation? */ + if (nfs4_stateid_match_other(&old_delegation->stateid, + &delegation->stateid)) { + spin_lock(&old_delegation->lock); + nfs_update_inplace_delegation(old_delegation, + delegation); + spin_unlock(&old_delegation->lock); + goto out; + } + if (!test_bit(NFS_DELEGATION_REVOKED, &old_delegation->flags)) { /* * Deal with broken servers that hand out two * delegations for the same file. @@ -395,11 +414,11 @@ int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred, if (test_and_set_bit(NFS_DELEGATION_RETURNING, &old_delegation->flags)) goto out; - freeme = nfs_detach_delegation_locked(nfsi, - old_delegation, clp); - if (freeme == NULL) - goto out; } + freeme = nfs_detach_delegation_locked(nfsi, old_delegation, clp); + if (freeme == NULL) + goto out; +add_new: list_add_tail_rcu(&delegation->super_list, &server->delegations); rcu_assign_pointer(nfsi->delegation, delegation); delegation = NULL; @@ -414,8 +433,10 @@ out: spin_unlock(&clp->cl_lock); if (delegation != NULL) nfs_free_delegation(delegation); - if (freeme != NULL) + if (freeme != NULL) { nfs_do_return_delegation(inode, freeme, 0); + nfs_free_delegation(freeme); + } return status; } @@ -425,7 +446,6 @@ out: static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation *delegation, int issync) { struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; - struct nfs_inode *nfsi = NFS_I(inode); int err = 0; if (delegation == NULL) @@ -447,8 +467,6 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation nfs_abort_delegation_return(delegation, clp); goto out; } - if (!nfs_detach_delegation(nfsi, delegation, NFS_SERVER(inode))) - goto out; err = nfs_do_return_delegation(inode, delegation, issync); out: @@ -459,8 +477,6 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation) { bool ret = false; - if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) - goto out; if (test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags)) ret = true; if (test_and_clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags) && !ret) { @@ -472,7 +488,10 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation) ret = true; spin_unlock(&delegation->lock); } -out: + if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags) || + test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) + ret = false; + return ret; } @@ -575,19 +594,23 @@ restart: } /** - * nfs_inode_return_delegation_noreclaim - return delegation, don't reclaim opens + * nfs_inode_evict_delegation - return delegation, don't reclaim opens * @inode: inode to process * * Does not protect against delegation reclaims, therefore really only safe - * to be called from nfs4_clear_inode(). + * to be called from nfs4_clear_inode(). Guaranteed to always free + * the delegation structure. */ -void nfs_inode_return_delegation_noreclaim(struct inode *inode) +void nfs_inode_evict_delegation(struct inode *inode) { struct nfs_delegation *delegation; delegation = nfs_inode_detach_delegation(inode); - if (delegation != NULL) + if (delegation != NULL) { + set_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags); nfs_do_return_delegation(inode, delegation, 1); + nfs_free_delegation(delegation); + } } /** @@ -623,10 +646,18 @@ int nfs4_inode_return_delegation(struct inode *inode) */ int nfs4_inode_make_writeable(struct inode *inode) { - if (!nfs4_has_session(NFS_SERVER(inode)->nfs_client) || - !nfs4_check_delegation(inode, FMODE_WRITE)) - return nfs4_inode_return_delegation(inode); - return 0; + struct nfs_delegation *delegation; + + rcu_read_lock(); + delegation = nfs4_get_valid_delegation(inode); + if (delegation == NULL || + (nfs4_has_session(NFS_SERVER(inode)->nfs_client) && + (delegation->type & FMODE_WRITE))) { + rcu_read_unlock(); + return 0; + } + rcu_read_unlock(); + return nfs4_inode_return_delegation(inode); } static void nfs_mark_return_if_closed_delegation(struct nfs_server *server, @@ -734,10 +765,9 @@ static void nfs_mark_delegation_revoked(struct nfs_server *server, { set_bit(NFS_DELEGATION_REVOKED, &delegation->flags); delegation->stateid.type = NFS4_INVALID_STATEID_TYPE; - nfs_mark_return_delegation(server, delegation); } -static bool nfs_revoke_delegation(struct inode *inode, +static void nfs_revoke_delegation(struct inode *inode, const nfs4_stateid *stateid) { struct nfs_delegation *delegation; @@ -751,29 +781,69 @@ static bool nfs_revoke_delegation(struct inode *inode, if (stateid == NULL) { nfs4_stateid_copy(&tmp, &delegation->stateid); stateid = &tmp; - } else if (!nfs4_stateid_match(stateid, &delegation->stateid)) - goto out; + } else { + if (!nfs4_stateid_match_other(stateid, &delegation->stateid)) + goto out; + spin_lock(&delegation->lock); + if (stateid->seqid) { + if (nfs4_stateid_is_newer(&delegation->stateid, stateid)) { + spin_unlock(&delegation->lock); + goto out; + } + delegation->stateid.seqid = stateid->seqid; + } + spin_unlock(&delegation->lock); + } nfs_mark_delegation_revoked(NFS_SERVER(inode), delegation); ret = true; out: rcu_read_unlock(); if (ret) nfs_inode_find_state_and_recover(inode, stateid); - return ret; } void nfs_remove_bad_delegation(struct inode *inode, const nfs4_stateid *stateid) { + nfs_revoke_delegation(inode, stateid); +} +EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation); + +void nfs_delegation_mark_returned(struct inode *inode, + const nfs4_stateid *stateid) +{ struct nfs_delegation *delegation; - if (!nfs_revoke_delegation(inode, stateid)) + if (!inode) return; - delegation = nfs_inode_detach_delegation(inode); - if (delegation) - nfs_free_delegation(delegation); + + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (!delegation) + goto out_rcu_unlock; + + spin_lock(&delegation->lock); + if (!nfs4_stateid_match_other(stateid, &delegation->stateid)) + goto out_spin_unlock; + if (stateid->seqid) { + /* If delegation->stateid is newer, dont mark as returned */ + if (nfs4_stateid_is_newer(&delegation->stateid, stateid)) + goto out_clear_returning; + if (delegation->stateid.seqid != stateid->seqid) + delegation->stateid.seqid = stateid->seqid; + } + + nfs_mark_delegation_revoked(NFS_SERVER(inode), delegation); + +out_clear_returning: + clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags); +out_spin_unlock: + spin_unlock(&delegation->lock); +out_rcu_unlock: + rcu_read_unlock(); + + nfs_inode_find_state_and_recover(inode, stateid); } -EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation); /** * nfs_expire_unused_delegation_types @@ -830,7 +900,7 @@ int nfs_async_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation; rcu_read_lock(); - delegation = rcu_dereference(NFS_I(inode)->delegation); + delegation = nfs4_get_valid_delegation(inode); if (delegation == NULL) goto out_enoent; if (stateid != NULL && @@ -856,6 +926,7 @@ nfs_delegation_find_inode_server(struct nfs_server *server, list_for_each_entry_rcu(delegation, &server->delegations, super_list) { spin_lock(&delegation->lock); if (delegation->inode != NULL && + !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) && nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) { freeme = igrab(delegation->inode); if (freeme && nfs_sb_active(freeme->i_sb)) @@ -1130,7 +1201,8 @@ void nfs_inode_find_delegation_state_and_recover(struct inode *inode, rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); if (delegation && - nfs4_stateid_match_other(&delegation->stateid, stateid)) { + nfs4_stateid_match_or_older(&delegation->stateid, stateid) && + !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) { nfs_mark_test_expired_delegation(NFS_SERVER(inode), delegation); found = true; } @@ -1179,9 +1251,11 @@ bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode) rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); if (delegation != NULL && - nfs4_stateid_match_other(dst, &delegation->stateid)) { + nfs4_stateid_match_other(dst, &delegation->stateid) && + nfs4_stateid_is_newer(&delegation->stateid, dst) && + !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) { dst->seqid = delegation->stateid.seqid; - return ret; + ret = true; } rcu_read_unlock(); out: diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 9eb87ae4c982..15d3484be028 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -43,7 +43,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred, fmode_t type, const nfs4_stateid *stateid, unsigned long pagemod_limit); int nfs4_inode_return_delegation(struct inode *inode); int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); -void nfs_inode_return_delegation_noreclaim(struct inode *inode); +void nfs_inode_evict_delegation(struct inode *inode); struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); void nfs_server_return_all_delegations(struct nfs_server *); @@ -53,6 +53,7 @@ void nfs_expire_unreferenced_delegations(struct nfs_client *clp); int nfs_client_return_marked_delegations(struct nfs_client *clp); int nfs_delegations_present(struct nfs_client *clp); void nfs_remove_bad_delegation(struct inode *inode, const nfs4_stateid *stateid); +void nfs_delegation_mark_returned(struct inode *inode, const nfs4_stateid *stateid); void nfs_delegation_mark_reclaim(struct nfs_client *clp); void nfs_delegation_reap_unclaimed(struct nfs_client *clp); @@ -68,6 +69,7 @@ int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, const struct cred **cred); bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode); +struct nfs_delegation *nfs4_get_valid_delegation(const struct inode *inode); void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); int nfs4_have_delegation(struct inode *inode, fmode_t flags); int nfs4_check_delegation(struct inode *inode, fmode_t flags); diff --git a/fs/nfs/export.c b/fs/nfs/export.c index deecb67638aa..3430d6891e89 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -105,6 +105,7 @@ nfs_fh_to_dentry(struct super_block *sb, struct fid *fid, ret = rpc_ops->getattr(NFS_SB(sb), server_fh, fattr, label, NULL); if (ret) { dprintk("%s: getattr failed %d\n", __func__, ret); + trace_nfs_fh_to_dentry(sb, server_fh, fattr->fileid, ret); dentry = ERR_PTR(ret); goto out_free_label; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 95dc90570786..8eb731d9be3e 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -649,7 +649,7 @@ out: out_swapfile: printk(KERN_INFO "NFS: attempt to write to active swap file!\n"); - return -EBUSY; + return -ETXTBSY; } EXPORT_SYMBOL_GPL(nfs_file_write); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 2a03bfeec10a..b0b4b9f303fd 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -504,15 +504,15 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st nfsi->read_cache_jiffies = fattr->time_start; nfsi->attr_gencount = fattr->gencount; if (fattr->valid & NFS_ATTR_FATTR_ATIME) - inode->i_atime = timespec_to_timespec64(fattr->atime); + inode->i_atime = fattr->atime; else if (nfs_server_capable(inode, NFS_CAP_ATIME)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); if (fattr->valid & NFS_ATTR_FATTR_MTIME) - inode->i_mtime = timespec_to_timespec64(fattr->mtime); + inode->i_mtime = fattr->mtime; else if (nfs_server_capable(inode, NFS_CAP_MTIME)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = timespec_to_timespec64(fattr->ctime); + inode->i_ctime = fattr->ctime; else if (nfs_server_capable(inode, NFS_CAP_CTIME)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_CTIME); if (fattr->valid & NFS_ATTR_FATTR_CHANGE) @@ -698,7 +698,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, if ((attr->ia_valid & ATTR_GID) != 0) inode->i_gid = attr->ia_gid; if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = timespec_to_timespec64(fattr->ctime); + inode->i_ctime = fattr->ctime; else nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME); @@ -709,14 +709,14 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_ATIME | NFS_INO_INVALID_CTIME); if (fattr->valid & NFS_ATTR_FATTR_ATIME) - inode->i_atime = timespec_to_timespec64(fattr->atime); + inode->i_atime = fattr->atime; else if (attr->ia_valid & ATTR_ATIME_SET) inode->i_atime = attr->ia_atime; else nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = timespec_to_timespec64(fattr->ctime); + inode->i_ctime = fattr->ctime; else nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME); @@ -725,14 +725,14 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_MTIME | NFS_INO_INVALID_CTIME); if (fattr->valid & NFS_ATTR_FATTR_MTIME) - inode->i_mtime = timespec_to_timespec64(fattr->mtime); + inode->i_mtime = fattr->mtime; else if (attr->ia_valid & ATTR_MTIME_SET) inode->i_mtime = attr->ia_mtime; else nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = timespec_to_timespec64(fattr->ctime); + inode->i_ctime = fattr->ctime; else nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME); @@ -1351,7 +1351,7 @@ static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi) static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { - struct timespec ts; + struct timespec64 ts; if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) && (fattr->valid & NFS_ATTR_FATTR_CHANGE) @@ -1361,18 +1361,18 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); } /* If we have atomic WCC data, we may update some attributes */ - ts = timespec64_to_timespec(inode->i_ctime); + ts = inode->i_ctime; if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME) && (fattr->valid & NFS_ATTR_FATTR_CTIME) - && timespec_equal(&ts, &fattr->pre_ctime)) { - inode->i_ctime = timespec_to_timespec64(fattr->ctime); + && timespec64_equal(&ts, &fattr->pre_ctime)) { + inode->i_ctime = fattr->ctime; } - ts = timespec64_to_timespec(inode->i_mtime); + ts = inode->i_mtime; if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME) && (fattr->valid & NFS_ATTR_FATTR_MTIME) - && timespec_equal(&ts, &fattr->pre_mtime)) { - inode->i_mtime = timespec_to_timespec64(fattr->mtime); + && timespec64_equal(&ts, &fattr->pre_mtime)) { + inode->i_mtime = fattr->mtime; if (S_ISDIR(inode->i_mode)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); } @@ -1398,7 +1398,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat struct nfs_inode *nfsi = NFS_I(inode); loff_t cur_size, new_isize; unsigned long invalid = 0; - struct timespec ts; + struct timespec64 ts; if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) return 0; @@ -1425,12 +1425,12 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat invalid |= NFS_INO_INVALID_CHANGE | NFS_INO_REVAL_PAGECACHE; - ts = timespec64_to_timespec(inode->i_mtime); - if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&ts, &fattr->mtime)) + ts = inode->i_mtime; + if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec64_equal(&ts, &fattr->mtime)) invalid |= NFS_INO_INVALID_MTIME; - ts = timespec64_to_timespec(inode->i_ctime); - if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec_equal(&ts, &fattr->ctime)) + ts = inode->i_ctime; + if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec64_equal(&ts, &fattr->ctime)) invalid |= NFS_INO_INVALID_CTIME; if (fattr->valid & NFS_ATTR_FATTR_SIZE) { @@ -1460,8 +1460,8 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink) invalid |= NFS_INO_INVALID_OTHER; - ts = timespec64_to_timespec(inode->i_atime); - if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&ts, &fattr->atime)) + ts = inode->i_atime; + if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec64_equal(&ts, &fattr->atime)) invalid |= NFS_INO_INVALID_ATIME; if (invalid != 0) @@ -1733,12 +1733,12 @@ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fa } if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 && (fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) { - fattr->pre_ctime = timespec64_to_timespec(inode->i_ctime); + fattr->pre_ctime = inode->i_ctime; fattr->valid |= NFS_ATTR_FATTR_PRECTIME; } if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 && (fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) { - fattr->pre_mtime = timespec64_to_timespec(inode->i_mtime); + fattr->pre_mtime = inode->i_mtime; fattr->valid |= NFS_ATTR_FATTR_PREMTIME; } if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 && @@ -1899,7 +1899,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } if (fattr->valid & NFS_ATTR_FATTR_MTIME) { - inode->i_mtime = timespec_to_timespec64(fattr->mtime); + inode->i_mtime = fattr->mtime; } else if (server->caps & NFS_CAP_MTIME) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_MTIME @@ -1908,7 +1908,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } if (fattr->valid & NFS_ATTR_FATTR_CTIME) { - inode->i_ctime = timespec_to_timespec64(fattr->ctime); + inode->i_ctime = fattr->ctime; } else if (server->caps & NFS_CAP_CTIME) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_CTIME @@ -1946,7 +1946,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (fattr->valid & NFS_ATTR_FATTR_ATIME) - inode->i_atime = timespec_to_timespec64(fattr->atime); + inode->i_atime = fattr->atime; else if (server->caps & NFS_CAP_ATIME) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATIME diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 447a3c17fa8e..24a65da58aa9 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -713,7 +713,7 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len) * 1024*1024*1024. */ static inline -u64 nfs_timespec_to_change_attr(const struct timespec *ts) +u64 nfs_timespec_to_change_attr(const struct timespec64 *ts) { return ((u64)ts->tv_sec << 30) + ts->tv_nsec; } diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 9287eb666322..5e0e9d29f5c5 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -157,6 +157,9 @@ struct vfsmount *nfs_d_automount(struct path *path) if (IS_ERR(mnt)) goto out; + if (nfs_mountpoint_expiry_timeout < 0) + goto out; + mntget(mnt); /* prevent immediate expiration */ mnt_set_expiry(mnt, &nfs_automount_list); schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index cbc17a203248..d94c7abdf25a 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -209,9 +209,9 @@ static int decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh) * unsigned int useconds; * }; */ -static __be32 *xdr_encode_time(__be32 *p, const struct timespec *timep) +static __be32 *xdr_encode_time(__be32 *p, const struct timespec64 *timep) { - *p++ = cpu_to_be32(timep->tv_sec); + *p++ = cpu_to_be32((u32)timep->tv_sec); if (timep->tv_nsec != 0) *p++ = cpu_to_be32(timep->tv_nsec / NSEC_PER_USEC); else @@ -227,14 +227,14 @@ static __be32 *xdr_encode_time(__be32 *p, const struct timespec *timep) * Illustrated" by Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5. */ static __be32 *xdr_encode_current_server_time(__be32 *p, - const struct timespec *timep) + const struct timespec64 *timep) { *p++ = cpu_to_be32(timep->tv_sec); *p++ = cpu_to_be32(1000000); return p; } -static __be32 *xdr_decode_time(__be32 *p, struct timespec *timep) +static __be32 *xdr_decode_time(__be32 *p, struct timespec64 *timep) { timep->tv_sec = be32_to_cpup(p++); timep->tv_nsec = be32_to_cpup(p++) * NSEC_PER_USEC; @@ -339,7 +339,6 @@ static __be32 *xdr_time_not_set(__be32 *p) static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr, struct user_namespace *userns) { - struct timespec ts; __be32 *p; p = xdr_reserve_space(xdr, NFS_sattr_sz << 2); @@ -362,19 +361,15 @@ static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr, *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); if (attr->ia_valid & ATTR_ATIME_SET) { - ts = timespec64_to_timespec(attr->ia_atime); - p = xdr_encode_time(p, &ts); + p = xdr_encode_time(p, &attr->ia_atime); } else if (attr->ia_valid & ATTR_ATIME) { - ts = timespec64_to_timespec(attr->ia_atime); - p = xdr_encode_current_server_time(p, &ts); + p = xdr_encode_current_server_time(p, &attr->ia_atime); } else p = xdr_time_not_set(p); if (attr->ia_valid & ATTR_MTIME_SET) { - ts = timespec64_to_timespec(attr->ia_atime); - xdr_encode_time(p, &ts); + xdr_encode_time(p, &attr->ia_mtime); } else if (attr->ia_valid & ATTR_MTIME) { - ts = timespec64_to_timespec(attr->ia_mtime); - xdr_encode_current_server_time(p, &ts); + xdr_encode_current_server_time(p, &attr->ia_mtime); } else xdr_time_not_set(p); } diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index 148ceb74d27c..223904bc40a7 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -106,7 +106,10 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, cl_init.nconnect = mds_clp->cl_nconnect; if (mds_srv->flags & NFS_MOUNT_NORESVPORT) - set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + + __set_bit(NFS_CS_NOPING, &cl_init.init_flags); + __set_bit(NFS_CS_DS, &cl_init.init_flags); /* Use the MDS nfs_client cl_ipaddr. */ nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 602767850b36..927eb680f161 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -456,14 +456,14 @@ static void zero_nfs_fh3(struct nfs_fh *fh) * uint32 nseconds; * }; */ -static __be32 *xdr_encode_nfstime3(__be32 *p, const struct timespec *timep) +static __be32 *xdr_encode_nfstime3(__be32 *p, const struct timespec64 *timep) { - *p++ = cpu_to_be32(timep->tv_sec); + *p++ = cpu_to_be32((u32)timep->tv_sec); *p++ = cpu_to_be32(timep->tv_nsec); return p; } -static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec *timep) +static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec64 *timep) { timep->tv_sec = be32_to_cpup(p++); timep->tv_nsec = be32_to_cpup(p++); @@ -533,7 +533,6 @@ static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec *timep) static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr, struct user_namespace *userns) { - struct timespec ts; u32 nbytes; __be32 *p; @@ -583,10 +582,8 @@ static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr, *p++ = xdr_zero; if (attr->ia_valid & ATTR_ATIME_SET) { - struct timespec ts; *p++ = xdr_two; - ts = timespec64_to_timespec(attr->ia_atime); - p = xdr_encode_nfstime3(p, &ts); + p = xdr_encode_nfstime3(p, &attr->ia_atime); } else if (attr->ia_valid & ATTR_ATIME) { *p++ = xdr_one; } else @@ -594,8 +591,7 @@ static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr, if (attr->ia_valid & ATTR_MTIME_SET) { *p++ = xdr_two; - ts = timespec64_to_timespec(attr->ia_mtime); - xdr_encode_nfstime3(p, &ts); + xdr_encode_nfstime3(p, &attr->ia_mtime); } else if (attr->ia_valid & ATTR_MTIME) { *p = xdr_one; } else diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h index 901cca7542f9..c891af949886 100644 --- a/fs/nfs/nfs42.h +++ b/fs/nfs/nfs42.h @@ -13,8 +13,10 @@ #define PNFS_LAYOUTSTATS_MAXDEV (4) /* nfs4.2proc.c */ +#ifdef CONFIG_NFS_V4_2 int nfs42_proc_allocate(struct file *, loff_t, loff_t); -ssize_t nfs42_proc_copy(struct file *, loff_t, struct file *, loff_t, size_t); +ssize_t nfs42_proc_copy(struct file *, loff_t, struct file *, loff_t, size_t, + struct nl4_server *, nfs4_stateid *, bool); int nfs42_proc_deallocate(struct file *, loff_t, loff_t); loff_t nfs42_proc_llseek(struct file *, loff_t, int); int nfs42_proc_layoutstats_generic(struct nfs_server *, @@ -23,5 +25,16 @@ int nfs42_proc_clone(struct file *, struct file *, loff_t, loff_t, loff_t); int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg, const struct nfs42_layout_error *errors, size_t n); +int nfs42_proc_copy_notify(struct file *, struct file *, + struct nfs42_copy_notify_res *); +static inline bool nfs42_files_from_same_server(struct file *in, + struct file *out) +{ + struct nfs_client *c_in = (NFS_SERVER(file_inode(in)))->nfs_client; + struct nfs_client *c_out = (NFS_SERVER(file_inode(out)))->nfs_client; + return nfs4_check_serverowner_major_id(c_in->cl_serverowner, + c_out->cl_serverowner); +} +#endif /* CONFIG_NFS_V4_2 */ #endif /* __LINUX_FS_NFS_NFS4_2_H */ diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 5196bfa7894d..1fe83e0f663e 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -3,6 +3,7 @@ * Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com> */ #include <linux/fs.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/sched.h> #include <linux/nfs.h> #include <linux/nfs3.h> @@ -15,10 +16,30 @@ #include "pnfs.h" #include "nfs4session.h" #include "internal.h" +#include "delegation.h" #define NFSDBG_FACILITY NFSDBG_PROC static int nfs42_do_offload_cancel_async(struct file *dst, nfs4_stateid *std); +static void nfs42_set_netaddr(struct file *filep, struct nfs42_netaddr *naddr) +{ + struct nfs_client *clp = (NFS_SERVER(file_inode(filep)))->nfs_client; + unsigned short port = 2049; + + rcu_read_lock(); + naddr->netid_len = scnprintf(naddr->netid, + sizeof(naddr->netid), "%s", + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_NETID)); + naddr->addr_len = scnprintf(naddr->addr, + sizeof(naddr->addr), + "%s.%u.%u", + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR), + port >> 8, port & 255); + rcu_read_unlock(); +} + static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, struct nfs_lock_context *lock, loff_t offset, loff_t len) { @@ -28,7 +49,7 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, .falloc_fh = NFS_FH(inode), .falloc_offset = offset, .falloc_length = len, - .falloc_bitmask = server->cache_consistency_bitmask, + .falloc_bitmask = nfs4_fattr_bitmap, }; struct nfs42_falloc_res res = { .falloc_server = server, @@ -132,22 +153,26 @@ out_unlock: } static int handle_async_copy(struct nfs42_copy_res *res, - struct nfs_server *server, + struct nfs_server *dst_server, + struct nfs_server *src_server, struct file *src, struct file *dst, - nfs4_stateid *src_stateid) + nfs4_stateid *src_stateid, + bool *restart) { struct nfs4_copy_state *copy, *tmp_copy; int status = NFS4_OK; bool found_pending = false; - struct nfs_open_context *ctx = nfs_file_open_context(dst); + struct nfs_open_context *dst_ctx = nfs_file_open_context(dst); + struct nfs_open_context *src_ctx = nfs_file_open_context(src); copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS); if (!copy) return -ENOMEM; - spin_lock(&server->nfs_client->cl_lock); - list_for_each_entry(tmp_copy, &server->nfs_client->pending_cb_stateids, + spin_lock(&dst_server->nfs_client->cl_lock); + list_for_each_entry(tmp_copy, + &dst_server->nfs_client->pending_cb_stateids, copies) { if (memcmp(&res->write_res.stateid, &tmp_copy->stateid, NFS4_STATEID_SIZE)) @@ -157,7 +182,7 @@ static int handle_async_copy(struct nfs42_copy_res *res, break; } if (found_pending) { - spin_unlock(&server->nfs_client->cl_lock); + spin_unlock(&dst_server->nfs_client->cl_lock); kfree(copy); copy = tmp_copy; goto out; @@ -165,19 +190,32 @@ static int handle_async_copy(struct nfs42_copy_res *res, memcpy(©->stateid, &res->write_res.stateid, NFS4_STATEID_SIZE); init_completion(©->completion); - copy->parent_state = ctx->state; + copy->parent_dst_state = dst_ctx->state; + copy->parent_src_state = src_ctx->state; - list_add_tail(©->copies, &server->ss_copies); - spin_unlock(&server->nfs_client->cl_lock); + list_add_tail(©->copies, &dst_server->ss_copies); + spin_unlock(&dst_server->nfs_client->cl_lock); + + if (dst_server != src_server) { + spin_lock(&src_server->nfs_client->cl_lock); + list_add_tail(©->src_copies, &src_server->ss_copies); + spin_unlock(&src_server->nfs_client->cl_lock); + } status = wait_for_completion_interruptible(©->completion); - spin_lock(&server->nfs_client->cl_lock); + spin_lock(&dst_server->nfs_client->cl_lock); list_del_init(©->copies); - spin_unlock(&server->nfs_client->cl_lock); + spin_unlock(&dst_server->nfs_client->cl_lock); + if (dst_server != src_server) { + spin_lock(&src_server->nfs_client->cl_lock); + list_del_init(©->src_copies); + spin_unlock(&src_server->nfs_client->cl_lock); + } if (status == -ERESTARTSYS) { goto out_cancel; - } else if (copy->flags) { + } else if (copy->flags || copy->error == NFS4ERR_PARTNER_NO_AUTH) { status = -EAGAIN; + *restart = true; goto out_cancel; } out: @@ -185,12 +223,14 @@ out: memcpy(&res->write_res.verifier, ©->verf, sizeof(copy->verf)); status = -copy->error; +out_free: kfree(copy); return status; out_cancel: nfs42_do_offload_cancel_async(dst, ©->stateid); - kfree(copy); - return status; + if (!nfs42_files_from_same_server(src, dst)) + nfs42_do_offload_cancel_async(src, src_stateid); + goto out_free; } static int process_copy_commit(struct file *dst, loff_t pos_dst, @@ -222,7 +262,10 @@ static ssize_t _nfs42_proc_copy(struct file *src, struct file *dst, struct nfs_lock_context *dst_lock, struct nfs42_copy_args *args, - struct nfs42_copy_res *res) + struct nfs42_copy_res *res, + struct nl4_server *nss, + nfs4_stateid *cnr_stateid, + bool *restart) { struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COPY], @@ -230,17 +273,23 @@ static ssize_t _nfs42_proc_copy(struct file *src, .rpc_resp = res, }; struct inode *dst_inode = file_inode(dst); - struct nfs_server *server = NFS_SERVER(dst_inode); + struct inode *src_inode = file_inode(src); + struct nfs_server *dst_server = NFS_SERVER(dst_inode); + struct nfs_server *src_server = NFS_SERVER(src_inode); loff_t pos_src = args->src_pos; loff_t pos_dst = args->dst_pos; size_t count = args->count; ssize_t status; - status = nfs4_set_rw_stateid(&args->src_stateid, src_lock->open_context, - src_lock, FMODE_READ); - if (status) - return status; - + if (nss) { + args->cp_src = nss; + nfs4_stateid_copy(&args->src_stateid, cnr_stateid); + } else { + status = nfs4_set_rw_stateid(&args->src_stateid, + src_lock->open_context, src_lock, FMODE_READ); + if (status) + return status; + } status = nfs_filemap_write_and_wait_range(file_inode(src)->i_mapping, pos_src, pos_src + (loff_t)count - 1); if (status) @@ -262,13 +311,15 @@ static ssize_t _nfs42_proc_copy(struct file *src, if (!res->commit_res.verf) return -ENOMEM; } + set_bit(NFS_CLNT_SRC_SSC_COPY_STATE, + &src_lock->open_context->state->flags); set_bit(NFS_CLNT_DST_SSC_COPY_STATE, &dst_lock->open_context->state->flags); - status = nfs4_call_sync(server->client, server, &msg, + status = nfs4_call_sync(dst_server->client, dst_server, &msg, &args->seq_args, &res->seq_res, 0); if (status == -ENOTSUPP) - server->caps &= ~NFS_CAP_COPY; + dst_server->caps &= ~NFS_CAP_COPY; if (status) goto out; @@ -280,8 +331,8 @@ static ssize_t _nfs42_proc_copy(struct file *src, } if (!res->synchronous) { - status = handle_async_copy(res, server, src, dst, - &args->src_stateid); + status = handle_async_copy(res, dst_server, src_server, src, + dst, &args->src_stateid, restart); if (status) return status; } @@ -304,8 +355,9 @@ out: } ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, - struct file *dst, loff_t pos_dst, - size_t count) + struct file *dst, loff_t pos_dst, size_t count, + struct nl4_server *nss, + nfs4_stateid *cnr_stateid, bool sync) { struct nfs_server *server = NFS_SERVER(file_inode(dst)); struct nfs_lock_context *src_lock; @@ -316,7 +368,7 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, .dst_fh = NFS_FH(file_inode(dst)), .dst_pos = pos_dst, .count = count, - .sync = false, + .sync = sync, }; struct nfs42_copy_res res; struct nfs4_exception src_exception = { @@ -328,6 +380,7 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, .stateid = &args.dst_stateid, }; ssize_t err, err2; + bool restart = false; src_lock = nfs_get_lock_context(nfs_file_open_context(src)); if (IS_ERR(src_lock)) @@ -347,21 +400,33 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, inode_lock(file_inode(dst)); err = _nfs42_proc_copy(src, src_lock, dst, dst_lock, - &args, &res); + &args, &res, + nss, cnr_stateid, &restart); inode_unlock(file_inode(dst)); if (err >= 0) break; - if (err == -ENOTSUPP) { + if (err == -ENOTSUPP && + nfs42_files_from_same_server(src, dst)) { err = -EOPNOTSUPP; break; } else if (err == -EAGAIN) { - dst_exception.retry = 1; - continue; + if (!restart) { + dst_exception.retry = 1; + continue; + } + break; } else if (err == -NFS4ERR_OFFLOAD_NO_REQS && !args.sync) { args.sync = true; dst_exception.retry = 1; continue; + } else if ((err == -ESTALE || + err == -NFS4ERR_OFFLOAD_DENIED || + err == -ENOTSUPP) && + !nfs42_files_from_same_server(src, dst)) { + nfs42_do_offload_cancel_async(src, &args.src_stateid); + err = -EOPNOTSUPP; + break; } err2 = nfs4_handle_exception(server, err, &src_exception); @@ -459,6 +524,76 @@ static int nfs42_do_offload_cancel_async(struct file *dst, return status; } +static int _nfs42_proc_copy_notify(struct file *src, struct file *dst, + struct nfs42_copy_notify_args *args, + struct nfs42_copy_notify_res *res) +{ + struct nfs_server *src_server = NFS_SERVER(file_inode(src)); + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COPY_NOTIFY], + .rpc_argp = args, + .rpc_resp = res, + }; + int status; + struct nfs_open_context *ctx; + struct nfs_lock_context *l_ctx; + + ctx = get_nfs_open_context(nfs_file_open_context(src)); + l_ctx = nfs_get_lock_context(ctx); + if (IS_ERR(l_ctx)) + return PTR_ERR(l_ctx); + + status = nfs4_set_rw_stateid(&args->cna_src_stateid, ctx, l_ctx, + FMODE_READ); + nfs_put_lock_context(l_ctx); + if (status) + return status; + + status = nfs4_call_sync(src_server->client, src_server, &msg, + &args->cna_seq_args, &res->cnr_seq_res, 0); + if (status == -ENOTSUPP) + src_server->caps &= ~NFS_CAP_COPY_NOTIFY; + + put_nfs_open_context(nfs_file_open_context(src)); + return status; +} + +int nfs42_proc_copy_notify(struct file *src, struct file *dst, + struct nfs42_copy_notify_res *res) +{ + struct nfs_server *src_server = NFS_SERVER(file_inode(src)); + struct nfs42_copy_notify_args *args; + struct nfs4_exception exception = { + .inode = file_inode(src), + }; + int status; + + if (!(src_server->caps & NFS_CAP_COPY_NOTIFY)) + return -EOPNOTSUPP; + + args = kzalloc(sizeof(struct nfs42_copy_notify_args), GFP_NOFS); + if (args == NULL) + return -ENOMEM; + + args->cna_src_fh = NFS_FH(file_inode(src)), + args->cna_dst.nl4_type = NL4_NETADDR; + nfs42_set_netaddr(dst, &args->cna_dst.u.nl4_addr); + exception.stateid = &args->cna_src_stateid; + + do { + status = _nfs42_proc_copy_notify(src, dst, args, res); + if (status == -ENOTSUPP) { + status = -EOPNOTSUPP; + goto out; + } + status = nfs4_handle_exception(src_server, status, &exception); + } while (exception.retry); + +out: + kfree(args); + return status; +} + static loff_t _nfs42_proc_llseek(struct file *filep, struct nfs_lock_context *lock, loff_t offset, int whence) { diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index aed865a84629..c03f3246d6c5 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -21,7 +21,10 @@ #define encode_copy_maxsz (op_encode_hdr_maxsz + \ XDR_QUADLEN(NFS4_STATEID_SIZE) + \ XDR_QUADLEN(NFS4_STATEID_SIZE) + \ - 2 + 2 + 2 + 1 + 1 + 1) + 2 + 2 + 2 + 1 + 1 + 1 +\ + 1 + /* One cnr_source_server */\ + 1 + /* nl4_type */ \ + 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT)) #define decode_copy_maxsz (op_decode_hdr_maxsz + \ NFS42_WRITE_RES_SIZE + \ 1 /* cr_consecutive */ + \ @@ -29,6 +32,16 @@ #define encode_offload_cancel_maxsz (op_encode_hdr_maxsz + \ XDR_QUADLEN(NFS4_STATEID_SIZE)) #define decode_offload_cancel_maxsz (op_decode_hdr_maxsz) +#define encode_copy_notify_maxsz (op_encode_hdr_maxsz + \ + XDR_QUADLEN(NFS4_STATEID_SIZE) + \ + 1 + /* nl4_type */ \ + 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT)) +#define decode_copy_notify_maxsz (op_decode_hdr_maxsz + \ + 3 + /* cnr_lease_time */\ + XDR_QUADLEN(NFS4_STATEID_SIZE) + \ + 1 + /* Support 1 cnr_source_server */\ + 1 + /* nl4_type */ \ + 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT)) #define encode_deallocate_maxsz (op_encode_hdr_maxsz + \ encode_fallocate_maxsz) #define decode_deallocate_maxsz (op_decode_hdr_maxsz) @@ -99,6 +112,12 @@ decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_offload_cancel_maxsz) +#define NFS4_enc_copy_notify_sz (compound_encode_hdr_maxsz + \ + encode_putfh_maxsz + \ + encode_copy_notify_maxsz) +#define NFS4_dec_copy_notify_sz (compound_decode_hdr_maxsz + \ + decode_putfh_maxsz + \ + decode_copy_notify_maxsz) #define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -166,6 +185,26 @@ static void encode_allocate(struct xdr_stream *xdr, encode_fallocate(xdr, args); } +static void encode_nl4_server(struct xdr_stream *xdr, + const struct nl4_server *ns) +{ + encode_uint32(xdr, ns->nl4_type); + switch (ns->nl4_type) { + case NL4_NAME: + case NL4_URL: + encode_string(xdr, ns->u.nl4_str_sz, ns->u.nl4_str); + break; + case NL4_NETADDR: + encode_string(xdr, ns->u.nl4_addr.netid_len, + ns->u.nl4_addr.netid); + encode_string(xdr, ns->u.nl4_addr.addr_len, + ns->u.nl4_addr.addr); + break; + default: + WARN_ON_ONCE(1); + } +} + static void encode_copy(struct xdr_stream *xdr, const struct nfs42_copy_args *args, struct compound_hdr *hdr) @@ -180,7 +219,12 @@ static void encode_copy(struct xdr_stream *xdr, encode_uint32(xdr, 1); /* consecutive = true */ encode_uint32(xdr, args->sync); - encode_uint32(xdr, 0); /* src server list */ + if (args->cp_src == NULL) { /* intra-ssc */ + encode_uint32(xdr, 0); /* no src server list */ + return; + } + encode_uint32(xdr, 1); /* supporting 1 server */ + encode_nl4_server(xdr, args->cp_src); } static void encode_offload_cancel(struct xdr_stream *xdr, @@ -191,6 +235,15 @@ static void encode_offload_cancel(struct xdr_stream *xdr, encode_nfs4_stateid(xdr, &args->osa_stateid); } +static void encode_copy_notify(struct xdr_stream *xdr, + const struct nfs42_copy_notify_args *args, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_COPY_NOTIFY, decode_copy_notify_maxsz, hdr); + encode_nfs4_stateid(xdr, &args->cna_src_stateid); + encode_nl4_server(xdr, &args->cna_dst); +} + static void encode_deallocate(struct xdr_stream *xdr, const struct nfs42_falloc_args *args, struct compound_hdr *hdr) @@ -355,6 +408,25 @@ static void nfs4_xdr_enc_offload_cancel(struct rpc_rqst *req, } /* + * Encode COPY_NOTIFY request + */ +static void nfs4_xdr_enc_copy_notify(struct rpc_rqst *req, + struct xdr_stream *xdr, + const void *data) +{ + const struct nfs42_copy_notify_args *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->cna_seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->cna_seq_args, &hdr); + encode_putfh(xdr, args->cna_src_fh, &hdr); + encode_copy_notify(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* * Encode DEALLOCATE request */ static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req, @@ -490,6 +562,58 @@ static int decode_write_response(struct xdr_stream *xdr, return decode_verifier(xdr, &res->verifier.verifier); } +static int decode_nl4_server(struct xdr_stream *xdr, struct nl4_server *ns) +{ + struct nfs42_netaddr *naddr; + uint32_t dummy; + char *dummy_str; + __be32 *p; + int status; + + /* nl_type */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + ns->nl4_type = be32_to_cpup(p); + switch (ns->nl4_type) { + case NL4_NAME: + case NL4_URL: + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; + if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) + return -EIO; + memcpy(&ns->u.nl4_str, dummy_str, dummy); + ns->u.nl4_str_sz = dummy; + break; + case NL4_NETADDR: + naddr = &ns->u.nl4_addr; + + /* netid string */ + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; + if (unlikely(dummy > RPCBIND_MAXNETIDLEN)) + return -EIO; + naddr->netid_len = dummy; + memcpy(naddr->netid, dummy_str, naddr->netid_len); + + /* uaddr string */ + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; + if (unlikely(dummy > RPCBIND_MAXUADDRLEN)) + return -EIO; + naddr->addr_len = dummy; + memcpy(naddr->addr, dummy_str, naddr->addr_len); + break; + default: + WARN_ON_ONCE(1); + return -EIO; + } + return 0; +} + static int decode_copy_requirements(struct xdr_stream *xdr, struct nfs42_copy_res *res) { __be32 *p; @@ -529,6 +653,42 @@ static int decode_offload_cancel(struct xdr_stream *xdr, return decode_op_hdr(xdr, OP_OFFLOAD_CANCEL); } +static int decode_copy_notify(struct xdr_stream *xdr, + struct nfs42_copy_notify_res *res) +{ + __be32 *p; + int status, count; + + status = decode_op_hdr(xdr, OP_COPY_NOTIFY); + if (status) + return status; + /* cnr_lease_time */ + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + return -EIO; + p = xdr_decode_hyper(p, &res->cnr_lease_time.seconds); + res->cnr_lease_time.nseconds = be32_to_cpup(p); + + status = decode_opaque_fixed(xdr, &res->cnr_stateid, NFS4_STATEID_SIZE); + if (unlikely(status)) + return -EIO; + + /* number of source addresses */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + + count = be32_to_cpup(p); + if (count > 1) + pr_warn("NFS: %s: nsvr %d > Supported. Use first servers\n", + __func__, count); + + status = decode_nl4_server(xdr, &res->cnr_src); + if (unlikely(status)) + return -EIO; + return 0; +} + static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res) { return decode_op_hdr(xdr, OP_DEALLOCATE); @@ -657,6 +817,32 @@ out: } /* + * Decode COPY_NOTIFY response + */ +static int nfs4_xdr_dec_copy_notify(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfs42_copy_notify_res *res = data; + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->cnr_seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_copy_notify(xdr, res); + +out: + return status; +} + +/* * Decode DEALLOCATE request */ static int nfs4_xdr_dec_deallocate(struct rpc_rqst *rqstp, diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 16b2e5cc3e94..a7a73b1d1fec 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -166,9 +166,9 @@ enum { NFS_STATE_RECOVERY_FAILED, /* OPEN stateid state recovery failed */ NFS_STATE_MAY_NOTIFY_LOCK, /* server may CB_NOTIFY_LOCK */ NFS_STATE_CHANGE_WAIT, /* A state changing operation is outstanding */ -#ifdef CONFIG_NFS_V4_2 NFS_CLNT_DST_SSC_COPY_STATE, /* dst server open state on client*/ -#endif /* CONFIG_NFS_V4_2 */ + NFS_CLNT_SRC_SSC_COPY_STATE, /* src server open state on client*/ + NFS_SRV_SSC_COPY_STATE, /* ssc state on the dst server */ }; struct nfs4_state { @@ -311,6 +311,13 @@ extern int nfs4_set_rw_stateid(nfs4_stateid *stateid, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx, fmode_t fmode); +extern int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fattr *fattr, struct nfs4_label *label, + struct inode *inode); +extern int update_open_stateid(struct nfs4_state *state, + const nfs4_stateid *open_stateid, + const nfs4_stateid *deleg_stateid, + fmode_t fmode); extern int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo); @@ -445,6 +452,8 @@ extern void nfs4_set_lease_period(struct nfs_client *clp, /* nfs4state.c */ +extern const nfs4_stateid current_stateid; + const struct cred *nfs4_get_clid_cred(struct nfs_client *clp); const struct cred *nfs4_get_machine_cred(struct nfs_client *clp); const struct cred *nfs4_get_renew_cred(struct nfs_client *clp); @@ -457,6 +466,8 @@ int nfs41_discover_server_trunking(struct nfs_client *clp, struct nfs_client **, const struct cred *); extern void nfs4_schedule_session_recovery(struct nfs4_session *, int); extern void nfs41_notify_server(struct nfs_client *); +bool nfs4_check_serverowner_major_id(struct nfs41_server_owner *o1, + struct nfs41_server_owner *o2); #else static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) { @@ -572,6 +583,12 @@ static inline bool nfs4_stateid_is_newer(const nfs4_stateid *s1, const nfs4_stat return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0; } +static inline bool nfs4_stateid_match_or_older(const nfs4_stateid *dst, const nfs4_stateid *src) +{ + return nfs4_stateid_match_other(dst, src) && + !(src->seqid && nfs4_stateid_is_newer(dst, src)); +} + static inline void nfs4_stateid_seqid_inc(nfs4_stateid *s1) { u32 seqid = be32_to_cpu(s1->seqid); diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index da6204025a2d..460d6251c405 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -629,7 +629,7 @@ out: /* * Returns true if the server major ids match */ -static bool +bool nfs4_check_serverowner_major_id(struct nfs41_server_owner *o1, struct nfs41_server_owner *o2) { @@ -879,14 +879,17 @@ static int nfs4_set_client(struct nfs_server *server, }; struct nfs_client *clp; - if (minorversion > 0 && proto == XPRT_TRANSPORT_TCP) + if (minorversion == 0) + __set_bit(NFS_CS_REUSEPORT, &cl_init.init_flags); + else if (proto == XPRT_TRANSPORT_TCP) cl_init.nconnect = nconnect; + if (server->flags & NFS_MOUNT_NORESVPORT) - set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); if (server->options & NFS_OPTION_MIGRATION) - set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); + __set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); if (test_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status)) - set_bit(NFS_CS_TSM_POSSIBLE, &cl_init.init_flags); + __set_bit(NFS_CS_TSM_POSSIBLE, &cl_init.init_flags); server->port = rpc_get_port(addr); /* Allocate or find a client reference we can use */ diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 339663d04bf8..620de905cba9 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -133,14 +133,55 @@ static ssize_t __nfs4_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t count, unsigned int flags) { + struct nfs42_copy_notify_res *cn_resp = NULL; + struct nl4_server *nss = NULL; + nfs4_stateid *cnrs = NULL; + ssize_t ret; + bool sync = false; + /* Only offload copy if superblock is the same */ - if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) + if (file_in->f_op != &nfs4_file_operations) return -EXDEV; if (!nfs_server_capable(file_inode(file_out), NFS_CAP_COPY)) return -EOPNOTSUPP; if (file_inode(file_in) == file_inode(file_out)) return -EOPNOTSUPP; - return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); + /* if the copy size if smaller than 2 RPC payloads, make it + * synchronous + */ + if (count <= 2 * NFS_SERVER(file_inode(file_in))->rsize) + sync = true; +retry: + if (!nfs42_files_from_same_server(file_in, file_out)) { + /* for inter copy, if copy size if smaller than 12 RPC + * payloads, fallback to traditional copy. There are + * 14 RPCs during an NFSv4.x mount between source/dest + * servers. + */ + if (sync || + count <= 14 * NFS_SERVER(file_inode(file_in))->rsize) + return -EOPNOTSUPP; + cn_resp = kzalloc(sizeof(struct nfs42_copy_notify_res), + GFP_NOFS); + if (unlikely(cn_resp == NULL)) + return -ENOMEM; + + ret = nfs42_proc_copy_notify(file_in, file_out, cn_resp); + if (ret) { + ret = -EOPNOTSUPP; + goto out; + } + nss = &cn_resp->cnr_src; + cnrs = &cn_resp->cnr_stateid; + } + ret = nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count, + nss, cnrs, sync); +out: + if (!nfs42_files_from_same_server(file_in, file_out)) + kfree(cn_resp); + if (ret == -EAGAIN) + goto retry; + return ret; } static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in, @@ -263,6 +304,102 @@ out_unlock: out: return ret < 0 ? ret : count; } + +static int read_name_gen = 1; +#define SSC_READ_NAME_BODY "ssc_read_%d" + +struct file * +nfs42_ssc_open(struct vfsmount *ss_mnt, struct nfs_fh *src_fh, + nfs4_stateid *stateid) +{ + struct nfs_fattr fattr; + struct file *filep, *res; + struct nfs_server *server; + struct inode *r_ino = NULL; + struct nfs_open_context *ctx; + struct nfs4_state_owner *sp; + char *read_name = NULL; + int len, status = 0; + + server = NFS_SERVER(ss_mnt->mnt_root->d_inode); + + nfs_fattr_init(&fattr); + + status = nfs4_proc_getattr(server, src_fh, &fattr, NULL, NULL); + if (status < 0) { + res = ERR_PTR(status); + goto out; + } + + res = ERR_PTR(-ENOMEM); + len = strlen(SSC_READ_NAME_BODY) + 16; + read_name = kzalloc(len, GFP_NOFS); + if (read_name == NULL) + goto out; + snprintf(read_name, len, SSC_READ_NAME_BODY, read_name_gen++); + + r_ino = nfs_fhget(ss_mnt->mnt_root->d_inode->i_sb, src_fh, &fattr, + NULL); + if (IS_ERR(r_ino)) { + res = ERR_CAST(r_ino); + goto out_free_name; + } + + filep = alloc_file_pseudo(r_ino, ss_mnt, read_name, FMODE_READ, + r_ino->i_fop); + if (IS_ERR(filep)) { + res = ERR_CAST(filep); + goto out_free_name; + } + filep->f_mode |= FMODE_READ; + + ctx = alloc_nfs_open_context(filep->f_path.dentry, filep->f_mode, + filep); + if (IS_ERR(ctx)) { + res = ERR_CAST(ctx); + goto out_filep; + } + + res = ERR_PTR(-EINVAL); + sp = nfs4_get_state_owner(server, ctx->cred, GFP_KERNEL); + if (sp == NULL) + goto out_ctx; + + ctx->state = nfs4_get_open_state(r_ino, sp); + if (ctx->state == NULL) + goto out_stateowner; + + set_bit(NFS_SRV_SSC_COPY_STATE, &ctx->state->flags); + set_bit(NFS_OPEN_STATE, &ctx->state->flags); + memcpy(&ctx->state->open_stateid.other, &stateid->other, + NFS4_STATEID_OTHER_SIZE); + update_open_stateid(ctx->state, stateid, NULL, filep->f_mode); + + nfs_file_set_open_context(filep, ctx); + put_nfs_open_context(ctx); + + file_ra_state_init(&filep->f_ra, filep->f_mapping->host->i_mapping); + res = filep; +out_free_name: + kfree(read_name); +out: + return res; +out_stateowner: + nfs4_put_state_owner(sp); +out_ctx: + put_nfs_open_context(ctx); +out_filep: + fput(filep); + goto out_free_name; +} +EXPORT_SYMBOL_GPL(nfs42_ssc_open); +void nfs42_ssc_close(struct file *filep) +{ + struct nfs_open_context *ctx = nfs_file_open_context(filep); + + ctx->state->flags = 0; +} +EXPORT_SYMBOL_GPL(nfs42_ssc_close); #endif /* CONFIG_NFS_V4_2 */ const struct file_operations nfs4_file_operations = { diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ab8ca20fd579..76d37161409a 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -91,7 +91,6 @@ struct nfs4_opendata; static int _nfs4_recover_proc_open(struct nfs4_opendata *data); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr); -static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label, struct inode *inode); static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label, struct inode *inode); static int nfs4_do_setattr(struct inode *inode, const struct cred *cred, struct nfs_fattr *fattr, struct iattr *sattr, @@ -476,6 +475,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server, case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_PARTNER_NO_AUTH: if (inode != NULL && stateid != NULL) { nfs_inode_find_state_and_recover(inode, stateid); @@ -521,9 +521,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server, case -NFS4ERR_DEADSESSION: case -NFS4ERR_SEQ_FALSE_RETRY: case -NFS4ERR_SEQ_MISORDERED: - dprintk("%s ERROR: %d Reset session\n", __func__, - errorcode); - nfs4_schedule_session_recovery(clp->cl_session, errorcode); + /* Handled in nfs41_sequence_process() */ goto wait_on_recovery; #endif /* defined(CONFIG_NFS_V4_1) */ case -NFS4ERR_FILE_OPEN: @@ -782,6 +780,7 @@ static int nfs41_sequence_process(struct rpc_task *task, struct nfs4_session *session; struct nfs4_slot *slot = res->sr_slot; struct nfs_client *clp; + int status; int ret = 1; if (slot == NULL) @@ -793,8 +792,13 @@ static int nfs41_sequence_process(struct rpc_task *task, session = slot->table->session; trace_nfs4_sequence_done(session, res); + + status = res->sr_status; + if (task->tk_status == -NFS4ERR_DEADSESSION) + status = -NFS4ERR_DEADSESSION; + /* Check the SEQUENCE operation status */ - switch (res->sr_status) { + switch (status) { case 0: /* Mark this sequence number as having been acked */ nfs4_slot_sequence_acked(slot, slot->seq_nr); @@ -866,6 +870,10 @@ static int nfs41_sequence_process(struct rpc_task *task, */ slot->seq_nr = slot->seq_nr_highest_sent; goto out_retry; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + goto session_recover; default: /* Just update the slot sequence no. */ slot->seq_done = 1; @@ -876,8 +884,10 @@ out: out_noaction: return ret; session_recover: - nfs4_schedule_session_recovery(session, res->sr_status); - goto retry_nowait; + nfs4_schedule_session_recovery(session, status); + dprintk("%s ERROR: %d Reset session\n", __func__, status); + nfs41_sequence_free_slot(res); + goto out; retry_new_seq: ++slot->seq_nr; retry_nowait: @@ -1440,8 +1450,6 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode, return 0; if ((delegation->type & fmode) != fmode) return 0; - if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) - return 0; switch (claim) { case NFS4_OPEN_CLAIM_NULL: case NFS4_OPEN_CLAIM_FH: @@ -1718,7 +1726,7 @@ static void nfs_state_clear_delegation(struct nfs4_state *state) write_sequnlock(&state->seqlock); } -static int update_open_stateid(struct nfs4_state *state, +int update_open_stateid(struct nfs4_state *state, const nfs4_stateid *open_stateid, const nfs4_stateid *delegation, fmode_t fmode) @@ -1739,7 +1747,7 @@ static int update_open_stateid(struct nfs4_state *state, ret = 1; } - deleg_cur = rcu_dereference(nfsi->delegation); + deleg_cur = nfs4_get_valid_delegation(state->inode); if (deleg_cur == NULL) goto no_delegation; @@ -1751,7 +1759,7 @@ static int update_open_stateid(struct nfs4_state *state, if (delegation == NULL) delegation = &deleg_cur->stateid; - else if (!nfs4_stateid_match(&deleg_cur->stateid, delegation)) + else if (!nfs4_stateid_match_other(&deleg_cur->stateid, delegation)) goto no_delegation_unlock; nfs_mark_delegation_referenced(deleg_cur); @@ -1798,7 +1806,7 @@ static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmo fmode &= FMODE_READ|FMODE_WRITE; rcu_read_lock(); - delegation = rcu_dereference(NFS_I(inode)->delegation); + delegation = nfs4_get_valid_delegation(inode); if (delegation == NULL || (delegation->type & fmode) == fmode) { rcu_read_unlock(); return; @@ -1810,7 +1818,6 @@ static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmo static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) { struct nfs4_state *state = opendata->state; - struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs_delegation *delegation; int open_mode = opendata->o_arg.open_flags; fmode_t fmode = opendata->o_arg.fmode; @@ -1827,7 +1834,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) } spin_unlock(&state->owner->so_lock); rcu_read_lock(); - delegation = rcu_dereference(nfsi->delegation); + delegation = nfs4_get_valid_delegation(state->inode); if (!can_open_delegated(delegation, fmode, claim)) { rcu_read_unlock(); break; @@ -2191,7 +2198,6 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct case -NFS4ERR_BAD_HIGH_SLOT: case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: case -NFS4ERR_DEADSESSION: - nfs4_schedule_session_recovery(server->nfs_client->cl_session, err); return -EAGAIN; case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: @@ -2371,7 +2377,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) data->o_arg.open_flags, claim)) goto out_no_action; rcu_read_lock(); - delegation = rcu_dereference(NFS_I(data->state->inode)->delegation); + delegation = nfs4_get_valid_delegation(data->state->inode); if (can_open_delegated(delegation, data->o_arg.fmode, claim)) goto unlock_no_action; rcu_read_unlock(); @@ -4065,7 +4071,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); } -static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, +int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label, struct inode *inode) { @@ -5101,12 +5107,12 @@ static bool nfs4_stateid_is_current(nfs4_stateid *stateid, const struct nfs_lock_context *l_ctx, fmode_t fmode) { - nfs4_stateid current_stateid; + nfs4_stateid _current_stateid; /* If the current stateid represents a lost lock, then exit */ - if (nfs4_set_rw_stateid(¤t_stateid, ctx, l_ctx, fmode) == -EIO) + if (nfs4_set_rw_stateid(&_current_stateid, ctx, l_ctx, fmode) == -EIO) return true; - return nfs4_stateid_match(stateid, ¤t_stateid); + return nfs4_stateid_match(stateid, &_current_stateid); } static bool nfs4_error_stateid_expired(int err) @@ -6199,10 +6205,13 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) task->tk_status = 0; break; case -NFS4ERR_OLD_STATEID: - if (nfs4_refresh_delegation_stateid(&data->stateid, data->inode)) - goto out_restart; - task->tk_status = 0; - break; + if (!nfs4_refresh_delegation_stateid(&data->stateid, data->inode)) + nfs4_stateid_seqid_inc(&data->stateid); + if (data->args.bitmask) { + data->args.bitmask = NULL; + data->res.fattr = NULL; + } + goto out_restart; case -NFS4ERR_ACCESS: if (data->args.bitmask) { data->args.bitmask = NULL; @@ -6217,6 +6226,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) if (exception.retry) goto out_restart; } + nfs_delegation_mark_returned(data->inode, data->args.stateid); data->rpc_status = task->tk_status; return; out_restart: @@ -6246,8 +6256,10 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) d_data = (struct nfs4_delegreturndata *)data; - if (!d_data->lr.roc && nfs4_wait_on_layoutreturn(d_data->inode, task)) + if (!d_data->lr.roc && nfs4_wait_on_layoutreturn(d_data->inode, task)) { + nfs4_sequence_done(task, &d_data->res.seq_res); return; + } lo = d_data->args.lr_args ? d_data->args.lr_args->layout : NULL; if (lo && !pnfs_layout_is_valid(lo)) { @@ -7823,6 +7835,15 @@ nfs41_same_server_scope(struct nfs41_server_scope *a, static void nfs4_bind_one_conn_to_session_done(struct rpc_task *task, void *calldata) { + struct nfs41_bind_conn_to_session_args *args = task->tk_msg.rpc_argp; + struct nfs_client *clp = args->client; + + switch (task->tk_status) { + case -NFS4ERR_BADSESSION: + case -NFS4ERR_DEADSESSION: + nfs4_schedule_session_recovery(clp->cl_session, + task->tk_status); + } } static const struct rpc_call_ops nfs4_bind_one_conn_to_session_ops = { @@ -8870,8 +8891,6 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf case -NFS4ERR_BADSESSION: case -NFS4ERR_DEADSESSION: case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: - nfs4_schedule_session_recovery(clp->cl_session, - task->tk_status); break; default: nfs4_schedule_lease_recovery(clp); @@ -9900,6 +9919,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { | NFS_CAP_ALLOCATE | NFS_CAP_COPY | NFS_CAP_OFFLOAD_CANCEL + | NFS_CAP_COPY_NOTIFY | NFS_CAP_DEALLOCATE | NFS_CAP_SEEK | NFS_CAP_LAYOUTSTATS diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 0c6d53dc3672..34552329233d 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -60,6 +60,7 @@ #include "nfs4session.h" #include "pnfs.h" #include "netns.h" +#include "nfs4trace.h" #define NFSDBG_FACILITY NFSDBG_STATE @@ -1407,7 +1408,7 @@ nfs_state_find_lock_state_by_stateid(struct nfs4_state *state, list_for_each_entry(pos, &state->lock_states, ls_locks) { if (!test_bit(NFS_LOCK_INITIALIZED, &pos->ls_flags)) continue; - if (nfs4_stateid_match_other(&pos->ls_stateid, stateid)) + if (nfs4_stateid_match_or_older(&pos->ls_stateid, stateid)) return pos; } return NULL; @@ -1441,12 +1442,13 @@ void nfs_inode_find_state_and_recover(struct inode *inode, state = ctx->state; if (state == NULL) continue; - if (nfs4_stateid_match_other(&state->stateid, stateid) && + if (nfs4_stateid_match_or_older(&state->stateid, stateid) && nfs4_state_mark_reclaim_nograce(clp, state)) { found = true; continue; } - if (nfs4_stateid_match_other(&state->open_stateid, stateid) && + if (test_bit(NFS_OPEN_STATE, &state->flags) && + nfs4_stateid_match_or_older(&state->open_stateid, stateid) && nfs4_state_mark_reclaim_nograce(clp, state)) { found = true; continue; @@ -1556,16 +1558,32 @@ static void nfs42_complete_copies(struct nfs4_state_owner *sp, struct nfs4_state { struct nfs4_copy_state *copy; - if (!test_bit(NFS_CLNT_DST_SSC_COPY_STATE, &state->flags)) + if (!test_bit(NFS_CLNT_DST_SSC_COPY_STATE, &state->flags) && + !test_bit(NFS_CLNT_SRC_SSC_COPY_STATE, &state->flags)) return; spin_lock(&sp->so_server->nfs_client->cl_lock); list_for_each_entry(copy, &sp->so_server->ss_copies, copies) { - if (!nfs4_stateid_match_other(&state->stateid, ©->parent_state->stateid)) - continue; + if ((test_bit(NFS_CLNT_DST_SSC_COPY_STATE, &state->flags) && + !nfs4_stateid_match_other(&state->stateid, + ©->parent_dst_state->stateid))) + continue; copy->flags = 1; - complete(©->completion); - break; + if (test_and_clear_bit(NFS_CLNT_DST_SSC_COPY_STATE, + &state->flags)) { + clear_bit(NFS_CLNT_SRC_SSC_COPY_STATE, &state->flags); + complete(©->completion); + } + } + list_for_each_entry(copy, &sp->so_server->ss_copies, src_copies) { + if ((test_bit(NFS_CLNT_SRC_SSC_COPY_STATE, &state->flags) && + !nfs4_stateid_match_other(&state->stateid, + ©->parent_src_state->stateid))) + continue; + copy->flags = 1; + if (test_and_clear_bit(NFS_CLNT_DST_SSC_COPY_STATE, + &state->flags)) + complete(©->completion); } spin_unlock(&sp->so_server->nfs_client->cl_lock); } @@ -1593,6 +1611,7 @@ static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_st if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) { spin_lock(&state->state_lock); list_for_each_entry(lock, &state->lock_states, ls_locks) { + trace_nfs4_state_lock_reclaim(state, lock); if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags)) pr_warn_ratelimited("NFS: %s: Lock reclaim failed!\n", __func__); } @@ -1609,6 +1628,9 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs struct nfs4_state *state; unsigned int loop = 0; int status = 0; +#ifdef CONFIG_NFS_V4_2 + bool found_ssc_copy_state = false; +#endif /* CONFIG_NFS_V4_2 */ /* Note: we rely on the sp->so_states list being ordered * so that we always reclaim open(O_RDWR) and/or open(O_WRITE) @@ -1628,6 +1650,13 @@ restart: continue; if (state->state == 0) continue; +#ifdef CONFIG_NFS_V4_2 + if (test_bit(NFS_SRV_SSC_COPY_STATE, &state->flags)) { + nfs4_state_mark_recovery_failed(state, -EIO); + found_ssc_copy_state = true; + continue; + } +#endif /* CONFIG_NFS_V4_2 */ refcount_inc(&state->count); spin_unlock(&sp->so_lock); status = __nfs4_reclaim_open_state(sp, state, ops); @@ -1682,6 +1711,10 @@ restart: } raw_write_seqcount_end(&sp->so_reclaim_seqcount); spin_unlock(&sp->so_lock); +#ifdef CONFIG_NFS_V4_2 + if (found_ssc_copy_state) + return -EIO; +#endif /* CONFIG_NFS_V4_2 */ return 0; out_err: nfs4_put_open_state(state); @@ -2508,6 +2541,7 @@ static void nfs4_state_manager(struct nfs_client *clp) /* Ensure exclusive access to NFSv4 state */ do { + trace_nfs4_state_mgr(clp); clear_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) { section = "purge state"; @@ -2621,6 +2655,7 @@ static void nfs4_state_manager(struct nfs_client *clp) out_error: if (strlen(section)) section_sep = ": "; + trace_nfs4_state_mgr_failed(clp, section, status); pr_warn_ratelimited("NFS: state manager%s%s failed on NFSv4 server %s" " with error %d\n", section_sep, section, clp->cl_hostname, -status); diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index 04c57066a11a..2c9cbade561a 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -92,8 +92,8 @@ static void nfs4_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); clear_inode(inode); - /* If we are holding a delegation, return it! */ - nfs_inode_return_delegation_noreclaim(inode); + /* If we are holding a delegation, return and free it */ + nfs_inode_evict_delegation(inode); /* Note that above delegreturn would trigger pnfs return-on-close */ pnfs_return_layout(inode); pnfs_destroy_layout(NFS_I(inode)); diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index b2f395fa7350..e60b6fbd5ada 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -562,6 +562,99 @@ TRACE_EVENT(nfs4_setup_sequence, ) ); +TRACE_DEFINE_ENUM(NFS4CLNT_MANAGER_RUNNING); +TRACE_DEFINE_ENUM(NFS4CLNT_CHECK_LEASE); +TRACE_DEFINE_ENUM(NFS4CLNT_LEASE_EXPIRED); +TRACE_DEFINE_ENUM(NFS4CLNT_RECLAIM_REBOOT); +TRACE_DEFINE_ENUM(NFS4CLNT_RECLAIM_NOGRACE); +TRACE_DEFINE_ENUM(NFS4CLNT_DELEGRETURN); +TRACE_DEFINE_ENUM(NFS4CLNT_SESSION_RESET); +TRACE_DEFINE_ENUM(NFS4CLNT_LEASE_CONFIRM); +TRACE_DEFINE_ENUM(NFS4CLNT_SERVER_SCOPE_MISMATCH); +TRACE_DEFINE_ENUM(NFS4CLNT_PURGE_STATE); +TRACE_DEFINE_ENUM(NFS4CLNT_BIND_CONN_TO_SESSION); +TRACE_DEFINE_ENUM(NFS4CLNT_MOVED); +TRACE_DEFINE_ENUM(NFS4CLNT_LEASE_MOVED); +TRACE_DEFINE_ENUM(NFS4CLNT_DELEGATION_EXPIRED); +TRACE_DEFINE_ENUM(NFS4CLNT_RUN_MANAGER); +TRACE_DEFINE_ENUM(NFS4CLNT_DELEGRETURN_RUNNING); + +#define show_nfs4_clp_state(state) \ + __print_flags(state, "|", \ + { NFS4CLNT_MANAGER_RUNNING, "MANAGER_RUNNING" }, \ + { NFS4CLNT_CHECK_LEASE, "CHECK_LEASE" }, \ + { NFS4CLNT_LEASE_EXPIRED, "LEASE_EXPIRED" }, \ + { NFS4CLNT_RECLAIM_REBOOT, "RECLAIM_REBOOT" }, \ + { NFS4CLNT_RECLAIM_NOGRACE, "RECLAIM_NOGRACE" }, \ + { NFS4CLNT_DELEGRETURN, "DELEGRETURN" }, \ + { NFS4CLNT_SESSION_RESET, "SESSION_RESET" }, \ + { NFS4CLNT_LEASE_CONFIRM, "LEASE_CONFIRM" }, \ + { NFS4CLNT_SERVER_SCOPE_MISMATCH, \ + "SERVER_SCOPE_MISMATCH" }, \ + { NFS4CLNT_PURGE_STATE, "PURGE_STATE" }, \ + { NFS4CLNT_BIND_CONN_TO_SESSION, \ + "BIND_CONN_TO_SESSION" }, \ + { NFS4CLNT_MOVED, "MOVED" }, \ + { NFS4CLNT_LEASE_MOVED, "LEASE_MOVED" }, \ + { NFS4CLNT_DELEGATION_EXPIRED, "DELEGATION_EXPIRED" }, \ + { NFS4CLNT_RUN_MANAGER, "RUN_MANAGER" }, \ + { NFS4CLNT_DELEGRETURN_RUNNING, "DELEGRETURN_RUNNING" }) + +TRACE_EVENT(nfs4_state_mgr, + TP_PROTO( + const struct nfs_client *clp + ), + + TP_ARGS(clp), + + TP_STRUCT__entry( + __field(unsigned long, state) + __string(hostname, clp->cl_hostname) + ), + + TP_fast_assign( + __entry->state = clp->cl_state; + __assign_str(hostname, clp->cl_hostname) + ), + + TP_printk( + "hostname=%s clp state=%s", __get_str(hostname), + show_nfs4_clp_state(__entry->state) + ) +) + +TRACE_EVENT(nfs4_state_mgr_failed, + TP_PROTO( + const struct nfs_client *clp, + const char *section, + int status + ), + + TP_ARGS(clp, section, status), + + TP_STRUCT__entry( + __field(unsigned long, error) + __field(unsigned long, state) + __string(hostname, clp->cl_hostname) + __string(section, section) + ), + + TP_fast_assign( + __entry->error = status; + __entry->state = clp->cl_state; + __assign_str(hostname, clp->cl_hostname); + __assign_str(section, section); + ), + + TP_printk( + "hostname=%s clp state=%s error=%ld (%s) section=%s", + __get_str(hostname), + show_nfs4_clp_state(__entry->state), -__entry->error, + show_nfsv4_errors(__entry->error), __get_str(section) + + ) +) + TRACE_EVENT(nfs4_xdr_status, TP_PROTO( const struct xdr_stream *xdr, @@ -929,6 +1022,88 @@ TRACE_EVENT(nfs4_set_lock, ) ); +TRACE_DEFINE_ENUM(LK_STATE_IN_USE); +TRACE_DEFINE_ENUM(NFS_DELEGATED_STATE); +TRACE_DEFINE_ENUM(NFS_OPEN_STATE); +TRACE_DEFINE_ENUM(NFS_O_RDONLY_STATE); +TRACE_DEFINE_ENUM(NFS_O_WRONLY_STATE); +TRACE_DEFINE_ENUM(NFS_O_RDWR_STATE); +TRACE_DEFINE_ENUM(NFS_STATE_RECLAIM_REBOOT); +TRACE_DEFINE_ENUM(NFS_STATE_RECLAIM_NOGRACE); +TRACE_DEFINE_ENUM(NFS_STATE_POSIX_LOCKS); +TRACE_DEFINE_ENUM(NFS_STATE_RECOVERY_FAILED); +TRACE_DEFINE_ENUM(NFS_STATE_MAY_NOTIFY_LOCK); +TRACE_DEFINE_ENUM(NFS_STATE_CHANGE_WAIT); +TRACE_DEFINE_ENUM(NFS_CLNT_DST_SSC_COPY_STATE); +TRACE_DEFINE_ENUM(NFS_CLNT_SRC_SSC_COPY_STATE); +TRACE_DEFINE_ENUM(NFS_SRV_SSC_COPY_STATE); + +#define show_nfs4_state_flags(flags) \ + __print_flags(flags, "|", \ + { LK_STATE_IN_USE, "IN_USE" }, \ + { NFS_DELEGATED_STATE, "DELEGATED" }, \ + { NFS_OPEN_STATE, "OPEN" }, \ + { NFS_O_RDONLY_STATE, "O_RDONLY" }, \ + { NFS_O_WRONLY_STATE, "O_WRONLY" }, \ + { NFS_O_RDWR_STATE, "O_RDWR" }, \ + { NFS_STATE_RECLAIM_REBOOT, "RECLAIM_REBOOT" }, \ + { NFS_STATE_RECLAIM_NOGRACE, "RECLAIM_NOGRACE" }, \ + { NFS_STATE_POSIX_LOCKS, "POSIX_LOCKS" }, \ + { NFS_STATE_RECOVERY_FAILED, "RECOVERY_FAILED" }, \ + { NFS_STATE_MAY_NOTIFY_LOCK, "MAY_NOTIFY_LOCK" }, \ + { NFS_STATE_CHANGE_WAIT, "CHANGE_WAIT" }, \ + { NFS_CLNT_DST_SSC_COPY_STATE, "CLNT_DST_SSC_COPY" }, \ + { NFS_CLNT_SRC_SSC_COPY_STATE, "CLNT_SRC_SSC_COPY" }, \ + { NFS_SRV_SSC_COPY_STATE, "SRV_SSC_COPY" }) + +#define show_nfs4_lock_flags(flags) \ + __print_flags(flags, "|", \ + { BIT(NFS_LOCK_INITIALIZED), "INITIALIZED" }, \ + { BIT(NFS_LOCK_LOST), "LOST" }) + +TRACE_EVENT(nfs4_state_lock_reclaim, + TP_PROTO( + const struct nfs4_state *state, + const struct nfs4_lock_state *lock + ), + + TP_ARGS(state, lock), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(unsigned long, state_flags) + __field(unsigned long, lock_flags) + __field(int, stateid_seq) + __field(u32, stateid_hash) + ), + + TP_fast_assign( + const struct inode *inode = state->inode; + + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->state_flags = state->flags; + __entry->lock_flags = lock->ls_flags; + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "stateid=%d:0x%08x state_flags=%s lock_flags=%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash, + show_nfs4_state_flags(__entry->state_flags), + show_nfs4_lock_flags(__entry->lock_flags) + ) +) + DECLARE_EVENT_CLASS(nfs4_set_delegation_event, TP_PROTO( const struct inode *inode, diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index ab07db0f07cd..936c57779ff4 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1059,7 +1059,7 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve } static __be32 * -xdr_encode_nfstime4(__be32 *p, const struct timespec *t) +xdr_encode_nfstime4(__be32 *p, const struct timespec64 *t) { p = xdr_encode_hyper(p, (__s64)t->tv_sec); *p++ = cpu_to_be32(t->tv_nsec); @@ -1072,7 +1072,6 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server, const uint32_t attrmask[]) { - struct timespec ts; char owner_name[IDMAP_NAMESZ]; char owner_group[IDMAP_NAMESZ]; int owner_namelen = 0; @@ -1161,16 +1160,14 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) { if (iap->ia_valid & ATTR_ATIME_SET) { *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); - ts = timespec64_to_timespec(iap->ia_atime); - p = xdr_encode_nfstime4(p, &ts); + p = xdr_encode_nfstime4(p, &iap->ia_atime); } else *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) { if (iap->ia_valid & ATTR_MTIME_SET) { *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); - ts = timespec64_to_timespec(iap->ia_mtime); - p = xdr_encode_nfstime4(p, &ts); + p = xdr_encode_nfstime4(p, &iap->ia_mtime); } else *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } @@ -4065,17 +4062,17 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint } static __be32 * -xdr_decode_nfstime4(__be32 *p, struct timespec *t) +xdr_decode_nfstime4(__be32 *p, struct timespec64 *t) { __u64 sec; p = xdr_decode_hyper(p, &sec); - t-> tv_sec = (time_t)sec; + t-> tv_sec = sec; t->tv_nsec = be32_to_cpup(p++); return p; } -static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) +static int decode_attr_time(struct xdr_stream *xdr, struct timespec64 *time) { __be32 *p; @@ -4086,7 +4083,7 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) return 0; } -static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) +static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec64 *time) { int status = 0; @@ -4104,7 +4101,7 @@ static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, str return status; } -static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) +static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec64 *time) { int status = 0; @@ -4123,7 +4120,7 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s } static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap, - struct timespec *time) + struct timespec64 *time) { int status = 0; @@ -4186,7 +4183,7 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap, return status; } -static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) +static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec64 *time) { int status = 0; @@ -7581,6 +7578,7 @@ const struct rpc_procinfo nfs4_procedures[] = { PROC42(CLONE, enc_clone, dec_clone), PROC42(COPY, enc_copy, dec_copy), PROC42(OFFLOAD_CANCEL, enc_offload_cancel, dec_offload_cancel), + PROC42(COPY_NOTIFY, enc_copy_notify, dec_copy_notify), PROC(LOOKUPP, enc_lookupp, dec_lookupp), PROC42(LAYOUTERROR, enc_layouterror, dec_layouterror), }; diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 361cc10d6f95..f64a33d2a1d1 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -1065,6 +1065,39 @@ TRACE_EVENT(nfs_commit_done, ) ); +TRACE_EVENT(nfs_fh_to_dentry, + TP_PROTO( + const struct super_block *sb, + const struct nfs_fh *fh, + u64 fileid, + int error + ), + + TP_ARGS(sb, fh, fileid, error), + + TP_STRUCT__entry( + __field(int, error) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + ), + + TP_fast_assign( + __entry->error = error; + __entry->dev = sb->s_dev; + __entry->fileid = fileid; + __entry->fhandle = nfs_fhandle_hash(fh); + ), + + TP_printk( + "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x ", + __entry->error, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle + ) +); + TRACE_DEFINE_ENUM(NFS_OK); TRACE_DEFINE_ENUM(NFSERR_PERM); TRACE_DEFINE_ENUM(NFSERR_NOENT); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index bb80034a7661..cec3070ab577 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2160,8 +2160,6 @@ out_unlock: return NULL; } -extern const nfs4_stateid current_stateid; - static void _lgopen_prepare_attached(struct nfs4_opendata *data, struct nfs_open_context *ctx) { diff --git a/fs/nfs/super.c b/fs/nfs/super.c index a84df7d63403..8d8d04bb9d64 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1592,7 +1592,7 @@ static int nfs_parse_mount_options(char *raw, dfprintk(MOUNT, "NFS: invalid " "lookupcache argument\n"); return 0; - }; + } break; case Opt_fscache_uniq: if (nfs_get_option_str(args, &mnt->fscache_uniq)) @@ -1625,7 +1625,7 @@ static int nfs_parse_mount_options(char *raw, dfprintk(MOUNT, "NFS: invalid " "local_lock argument\n"); return 0; - }; + } break; /* @@ -2585,7 +2585,7 @@ static void nfs_get_cache_cookie(struct super_block *sb, if (mnt_s->fscache_key) { uniq = mnt_s->fscache_key->key.uniquifier; ulen = mnt_s->fscache_key->key.uniq_len; - }; + } } else return; diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c index 4f3390b20239..c489496b5659 100644 --- a/fs/nfs/sysfs.c +++ b/fs/nfs/sysfs.c @@ -121,8 +121,7 @@ static void nfs_netns_client_release(struct kobject *kobj) struct nfs_netns_client, kobject); - if (c->identifier) - kfree(c->identifier); + kfree(c->identifier); kfree(c); } diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 10cefb0c07c7..f2f81561ebb6 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -73,7 +73,8 @@ config NFSD_V4 select NFSD_V3 select FS_POSIX_ACL select SUNRPC_GSS - select CRYPTO + select CRYPTO_MD5 + select CRYPTO_SHA256 select GRACE_PERIOD help This option enables support in your system's NFS server for diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index ef55e9b1cd4e..32a9bf22ac08 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -685,8 +685,6 @@ nfsd_file_cache_purge(struct net *net) void nfsd_file_cache_shutdown(void) { - LIST_HEAD(dispose); - set_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags); lease_unregister_notifier(&nfsd_file_lease_notifier); diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 86e5658651f1..195ab7a0fc89 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -863,13 +863,11 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, } else dchild = dget(dparent); } else - dchild = lookup_one_len_unlocked(name, dparent, namlen); + dchild = lookup_positive_unlocked(name, dparent, namlen); if (IS_ERR(dchild)) return rv; if (d_mountpoint(dchild)) goto out; - if (d_really_is_negative(dchild)) - goto out; if (dchild->d_inode->i_ino != ino) goto out; rv = fh_compose(fhp, exp, dchild, &cd->fh); diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 524111420b48..24534db87e86 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -826,6 +826,31 @@ static int max_cb_time(struct net *net) return max(nn->nfsd4_lease/10, (time_t)1) * HZ; } +static struct workqueue_struct *callback_wq; + +static bool nfsd4_queue_cb(struct nfsd4_callback *cb) +{ + return queue_work(callback_wq, &cb->cb_work); +} + +static void nfsd41_cb_inflight_begin(struct nfs4_client *clp) +{ + atomic_inc(&clp->cl_cb_inflight); +} + +static void nfsd41_cb_inflight_end(struct nfs4_client *clp) +{ + + if (atomic_dec_and_test(&clp->cl_cb_inflight)) + wake_up_var(&clp->cl_cb_inflight); +} + +static void nfsd41_cb_inflight_wait_complete(struct nfs4_client *clp) +{ + wait_var_event(&clp->cl_cb_inflight, + !atomic_read(&clp->cl_cb_inflight)); +} + static const struct cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses) { if (clp->cl_minorversion == 0) { @@ -937,14 +962,21 @@ static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) clp->cl_cb_state = NFSD4_CB_UP; } +static void nfsd4_cb_probe_release(void *calldata) +{ + struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null); + + nfsd41_cb_inflight_end(clp); + +} + static const struct rpc_call_ops nfsd4_cb_probe_ops = { /* XXX: release method to ensure we set the cb channel down if * necessary on early failure? */ .rpc_call_done = nfsd4_cb_probe_done, + .rpc_release = nfsd4_cb_probe_release, }; -static struct workqueue_struct *callback_wq; - /* * Poke the callback thread to process any updates to the callback * parameters, and send a null probe. @@ -975,9 +1007,12 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn) * If the slot is available, then mark it busy. Otherwise, set the * thread for sleeping on the callback RPC wait queue. */ -static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task) +static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task) { - if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) { + struct nfs4_client *clp = cb->cb_clp; + + if (!cb->cb_holds_slot && + test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) { rpc_sleep_on(&clp->cl_cb_waitq, task, NULL); /* Race breaker */ if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) { @@ -986,9 +1021,31 @@ static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task) } rpc_wake_up_queued_task(&clp->cl_cb_waitq, task); } + cb->cb_holds_slot = true; return true; } +static void nfsd41_cb_release_slot(struct nfsd4_callback *cb) +{ + struct nfs4_client *clp = cb->cb_clp; + + if (cb->cb_holds_slot) { + cb->cb_holds_slot = false; + clear_bit(0, &clp->cl_cb_slot_busy); + rpc_wake_up_next(&clp->cl_cb_waitq); + } +} + +static void nfsd41_destroy_cb(struct nfsd4_callback *cb) +{ + struct nfs4_client *clp = cb->cb_clp; + + nfsd41_cb_release_slot(cb); + if (cb->cb_ops && cb->cb_ops->release) + cb->cb_ops->release(cb); + nfsd41_cb_inflight_end(clp); +} + /* * TODO: cb_sequence should support referring call lists, cachethis, multiple * slots, and mark callback channel down on communication errors. @@ -1005,11 +1062,8 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) */ cb->cb_seq_status = 1; cb->cb_status = 0; - if (minorversion) { - if (!cb->cb_holds_slot && !nfsd41_cb_get_slot(clp, task)) - return; - cb->cb_holds_slot = true; - } + if (minorversion && !nfsd41_cb_get_slot(cb, task)) + return; rpc_call_start(task); } @@ -1072,13 +1126,12 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback } break; default: + nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status); dprintk("%s: unprocessed error %d\n", __func__, cb->cb_seq_status); } - cb->cb_holds_slot = false; - clear_bit(0, &clp->cl_cb_slot_busy); - rpc_wake_up_next(&clp->cl_cb_waitq); + nfsd41_cb_release_slot(cb); dprintk("%s: freed slot, new seqid=%d\n", __func__, clp->cl_cb_session->se_cb_seq_nr); @@ -1091,8 +1144,10 @@ retry_nowait: ret = false; goto out; need_restart: - task->tk_status = 0; - cb->cb_need_restart = true; + if (!test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) { + task->tk_status = 0; + cb->cb_need_restart = true; + } return false; } @@ -1134,9 +1189,9 @@ static void nfsd4_cb_release(void *calldata) struct nfsd4_callback *cb = calldata; if (cb->cb_need_restart) - nfsd4_run_cb(cb); + nfsd4_queue_cb(cb); else - cb->cb_ops->release(cb); + nfsd41_destroy_cb(cb); } @@ -1170,6 +1225,7 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp) */ nfsd4_run_cb(&clp->cl_cb_null); flush_workqueue(callback_wq); + nfsd41_cb_inflight_wait_complete(clp); } /* requires cl_lock: */ @@ -1187,6 +1243,12 @@ static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp) return NULL; } +/* + * Note there isn't a lot of locking in this code; instead we depend on + * the fact that it is run from the callback_wq, which won't run two + * work items at once. So, for example, callback_wq handles all access + * of cl_cb_client and all calls to rpc_create or rpc_shutdown_client. + */ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) { struct nfs4_cb_conn conn; @@ -1255,8 +1317,7 @@ nfsd4_run_cb_work(struct work_struct *work) clnt = clp->cl_cb_client; if (!clnt) { /* Callback channel broken, or client killed; give up: */ - if (cb->cb_ops && cb->cb_ops->release) - cb->cb_ops->release(cb); + nfsd41_destroy_cb(cb); return; } @@ -1265,6 +1326,7 @@ nfsd4_run_cb_work(struct work_struct *work) */ if (!cb->cb_ops && clp->cl_minorversion) { clp->cl_cb_state = NFSD4_CB_UP; + nfsd41_destroy_cb(cb); return; } @@ -1290,5 +1352,9 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, void nfsd4_run_cb(struct nfsd4_callback *cb) { - queue_work(callback_wq, &cb->cb_work); + struct nfs4_client *clp = cb->cb_clp; + + nfsd41_cb_inflight_begin(clp); + if (!nfsd4_queue_cb(cb)) + nfsd41_cb_inflight_end(clp); } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 4e3e77b76411..4798667af647 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1077,7 +1077,8 @@ nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; status = nfsd4_clone_file_range(src->nf_file, clone->cl_src_pos, - dst->nf_file, clone->cl_dst_pos, clone->cl_count); + dst->nf_file, clone->cl_dst_pos, clone->cl_count, + EX_ISSYNC(cstate->current_fh.fh_export)); nfsd_file_put(dst); nfsd_file_put(src); @@ -1297,7 +1298,8 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, out: return status; out_err: - cleanup_async_copy(async_copy); + if (async_copy) + cleanup_async_copy(async_copy); goto out; } diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index cdc75ad4438b..2481e7662128 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -1578,6 +1578,7 @@ nfsd4_cld_tracking_init(struct net *net) struct nfsd_net *nn = net_generic(net, nfsd_net_id); bool running; int retries = 10; + struct crypto_shash *tfm; status = nfs4_cld_state_init(net); if (status) @@ -1586,11 +1587,6 @@ nfsd4_cld_tracking_init(struct net *net) status = __nfsd4_init_cld_pipe(net); if (status) goto err_shutdown; - nn->cld_net->cn_tfm = crypto_alloc_shash("sha256", 0, 0); - if (IS_ERR(nn->cld_net->cn_tfm)) { - status = PTR_ERR(nn->cld_net->cn_tfm); - goto err_remove; - } /* * rpc pipe upcalls take 30 seconds to time out, so we don't want to @@ -1607,6 +1603,12 @@ nfsd4_cld_tracking_init(struct net *net) status = -ETIMEDOUT; goto err_remove; } + tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(tfm)) { + status = PTR_ERR(tfm); + goto err_remove; + } + nn->cld_net->cn_tfm = tfm; status = nfsd4_cld_get_version(nn); if (status == -EOPNOTSUPP) @@ -1850,19 +1852,14 @@ nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *env0, char *env1) static char * bin_to_hex_dup(const unsigned char *src, int srclen) { - int i; - char *buf, *hex; + char *buf; /* +1 for terminating NULL */ - buf = kmalloc((srclen * 2) + 1, GFP_KERNEL); + buf = kzalloc((srclen * 2) + 1, GFP_KERNEL); if (!buf) return buf; - hex = buf; - for (i = 0; i < srclen; i++) { - sprintf(hex, "%2.2x", *src++); - hex += 2; - } + bin2hex(buf, src, srclen); return buf; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index c65aeaa812d4..369e574c5092 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2382,10 +2382,10 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st) access = bmap_to_share_mode(ols->st_access_bmap); deny = bmap_to_share_mode(ols->st_deny_bmap); - seq_printf(s, "access: \%s\%s, ", + seq_printf(s, "access: %s%s, ", access & NFS4_SHARE_ACCESS_READ ? "r" : "-", access & NFS4_SHARE_ACCESS_WRITE ? "w" : "-"); - seq_printf(s, "deny: \%s\%s, ", + seq_printf(s, "deny: %s%s, ", deny & NFS4_SHARE_ACCESS_READ ? "r" : "-", deny & NFS4_SHARE_ACCESS_WRITE ? "w" : "-"); @@ -3548,12 +3548,17 @@ static bool replay_matches_cache(struct svc_rqst *rqstp, (bool)seq->cachethis) return false; /* - * If there's an error than the reply can have fewer ops than - * the call. But if we cached a reply with *more* ops than the - * call you're sending us now, then this new call is clearly not - * really a replay of the old one: + * If there's an error then the reply can have fewer ops than + * the call. */ - if (slot->sl_opcnt < argp->opcnt) + if (slot->sl_opcnt < argp->opcnt && !slot->sl_status) + return false; + /* + * But if we cached a reply with *more* ops than the call you're + * sending us now, then this new call is clearly not really a + * replay of the old one: + */ + if (slot->sl_opcnt > argp->opcnt) return false; /* This is the only check explicitly called by spec: */ if (!same_creds(&rqstp->rq_cred, &slot->sl_cred)) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 533d0fc3c96b..d2dc4c0e22e8 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2991,18 +2991,9 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd, __be32 nfserr; int ignore_crossmnt = 0; - dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen); + dentry = lookup_positive_unlocked(name, cd->rd_fhp->fh_dentry, namlen); if (IS_ERR(dentry)) return nfserrno(PTR_ERR(dentry)); - if (d_really_is_negative(dentry)) { - /* - * we're not holding the i_mutex here, so there's - * a window where this directory entry could have gone - * away. - */ - dput(dentry); - return nfserr_noent; - } exp_get(exp); /* @@ -3461,7 +3452,6 @@ static __be32 nfsd4_encode_splice_read( struct xdr_stream *xdr = &resp->xdr; struct xdr_buf *buf = xdr->buf; u32 eof; - long len; int space_left; __be32 nfserr; __be32 *p = xdr->p - 2; @@ -3470,7 +3460,6 @@ static __be32 nfsd4_encode_splice_read( if (xdr->end - xdr->p < 1) return nfserr_resource; - len = maxcount; nfserr = nfsd_splice_read(read->rd_rqstp, read->rd_fhp, file, read->rd_offset, &maxcount, &eof); read->rd_length = maxcount; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index af2947551e9c..57b93d95fa5c 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -280,7 +280,8 @@ void nfsd_lockd_shutdown(void); #define nfserr_union_notsupp cpu_to_be32(NFS4ERR_UNION_NOTSUPP) #define nfserr_offload_denied cpu_to_be32(NFS4ERR_OFFLOAD_DENIED) #define nfserr_wrong_lfs cpu_to_be32(NFS4ERR_WRONG_LFS) -#define nfserr_badlabel cpu_to_be32(NFS4ERR_BADLABEL) +#define nfserr_badlabel cpu_to_be32(NFS4ERR_BADLABEL) +#define nfserr_file_open cpu_to_be32(NFS4ERR_FILE_OPEN) /* error codes for internal use */ /* if a request fails due to kmalloc failure, it gets dropped. diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index fdf7ed4bd5dd..e8bee8ff30c5 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -95,12 +95,11 @@ static const struct svc_version *nfsd_acl_version[] = { #define NFSD_ACL_MINVERS 2 #define NFSD_ACL_NRVERS ARRAY_SIZE(nfsd_acl_version) -static const struct svc_version *nfsd_acl_versions[NFSD_ACL_NRVERS]; static struct svc_program nfsd_acl_program = { .pg_prog = NFS_ACL_PROGRAM, .pg_nvers = NFSD_ACL_NRVERS, - .pg_vers = nfsd_acl_versions, + .pg_vers = nfsd_acl_version, .pg_name = "nfsacl", .pg_class = "nfsd", .pg_stats = &nfsd_acl_svcstats, diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 46f56afb6cb8..d61b83b9654c 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -367,6 +367,7 @@ struct nfs4_client { struct net *net; struct list_head async_copies; /* list of async copies */ spinlock_t async_lock; /* lock for async copies */ + atomic_t cl_cb_inflight; /* Outstanding callbacks */ }; /* struct nfs4_client_reset diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index bd0a385df3fc..c0dc491537a6 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -525,7 +525,7 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, #endif __be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst, - u64 dst_pos, u64 count) + u64 dst_pos, u64 count, bool sync) { loff_t cloned; @@ -534,6 +534,12 @@ __be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst, return nfserrno(cloned); if (count && cloned != count) return nfserrno(-EINVAL); + if (sync) { + loff_t dst_end = count ? dst_pos + count - 1 : LLONG_MAX; + int status = vfs_fsync_range(dst, dst_pos, dst_end, 0); + if (status < 0) + return nfserrno(status); + } return 0; } @@ -1809,7 +1815,17 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, out_drop_write: fh_drop_write(fhp); out_nfserr: - err = nfserrno(host_err); + if (host_err == -EBUSY) { + /* name is mounted-on. There is no perfect + * error status. + */ + if (nfsd_v4client(rqstp)) + err = nfserr_file_open; + else + err = nfserr_acces; + } else { + err = nfserrno(host_err); + } out: return err; } diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index a13fd9d7e1f5..cc110a10bfe8 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -56,7 +56,7 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *, __be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *, struct file *, loff_t, loff_t, int); __be32 nfsd4_clone_file_range(struct file *, u64, struct file *, - u64, u64); + u64, u64, bool); #endif /* CONFIG_NFSD_V4 */ __be32 nfsd_create_locked(struct svc_rqst *, struct svc_fh *, char *name, int len, struct iattr *attrs, diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 91b9dac6b2cc..4ba73dbf3e8d 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -1354,6 +1354,7 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case NILFS_IOCTL_SYNC: case NILFS_IOCTL_RESIZE: case NILFS_IOCTL_SET_ALLOC_RANGE: + case FITRIM: break; default: return -ENOIOCTLCMD; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 8508ab575017..0aa362b88550 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -523,7 +523,7 @@ static const struct file_operations fanotify_fops = { .fasync = NULL, .release = fanotify_release, .unlocked_ioctl = fanotify_ioctl, - .compat_ioctl = fanotify_ioctl, + .compat_ioctl = compat_ptr_ioctl, .llseek = noop_llseek, }; diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 1e2bfd26b352..ef83f4020554 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -50,7 +50,7 @@ static void show_mark_fhandle(struct seq_file *m, struct inode *inode) f.handle.handle_bytes = sizeof(f.pad); size = f.handle.handle_bytes >> 2; - ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0); + ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, NULL); if ((ret == FILEID_INVALID) || (ret < 0)) { WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret); return; diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 2ecef6155fc0..3e77b728a22b 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -381,8 +381,6 @@ out: } EXPORT_SYMBOL_GPL(fsnotify); -extern struct kmem_cache *fsnotify_mark_connector_cachep; - static __init int fsnotify_init(void) { int ret; diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index f3462828a0e2..ff2063ec6b0f 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -65,4 +65,6 @@ extern void __fsnotify_update_child_dentry_flags(struct inode *inode); extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void); extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder); +extern struct kmem_cache *fsnotify_mark_connector_cachep; + #endif /* __FS_NOTIFY_FSNOTIFY_H_ */ diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 3e7da392aa6f..bb981ec76456 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -327,8 +327,8 @@ int ocfs2_acl_chmod(struct inode *inode, struct buffer_head *bh) down_read(&OCFS2_I(inode)->ip_xattr_sem); acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh); up_read(&OCFS2_I(inode)->ip_xattr_sem); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); + if (IS_ERR_OR_NULL(acl)) + return PTR_ERR_OR_ZERO(acl); ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); if (ret) return ret; diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index f9baefc76cf9..88534eb0e7c2 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -2288,9 +2288,9 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, int ret = 0; int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits; - if (handle->h_buffer_credits < credits) + if (jbd2_handle_buffer_credits(handle) < credits) ret = ocfs2_extend_trans(handle, - credits - handle->h_buffer_credits); + credits - jbd2_handle_buffer_credits(handle)); return ret; } @@ -2367,7 +2367,7 @@ static int ocfs2_rotate_tree_right(handle_t *handle, struct ocfs2_path *right_path, struct ocfs2_path **ret_left_path) { - int ret, start, orig_credits = handle->h_buffer_credits; + int ret, start, orig_credits = jbd2_handle_buffer_credits(handle); u32 cpos; struct ocfs2_path *left_path = NULL; struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); @@ -3148,7 +3148,7 @@ static int ocfs2_rotate_tree_left(handle_t *handle, struct ocfs2_path *path, struct ocfs2_cached_dealloc_ctxt *dealloc) { - int ret, orig_credits = handle->h_buffer_credits; + int ret, orig_credits = jbd2_handle_buffer_credits(handle); struct ocfs2_path *tmp_path = NULL, *restart_path = NULL; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; @@ -3386,8 +3386,8 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path, right_path); ret = ocfs2_extend_rotate_transaction(handle, subtree_index, - handle->h_buffer_credits, - right_path); + jbd2_handle_buffer_credits(handle), + right_path); if (ret) { mlog_errno(ret); goto out; @@ -3548,8 +3548,8 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, right_path); ret = ocfs2_extend_rotate_transaction(handle, subtree_index, - handle->h_buffer_credits, - left_path); + jbd2_handle_buffer_credits(handle), + left_path); if (ret) { mlog_errno(ret); goto out; @@ -3623,7 +3623,7 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, le16_to_cpu(el->l_next_free_rec) == 1) { /* extend credit for ocfs2_remove_rightmost_path */ ret = ocfs2_extend_rotate_transaction(handle, 0, - handle->h_buffer_credits, + jbd2_handle_buffer_credits(handle), right_path); if (ret) { mlog_errno(ret); @@ -3669,7 +3669,7 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) { /* extend credit for ocfs2_remove_rightmost_path */ ret = ocfs2_extend_rotate_transaction(handle, 0, - handle->h_buffer_credits, + jbd2_handle_buffer_credits(handle), path); if (ret) { mlog_errno(ret); @@ -3725,7 +3725,7 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, /* extend credit for ocfs2_remove_rightmost_path */ ret = ocfs2_extend_rotate_transaction(handle, 0, - handle->h_buffer_credits, + jbd2_handle_buffer_credits(handle), path); if (ret) { mlog_errno(ret); @@ -3755,7 +3755,7 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, /* extend credit for ocfs2_remove_rightmost_path */ ret = ocfs2_extend_rotate_transaction(handle, 0, - handle->h_buffer_credits, + jbd2_handle_buffer_credits(handle), path); if (ret) { mlog_errno(ret); @@ -3799,7 +3799,7 @@ static int ocfs2_try_to_merge_extent(handle_t *handle, if (ctxt->c_split_covers_rec) { /* extend credit for ocfs2_remove_rightmost_path */ ret = ocfs2_extend_rotate_transaction(handle, 0, - handle->h_buffer_credits, + jbd2_handle_buffer_credits(handle), path); if (ret) { mlog_errno(ret); @@ -5358,7 +5358,7 @@ static int ocfs2_truncate_rec(handle_t *handle, if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { /* extend credit for ocfs2_remove_rightmost_path */ ret = ocfs2_extend_rotate_transaction(handle, 0, - handle->h_buffer_credits, + jbd2_handle_buffer_credits(handle), path); if (ret) { mlog_errno(ret); @@ -5427,8 +5427,8 @@ static int ocfs2_truncate_rec(handle_t *handle, } ret = ocfs2_extend_rotate_transaction(handle, 0, - handle->h_buffer_credits, - path); + jbd2_handle_buffer_credits(handle), + path); if (ret) { mlog_errno(ret); goto out; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 9cd0a6815933..3a67a6518ddf 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -11,7 +11,6 @@ #include <linux/pagemap.h> #include <asm/byteorder.h> #include <linux/swap.h> -#include <linux/pipe_fs_i.h> #include <linux/mpage.h> #include <linux/quotaops.h> #include <linux/blkdev.h> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 6e774c5ea13b..1c4c51f3df60 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1687,7 +1687,7 @@ static void __ocfs2_cluster_unlock(struct ocfs2_super *osb, spin_unlock_irqrestore(&lockres->l_lock, flags); #ifdef CONFIG_DEBUG_LOCK_ALLOC if (lockres->l_lockdep_map.key != NULL) - rwsem_release(&lockres->l_lockdep_map, 1, caller_ip); + rwsem_release(&lockres->l_lockdep_map, caller_ip); #endif } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 2e982db3e1ae..9876db52913a 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1230,6 +1230,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid)); if (IS_ERR(transfer_to[USRQUOTA])) { status = PTR_ERR(transfer_to[USRQUOTA]); + transfer_to[USRQUOTA] = NULL; goto bail_unlock; } } @@ -1239,6 +1240,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid)); if (IS_ERR(transfer_to[GRPQUOTA])) { status = PTR_ERR(transfer_to[GRPQUOTA]); + transfer_to[GRPQUOTA] = NULL; goto bail_unlock; } } @@ -2096,53 +2098,89 @@ static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos) return 0; } -static int ocfs2_prepare_inode_for_refcount(struct inode *inode, - struct file *file, - loff_t pos, size_t count, - int *meta_level) +static int ocfs2_inode_lock_for_extent_tree(struct inode *inode, + struct buffer_head **di_bh, + int meta_level, + int overwrite_io, + int write_sem, + int wait) { - int ret; - struct buffer_head *di_bh = NULL; - u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; - u32 clusters = - ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos; + int ret = 0; - ret = ocfs2_inode_lock(inode, &di_bh, 1); - if (ret) { - mlog_errno(ret); + if (wait) + ret = ocfs2_inode_lock(inode, NULL, meta_level); + else + ret = ocfs2_try_inode_lock(inode, + overwrite_io ? NULL : di_bh, meta_level); + if (ret < 0) goto out; + + if (wait) { + if (write_sem) + down_write(&OCFS2_I(inode)->ip_alloc_sem); + else + down_read(&OCFS2_I(inode)->ip_alloc_sem); + } else { + if (write_sem) + ret = down_write_trylock(&OCFS2_I(inode)->ip_alloc_sem); + else + ret = down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem); + + if (!ret) { + ret = -EAGAIN; + goto out_unlock; + } } - *meta_level = 1; + return ret; - ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); - if (ret) - mlog_errno(ret); +out_unlock: + brelse(*di_bh); + ocfs2_inode_unlock(inode, meta_level); out: - brelse(di_bh); return ret; } +static void ocfs2_inode_unlock_for_extent_tree(struct inode *inode, + struct buffer_head **di_bh, + int meta_level, + int write_sem) +{ + if (write_sem) + up_write(&OCFS2_I(inode)->ip_alloc_sem); + else + up_read(&OCFS2_I(inode)->ip_alloc_sem); + + brelse(*di_bh); + *di_bh = NULL; + + if (meta_level >= 0) + ocfs2_inode_unlock(inode, meta_level); +} + static int ocfs2_prepare_inode_for_write(struct file *file, loff_t pos, size_t count, int wait) { int ret = 0, meta_level = 0, overwrite_io = 0; + int write_sem = 0; struct dentry *dentry = file->f_path.dentry; struct inode *inode = d_inode(dentry); struct buffer_head *di_bh = NULL; + u32 cpos; + u32 clusters; /* * We start with a read level meta lock and only jump to an ex * if we need to make modifications here. */ for(;;) { - if (wait) - ret = ocfs2_inode_lock(inode, NULL, meta_level); - else - ret = ocfs2_try_inode_lock(inode, - overwrite_io ? NULL : &di_bh, meta_level); + ret = ocfs2_inode_lock_for_extent_tree(inode, + &di_bh, + meta_level, + overwrite_io, + write_sem, + wait); if (ret < 0) { - meta_level = -1; if (ret != -EAGAIN) mlog_errno(ret); goto out; @@ -2154,15 +2192,8 @@ static int ocfs2_prepare_inode_for_write(struct file *file, */ if (!wait && !overwrite_io) { overwrite_io = 1; - if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) { - ret = -EAGAIN; - goto out_unlock; - } ret = ocfs2_overwrite_io(inode, di_bh, pos, count); - brelse(di_bh); - di_bh = NULL; - up_read(&OCFS2_I(inode)->ip_alloc_sem); if (ret < 0) { if (ret != -EAGAIN) mlog_errno(ret); @@ -2181,7 +2212,10 @@ static int ocfs2_prepare_inode_for_write(struct file *file, * set inode->i_size at the end of a write. */ if (should_remove_suid(dentry)) { if (meta_level == 0) { - ocfs2_inode_unlock(inode, meta_level); + ocfs2_inode_unlock_for_extent_tree(inode, + &di_bh, + meta_level, + write_sem); meta_level = 1; continue; } @@ -2195,18 +2229,32 @@ static int ocfs2_prepare_inode_for_write(struct file *file, ret = ocfs2_check_range_for_refcount(inode, pos, count); if (ret == 1) { - ocfs2_inode_unlock(inode, meta_level); - meta_level = -1; - - ret = ocfs2_prepare_inode_for_refcount(inode, - file, - pos, - count, - &meta_level); + ocfs2_inode_unlock_for_extent_tree(inode, + &di_bh, + meta_level, + write_sem); + ret = ocfs2_inode_lock_for_extent_tree(inode, + &di_bh, + meta_level, + overwrite_io, + 1, + wait); + write_sem = 1; + if (ret < 0) { + if (ret != -EAGAIN) + mlog_errno(ret); + goto out; + } + + cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; + clusters = + ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos; + ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); } if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto out_unlock; } @@ -2217,10 +2265,10 @@ out_unlock: trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno, pos, count, wait); - brelse(di_bh); - - if (meta_level >= 0) - ocfs2_inode_unlock(inode, meta_level); + ocfs2_inode_unlock_for_extent_tree(inode, + &di_bh, + meta_level, + write_sem); out: return ret; diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index efeea208fdeb..89984172fc4a 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -985,6 +985,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) return -EFAULT; return ocfs2_info_handle(inode, &info, 1); + case FITRIM: case OCFS2_IOC_MOVE_EXT: break; default: diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 930e3d388579..1afe57f425a0 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -217,7 +217,8 @@ void ocfs2_recovery_exit(struct ocfs2_super *osb) /* At this point, we know that no more recovery threads can be * launched, so wait for any recovery completion work to * complete. */ - flush_workqueue(osb->ocfs2_wq); + if (osb->ocfs2_wq) + flush_workqueue(osb->ocfs2_wq); /* * Now that recovery is shut down, and the osb is about to be @@ -419,14 +420,14 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks) if (!nblocks) return 0; - old_nblocks = handle->h_buffer_credits; + old_nblocks = jbd2_handle_buffer_credits(handle); trace_ocfs2_extend_trans(old_nblocks, nblocks); #ifdef CONFIG_OCFS2_DEBUG_FS status = 1; #else - status = jbd2_journal_extend(handle, nblocks); + status = jbd2_journal_extend(handle, nblocks, 0); if (status < 0) { mlog_errno(status); goto bail; @@ -460,13 +461,13 @@ int ocfs2_allocate_extend_trans(handle_t *handle, int thresh) BUG_ON(!handle); - old_nblks = handle->h_buffer_credits; + old_nblks = jbd2_handle_buffer_credits(handle); trace_ocfs2_allocate_extend_trans(old_nblks, thresh); if (old_nblks < thresh) return 0; - status = jbd2_journal_extend(handle, OCFS2_MAX_TRANS_DATA); + status = jbd2_journal_extend(handle, OCFS2_MAX_TRANS_DATA, 0); if (status < 0) { mlog_errno(status); goto bail; diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 158e5af767fd..720e9f94957e 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -377,7 +377,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) struct ocfs2_dinode *alloc = NULL; cancel_delayed_work(&osb->la_enable_wq); - flush_workqueue(osb->ocfs2_wq); + if (osb->ocfs2_wq) + flush_workqueue(osb->ocfs2_wq); if (osb->local_alloc_state == OCFS2_LA_UNUSED) goto out; diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 7a922190a8c7..eda83487c9ec 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -728,7 +728,7 @@ static int ocfs2_release_dquot(struct dquot *dquot) mutex_lock(&dquot->dq_lock); /* Check whether we are not racing with some other dqget() */ - if (atomic_read(&dquot->dq_count) > 1) + if (dquot_is_busy(dquot)) goto out; /* Running from downconvert thread? Postpone quota processing to wq */ if (current == osb->dc_task) { diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 69c21a3843af..4180c3ef0a68 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1252,6 +1252,7 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, int nr) { struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; + struct journal_head *jh; int ret; if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap)) @@ -1260,13 +1261,14 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, if (!buffer_jbd(bg_bh)) return 1; - jbd_lock_bh_state(bg_bh); - bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data; + jh = bh2jh(bg_bh); + spin_lock(&jh->b_state_lock); + bg = (struct ocfs2_group_desc *) jh->b_committed_data; if (bg) ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); else ret = 1; - jbd_unlock_bh_state(bg_bh); + spin_unlock(&jh->b_state_lock); return ret; } @@ -2387,6 +2389,7 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, int status; unsigned int tmp; struct ocfs2_group_desc *undo_bg = NULL; + struct journal_head *jh; /* The caller got this descriptor from * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ @@ -2405,10 +2408,10 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, goto bail; } + jh = bh2jh(group_bh); if (undo_fn) { - jbd_lock_bh_state(group_bh); - undo_bg = (struct ocfs2_group_desc *) - bh2jh(group_bh)->b_committed_data; + spin_lock(&jh->b_state_lock); + undo_bg = (struct ocfs2_group_desc *) jh->b_committed_data; BUG_ON(!undo_bg); } @@ -2423,7 +2426,7 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, le16_add_cpu(&bg->bg_free_bits_count, num_bits); if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { if (undo_fn) - jbd_unlock_bh_state(group_bh); + spin_unlock(&jh->b_state_lock); return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", (unsigned long long)le64_to_cpu(bg->bg_blkno), le16_to_cpu(bg->bg_bits), @@ -2432,7 +2435,7 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, } if (undo_fn) - jbd_unlock_bh_state(group_bh); + spin_unlock(&jh->b_state_lock); ocfs2_journal_dirty(handle, group_bh); bail: diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index c81e86c62380..05dd68ade293 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -926,8 +926,8 @@ static int ocfs2_enable_quotas(struct ocfs2_super *osb) status = -ENOENT; goto out_quota_off; } - status = dquot_enable(inode[type], type, QFMT_OCFS2, - DQUOT_USAGE_ENABLED); + status = dquot_load_quota_inode(inode[type], type, QFMT_OCFS2, + DQUOT_USAGE_ENABLED); if (status < 0) goto out_quota_off; } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index d8507972ee13..90c830e3758e 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1490,6 +1490,18 @@ static int ocfs2_xa_check_space(struct ocfs2_xa_loc *loc, return loc->xl_ops->xlo_check_space(loc, xi); } +static void ocfs2_xa_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash) +{ + loc->xl_ops->xlo_add_entry(loc, name_hash); + loc->xl_entry->xe_name_hash = cpu_to_le32(name_hash); + /* + * We can't leave the new entry's xe_name_offset at zero or + * add_namevalue() will go nuts. We set it to the size of our + * storage so that it can never be less than any other entry. + */ + loc->xl_entry->xe_name_offset = cpu_to_le16(loc->xl_size); +} + static void ocfs2_xa_add_namevalue(struct ocfs2_xa_loc *loc, struct ocfs2_xattr_info *xi) { @@ -2121,31 +2133,29 @@ static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc, if (rc) goto out; - if (!loc->xl_entry) { - rc = -EINVAL; - goto out; - } - - if (ocfs2_xa_can_reuse_entry(loc, xi)) { - orig_value_size = loc->xl_entry->xe_value_size; - rc = ocfs2_xa_reuse_entry(loc, xi, ctxt); - if (rc) - goto out; - goto alloc_value; - } + if (loc->xl_entry) { + if (ocfs2_xa_can_reuse_entry(loc, xi)) { + orig_value_size = loc->xl_entry->xe_value_size; + rc = ocfs2_xa_reuse_entry(loc, xi, ctxt); + if (rc) + goto out; + goto alloc_value; + } - if (!ocfs2_xattr_is_local(loc->xl_entry)) { - orig_clusters = ocfs2_xa_value_clusters(loc); - rc = ocfs2_xa_value_truncate(loc, 0, ctxt); - if (rc) { - mlog_errno(rc); - ocfs2_xa_cleanup_value_truncate(loc, - "overwriting", - orig_clusters); - goto out; + if (!ocfs2_xattr_is_local(loc->xl_entry)) { + orig_clusters = ocfs2_xa_value_clusters(loc); + rc = ocfs2_xa_value_truncate(loc, 0, ctxt); + if (rc) { + mlog_errno(rc); + ocfs2_xa_cleanup_value_truncate(loc, + "overwriting", + orig_clusters); + goto out; + } } - } - ocfs2_xa_wipe_namevalue(loc); + ocfs2_xa_wipe_namevalue(loc); + } else + ocfs2_xa_add_entry(loc, name_hash); /* * If we get here, we have a blank entry. Fill it. We grow our diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index a5612abc0936..c740159d9ad1 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -46,8 +46,9 @@ static int flush_racache(struct inode *inode) * Post and wait for the I/O upcall to finish */ ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode, - loff_t *offset, struct iov_iter *iter, size_t total_size, - loff_t readahead_size, struct orangefs_write_range *wr, int *index_return) + loff_t *offset, struct iov_iter *iter, size_t total_size, + loff_t readahead_size, struct orangefs_write_range *wr, + int *index_return, struct file *file) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; @@ -55,6 +56,8 @@ ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode, int buffer_index; ssize_t ret; size_t copy_amount; + int open_for_read; + int open_for_write; new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO); if (!new_op) @@ -90,6 +93,38 @@ populate_shared_memory: new_op->upcall.uid = from_kuid(&init_user_ns, wr->uid); new_op->upcall.gid = from_kgid(&init_user_ns, wr->gid); } + /* + * Orangefs has no open, and orangefs checks file permissions + * on each file access. Posix requires that file permissions + * be checked on open and nowhere else. Orangefs-through-the-kernel + * needs to seem posix compliant. + * + * The VFS opens files, even if the filesystem provides no + * method. We can see if a file was successfully opened for + * read and or for write by looking at file->f_mode. + * + * When writes are flowing from the page cache, file is no + * longer available. We can trust the VFS to have checked + * file->f_mode before writing to the page cache. + * + * The mode of a file might change between when it is opened + * and IO commences, or it might be created with an arbitrary mode. + * + * We'll make sure we don't hit EACCES during the IO stage by + * using UID 0. Some of the time we have access without changing + * to UID 0 - how to check? + */ + if (file) { + open_for_write = file->f_mode & FMODE_WRITE; + open_for_read = file->f_mode & FMODE_READ; + } else { + open_for_write = 1; + open_for_read = 0; /* not relevant? */ + } + if ((type == ORANGEFS_IO_WRITE) && open_for_write) + new_op->upcall.uid = 0; + if ((type == ORANGEFS_IO_READ) && open_for_read) + new_op->upcall.uid = 0; gossip_debug(GOSSIP_FILE_DEBUG, "%s(%pU): offset: %llu total_size: %zd\n", diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index efb12197da18..961c0fd8675a 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -55,7 +55,7 @@ static int orangefs_writepage_locked(struct page *page, iov_iter_bvec(&iter, WRITE, &bv, 1, wlen); ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen, - len, wr, NULL); + len, wr, NULL, NULL); if (ret < 0) { SetPageError(page); mapping_set_error(page->mapping, ret); @@ -126,7 +126,7 @@ static int orangefs_writepages_work(struct orangefs_writepages *ow, wr.uid = ow->uid; wr.gid = ow->gid; ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, ow->len, - 0, &wr, NULL); + 0, &wr, NULL, NULL); if (ret < 0) { for (i = 0; i < ow->npages; i++) { SetPageError(ow->pages[i]); @@ -311,7 +311,7 @@ static int orangefs_readpage(struct file *file, struct page *page) iov_iter_bvec(&iter, READ, &bv, 1, PAGE_SIZE); ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter, - read_size, inode->i_size, NULL, &buffer_index); + read_size, inode->i_size, NULL, &buffer_index, file); remaining = ret; /* this will only zero remaining unread portions of the page data */ iov_iter_zero(~0U, &iter); @@ -651,7 +651,7 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb, (int)*offset); ret = wait_for_direct_io(type, inode, offset, iter, - each_count, 0, NULL, NULL); + each_count, 0, NULL, NULL, file); gossip_debug(GOSSIP_FILE_DEBUG, "%s(%pU): return from wait_for_io:%d\n", __func__, diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 34a6c99fa29b..ed67f39fa7ce 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -398,7 +398,8 @@ bool __is_daemon_in_service(void); */ int orangefs_revalidate_mapping(struct inode *); ssize_t wait_for_direct_io(enum ORANGEFS_io_type, struct inode *, loff_t *, - struct iov_iter *, size_t, loff_t, struct orangefs_write_range *, int *); + struct iov_iter *, size_t, loff_t, struct orangefs_write_range *, int *, + struct file *); ssize_t do_readv_writev(enum ORANGEFS_io_type, struct file *, loff_t *, struct iov_iter *); diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index e9717c2f7d45..c269d6033525 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -200,7 +200,7 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, int err; bool last_element = !post[0]; - this = lookup_one_len_unlocked(name, base, namelen); + this = lookup_positive_unlocked(name, base, namelen); if (IS_ERR(this)) { err = PTR_ERR(this); this = NULL; @@ -208,8 +208,6 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, goto out; goto out_err; } - if (!this->d_inode) - goto put_and_out; if (ovl_dentry_weird(this)) { /* Don't support traversing automounts and other weirdness */ @@ -651,7 +649,7 @@ struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh) if (err) return ERR_PTR(err); - index = lookup_one_len_unlocked(name.name, ofs->indexdir, name.len); + index = lookup_positive_unlocked(name.name, ofs->indexdir, name.len); kfree(name.name); if (IS_ERR(index)) { if (PTR_ERR(index) == -ENOENT) @@ -659,9 +657,7 @@ struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh) return index; } - if (d_is_negative(index)) - err = 0; - else if (ovl_is_whiteout(index)) + if (ovl_is_whiteout(index)) err = -ESTALE; else if (ovl_dentry_weird(index)) err = -EIO; @@ -685,7 +681,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, if (err) return ERR_PTR(err); - index = lookup_one_len_unlocked(name.name, ofs->indexdir, name.len); + index = lookup_positive_unlocked(name.name, ofs->indexdir, name.len); if (IS_ERR(index)) { err = PTR_ERR(index); if (err == -ENOENT) { @@ -700,9 +696,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, } inode = d_inode(index); - if (d_is_negative(index)) { - goto out_dput; - } else if (ovl_is_whiteout(index) && !verify) { + if (ovl_is_whiteout(index) && !verify) { /* * When index lookup is called with !verify for decoding an * overlay file handle, a whiteout index implies that decode @@ -1131,7 +1125,7 @@ bool ovl_lower_positive(struct dentry *dentry) struct dentry *this; struct dentry *lowerdir = poe->lowerstack[i].dentry; - this = lookup_one_len_unlocked(name->name, lowerdir, + this = lookup_positive_unlocked(name->name, lowerdir, name->len); if (IS_ERR(this)) { switch (PTR_ERR(this)) { @@ -1148,10 +1142,8 @@ bool ovl_lower_positive(struct dentry *dentry) break; } } else { - if (this->d_inode) { - positive = !ovl_is_whiteout(this); - done = true; - } + positive = !ovl_is_whiteout(this); + done = true; dput(this); } } diff --git a/fs/pipe.c b/fs/pipe.c index 8a2ab2f974bd..87109e761fa5 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -43,10 +43,12 @@ unsigned long pipe_user_pages_hard; unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; /* - * We use a start+len construction, which provides full use of the - * allocated memory. - * -- Florian Coosmann (FGC) - * + * We use head and tail indices that aren't masked off, except at the point of + * dereference, but rather they're allowed to wrap naturally. This means there + * isn't a dead spot in the buffer, but the ring has to be a power of two and + * <= 2^31. + * -- David Howells 2019-09-23. + * * Reads with count = 0 should always return 0. * -- Julian Bradfield 1999-06-07. * @@ -268,27 +270,48 @@ static bool pipe_buf_can_merge(struct pipe_buffer *buf) return buf->ops == &anon_pipe_buf_ops; } +/* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ +static inline bool pipe_readable(const struct pipe_inode_info *pipe) +{ + unsigned int head = READ_ONCE(pipe->head); + unsigned int tail = READ_ONCE(pipe->tail); + unsigned int writers = READ_ONCE(pipe->writers); + + return !pipe_empty(head, tail) || !writers; +} + static ssize_t pipe_read(struct kiocb *iocb, struct iov_iter *to) { size_t total_len = iov_iter_count(to); struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; - int do_wakeup; + bool was_full; ssize_t ret; /* Null read succeeds. */ if (unlikely(total_len == 0)) return 0; - do_wakeup = 0; ret = 0; __pipe_lock(pipe); + + /* + * We only wake up writers if the pipe was full when we started + * reading in order to avoid unnecessary wakeups. + * + * But when we do wake up writers, we do so using a sync wakeup + * (WF_SYNC), because we want them to get going and generate more + * data for us. + */ + was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); for (;;) { - int bufs = pipe->nrbufs; - if (bufs) { - int curbuf = pipe->curbuf; - struct pipe_buffer *buf = pipe->bufs + curbuf; + unsigned int head = pipe->head; + unsigned int tail = pipe->tail; + unsigned int mask = pipe->ring_size - 1; + + if (!pipe_empty(head, tail)) { + struct pipe_buffer *buf = &pipe->bufs[tail & mask]; size_t chars = buf->len; size_t written; int error; @@ -321,47 +344,43 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) if (!buf->len) { pipe_buf_release(pipe, buf); - curbuf = (curbuf + 1) & (pipe->buffers - 1); - pipe->curbuf = curbuf; - pipe->nrbufs = --bufs; - do_wakeup = 1; + spin_lock_irq(&pipe->wait.lock); + tail++; + pipe->tail = tail; + spin_unlock_irq(&pipe->wait.lock); } total_len -= chars; if (!total_len) break; /* common path: read succeeded */ + if (!pipe_empty(head, tail)) /* More to do? */ + continue; } - if (bufs) /* More to do? */ - continue; + if (!pipe->writers) break; - if (!pipe->waiting_writers) { - /* syscall merging: Usually we must not sleep - * if O_NONBLOCK is set, or if we got some data. - * But if a writer sleeps in kernel space, then - * we can wait for that data without violating POSIX. - */ - if (ret) - break; - if (filp->f_flags & O_NONBLOCK) { - ret = -EAGAIN; - break; - } + if (ret) + break; + if (filp->f_flags & O_NONBLOCK) { + ret = -EAGAIN; + break; } if (signal_pending(current)) { if (!ret) ret = -ERESTARTSYS; break; } - if (do_wakeup) { + __pipe_unlock(pipe); + if (was_full) { wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM); - kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } - pipe_wait(pipe); + wait_event_interruptible(pipe->wait, pipe_readable(pipe)); + __pipe_lock(pipe); + was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); } __pipe_unlock(pipe); - /* Signal writers asynchronously that there is more room. */ - if (do_wakeup) { + if (was_full) { wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } @@ -375,15 +394,27 @@ static inline int is_packetized(struct file *file) return (file->f_flags & O_DIRECT) != 0; } +/* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ +static inline bool pipe_writable(const struct pipe_inode_info *pipe) +{ + unsigned int head = READ_ONCE(pipe->head); + unsigned int tail = READ_ONCE(pipe->tail); + unsigned int max_usage = READ_ONCE(pipe->max_usage); + + return !pipe_full(head, tail, max_usage) || + !READ_ONCE(pipe->readers); +} + static ssize_t pipe_write(struct kiocb *iocb, struct iov_iter *from) { struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; + unsigned int head; ssize_t ret = 0; - int do_wakeup = 0; size_t total_len = iov_iter_count(from); ssize_t chars; + bool was_empty = false; /* Null write succeeds. */ if (unlikely(total_len == 0)) @@ -397,12 +428,23 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) goto out; } - /* We try to merge small writes */ - chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ - if (pipe->nrbufs && chars != 0) { - int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & - (pipe->buffers - 1); - struct pipe_buffer *buf = pipe->bufs + lastbuf; + /* + * Only wake up if the pipe started out empty, since + * otherwise there should be no readers waiting. + * + * If it wasn't empty we try to merge new data into + * the last buffer. + * + * That naturally merges small writes, but it also + * page-aligs the rest of the writes for large writes + * spanning multiple pages. + */ + head = pipe->head; + was_empty = pipe_empty(head, pipe->tail); + chars = total_len & (PAGE_SIZE-1); + if (chars && !was_empty) { + unsigned int mask = pipe->ring_size - 1; + struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask]; int offset = buf->offset + buf->len; if (pipe_buf_can_merge(buf) && offset + chars <= PAGE_SIZE) { @@ -415,7 +457,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) ret = -EFAULT; goto out; } - do_wakeup = 1; + buf->len += ret; if (!iov_iter_count(from)) goto out; @@ -423,18 +465,17 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) } for (;;) { - int bufs; - if (!pipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } - bufs = pipe->nrbufs; - if (bufs < pipe->buffers) { - int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1); - struct pipe_buffer *buf = pipe->bufs + newbuf; + + head = pipe->head; + if (!pipe_full(head, pipe->tail, pipe->max_usage)) { + unsigned int mask = pipe->ring_size - 1; + struct pipe_buffer *buf = &pipe->bufs[head & mask]; struct page *page = pipe->tmp_page; int copied; @@ -446,38 +487,54 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) } pipe->tmp_page = page; } - /* Always wake up, even if the copy fails. Otherwise - * we lock up (O_NONBLOCK-)readers that sleep due to - * syscall merging. - * FIXME! Is this really true? + + /* Allocate a slot in the ring in advance and attach an + * empty buffer. If we fault or otherwise fail to use + * it, either the reader will consume it or it'll still + * be there for the next write. */ - do_wakeup = 1; - copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); - if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { - if (!ret) - ret = -EFAULT; - break; + spin_lock_irq(&pipe->wait.lock); + + head = pipe->head; + if (pipe_full(head, pipe->tail, pipe->max_usage)) { + spin_unlock_irq(&pipe->wait.lock); + continue; } - ret += copied; + + pipe->head = head + 1; + spin_unlock_irq(&pipe->wait.lock); /* Insert it into the buffer array */ + buf = &pipe->bufs[head & mask]; buf->page = page; buf->ops = &anon_pipe_buf_ops; buf->offset = 0; - buf->len = copied; + buf->len = 0; buf->flags = 0; if (is_packetized(filp)) { buf->ops = &packet_pipe_buf_ops; buf->flags = PIPE_BUF_FLAG_PACKET; } - pipe->nrbufs = ++bufs; pipe->tmp_page = NULL; + copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); + if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { + if (!ret) + ret = -EFAULT; + break; + } + ret += copied; + buf->offset = 0; + buf->len = copied; + if (!iov_iter_count(from)) break; } - if (bufs < pipe->buffers) + + if (!pipe_full(head, pipe->tail, pipe->max_usage)) continue; + + /* Wait for buffer space to become available. */ if (filp->f_flags & O_NONBLOCK) { if (!ret) ret = -EAGAIN; @@ -488,18 +545,35 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) ret = -ERESTARTSYS; break; } - if (do_wakeup) { + + /* + * We're going to release the pipe lock and wait for more + * space. We wake up any readers if necessary, and then + * after waiting we need to re-check whether the pipe + * become empty while we dropped the lock. + */ + __pipe_unlock(pipe); + if (was_empty) { wake_up_interruptible_sync_poll(&pipe->wait, EPOLLIN | EPOLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); - do_wakeup = 0; } - pipe->waiting_writers++; - pipe_wait(pipe); - pipe->waiting_writers--; + wait_event_interruptible(pipe->wait, pipe_writable(pipe)); + __pipe_lock(pipe); + was_empty = pipe_empty(head, pipe->tail); } out: __pipe_unlock(pipe); - if (do_wakeup) { + + /* + * If we do do a wakeup event, we do a 'sync' wakeup, because we + * want the reader to start processing things asap, rather than + * leave the data pending. + * + * This is particularly important for small writes, because of + * how (for example) the GNU make jobserver uses small writes to + * wake up pending jobs + */ + if (was_empty) { wake_up_interruptible_sync_poll(&pipe->wait, EPOLLIN | EPOLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } @@ -515,17 +589,19 @@ out: static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct pipe_inode_info *pipe = filp->private_data; - int count, buf, nrbufs; + int count, head, tail, mask; switch (cmd) { case FIONREAD: __pipe_lock(pipe); count = 0; - buf = pipe->curbuf; - nrbufs = pipe->nrbufs; - while (--nrbufs >= 0) { - count += pipe->bufs[buf].len; - buf = (buf+1) & (pipe->buffers - 1); + head = pipe->head; + tail = pipe->tail; + mask = pipe->ring_size - 1; + + while (tail != head) { + count += pipe->bufs[tail & mask].len; + tail++; } __pipe_unlock(pipe); @@ -541,21 +617,35 @@ pipe_poll(struct file *filp, poll_table *wait) { __poll_t mask; struct pipe_inode_info *pipe = filp->private_data; - int nrbufs; + unsigned int head, tail; + /* + * Reading only -- no need for acquiring the semaphore. + * + * But because this is racy, the code has to add the + * entry to the poll table _first_ .. + */ poll_wait(filp, &pipe->wait, wait); - /* Reading only -- no need for acquiring the semaphore. */ - nrbufs = pipe->nrbufs; + /* + * .. and only then can you do the racy tests. That way, + * if something changes and you got it wrong, the poll + * table entry will wake you up and fix it. + */ + head = READ_ONCE(pipe->head); + tail = READ_ONCE(pipe->tail); + mask = 0; if (filp->f_mode & FMODE_READ) { - mask = (nrbufs > 0) ? EPOLLIN | EPOLLRDNORM : 0; + if (!pipe_empty(head, tail)) + mask |= EPOLLIN | EPOLLRDNORM; if (!pipe->writers && filp->f_version != pipe->w_counter) mask |= EPOLLHUP; } if (filp->f_mode & FMODE_WRITE) { - mask |= (nrbufs < pipe->buffers) ? EPOLLOUT | EPOLLWRNORM : 0; + if (!pipe_full(head, tail, pipe->max_usage)) + mask |= EPOLLOUT | EPOLLWRNORM; /* * Most Unices do not set EPOLLERR for FIFOs but on Linux they * behave exactly like pipes for poll(). @@ -679,7 +769,8 @@ struct pipe_inode_info *alloc_pipe_info(void) if (pipe->bufs) { init_waitqueue_head(&pipe->wait); pipe->r_counter = pipe->w_counter = 1; - pipe->buffers = pipe_bufs; + pipe->max_usage = pipe_bufs; + pipe->ring_size = pipe_bufs; pipe->user = user; mutex_init(&pipe->mutex); return pipe; @@ -697,9 +788,9 @@ void free_pipe_info(struct pipe_inode_info *pipe) { int i; - (void) account_pipe_buffers(pipe->user, pipe->buffers, 0); + (void) account_pipe_buffers(pipe->user, pipe->ring_size, 0); free_uid(pipe->user); - for (i = 0; i < pipe->buffers; i++) { + for (i = 0; i < pipe->ring_size; i++) { struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) pipe_buf_release(pipe, buf); @@ -793,6 +884,8 @@ int create_pipe_files(struct file **res, int flags) } res[0]->private_data = inode->i_pipe; res[1] = f; + stream_open(inode, res[0]); + stream_open(inode, res[1]); return 0; } @@ -880,7 +973,7 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes) static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) { - int cur = *cnt; + int cur = *cnt; while (cur == *cnt) { pipe_wait(pipe); @@ -931,9 +1024,9 @@ static int fifo_open(struct inode *inode, struct file *filp) __pipe_lock(pipe); /* We can only do regular read/write on fifos */ - filp->f_mode &= (FMODE_READ | FMODE_WRITE); + stream_open(inode, filp); - switch (filp->f_mode) { + switch (filp->f_mode & (FMODE_READ | FMODE_WRITE)) { case FMODE_READ: /* * O_RDONLY @@ -955,7 +1048,7 @@ static int fifo_open(struct inode *inode, struct file *filp) } } break; - + case FMODE_WRITE: /* * O_WRONLY @@ -975,7 +1068,7 @@ static int fifo_open(struct inode *inode, struct file *filp) goto err_wr; } break; - + case FMODE_READ | FMODE_WRITE: /* * O_RDWR @@ -1054,14 +1147,14 @@ unsigned int round_pipe_size(unsigned long size) static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) { struct pipe_buffer *bufs; - unsigned int size, nr_pages; + unsigned int size, nr_slots, head, tail, mask, n; unsigned long user_bufs; long ret = 0; size = round_pipe_size(arg); - nr_pages = size >> PAGE_SHIFT; + nr_slots = size >> PAGE_SHIFT; - if (!nr_pages) + if (!nr_slots) return -EINVAL; /* @@ -1071,13 +1164,13 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) * Decreasing the pipe capacity is always permitted, even * if the user is currently over a limit. */ - if (nr_pages > pipe->buffers && + if (nr_slots > pipe->ring_size && size > pipe_max_size && !capable(CAP_SYS_RESOURCE)) return -EPERM; - user_bufs = account_pipe_buffers(pipe->user, pipe->buffers, nr_pages); + user_bufs = account_pipe_buffers(pipe->user, pipe->ring_size, nr_slots); - if (nr_pages > pipe->buffers && + if (nr_slots > pipe->ring_size && (too_many_pipe_buffers_hard(user_bufs) || too_many_pipe_buffers_soft(user_bufs)) && is_unprivileged_user()) { @@ -1086,17 +1179,21 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) } /* - * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't - * expect a lot of shrink+grow operations, just free and allocate - * again like we would do for growing. If the pipe currently + * We can shrink the pipe, if arg is greater than the ring occupancy. + * Since we don't expect a lot of shrink+grow operations, just free and + * allocate again like we would do for growing. If the pipe currently * contains more buffers than arg, then return busy. */ - if (nr_pages < pipe->nrbufs) { + mask = pipe->ring_size - 1; + head = pipe->head; + tail = pipe->tail; + n = pipe_occupancy(pipe->head, pipe->tail); + if (nr_slots < n) { ret = -EBUSY; goto out_revert_acct; } - bufs = kcalloc(nr_pages, sizeof(*bufs), + bufs = kcalloc(nr_slots, sizeof(*bufs), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (unlikely(!bufs)) { ret = -ENOMEM; @@ -1105,33 +1202,38 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) /* * The pipe array wraps around, so just start the new one at zero - * and adjust the indexes. + * and adjust the indices. */ - if (pipe->nrbufs) { - unsigned int tail; - unsigned int head; - - tail = pipe->curbuf + pipe->nrbufs; - if (tail < pipe->buffers) - tail = 0; - else - tail &= (pipe->buffers - 1); - - head = pipe->nrbufs - tail; - if (head) - memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer)); - if (tail) - memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); + if (n > 0) { + unsigned int h = head & mask; + unsigned int t = tail & mask; + if (h > t) { + memcpy(bufs, pipe->bufs + t, + n * sizeof(struct pipe_buffer)); + } else { + unsigned int tsize = pipe->ring_size - t; + if (h > 0) + memcpy(bufs + tsize, pipe->bufs, + h * sizeof(struct pipe_buffer)); + memcpy(bufs, pipe->bufs + t, + tsize * sizeof(struct pipe_buffer)); + } } - pipe->curbuf = 0; + head = n; + tail = 0; + kfree(pipe->bufs); pipe->bufs = bufs; - pipe->buffers = nr_pages; - return nr_pages * PAGE_SIZE; + pipe->ring_size = nr_slots; + pipe->max_usage = nr_slots; + pipe->tail = tail; + pipe->head = head; + wake_up_interruptible_all(&pipe->wait); + return pipe->max_usage * PAGE_SIZE; out_revert_acct: - (void) account_pipe_buffers(pipe->user, nr_pages, pipe->buffers); + (void) account_pipe_buffers(pipe->user, nr_slots, pipe->ring_size); return ret; } @@ -1161,7 +1263,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) ret = pipe_set_size(pipe, arg); break; case F_GETPIPE_SZ: - ret = pipe->buffers * PAGE_SIZE; + ret = pipe->max_usage * PAGE_SIZE; break; default: ret = -EINVAL; diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index cb5629bd5fff..733881a6387b 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -42,8 +42,8 @@ config PROC_VMCORE bool "/proc/vmcore support" depends on PROC_FS && CRASH_DUMP default y - help - Exports the dump image of crashed kernel in ELF format. + help + Exports the dump image of crashed kernel in ELF format. config PROC_VMCORE_DEVICE_DUMP bool "Device Hardware/Firmware Log Collection" @@ -72,7 +72,7 @@ config PROC_SYSCTL a recompile of the kernel or reboot of the system. The primary interface is through /proc/sys. If you say Y here a tree of modifiable sysctl entries will be generated beneath the - /proc/sys directory. They are explained in the files + /proc/sys directory. They are explained in the files in <file:Documentation/admin-guide/sysctl/>. Note that enabling this option will enlarge the kernel by at least 8 KB. @@ -88,7 +88,7 @@ config PROC_PAGE_MONITOR Various /proc files exist to monitor process memory utilization: /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap, /proc/kpagecount, and /proc/kpageflags. Disabling these - interfaces will reduce the size of the kernel by approximately 4kb. + interfaces will reduce the size of the kernel by approximately 4kb. config PROC_CHILDREN bool "Include /proc/<pid>/task/<tid>/children file" diff --git a/fs/proc/array.c b/fs/proc/array.c index 46dcb6f0eccf..5efaf3708ec6 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -533,7 +533,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, nice = task_nice(task); /* convert nsec -> ticks */ - start_time = nsec_to_clock_t(task->real_start_time); + start_time = nsec_to_clock_t(task->start_boottime); seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns)); seq_puts(m, " ("); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 64e9ee1b129e..074e9585c699 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -138,8 +138,12 @@ static int proc_getattr(const struct path *path, struct kstat *stat, { struct inode *inode = d_inode(path->dentry); struct proc_dir_entry *de = PDE(inode); - if (de && de->nlink) - set_nlink(inode, de->nlink); + if (de) { + nlink_t nlink = READ_ONCE(de->nlink); + if (nlink > 0) { + set_nlink(inode, nlink); + } + } generic_fillattr(inode, stat); return 0; @@ -159,7 +163,6 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret, { const char *cp = name, *next; struct proc_dir_entry *de; - unsigned int len; de = *ret; if (!de) @@ -170,13 +173,12 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret, if (!next) break; - len = next - cp; - de = pde_subdir_find(de, cp, len); + de = pde_subdir_find(de, cp, next - cp); if (!de) { WARN(1, "name '%s'\n", name); return -ENOENT; } - cp += len + 1; + cp = next + 1; } *residual = cp; *ret = de; @@ -362,6 +364,7 @@ struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, write_unlock(&proc_subdir_lock); goto out_free_inum; } + dir->nlink++; write_unlock(&proc_subdir_lock); return dp; @@ -472,10 +475,7 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode, ent->data = data; ent->proc_fops = &proc_dir_operations; ent->proc_iops = &proc_dir_inode_operations; - parent->nlink++; ent = proc_register(parent, ent); - if (!ent) - parent->nlink--; } return ent; } @@ -505,10 +505,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name) ent->data = NULL; ent->proc_fops = NULL; ent->proc_iops = NULL; - parent->nlink++; ent = proc_register(parent, ent); - if (!ent) - parent->nlink--; } return ent; } @@ -666,8 +663,12 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) len = strlen(fn); de = pde_subdir_find(parent, fn, len); - if (de) + if (de) { rb_erase(&de->subdir_node, &parent->subdir); + if (S_ISDIR(de->mode)) { + parent->nlink--; + } + } write_unlock(&proc_subdir_lock); if (!de) { WARN(1, "name '%s'\n", name); @@ -676,9 +677,6 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) proc_entry_rundown(de); - if (S_ISDIR(de->mode)) - parent->nlink--; - de->nlink = 0; WARN(pde_subdir_first(de), "%s: removing non-empty directory '%s/%s', leaking at least '%s'\n", __func__, de->parent->name, de->name, pde_subdir_first(de)->name); @@ -714,13 +712,12 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) de = next; continue; } - write_unlock(&proc_subdir_lock); - - proc_entry_rundown(de); next = de->parent; if (S_ISDIR(de->mode)) next->nlink--; - de->nlink = 0; + write_unlock(&proc_subdir_lock); + + proc_entry_rundown(de); if (de == root) break; pde_put(de); diff --git a/fs/proc/internal.h b/fs/proc/internal.h index cd0c8d5ce9a1..0f3b557c9b77 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -197,8 +197,8 @@ extern ssize_t proc_simple_write(struct file *, const char __user *, size_t, lof * inode.c */ struct pde_opener { - struct file *file; struct list_head lh; + struct file *file; bool closing; struct completion *c; } __randomize_layout; diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index ac9247371871..8c1f1bb1a5ce 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -132,9 +132,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR); show_val_kb(m, "ShmemPmdMapped: ", global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR); - show_val_kb(m, "FileHugePages: ", + show_val_kb(m, "FileHugePages: ", global_node_page_state(NR_FILE_THPS) * HPAGE_PMD_NR); - show_val_kb(m, "FilePmdMapped: ", + show_val_kb(m, "FilePmdMapped: ", global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR); #endif diff --git a/fs/proc/page.c b/fs/proc/page.c index 544d1ee15aee..7c952ee732e6 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -42,10 +42,12 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, return -EINVAL; while (count > 0) { - if (pfn_valid(pfn)) - ppage = pfn_to_page(pfn); - else - ppage = NULL; + /* + * TODO: ZONE_DEVICE support requires to identify + * memmaps that were actually initialized. + */ + ppage = pfn_to_online_page(pfn); + if (!ppage || PageSlab(ppage) || page_has_type(ppage)) pcount = 0; else @@ -216,10 +218,11 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, return -EINVAL; while (count > 0) { - if (pfn_valid(pfn)) - ppage = pfn_to_page(pfn); - else - ppage = NULL; + /* + * TODO: ZONE_DEVICE support requires to identify + * memmaps that were actually initialized. + */ + ppage = pfn_to_online_page(pfn); if (put_user(stable_page_flags(ppage), out)) { ret = -EFAULT; @@ -261,10 +264,11 @@ static ssize_t kpagecgroup_read(struct file *file, char __user *buf, return -EINVAL; while (count > 0) { - if (pfn_valid(pfn)) - ppage = pfn_to_page(pfn); - else - ppage = NULL; + /* + * TODO: ZONE_DEVICE support requires to identify + * memmaps that were actually initialized. + */ + ppage = pfn_to_online_page(pfn); if (ppage) ino = page_cgroup_ino(ppage); diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 80c305f206bb..37bdbec5b402 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -120,20 +120,23 @@ static int show_stat(struct seq_file *p, void *v) getboottime64(&boottime); for_each_possible_cpu(i) { - struct kernel_cpustat *kcs = &kcpustat_cpu(i); - - user += kcs->cpustat[CPUTIME_USER]; - nice += kcs->cpustat[CPUTIME_NICE]; - system += kcs->cpustat[CPUTIME_SYSTEM]; - idle += get_idle_time(kcs, i); - iowait += get_iowait_time(kcs, i); - irq += kcs->cpustat[CPUTIME_IRQ]; - softirq += kcs->cpustat[CPUTIME_SOFTIRQ]; - steal += kcs->cpustat[CPUTIME_STEAL]; - guest += kcs->cpustat[CPUTIME_GUEST]; - guest_nice += kcs->cpustat[CPUTIME_GUEST_NICE]; - sum += kstat_cpu_irqs_sum(i); - sum += arch_irq_stat_cpu(i); + struct kernel_cpustat kcpustat; + u64 *cpustat = kcpustat.cpustat; + + kcpustat_cpu_fetch(&kcpustat, i); + + user += cpustat[CPUTIME_USER]; + nice += cpustat[CPUTIME_NICE]; + system += cpustat[CPUTIME_SYSTEM]; + idle += get_idle_time(&kcpustat, i); + iowait += get_iowait_time(&kcpustat, i); + irq += cpustat[CPUTIME_IRQ]; + softirq += cpustat[CPUTIME_SOFTIRQ]; + steal += cpustat[CPUTIME_STEAL]; + guest += cpustat[CPUTIME_GUEST]; + guest_nice += cpustat[CPUTIME_USER]; + sum += kstat_cpu_irqs_sum(i); + sum += arch_irq_stat_cpu(i); for (j = 0; j < NR_SOFTIRQS; j++) { unsigned int softirq_stat = kstat_softirqs_cpu(j, i); @@ -157,19 +160,22 @@ static int show_stat(struct seq_file *p, void *v) seq_putc(p, '\n'); for_each_online_cpu(i) { - struct kernel_cpustat *kcs = &kcpustat_cpu(i); + struct kernel_cpustat kcpustat; + u64 *cpustat = kcpustat.cpustat; + + kcpustat_cpu_fetch(&kcpustat, i); /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ - user = kcs->cpustat[CPUTIME_USER]; - nice = kcs->cpustat[CPUTIME_NICE]; - system = kcs->cpustat[CPUTIME_SYSTEM]; - idle = get_idle_time(kcs, i); - iowait = get_iowait_time(kcs, i); - irq = kcs->cpustat[CPUTIME_IRQ]; - softirq = kcs->cpustat[CPUTIME_SOFTIRQ]; - steal = kcs->cpustat[CPUTIME_STEAL]; - guest = kcs->cpustat[CPUTIME_GUEST]; - guest_nice = kcs->cpustat[CPUTIME_GUEST_NICE]; + user = cpustat[CPUTIME_USER]; + nice = cpustat[CPUTIME_NICE]; + system = cpustat[CPUTIME_SYSTEM]; + idle = get_idle_time(&kcpustat, i); + iowait = get_iowait_time(&kcpustat, i); + irq = cpustat[CPUTIME_IRQ]; + softirq = cpustat[CPUTIME_SOFTIRQ]; + steal = cpustat[CPUTIME_STEAL]; + guest = cpustat[CPUTIME_GUEST]; + guest_nice = cpustat[CPUTIME_USER]; seq_printf(p, "cpu%d", i); seq_put_decimal_ull(p, " ", nsec_to_clock_t(user)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 3d7024662d29..d896457e7c11 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -793,7 +793,7 @@ static void pstore_timefunc(struct timer_list *unused) jiffies + msecs_to_jiffies(pstore_update_ms)); } -void __init pstore_choose_compression(void) +static void __init pstore_choose_compression(void) { const struct pstore_zbackend *step; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 6e826b454082..b0688c02dc90 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -497,7 +497,7 @@ int dquot_release(struct dquot *dquot) mutex_lock(&dquot->dq_lock); /* Check whether we are not racing with some other dqget() */ - if (atomic_read(&dquot->dq_count) > 1) + if (dquot_is_busy(dquot)) goto out_dqlock; if (dqopt->ops[dquot->dq_id.type]->release_dqblk) { ret = dqopt->ops[dquot->dq_id.type]->release_dqblk(dquot); @@ -595,7 +595,6 @@ int dquot_scan_active(struct super_block *sb, /* Now we have active dquot so we can just increase use count */ atomic_inc(&dquot->dq_count); spin_unlock(&dq_list_lock); - dqstats_inc(DQST_LOOKUPS); dqput(old_dquot); old_dquot = dquot; /* @@ -623,7 +622,7 @@ EXPORT_SYMBOL(dquot_scan_active); /* Write all dquot structures to quota files */ int dquot_writeback_dquots(struct super_block *sb, int type) { - struct list_head *dirty; + struct list_head dirty; struct dquot *dquot; struct quota_info *dqopt = sb_dqopt(sb); int cnt; @@ -637,9 +636,10 @@ int dquot_writeback_dquots(struct super_block *sb, int type) if (!sb_has_quota_active(sb, cnt)) continue; spin_lock(&dq_list_lock); - dirty = &dqopt->info[cnt].dqi_dirty_list; - while (!list_empty(dirty)) { - dquot = list_first_entry(dirty, struct dquot, + /* Move list away to avoid livelock. */ + list_replace_init(&dqopt->info[cnt].dqi_dirty_list, &dirty); + while (!list_empty(&dirty)) { + dquot = list_first_entry(&dirty, struct dquot, dq_dirty); WARN_ON(!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)); @@ -649,7 +649,6 @@ int dquot_writeback_dquots(struct super_block *sb, int type) * use count */ dqgrab(dquot); spin_unlock(&dq_list_lock); - dqstats_inc(DQST_LOOKUPS); err = sb->dq_op->write_dquot(dquot); if (err) { /* @@ -2162,14 +2161,29 @@ int dquot_file_open(struct inode *inode, struct file *file) } EXPORT_SYMBOL(dquot_file_open); +static void vfs_cleanup_quota_inode(struct super_block *sb, int type) +{ + struct quota_info *dqopt = sb_dqopt(sb); + struct inode *inode = dqopt->files[type]; + + if (!inode) + return; + if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { + inode_lock(inode); + inode->i_flags &= ~S_NOQUOTA; + inode_unlock(inode); + } + dqopt->files[type] = NULL; + iput(inode); +} + /* * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) */ int dquot_disable(struct super_block *sb, int type, unsigned int flags) { - int cnt, ret = 0; + int cnt; struct quota_info *dqopt = sb_dqopt(sb); - struct inode *toputinode[MAXQUOTAS]; /* s_umount should be held in exclusive mode */ if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount))) @@ -2191,7 +2205,6 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) return 0; for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - toputinode[cnt] = NULL; if (type != -1 && cnt != type) continue; if (!sb_has_quota_loaded(sb, cnt)) @@ -2211,8 +2224,7 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) dqopt->flags &= ~dquot_state_flag( DQUOT_SUSPENDED, cnt); spin_unlock(&dq_state_lock); - iput(dqopt->files[cnt]); - dqopt->files[cnt] = NULL; + vfs_cleanup_quota_inode(sb, cnt); continue; } spin_unlock(&dq_state_lock); @@ -2234,10 +2246,6 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) if (dqopt->ops[cnt]->free_file_info) dqopt->ops[cnt]->free_file_info(sb, cnt); put_quota_format(dqopt->info[cnt].dqi_format); - - toputinode[cnt] = dqopt->files[cnt]; - if (!sb_has_quota_loaded(sb, cnt)) - dqopt->files[cnt] = NULL; dqopt->info[cnt].dqi_flags = 0; dqopt->info[cnt].dqi_igrace = 0; dqopt->info[cnt].dqi_bgrace = 0; @@ -2259,32 +2267,22 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) * must also discard the blockdev buffers so that we see the * changes done by userspace on the next quotaon() */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) - /* This can happen when suspending quotas on remount-ro... */ - if (toputinode[cnt] && !sb_has_quota_loaded(sb, cnt)) { - inode_lock(toputinode[cnt]); - toputinode[cnt]->i_flags &= ~S_NOQUOTA; - truncate_inode_pages(&toputinode[cnt]->i_data, 0); - inode_unlock(toputinode[cnt]); - mark_inode_dirty_sync(toputinode[cnt]); + if (!sb_has_quota_loaded(sb, cnt) && dqopt->files[cnt]) { + inode_lock(dqopt->files[cnt]); + truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); + inode_unlock(dqopt->files[cnt]); } if (sb->s_bdev) invalidate_bdev(sb->s_bdev); put_inodes: + /* We are done when suspending quotas */ + if (flags & DQUOT_SUSPENDED) + return 0; + for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (toputinode[cnt]) { - /* On remount RO, we keep the inode pointer so that we - * can reenable quota on the subsequent remount RW. We - * have to check 'flags' variable and not use sb_has_ - * function because another quotaon / quotaoff could - * change global state before we got here. We refuse - * to suspend quotas when there is pending delete on - * the quota file... */ - if (!(flags & DQUOT_SUSPENDED)) - iput(toputinode[cnt]); - else if (!toputinode[cnt]->i_nlink) - ret = -EBUSY; - } - return ret; + if (!sb_has_quota_loaded(sb, cnt)) + vfs_cleanup_quota_inode(sb, cnt); + return 0; } EXPORT_SYMBOL(dquot_disable); @@ -2299,28 +2297,52 @@ EXPORT_SYMBOL(dquot_quota_off); * Turn quotas on on a device */ -/* - * Helper function to turn quotas on when we already have the inode of - * quota file and no quota information is loaded. - */ -static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, +static int vfs_setup_quota_inode(struct inode *inode, int type) +{ + struct super_block *sb = inode->i_sb; + struct quota_info *dqopt = sb_dqopt(sb); + + if (!S_ISREG(inode->i_mode)) + return -EACCES; + if (IS_RDONLY(inode)) + return -EROFS; + if (sb_has_quota_loaded(sb, type)) + return -EBUSY; + + dqopt->files[type] = igrab(inode); + if (!dqopt->files[type]) + return -EIO; + if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { + /* We don't want quota and atime on quota files (deadlocks + * possible) Also nobody should write to the file - we use + * special IO operations which ignore the immutable bit. */ + inode_lock(inode); + inode->i_flags |= S_NOQUOTA; + inode_unlock(inode); + /* + * When S_NOQUOTA is set, remove dquot references as no more + * references can be added + */ + __dquot_drop(inode); + } + return 0; +} + +int dquot_load_quota_sb(struct super_block *sb, int type, int format_id, unsigned int flags) { struct quota_format_type *fmt = find_quota_format(format_id); - struct super_block *sb = inode->i_sb; struct quota_info *dqopt = sb_dqopt(sb); int error; + /* Just unsuspend quotas? */ + BUG_ON(flags & DQUOT_SUSPENDED); + /* s_umount should be held in exclusive mode */ + if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount))) + up_read(&sb->s_umount); + if (!fmt) return -ESRCH; - if (!S_ISREG(inode->i_mode)) { - error = -EACCES; - goto out_fmt; - } - if (IS_RDONLY(inode)) { - error = -EROFS; - goto out_fmt; - } if (!sb->s_op->quota_write || !sb->s_op->quota_read || (type == PRJQUOTA && sb->dq_op->get_projid == NULL)) { error = -EINVAL; @@ -2352,27 +2374,9 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, invalidate_bdev(sb->s_bdev); } - if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { - /* We don't want quota and atime on quota files (deadlocks - * possible) Also nobody should write to the file - we use - * special IO operations which ignore the immutable bit. */ - inode_lock(inode); - inode->i_flags |= S_NOQUOTA; - inode_unlock(inode); - /* - * When S_NOQUOTA is set, remove dquot references as no more - * references can be added - */ - __dquot_drop(inode); - } - - error = -EIO; - dqopt->files[type] = igrab(inode); - if (!dqopt->files[type]) - goto out_file_flags; error = -EINVAL; if (!fmt->qf_ops->check_quota_file(sb, type)) - goto out_file_init; + goto out_fmt; dqopt->ops[type] = fmt->qf_ops; dqopt->info[type].dqi_format = fmt; @@ -2380,7 +2384,7 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, INIT_LIST_HEAD(&dqopt->info[type].dqi_dirty_list); error = dqopt->ops[type]->read_file_info(sb, type); if (error < 0) - goto out_file_init; + goto out_fmt; if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) { spin_lock(&dq_data_lock); dqopt->info[type].dqi_flags |= DQF_SYS_FILE; @@ -2395,24 +2399,36 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, dquot_disable(sb, type, flags); return error; -out_file_init: - dqopt->files[type] = NULL; - iput(inode); -out_file_flags: - inode_lock(inode); - inode->i_flags &= ~S_NOQUOTA; - inode_unlock(inode); out_fmt: put_quota_format(fmt); return error; } +EXPORT_SYMBOL(dquot_load_quota_sb); + +/* + * More powerful function for turning on quotas on given quota inode allowing + * setting of individual quota flags + */ +int dquot_load_quota_inode(struct inode *inode, int type, int format_id, + unsigned int flags) +{ + int err; + + err = vfs_setup_quota_inode(inode, type); + if (err < 0) + return err; + err = dquot_load_quota_sb(inode->i_sb, type, format_id, flags); + if (err < 0) + vfs_cleanup_quota_inode(inode->i_sb, type); + return err; +} +EXPORT_SYMBOL(dquot_load_quota_inode); /* Reenable quotas on remount RW */ int dquot_resume(struct super_block *sb, int type) { struct quota_info *dqopt = sb_dqopt(sb); - struct inode *inode; int ret = 0, cnt; unsigned int flags; @@ -2426,8 +2442,6 @@ int dquot_resume(struct super_block *sb, int type) if (!sb_has_quota_suspended(sb, cnt)) continue; - inode = dqopt->files[cnt]; - dqopt->files[cnt] = NULL; spin_lock(&dq_state_lock); flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED, @@ -2436,9 +2450,10 @@ int dquot_resume(struct super_block *sb, int type) spin_unlock(&dq_state_lock); flags = dquot_generic_flag(flags, cnt); - ret = vfs_load_quota_inode(inode, cnt, - dqopt->info[cnt].dqi_fmt_id, flags); - iput(inode); + ret = dquot_load_quota_sb(sb, cnt, dqopt->info[cnt].dqi_fmt_id, + flags); + if (ret < 0) + vfs_cleanup_quota_inode(sb, type); } return ret; @@ -2455,7 +2470,7 @@ int dquot_quota_on(struct super_block *sb, int type, int format_id, if (path->dentry->d_sb != sb) error = -EXDEV; else - error = vfs_load_quota_inode(d_inode(path->dentry), type, + error = dquot_load_quota_inode(d_inode(path->dentry), type, format_id, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); return error; @@ -2463,41 +2478,6 @@ int dquot_quota_on(struct super_block *sb, int type, int format_id, EXPORT_SYMBOL(dquot_quota_on); /* - * More powerful function for turning on quotas allowing setting - * of individual quota flags - */ -int dquot_enable(struct inode *inode, int type, int format_id, - unsigned int flags) -{ - struct super_block *sb = inode->i_sb; - - /* Just unsuspend quotas? */ - BUG_ON(flags & DQUOT_SUSPENDED); - /* s_umount should be held in exclusive mode */ - if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount))) - up_read(&sb->s_umount); - - if (!flags) - return 0; - /* Just updating flags needed? */ - if (sb_has_quota_loaded(sb, type)) { - if (flags & DQUOT_USAGE_ENABLED && - sb_has_quota_usage_enabled(sb, type)) - return -EBUSY; - if (flags & DQUOT_LIMITS_ENABLED && - sb_has_quota_limits_enabled(sb, type)) - return -EBUSY; - spin_lock(&dq_state_lock); - sb_dqopt(sb)->flags |= dquot_state_flag(flags, type); - spin_unlock(&dq_state_lock); - return 0; - } - - return vfs_load_quota_inode(inode, type, format_id, flags); -} -EXPORT_SYMBOL(dquot_enable); - -/* * This function is used when filesystem needs to initialize quotas * during mount time. */ @@ -2507,21 +2487,15 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name, struct dentry *dentry; int error; - dentry = lookup_one_len_unlocked(qf_name, sb->s_root, strlen(qf_name)); + dentry = lookup_positive_unlocked(qf_name, sb->s_root, strlen(qf_name)); if (IS_ERR(dentry)) return PTR_ERR(dentry); - if (d_really_is_negative(dentry)) { - error = -ENOENT; - goto out; - } - error = security_quota_on(dentry); if (!error) - error = vfs_load_quota_inode(d_inode(dentry), type, format_id, + error = dquot_load_quota_inode(d_inode(dentry), type, format_id, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); -out: dput(dentry); return error; } @@ -2543,13 +2517,17 @@ static int dquot_quota_enable(struct super_block *sb, unsigned int flags) if (!(flags & qtype_enforce_flag(type))) continue; /* Can't enforce without accounting */ - if (!sb_has_quota_usage_enabled(sb, type)) - return -EINVAL; - ret = dquot_enable(dqopt->files[type], type, - dqopt->info[type].dqi_fmt_id, - DQUOT_LIMITS_ENABLED); - if (ret < 0) + if (!sb_has_quota_usage_enabled(sb, type)) { + ret = -EINVAL; goto out_err; + } + if (sb_has_quota_limits_enabled(sb, type)) { + ret = -EBUSY; + goto out_err; + } + spin_lock(&dq_state_lock); + dqopt->flags |= dquot_state_flag(DQUOT_LIMITS_ENABLED, type); + spin_unlock(&dq_state_lock); } return 0; out_err: @@ -2599,10 +2577,12 @@ static int dquot_quota_disable(struct super_block *sb, unsigned int flags) out_err: /* Backout enforcement disabling we already did */ for (type--; type >= 0; type--) { - if (flags & qtype_enforce_flag(type)) - dquot_enable(dqopt->files[type], type, - dqopt->info[type].dqi_fmt_id, - DQUOT_LIMITS_ENABLED); + if (flags & qtype_enforce_flag(type)) { + spin_lock(&dq_state_lock); + dqopt->flags |= + dquot_state_flag(DQUOT_LIMITS_ENABLED, type); + spin_unlock(&dq_state_lock); + } } return ret; } @@ -2800,8 +2780,10 @@ int dquot_get_state(struct super_block *sb, struct qc_state *state) tstate->flags |= QCI_LIMITS_ENFORCED; tstate->spc_timelimit = mi->dqi_bgrace; tstate->ino_timelimit = mi->dqi_igrace; - tstate->ino = dqopt->files[type]->i_ino; - tstate->blocks = dqopt->files[type]->i_blocks; + if (dqopt->files[type]) { + tstate->ino = dqopt->files[type]->i_ino; + tstate->blocks = dqopt->files[type]->i_blocks; + } tstate->nextents = 1; /* We don't know... */ spin_unlock(&dq_data_lock); } @@ -2860,68 +2842,73 @@ EXPORT_SYMBOL(dquot_quotactl_sysfile_ops); static int do_proc_dqstats(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - unsigned int type = (int *)table->data - dqstats.stat; + unsigned int type = (unsigned long *)table->data - dqstats.stat; + s64 value = percpu_counter_sum(&dqstats.counter[type]); + + /* Filter negative values for non-monotonic counters */ + if (value < 0 && (type == DQST_ALLOC_DQUOTS || + type == DQST_FREE_DQUOTS)) + value = 0; /* Update global table */ - dqstats.stat[type] = - percpu_counter_sum_positive(&dqstats.counter[type]); - return proc_dointvec(table, write, buffer, lenp, ppos); + dqstats.stat[type] = value; + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } static struct ctl_table fs_dqstats_table[] = { { .procname = "lookups", .data = &dqstats.stat[DQST_LOOKUPS], - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "drops", .data = &dqstats.stat[DQST_DROPS], - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "reads", .data = &dqstats.stat[DQST_READS], - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "writes", .data = &dqstats.stat[DQST_WRITES], - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "cache_hits", .data = &dqstats.stat[DQST_CACHE_HITS], - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "allocated_dquots", .data = &dqstats.stat[DQST_ALLOC_DQUOTS], - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "free_dquots", .data = &dqstats.stat[DQST_FREE_DQUOTS], - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "syncs", .data = &dqstats.stat[DQST_SYNCS], - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, @@ -2983,11 +2970,7 @@ static int __init dquot_init(void) /* Find power-of-two hlist_heads which can fit into allocation */ nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head); - dq_hash_bits = 0; - do { - dq_hash_bits++; - } while (nr_hash >> dq_hash_bits); - dq_hash_bits--; + dq_hash_bits = ilog2(nr_hash); nr_hash = 1UL << dq_hash_bits; dq_hash_mask = nr_hash - 1; diff --git a/fs/quota/quota.c b/fs/quota/quota.c index cb13fb76dbee..5444d3c4d93f 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -60,8 +60,6 @@ static int quota_sync_all(int type) { int ret; - if (type >= MAXQUOTAS) - return -EINVAL; ret = security_quotactl(Q_SYNC, type, 0, NULL); if (!ret) iterate_supers(quota_sync_one, &type); @@ -686,8 +684,6 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, { int ret; - if (type >= MAXQUOTAS) - return -EINVAL; type = array_index_nospec(type, MAXQUOTAS); /* * Quota not supported on this fs? Check this before s_quota_types @@ -831,6 +827,9 @@ int kernel_quotactl(unsigned int cmd, const char __user *special, cmds = cmd >> SUBCMDSHIFT; type = cmd & SUBCMDMASK; + if (type >= MAXQUOTAS) + return -EINVAL; + /* * As a special case Q_SYNC can be called without a specific device. * It will iterate all superblocks that have quota enabled and call diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c index c740e5572eb8..cd92e5fa0062 100644 --- a/fs/quota/quota_v1.c +++ b/fs/quota/quota_v1.c @@ -217,7 +217,6 @@ static const struct quota_format_ops v1_format_ops = { .check_quota_file = v1_check_quota_file, .read_file_info = v1_read_file_info, .write_file_info = v1_write_file_info, - .free_file_info = NULL, .read_dqblk = v1_read_dqblk, .commit_dqblk = v1_commit_dqblk, }; diff --git a/fs/readdir.c b/fs/readdir.c index 6e2623e57b2e..d26d5ea4de7b 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -105,9 +105,9 @@ EXPORT_SYMBOL(iterate_dir); */ static int verify_dirent_name(const char *name, int len) { - if (WARN_ON_ONCE(!len)) + if (!len) return -EIO; - if (WARN_ON_ONCE(memchr(name, '/', len))) + if (memchr(name, '/', len)) return -EIO; return 0; } diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 843aadcc123c..84cf8bdbec9c 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -38,16 +38,10 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp) BUG_ON(!S_ISREG(inode->i_mode)); - if (atomic_add_unless(&REISERFS_I(inode)->openers, -1, 1)) + if (!atomic_dec_and_mutex_lock(&REISERFS_I(inode)->openers, + &REISERFS_I(inode)->tailpack)) return 0; - mutex_lock(&REISERFS_I(inode)->tailpack); - - if (!atomic_dec_and_test(&REISERFS_I(inode)->openers)) { - mutex_unlock(&REISERFS_I(inode)->tailpack); - return 0; - } - /* fast out for when nothing needs to be done */ if ((!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) || !tail_has_to_be_packed(inode)) && diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 132ec4406ed0..6419e6dacc39 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2097,6 +2097,15 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, goto out_inserted_sd; } + /* + * Mark it private if we're creating the privroot + * or something under it. + */ + if (IS_PRIVATE(dir) || dentry == REISERFS_SB(sb)->priv_root) { + inode->i_flags |= S_PRIVATE; + inode->i_opflags &= ~IOP_XATTR; + } + if (reiserfs_posixacl(inode->i_sb)) { reiserfs_write_unlock(inode->i_sb); retval = reiserfs_inherit_default_acl(th, dir, dentry, inode); @@ -2111,8 +2120,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, reiserfs_warning(inode->i_sb, "jdm-13090", "ACLs aren't enabled in the fs, " "but vfs thinks they are!"); - } else if (IS_PRIVATE(dir)) - inode->i_flags |= S_PRIVATE; + } if (security->name) { reiserfs_write_unlock(inode->i_sb); diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 97f3fc4fdd79..959a066b7bb0 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -377,10 +377,13 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry, /* * Propagate the private flag so we know we're - * in the priv tree + * in the priv tree. Also clear IOP_XATTR + * since we don't have xattrs on xattr files. */ - if (IS_PRIVATE(dir)) + if (IS_PRIVATE(dir)) { inode->i_flags |= S_PRIVATE; + inode->i_opflags &= ~IOP_XATTR; + } } reiserfs_write_unlock(dir->i_sb); if (retval == IO_ERROR) { diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index e5ca9ed79e54..726580114d55 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -1168,6 +1168,8 @@ static inline int bmap_would_wrap(unsigned bmap_nr) return bmap_nr > ((1LL << 16) - 1); } +extern const struct xattr_handler *reiserfs_xattr_handlers[]; + /* * this says about version of key of all items (but stat data) the * object consists of diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index d69b4ac0ae2f..3244037b1286 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -2049,6 +2049,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) if (replay_only(s)) goto error_unlocked; + s->s_xattr = reiserfs_xattr_handlers; + if (bdev_read_only(s->s_bdev) && !sb_rdonly(s)) { SWARN(silent, s, "clm-7000", "Detected readonly device, marking FS readonly"); diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index b5b26d8a192c..62b40df36c98 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -122,13 +122,13 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags) struct dentry *xaroot; if (d_really_is_negative(privroot)) - return ERR_PTR(-ENODATA); + return ERR_PTR(-EOPNOTSUPP); inode_lock_nested(d_inode(privroot), I_MUTEX_XATTR); xaroot = dget(REISERFS_SB(sb)->xattr_root); if (!xaroot) - xaroot = ERR_PTR(-ENODATA); + xaroot = ERR_PTR(-EOPNOTSUPP); else if (d_really_is_negative(xaroot)) { int err = -ENODATA; @@ -619,6 +619,10 @@ int reiserfs_xattr_set(struct inode *inode, const char *name, int error, error2; size_t jbegin_count = reiserfs_xattr_nblocks(inode, buffer_size); + /* Check before we start a transaction and then do nothing. */ + if (!d_really_is_positive(REISERFS_SB(inode->i_sb)->priv_root)) + return -EOPNOTSUPP; + if (!(flags & XATTR_REPLACE)) jbegin_count += reiserfs_xattr_jcreate_nblocks(inode); @@ -841,8 +845,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size) if (d_really_is_negative(dentry)) return -EINVAL; - if (!dentry->d_sb->s_xattr || - get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) + if (get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) return -EOPNOTSUPP; dir = open_xa_dir(d_inode(dentry), XATTR_REPLACE); @@ -882,6 +885,7 @@ static int create_privroot(struct dentry *dentry) } d_inode(dentry)->i_flags |= S_PRIVATE; + d_inode(dentry)->i_opflags &= ~IOP_XATTR; reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr " "storage.\n", PRIVROOT_NAME); @@ -895,7 +899,7 @@ static int create_privroot(struct dentry *dentry) { return 0; } #endif /* Actual operations that are exported to VFS-land */ -static const struct xattr_handler *reiserfs_xattr_handlers[] = { +const struct xattr_handler *reiserfs_xattr_handlers[] = { #ifdef CONFIG_REISERFS_FS_XATTR &reiserfs_xattr_user_handler, &reiserfs_xattr_trusted_handler, @@ -966,8 +970,10 @@ int reiserfs_lookup_privroot(struct super_block *s) if (!IS_ERR(dentry)) { REISERFS_SB(s)->priv_root = dentry; d_set_d_op(dentry, &xattr_lookup_poison_ops); - if (d_really_is_positive(dentry)) + if (d_really_is_positive(dentry)) { d_inode(dentry)->i_flags |= S_PRIVATE; + d_inode(dentry)->i_opflags &= ~IOP_XATTR; + } } else err = PTR_ERR(dentry); inode_unlock(d_inode(s->s_root)); @@ -996,7 +1002,6 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) } if (d_really_is_positive(privroot)) { - s->s_xattr = reiserfs_xattr_handlers; inode_lock(d_inode(privroot)); if (!REISERFS_SB(s)->xattr_root) { struct dentry *dentry; diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index aa9380bac196..05f666794561 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -320,10 +320,8 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, * would be useless since permissions are ignored, and a pain because * it introduces locking cycles */ - if (IS_PRIVATE(dir)) { - inode->i_flags |= S_PRIVATE; + if (IS_PRIVATE(inode)) goto apply_umask; - } err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); if (err) diff --git a/fs/select.c b/fs/select.c index 53a0c149f528..11d0285d46b7 100644 --- a/fs/select.c +++ b/fs/select.c @@ -321,7 +321,7 @@ static int poll_select_finish(struct timespec64 *end_time, switch (pt_type) { case PT_TIMEVAL: { - struct timeval rtv; + struct __kernel_old_timeval rtv; if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec)) memset(&rtv, 0, sizeof(rtv)); @@ -698,10 +698,10 @@ out_nofds: } static int kern_select(int n, fd_set __user *inp, fd_set __user *outp, - fd_set __user *exp, struct timeval __user *tvp) + fd_set __user *exp, struct __kernel_old_timeval __user *tvp) { struct timespec64 end_time, *to = NULL; - struct timeval tv; + struct __kernel_old_timeval tv; int ret; if (tvp) { @@ -720,7 +720,7 @@ static int kern_select(int n, fd_set __user *inp, fd_set __user *outp, } SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, - fd_set __user *, exp, struct timeval __user *, tvp) + fd_set __user *, exp, struct __kernel_old_timeval __user *, tvp) { return kern_select(n, inp, outp, exp, tvp); } @@ -810,7 +810,7 @@ SYSCALL_DEFINE6(pselect6_time32, int, n, fd_set __user *, inp, fd_set __user *, struct sel_arg_struct { unsigned long n; fd_set __user *inp, *outp, *exp; - struct timeval __user *tvp; + struct __kernel_old_timeval __user *tvp; }; SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg) diff --git a/fs/splice.c b/fs/splice.c index 98412721f056..3009652a41c8 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -185,6 +185,9 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) { unsigned int spd_pages = spd->nr_pages; + unsigned int tail = pipe->tail; + unsigned int head = pipe->head; + unsigned int mask = pipe->ring_size - 1; int ret = 0, page_nr = 0; if (!spd_pages) @@ -196,9 +199,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, goto out; } - while (pipe->nrbufs < pipe->buffers) { - int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); - struct pipe_buffer *buf = pipe->bufs + newbuf; + while (!pipe_full(head, tail, pipe->max_usage)) { + struct pipe_buffer *buf = &pipe->bufs[head & mask]; buf->page = spd->pages[page_nr]; buf->offset = spd->partial[page_nr].offset; @@ -207,7 +209,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, buf->ops = spd->ops; buf->flags = 0; - pipe->nrbufs++; + head++; + pipe->head = head; page_nr++; ret += buf->len; @@ -228,17 +231,19 @@ EXPORT_SYMBOL_GPL(splice_to_pipe); ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { + unsigned int head = pipe->head; + unsigned int tail = pipe->tail; + unsigned int mask = pipe->ring_size - 1; int ret; if (unlikely(!pipe->readers)) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; - } else if (pipe->nrbufs == pipe->buffers) { + } else if (pipe_full(head, tail, pipe->max_usage)) { ret = -EAGAIN; } else { - int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); - pipe->bufs[newbuf] = *buf; - pipe->nrbufs++; + pipe->bufs[head & mask] = *buf; + pipe->head = head + 1; return buf->len; } pipe_buf_release(pipe, buf); @@ -252,14 +257,14 @@ EXPORT_SYMBOL(add_to_pipe); */ int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) { - unsigned int buffers = READ_ONCE(pipe->buffers); + unsigned int max_usage = READ_ONCE(pipe->max_usage); - spd->nr_pages_max = buffers; - if (buffers <= PIPE_DEF_BUFFERS) + spd->nr_pages_max = max_usage; + if (max_usage <= PIPE_DEF_BUFFERS) return 0; - spd->pages = kmalloc_array(buffers, sizeof(struct page *), GFP_KERNEL); - spd->partial = kmalloc_array(buffers, sizeof(struct partial_page), + spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL); + spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page), GFP_KERNEL); if (spd->pages && spd->partial) @@ -298,10 +303,11 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, { struct iov_iter to; struct kiocb kiocb; - int idx, ret; + unsigned int i_head; + int ret; iov_iter_pipe(&to, READ, pipe, len); - idx = to.idx; + i_head = to.head; init_sync_kiocb(&kiocb, in); kiocb.ki_pos = *ppos; ret = call_read_iter(in, &kiocb, &to); @@ -309,7 +315,7 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, *ppos = kiocb.ki_pos; file_accessed(in); } else if (ret < 0) { - to.idx = idx; + to.head = i_head; to.iov_offset = 0; iov_iter_advance(&to, 0); /* to free what was emitted */ /* @@ -370,11 +376,12 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos, struct iov_iter to; struct page **pages; unsigned int nr_pages; + unsigned int mask; size_t offset, base, copied = 0; ssize_t res; int i; - if (pipe->nrbufs == pipe->buffers) + if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) return -EAGAIN; /* @@ -400,8 +407,9 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos, } } - pipe->bufs[to.idx].offset = offset; - pipe->bufs[to.idx].len -= offset; + mask = pipe->ring_size - 1; + pipe->bufs[to.head & mask].offset = offset; + pipe->bufs[to.head & mask].len -= offset; for (i = 0; i < nr_pages; i++) { size_t this_len = min_t(size_t, len, PAGE_SIZE - offset); @@ -443,7 +451,8 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe, more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0; - if (sd->len < sd->total_len && pipe->nrbufs > 1) + if (sd->len < sd->total_len && + pipe_occupancy(pipe->head, pipe->tail) > 1) more |= MSG_SENDPAGE_NOTLAST; return file->f_op->sendpage(file, buf->page, buf->offset, @@ -481,10 +490,13 @@ static void wakeup_pipe_writers(struct pipe_inode_info *pipe) static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, splice_actor *actor) { + unsigned int head = pipe->head; + unsigned int tail = pipe->tail; + unsigned int mask = pipe->ring_size - 1; int ret; - while (pipe->nrbufs) { - struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; + while (!pipe_empty(head, tail)) { + struct pipe_buffer *buf = &pipe->bufs[tail & mask]; sd->len = buf->len; if (sd->len > sd->total_len) @@ -511,8 +523,8 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des if (!buf->len) { pipe_buf_release(pipe, buf); - pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); - pipe->nrbufs--; + tail++; + pipe->tail = tail; if (pipe->files) sd->need_wakeup = true; } @@ -543,11 +555,11 @@ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_des if (signal_pending(current)) return -ERESTARTSYS; - while (!pipe->nrbufs) { + while (pipe_empty(pipe->head, pipe->tail)) { if (!pipe->writers) return 0; - if (!pipe->waiting_writers && sd->num_spliced) + if (sd->num_spliced) return 0; if (sd->flags & SPLICE_F_NONBLOCK) @@ -686,7 +698,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, .pos = *ppos, .u.file = out, }; - int nbufs = pipe->buffers; + int nbufs = pipe->max_usage; struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec), GFP_KERNEL); ssize_t ret; @@ -699,16 +711,17 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, splice_from_pipe_begin(&sd); while (sd.total_len) { struct iov_iter from; + unsigned int head, tail, mask; size_t left; - int n, idx; + int n; ret = splice_from_pipe_next(pipe, &sd); if (ret <= 0) break; - if (unlikely(nbufs < pipe->buffers)) { + if (unlikely(nbufs < pipe->max_usage)) { kfree(array); - nbufs = pipe->buffers; + nbufs = pipe->max_usage; array = kcalloc(nbufs, sizeof(struct bio_vec), GFP_KERNEL); if (!array) { @@ -717,18 +730,19 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, } } + head = pipe->head; + tail = pipe->tail; + mask = pipe->ring_size - 1; + /* build the vector */ left = sd.total_len; - for (n = 0, idx = pipe->curbuf; left && n < pipe->nrbufs; n++, idx++) { - struct pipe_buffer *buf = pipe->bufs + idx; + for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++, n++) { + struct pipe_buffer *buf = &pipe->bufs[tail & mask]; size_t this_len = buf->len; if (this_len > left) this_len = left; - if (idx == pipe->buffers - 1) - idx = -1; - ret = pipe_buf_confirm(pipe, buf); if (unlikely(ret)) { if (ret == -ENODATA) @@ -752,14 +766,15 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, *ppos = sd.pos; /* dismiss the fully eaten buffers, adjust the partial one */ + tail = pipe->tail; while (ret) { - struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; + struct pipe_buffer *buf = &pipe->bufs[tail & mask]; if (ret >= buf->len) { ret -= buf->len; buf->len = 0; pipe_buf_release(pipe, buf); - pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); - pipe->nrbufs--; + tail++; + pipe->tail = tail; if (pipe->files) sd.need_wakeup = true; } else { @@ -942,15 +957,17 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, sd->flags &= ~SPLICE_F_NONBLOCK; more = sd->flags & SPLICE_F_MORE; - WARN_ON_ONCE(pipe->nrbufs != 0); + WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail)); while (len) { + unsigned int p_space; size_t read_len; loff_t pos = sd->pos, prev_pos = pos; /* Don't try to read more the pipe has space for. */ - read_len = min_t(size_t, len, - (pipe->buffers - pipe->nrbufs) << PAGE_SHIFT); + p_space = pipe->max_usage - + pipe_occupancy(pipe->head, pipe->tail); + read_len = min_t(size_t, len, p_space << PAGE_SHIFT); ret = do_splice_to(in, &pos, pipe, read_len, flags); if (unlikely(ret <= 0)) goto out_release; @@ -989,7 +1006,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, } done: - pipe->nrbufs = pipe->curbuf = 0; + pipe->tail = pipe->head = 0; file_accessed(in); return bytes; @@ -998,8 +1015,8 @@ out_release: * If we did an incomplete transfer we must release * the pipe buffers in question: */ - for (i = 0; i < pipe->buffers; i++) { - struct pipe_buffer *buf = pipe->bufs + i; + for (i = 0; i < pipe->ring_size; i++) { + struct pipe_buffer *buf = &pipe->bufs[i]; if (buf->ops) pipe_buf_release(pipe, buf); @@ -1075,15 +1092,13 @@ static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) send_sig(SIGPIPE, current, 0); return -EPIPE; } - if (pipe->nrbufs != pipe->buffers) + if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) return 0; if (flags & SPLICE_F_NONBLOCK) return -EAGAIN; if (signal_pending(current)) return -ERESTARTSYS; - pipe->waiting_writers++; pipe_wait(pipe); - pipe->waiting_writers--; } } @@ -1180,8 +1195,15 @@ static long do_splice(struct file *in, loff_t __user *off_in, pipe_lock(opipe); ret = wait_for_space(opipe, flags); - if (!ret) + if (!ret) { + unsigned int p_space; + + /* Don't try to read more the pipe has space for. */ + p_space = opipe->max_usage - pipe_occupancy(opipe->head, opipe->tail); + len = min_t(size_t, len, p_space << PAGE_SHIFT); + ret = do_splice_to(in, &offset, opipe, len, flags); + } pipe_unlock(opipe); if (ret > 0) wakeup_pipe_readers(opipe); @@ -1442,27 +1464,25 @@ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) int ret; /* - * Check ->nrbufs without the inode lock first. This function + * Check the pipe occupancy without the inode lock first. This function * is speculative anyways, so missing one is ok. */ - if (pipe->nrbufs) + if (!pipe_empty(pipe->head, pipe->tail)) return 0; ret = 0; pipe_lock(pipe); - while (!pipe->nrbufs) { + while (pipe_empty(pipe->head, pipe->tail)) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; } if (!pipe->writers) break; - if (!pipe->waiting_writers) { - if (flags & SPLICE_F_NONBLOCK) { - ret = -EAGAIN; - break; - } + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; } pipe_wait(pipe); } @@ -1480,16 +1500,16 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) int ret; /* - * Check ->nrbufs without the inode lock first. This function + * Check pipe occupancy without the inode lock first. This function * is speculative anyways, so missing one is ok. */ - if (pipe->nrbufs < pipe->buffers) + if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) return 0; ret = 0; pipe_lock(pipe); - while (pipe->nrbufs >= pipe->buffers) { + while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { if (!pipe->readers) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; @@ -1503,9 +1523,7 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) ret = -ERESTARTSYS; break; } - pipe->waiting_writers++; pipe_wait(pipe); - pipe->waiting_writers--; } pipe_unlock(pipe); @@ -1520,7 +1538,10 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; - int ret = 0, nbuf; + unsigned int i_head, o_head; + unsigned int i_tail, o_tail; + unsigned int i_mask, o_mask; + int ret = 0; bool input_wakeup = false; @@ -1540,7 +1561,14 @@ retry: */ pipe_double_lock(ipipe, opipe); + i_tail = ipipe->tail; + i_mask = ipipe->ring_size - 1; + o_head = opipe->head; + o_mask = opipe->ring_size - 1; + do { + size_t o_len; + if (!opipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) @@ -1548,14 +1576,18 @@ retry: break; } - if (!ipipe->nrbufs && !ipipe->writers) + i_head = ipipe->head; + o_tail = opipe->tail; + + if (pipe_empty(i_head, i_tail) && !ipipe->writers) break; /* * Cannot make any progress, because either the input * pipe is empty or the output pipe is full. */ - if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) { + if (pipe_empty(i_head, i_tail) || + pipe_full(o_head, o_tail, opipe->max_usage)) { /* Already processed some buffers, break */ if (ret) break; @@ -1575,9 +1607,8 @@ retry: goto retry; } - ibuf = ipipe->bufs + ipipe->curbuf; - nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1); - obuf = opipe->bufs + nbuf; + ibuf = &ipipe->bufs[i_tail & i_mask]; + obuf = &opipe->bufs[o_head & o_mask]; if (len >= ibuf->len) { /* @@ -1585,10 +1616,12 @@ retry: */ *obuf = *ibuf; ibuf->ops = NULL; - opipe->nrbufs++; - ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1); - ipipe->nrbufs--; + i_tail++; + ipipe->tail = i_tail; input_wakeup = true; + o_len = obuf->len; + o_head++; + opipe->head = o_head; } else { /* * Get a reference to this pipe buffer, @@ -1610,12 +1643,14 @@ retry: pipe_buf_mark_unmergeable(obuf); obuf->len = len; - opipe->nrbufs++; - ibuf->offset += obuf->len; - ibuf->len -= obuf->len; + ibuf->offset += len; + ibuf->len -= len; + o_len = len; + o_head++; + opipe->head = o_head; } - ret += obuf->len; - len -= obuf->len; + ret += o_len; + len -= o_len; } while (len); pipe_unlock(ipipe); @@ -1641,7 +1676,10 @@ static int link_pipe(struct pipe_inode_info *ipipe, size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; - int ret = 0, i = 0, nbuf; + unsigned int i_head, o_head; + unsigned int i_tail, o_tail; + unsigned int i_mask, o_mask; + int ret = 0; /* * Potential ABBA deadlock, work around it by ordering lock @@ -1650,6 +1688,11 @@ static int link_pipe(struct pipe_inode_info *ipipe, */ pipe_double_lock(ipipe, opipe); + i_tail = ipipe->tail; + i_mask = ipipe->ring_size - 1; + o_head = opipe->head; + o_mask = opipe->ring_size - 1; + do { if (!opipe->readers) { send_sig(SIGPIPE, current, 0); @@ -1658,15 +1701,19 @@ static int link_pipe(struct pipe_inode_info *ipipe, break; } + i_head = ipipe->head; + o_tail = opipe->tail; + /* - * If we have iterated all input buffers or ran out of + * If we have iterated all input buffers or run out of * output room, break. */ - if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) + if (pipe_empty(i_head, i_tail) || + pipe_full(o_head, o_tail, opipe->max_usage)) break; - ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1)); - nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1); + ibuf = &ipipe->bufs[i_tail & i_mask]; + obuf = &opipe->bufs[o_head & o_mask]; /* * Get a reference to this pipe buffer, @@ -1678,7 +1725,6 @@ static int link_pipe(struct pipe_inode_info *ipipe, break; } - obuf = opipe->bufs + nbuf; *obuf = *ibuf; /* @@ -1691,19 +1737,13 @@ static int link_pipe(struct pipe_inode_info *ipipe, if (obuf->len > len) obuf->len = len; - - opipe->nrbufs++; ret += obuf->len; len -= obuf->len; - i++; - } while (len); - /* - * return EAGAIN if we have the potential of some data in the - * future, otherwise just return 0 - */ - if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK)) - ret = -EAGAIN; + o_head++; + opipe->head = o_head; + i_tail++; + } while (len); pipe_unlock(ipipe); pipe_unlock(opipe); diff --git a/fs/timerfd.c b/fs/timerfd.c index 48305ba41e3c..ac7f59a58f94 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -302,11 +302,11 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, static void timerfd_show(struct seq_file *m, struct file *file) { struct timerfd_ctx *ctx = file->private_data; - struct itimerspec t; + struct timespec64 value, interval; spin_lock_irq(&ctx->wqh.lock); - t.it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); - t.it_interval = ktime_to_timespec(ctx->tintv); + value = ktime_to_timespec64(timerfd_get_remaining(ctx)); + interval = ktime_to_timespec64(ctx->tintv); spin_unlock_irq(&ctx->wqh.lock); seq_printf(m, @@ -318,10 +318,10 @@ static void timerfd_show(struct seq_file *m, struct file *file) ctx->clockid, (unsigned long long)ctx->ticks, ctx->settime_flags, - (unsigned long long)t.it_value.tv_sec, - (unsigned long long)t.it_value.tv_nsec, - (unsigned long long)t.it_interval.tv_sec, - (unsigned long long)t.it_interval.tv_nsec); + (unsigned long long)value.tv_sec, + (unsigned long long)value.tv_nsec, + (unsigned long long)interval.tv_sec, + (unsigned long long)interval.tv_nsec); } #else #define timerfd_show NULL diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index e4b52783819d..0f5a480fe264 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2737,18 +2737,6 @@ static ssize_t dfs_file_write(struct file *file, const char __user *u, struct dentry *dent = file->f_path.dentry; int val; - /* - * TODO: this is racy - the file-system might have already been - * unmounted and we'd oops in this case. The plan is to fix it with - * help of 'iterate_supers_type()' which we should have in v3.0: when - * a debugfs opened, we rember FS's UUID in file->private_data. Then - * whenever we access the FS via a debugfs file, we iterate all UBIFS - * superblocks and fine the one with the same UUID, and take the - * locking right. - * - * The other way to go suggested by Al Viro is to create a separate - * 'ubifs-debug' file-system instead. - */ if (file->f_path.dentry == d->dfs_dump_lprops) { ubifs_dump_lprops(c); return count; diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 4fd9683b8245..388fe8f5dc51 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -503,7 +503,7 @@ static void mark_inode_clean(struct ubifs_info *c, struct ubifs_inode *ui) static void set_dent_cookie(struct ubifs_info *c, struct ubifs_dent_node *dent) { if (c->double_hash) - dent->cookie = prandom_u32(); + dent->cookie = (__force __le32) prandom_u32(); else dent->cookie = 0; } @@ -899,7 +899,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) fname_name(&nm) = xent->name; fname_len(&nm) = le16_to_cpu(xent->nlen); - xino = ubifs_iget(c->vfs_sb, xent->inum); + xino = ubifs_iget(c->vfs_sb, le64_to_cpu(xent->inum)); if (IS_ERR(xino)) { err = PTR_ERR(xino); ubifs_err(c, "dead directory entry '%s', error %d", diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 3b4b4114f208..54d6db61106f 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -631,12 +631,17 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ino_t inum; int i, n, err, first = 1; + ino = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS); + if (!ino) + return -ENOMEM; + list_for_each_entry(snod, &sleb->nodes, list) { if (snod->type != UBIFS_ORPH_NODE) { ubifs_err(c, "invalid node type %d in orphan area at %d:%d", snod->type, sleb->lnum, snod->offs); ubifs_dump_node(c, snod->node); - return -EINVAL; + err = -EINVAL; + goto out_free; } orph = snod->node; @@ -663,20 +668,18 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ubifs_err(c, "out of order commit number %llu in orphan node at %d:%d", cmt_no, sleb->lnum, snod->offs); ubifs_dump_node(c, snod->node); - return -EINVAL; + err = -EINVAL; + goto out_free; } dbg_rcvry("out of date LEB %d", sleb->lnum); *outofdate = 1; - return 0; + err = 0; + goto out_free; } if (first) first = 0; - ino = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS); - if (!ino) - return -ENOMEM; - n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3; for (i = 0; i < n; i++) { union ubifs_key key1, key2; diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index a551eb3e9b89..2b7c04bf8983 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -184,7 +184,7 @@ static int create_default_filesystem(struct ubifs_info *c) if (err) goto out; } else { - sup->hash_algo = 0xffff; + sup->hash_algo = cpu_to_le16(0xffff); } sup->ch.node_type = UBIFS_SB_NODE; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 7d4547e5202d..5e1e8ec0589e 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2267,10 +2267,8 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, } } else { err = ubifs_fill_super(sb, data, flags & SB_SILENT ? 1 : 0); - if (err) { - kfree(c); + if (err) goto out_deact; - } /* We do not support atime */ sb->s_flags |= SB_ACTIVE; if (IS_ENABLED(CONFIG_UBIFS_ATIME_SUPPORT)) diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index a384a0f9ff32..234be1c4dc87 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -212,7 +212,7 @@ static int is_idx_node_in_use(struct ubifs_info *c, union ubifs_key *key, /** * layout_leb_in_gaps - layout index nodes using in-the-gaps method. * @c: UBIFS file-system description object - * @p: return LEB number here + * @p: return LEB number in @c->gap_lebs[p] * * This function lays out new index nodes for dirty znodes using in-the-gaps * method of TNC commit. @@ -221,7 +221,7 @@ static int is_idx_node_in_use(struct ubifs_info *c, union ubifs_key *key, * This function returns the number of index nodes written into the gaps, or a * negative error code on failure. */ -static int layout_leb_in_gaps(struct ubifs_info *c, int *p) +static int layout_leb_in_gaps(struct ubifs_info *c, int p) { struct ubifs_scan_leb *sleb; struct ubifs_scan_node *snod; @@ -236,7 +236,7 @@ static int layout_leb_in_gaps(struct ubifs_info *c, int *p) * filled, however we do not check there at present. */ return lnum; /* Error code */ - *p = lnum; + c->gap_lebs[p] = lnum; dbg_gc("LEB %d", lnum); /* * Scan the index LEB. We use the generic scan for this even though @@ -355,7 +355,7 @@ static int get_leb_cnt(struct ubifs_info *c, int cnt) */ static int layout_in_gaps(struct ubifs_info *c, int cnt) { - int err, leb_needed_cnt, written, *p; + int err, leb_needed_cnt, written, p = 0, old_idx_lebs, *gap_lebs; dbg_gc("%d znodes to write", cnt); @@ -364,9 +364,9 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt) if (!c->gap_lebs) return -ENOMEM; - p = c->gap_lebs; + old_idx_lebs = c->lst.idx_lebs; do { - ubifs_assert(c, p < c->gap_lebs + c->lst.idx_lebs); + ubifs_assert(c, p < c->lst.idx_lebs); written = layout_leb_in_gaps(c, p); if (written < 0) { err = written; @@ -392,9 +392,29 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt) leb_needed_cnt = get_leb_cnt(c, cnt); dbg_gc("%d znodes remaining, need %d LEBs, have %d", cnt, leb_needed_cnt, c->ileb_cnt); + /* + * Dynamically change the size of @c->gap_lebs to prevent + * oob, because @c->lst.idx_lebs could be increased by + * function @get_idx_gc_leb (called by layout_leb_in_gaps-> + * ubifs_find_dirty_idx_leb) during loop. Only enlarge + * @c->gap_lebs when needed. + * + */ + if (leb_needed_cnt > c->ileb_cnt && p >= old_idx_lebs && + old_idx_lebs < c->lst.idx_lebs) { + old_idx_lebs = c->lst.idx_lebs; + gap_lebs = krealloc(c->gap_lebs, sizeof(int) * + (old_idx_lebs + 1), GFP_NOFS); + if (!gap_lebs) { + kfree(c->gap_lebs); + c->gap_lebs = NULL; + return -ENOMEM; + } + c->gap_lebs = gap_lebs; + } } while (leb_needed_cnt > c->ileb_cnt); - *p = -1; + c->gap_lebs[p] = -1; return 0; } diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index f9fd18670e22..37df7c9eedb1 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1460,7 +1460,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, start = vma->vm_start; vma_end = min(end, vma->vm_end); - new_flags = (vma->vm_flags & ~vm_flags) | vm_flags; + new_flags = (vma->vm_flags & + ~(VM_UFFD_MISSING|VM_UFFD_WP)) | vm_flags; prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), @@ -1834,13 +1835,12 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) goto out; features = uffdio_api.features; - if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) { - memset(&uffdio_api, 0, sizeof(uffdio_api)); - if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) - goto out; - ret = -EINVAL; - goto out; - } + ret = -EINVAL; + if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) + goto err_out; + ret = -EPERM; + if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE)) + goto err_out; /* report all available features and ioctls to userland */ uffdio_api.features = UFFD_API_FEATURES; uffdio_api.ioctls = UFFD_API_IOCTLS; @@ -1853,6 +1853,11 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, ret = 0; out: return ret; +err_out: + memset(&uffdio_api, 0, sizeof(uffdio_api)); + if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) + ret = -EFAULT; + goto out; } static long userfaultfd_ioctl(struct file *file, unsigned cmd, @@ -1923,7 +1928,7 @@ static const struct file_operations userfaultfd_fops = { .poll = userfaultfd_poll, .read = userfaultfd_read, .unlocked_ioctl = userfaultfd_ioctl, - .compat_ioctl = userfaultfd_ioctl, + .compat_ioctl = compat_ptr_ioctl, .llseek = noop_llseek, }; diff --git a/fs/utimes.c b/fs/utimes.c index 1ba3f7883870..c952b6b3d8a0 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -161,9 +161,9 @@ SYSCALL_DEFINE4(utimensat, int, dfd, const char __user *, filename, * utimensat() instead. */ static long do_futimesat(int dfd, const char __user *filename, - struct timeval __user *utimes) + struct __kernel_old_timeval __user *utimes) { - struct timeval times[2]; + struct __kernel_old_timeval times[2]; struct timespec64 tstimes[2]; if (utimes) { @@ -190,13 +190,13 @@ static long do_futimesat(int dfd, const char __user *filename, SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename, - struct timeval __user *, utimes) + struct __kernel_old_timeval __user *, utimes) { return do_futimesat(dfd, filename, utimes); } SYSCALL_DEFINE2(utimes, char __user *, filename, - struct timeval __user *, utimes) + struct __kernel_old_timeval __user *, utimes) { return do_futimesat(AT_FDCWD, filename, utimes); } diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 37f680b9a570..ef95ca07d084 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -366,11 +366,11 @@ struct xfs_bulkstat { uint64_t bs_blocks; /* number of blocks */ uint64_t bs_xflags; /* extended flags */ - uint64_t bs_atime; /* access time, seconds */ - uint64_t bs_mtime; /* modify time, seconds */ + int64_t bs_atime; /* access time, seconds */ + int64_t bs_mtime; /* modify time, seconds */ - uint64_t bs_ctime; /* inode change time, seconds */ - uint64_t bs_btime; /* creation time, seconds */ + int64_t bs_ctime; /* inode change time, seconds */ + int64_t bs_btime; /* creation time, seconds */ uint32_t bs_gen; /* generation count */ uint32_t bs_uid; /* user id */ |