diff options
Diffstat (limited to 'fs')
174 files changed, 6457 insertions, 3685 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 8d97f0b45e9c..795706520b5e 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -399,7 +399,7 @@ static int v9fs_test_inode(struct inode *inode, void *data) umode = p9mode2unixmode(v9ses, st, &rdev); /* don't match inode of different type */ - if ((inode->i_mode & S_IFMT) != (umode & S_IFMT)) + if (inode_wrong_type(inode, umode)) return 0; /* compare qid details */ @@ -1390,7 +1390,7 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) * Don't update inode if the file type is different */ umode = p9mode2unixmode(v9ses, st, &rdev); - if ((inode->i_mode & S_IFMT) != (umode & S_IFMT)) + if (inode_wrong_type(inode, umode)) goto out; /* diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 1dc7af046615..e1c0240b51c0 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -59,7 +59,7 @@ static int v9fs_test_inode_dotl(struct inode *inode, void *data) struct p9_stat_dotl *st = (struct p9_stat_dotl *)data; /* don't match inode of different type */ - if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT)) + if (inode_wrong_type(inode, st->st_mode)) return 0; if (inode->i_generation != st->st_gen) @@ -663,14 +663,10 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, if (stat->st_result_mask & P9_STATS_NLINK) set_nlink(inode, stat->st_nlink); if (stat->st_result_mask & P9_STATS_MODE) { - inode->i_mode = stat->st_mode; - if ((S_ISBLK(inode->i_mode)) || - (S_ISCHR(inode->i_mode))) - init_special_inode(inode, inode->i_mode, - inode->i_rdev); + mode = stat->st_mode & S_IALLUGO; + mode |= inode->i_mode & ~S_IALLUGO; + inode->i_mode = mode; } - if (stat->st_result_mask & P9_STATS_RDEV) - inode->i_rdev = new_decode_dev(stat->st_rdev); if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) && stat->st_result_mask & P9_STATS_SIZE) v9fs_i_size_write(inode, stat->st_size); @@ -959,7 +955,7 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) /* * Don't update inode if the file type is different */ - if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT)) + if (inode_wrong_type(inode, st->st_mode)) goto out; /* diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 12be88716e4c..5a70c09f5325 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -102,13 +102,13 @@ static int afs_inode_init_from_status(struct afs_operation *op, switch (status->type) { case AFS_FTYPE_FILE: - inode->i_mode = S_IFREG | status->mode; + inode->i_mode = S_IFREG | (status->mode & S_IALLUGO); inode->i_op = &afs_file_inode_operations; inode->i_fop = &afs_file_operations; inode->i_mapping->a_ops = &afs_fs_aops; break; case AFS_FTYPE_DIR: - inode->i_mode = S_IFDIR | status->mode; + inode->i_mode = S_IFDIR | (status->mode & S_IALLUGO); inode->i_op = &afs_dir_inode_operations; inode->i_fop = &afs_dir_file_operations; inode->i_mapping->a_ops = &afs_dir_aops; @@ -198,7 +198,7 @@ static void afs_apply_status(struct afs_operation *op, if (status->mode != vnode->status.mode) { mode = inode->i_mode; mode &= ~S_IALLUGO; - mode |= status->mode; + mode |= status->mode & S_IALLUGO; WRITE_ONCE(inode->i_mode, mode); } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index b12ba98ae9f5..187b3f2b9202 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -2267,8 +2267,7 @@ static int elf_core_dump(struct coredump_params *cprm) goto end_coredump; /* Align to page */ - if (!dump_skip(cprm, dataoff - cprm->pos)) - goto end_coredump; + dump_skip_to(cprm, dataoff); for (i = 0; i < vma_count; i++) { struct core_vma_metadata *meta = vma_meta + i; @@ -2276,7 +2275,6 @@ static int elf_core_dump(struct coredump_params *cprm) if (!dump_user_range(cprm, meta->start, meta->dump_size)) goto end_coredump; } - dump_truncate(cprm); if (!elf_core_write_extra_data(cprm)) goto end_coredump; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 3cfd6cd46f26..2c99b102c860 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1631,8 +1631,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) goto end_coredump; } - if (!dump_skip(cprm, dataoff - cprm->pos)) - goto end_coredump; + dump_skip_to(cprm, dataoff); if (!elf_fdpic_dump_segments(cprm, vma_meta, vma_count)) goto end_coredump; diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index b4fb997eda16..cec88a66bd6c 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -30,7 +30,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ - subpage.o + subpage.o tree-mod-log.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index f47c1528eb9a..117d423fdb93 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -14,6 +14,7 @@ #include "delayed-ref.h" #include "locking.h" #include "misc.h" +#include "tree-mod-log.h" /* Just an arbitrary number so we can be sure this happened */ #define BACKREF_FOUND_SHARED 6 @@ -452,7 +453,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (path->slots[0] >= btrfs_header_nritems(eb) || is_shared_data_backref(preftrees, eb->start) || ref->root_id != btrfs_header_owner(eb)) { - if (time_seq == SEQ_LAST) + if (time_seq == BTRFS_SEQ_LAST) ret = btrfs_next_leaf(root, path); else ret = btrfs_next_old_leaf(root, path, time_seq); @@ -476,7 +477,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (slot == 0 && (is_shared_data_backref(preftrees, eb->start) || ref->root_id != btrfs_header_owner(eb))) { - if (time_seq == SEQ_LAST) + if (time_seq == BTRFS_SEQ_LAST) ret = btrfs_next_leaf(root, path); else ret = btrfs_next_old_leaf(root, path, time_seq); @@ -514,7 +515,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, eie = NULL; } next: - if (time_seq == SEQ_LAST) + if (time_seq == BTRFS_SEQ_LAST) ret = btrfs_next_item(root, path); else ret = btrfs_next_old_item(root, path, time_seq); @@ -574,7 +575,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, if (path->search_commit_root) root_level = btrfs_header_level(root->commit_root); - else if (time_seq == SEQ_LAST) + else if (time_seq == BTRFS_SEQ_LAST) root_level = btrfs_header_level(root->node); else root_level = btrfs_old_root_level(root, time_seq); @@ -605,7 +606,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, search_key.offset >= LLONG_MAX) search_key.offset = 0; path->lowest_level = level; - if (time_seq == SEQ_LAST) + if (time_seq == BTRFS_SEQ_LAST) ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); else ret = btrfs_search_old_slot(root, &search_key, path, time_seq); @@ -1147,8 +1148,8 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info, * indirect refs to their parent bytenr. * When roots are found, they're added to the roots list * - * If time_seq is set to SEQ_LAST, it will not search delayed_refs, and behave - * much like trans == NULL case, the difference only lies in it will not + * If time_seq is set to BTRFS_SEQ_LAST, it will not search delayed_refs, and + * behave much like trans == NULL case, the difference only lies in it will not * commit root. * The special case is for qgroup to search roots in commit_transaction(). * @@ -1199,7 +1200,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, path->skip_locking = 1; } - if (time_seq == SEQ_LAST) + if (time_seq == BTRFS_SEQ_LAST) path->skip_locking = 1; /* @@ -1217,9 +1218,9 @@ again: #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (trans && likely(trans->type != __TRANS_DUMMY) && - time_seq != SEQ_LAST) { + time_seq != BTRFS_SEQ_LAST) { #else - if (trans && time_seq != SEQ_LAST) { + if (trans && time_seq != BTRFS_SEQ_LAST) { #endif /* * look if there are updates for this ref queued and lock the @@ -1527,7 +1528,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, struct btrfs_trans_handle *trans; struct ulist_iterator uiter; struct ulist_node *node; - struct seq_list elem = SEQ_LIST_INIT(elem); + struct btrfs_seq_list elem = BTRFS_SEQ_LIST_INIT(elem); int ret = 0; struct share_check shared = { .root_objectid = root->root_key.objectid, @@ -1953,7 +1954,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, struct ulist *roots = NULL; struct ulist_node *ref_node = NULL; struct ulist_node *root_node = NULL; - struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); + struct btrfs_seq_list seq_elem = BTRFS_SEQ_LIST_INIT(seq_elem); struct ulist_iterator ref_uiter; struct ulist_iterator root_uiter; @@ -1971,12 +1972,12 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, } if (trans) - btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); + btrfs_get_tree_mod_seq(fs_info, &seq_elem); else down_read(&fs_info->commit_root_sem); ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, - tree_mod_seq_elem.seq, &refs, + seq_elem.seq, &refs, &extent_item_pos, ignore_offset); if (ret) goto out; @@ -1984,7 +1985,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, ULIST_ITER_INIT(&ref_uiter); while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val, - tree_mod_seq_elem.seq, &roots, + seq_elem.seq, &roots, ignore_offset); if (ret) break; @@ -2007,7 +2008,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, free_leaf_list(refs); out: if (trans) { - btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); + btrfs_put_tree_mod_seq(fs_info, &seq_elem); btrfs_end_transaction(trans); } else { up_read(&fs_info->commit_root_sem); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 744b99ddc28c..aa57bdc8fc89 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1289,7 +1289,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * Long running balances can keep us blocked here for eternity, so * simply skip deletion if we're unable to get the mutex. */ - if (!mutex_trylock(&fs_info->delete_unused_bgs_mutex)) + if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) return; spin_lock(&fs_info->unused_bgs_lock); @@ -1462,12 +1462,12 @@ next: spin_lock(&fs_info->unused_bgs_lock); } spin_unlock(&fs_info->unused_bgs_lock); - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); return; flip_async: btrfs_end_transaction(trans); - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_put_block_group(block_group); btrfs_discard_punt_unused_bgs_list(fs_info); } @@ -1485,6 +1485,97 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg) spin_unlock(&fs_info->unused_bgs_lock); } +void btrfs_reclaim_bgs_work(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info = + container_of(work, struct btrfs_fs_info, reclaim_bgs_work); + struct btrfs_block_group *bg; + struct btrfs_space_info *space_info; + int ret; + + if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) + return; + + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) + return; + + mutex_lock(&fs_info->reclaim_bgs_lock); + spin_lock(&fs_info->unused_bgs_lock); + while (!list_empty(&fs_info->reclaim_bgs)) { + bg = list_first_entry(&fs_info->reclaim_bgs, + struct btrfs_block_group, + bg_list); + list_del_init(&bg->bg_list); + + space_info = bg->space_info; + spin_unlock(&fs_info->unused_bgs_lock); + + /* Don't race with allocators so take the groups_sem */ + down_write(&space_info->groups_sem); + + spin_lock(&bg->lock); + if (bg->reserved || bg->pinned || bg->ro) { + /* + * We want to bail if we made new allocations or have + * outstanding allocations in this block group. We do + * the ro check in case balance is currently acting on + * this block group. + */ + spin_unlock(&bg->lock); + up_write(&space_info->groups_sem); + goto next; + } + spin_unlock(&bg->lock); + + /* Get out fast, in case we're unmounting the filesystem */ + if (btrfs_fs_closing(fs_info)) { + up_write(&space_info->groups_sem); + goto next; + } + + ret = inc_block_group_ro(bg, 0); + up_write(&space_info->groups_sem); + if (ret < 0) + goto next; + + btrfs_info(fs_info, "reclaiming chunk %llu with %llu%% used", + bg->start, div_u64(bg->used * 100, bg->length)); + trace_btrfs_reclaim_block_group(bg); + ret = btrfs_relocate_chunk(fs_info, bg->start); + if (ret) + btrfs_err(fs_info, "error relocating chunk %llu", + bg->start); + +next: + btrfs_put_block_group(bg); + spin_lock(&fs_info->unused_bgs_lock); + } + spin_unlock(&fs_info->unused_bgs_lock); + mutex_unlock(&fs_info->reclaim_bgs_lock); + btrfs_exclop_finish(fs_info); +} + +void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) +{ + spin_lock(&fs_info->unused_bgs_lock); + if (!list_empty(&fs_info->reclaim_bgs)) + queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work); + spin_unlock(&fs_info->unused_bgs_lock); +} + +void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + spin_lock(&fs_info->unused_bgs_lock); + if (list_empty(&bg->bg_list)) { + btrfs_get_block_group(bg); + trace_btrfs_add_reclaim_block_group(bg); + list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs); + } + spin_unlock(&fs_info->unused_bgs_lock); +} + static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, struct btrfs_path *path) { @@ -2267,29 +2358,33 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, struct btrfs_trans_handle *trans; u64 alloc_flags; int ret; + bool dirty_bg_running; -again: - trans = btrfs_join_transaction(fs_info->extent_root); - if (IS_ERR(trans)) - return PTR_ERR(trans); + do { + trans = btrfs_join_transaction(fs_info->extent_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); - /* - * we're not allowed to set block groups readonly after the dirty - * block groups cache has started writing. If it already started, - * back off and let this transaction commit - */ - mutex_lock(&fs_info->ro_block_group_mutex); - if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { - u64 transid = trans->transid; + dirty_bg_running = false; - mutex_unlock(&fs_info->ro_block_group_mutex); - btrfs_end_transaction(trans); + /* + * We're not allowed to set block groups readonly after the dirty + * block group cache has started writing. If it already started, + * back off and let this transaction commit. + */ + mutex_lock(&fs_info->ro_block_group_mutex); + if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { + u64 transid = trans->transid; - ret = btrfs_wait_for_commit(fs_info, transid); - if (ret) - return ret; - goto again; - } + mutex_unlock(&fs_info->ro_block_group_mutex); + btrfs_end_transaction(trans); + + ret = btrfs_wait_for_commit(fs_info, transid); + if (ret) + return ret; + dirty_bg_running = true; + } + } while (dirty_bg_running); if (do_chunk_alloc) { /* @@ -3269,6 +3364,7 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) */ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) { + struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_space_info *info; u64 left; @@ -3283,6 +3379,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) lockdep_assert_held(&fs_info->chunk_mutex); info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); +again: spin_lock(&info->lock); left = info->total_bytes - btrfs_space_info_used(info, true); spin_unlock(&info->lock); @@ -3301,6 +3398,58 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) if (left < thresh) { u64 flags = btrfs_system_alloc_profile(fs_info); + u64 reserved = atomic64_read(&cur_trans->chunk_bytes_reserved); + + /* + * If there's not available space for the chunk tree (system + * space) and there are other tasks that reserved space for + * creating a new system block group, wait for them to complete + * the creation of their system block group and release excess + * reserved space. We do this because: + * + * *) We can end up allocating more system chunks than necessary + * when there are multiple tasks that are concurrently + * allocating block groups, which can lead to exhaustion of + * the system array in the superblock; + * + * *) If we allocate extra and unnecessary system block groups, + * despite being empty for a long time, and possibly forever, + * they end not being added to the list of unused block groups + * because that typically happens only when deallocating the + * last extent from a block group - which never happens since + * we never allocate from them in the first place. The few + * exceptions are when mounting a filesystem or running scrub, + * which add unused block groups to the list of unused block + * groups, to be deleted by the cleaner kthread. + * And even when they are added to the list of unused block + * groups, it can take a long time until they get deleted, + * since the cleaner kthread might be sleeping or busy with + * other work (deleting subvolumes, running delayed iputs, + * defrag scheduling, etc); + * + * This is rare in practice, but can happen when too many tasks + * are allocating blocks groups in parallel (via fallocate()) + * and before the one that reserved space for a new system block + * group finishes the block group creation and releases the space + * reserved in excess (at btrfs_create_pending_block_groups()), + * other tasks end up here and see free system space temporarily + * not enough for updating the chunk tree. + * + * We unlock the chunk mutex before waiting for such tasks and + * lock it again after the wait, otherwise we would deadlock. + * It is safe to do so because allocating a system chunk is the + * first thing done while allocating a new block group. + */ + if (reserved > trans->chunk_bytes_reserved) { + const u64 min_needed = reserved - thresh; + + mutex_unlock(&fs_info->chunk_mutex); + wait_event(cur_trans->chunk_reserve_wait, + atomic64_read(&cur_trans->chunk_bytes_reserved) <= + min_needed); + mutex_lock(&fs_info->chunk_mutex); + goto again; + } /* * Ignore failure to create system chunk. We might end up not @@ -3315,8 +3464,10 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) ret = btrfs_block_rsv_add(fs_info->chunk_root, &fs_info->chunk_block_rsv, thresh, BTRFS_RESERVE_NO_FLUSH); - if (!ret) + if (!ret) { + atomic64_add(thresh, &cur_trans->chunk_bytes_reserved); trans->chunk_bytes_reserved += thresh; + } } } @@ -3386,6 +3537,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) } spin_unlock(&info->unused_bgs_lock); + spin_lock(&info->unused_bgs_lock); + while (!list_empty(&info->reclaim_bgs)) { + block_group = list_first_entry(&info->reclaim_bgs, + struct btrfs_block_group, + bg_list); + list_del_init(&block_group->bg_list); + btrfs_put_block_group(block_group); + } + spin_unlock(&info->unused_bgs_lock); + spin_lock(&info->block_group_cache_lock); while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { block_group = rb_entry(n, struct btrfs_block_group, diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 3ecc3372a5ce..7b927425dc71 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -264,6 +264,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, u64 group_start, struct extent_map *em); void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); void btrfs_mark_bg_unused(struct btrfs_block_group *bg); +void btrfs_reclaim_bgs_work(struct work_struct *work); +void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info); +void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg); int btrfs_read_block_groups(struct btrfs_fs_info *info); int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, u64 type, u64 chunk_offset, u64 size); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 28e202e89660..c652e19ad74e 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -220,6 +220,7 @@ struct btrfs_inode { /* Hook into fs_info->delayed_iputs */ struct list_head delayed_iput; + struct rw_semaphore i_mmap_lock; struct inode vfs_inode; }; @@ -299,24 +300,30 @@ static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode, mod); } -static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) +/* + * Called every time after doing a buffered, direct IO or memory mapped write. + * + * This is to ensure that if we write to a file that was previously fsynced in + * the current transaction, then try to fsync it again in the same transaction, + * we will know that there were changes in the file and that it needs to be + * logged. + */ +static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode) +{ + spin_lock(&inode->lock); + inode->last_sub_trans = inode->root->log_transid; + spin_unlock(&inode->lock); +} + +static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) { - int ret = 0; + bool ret = false; spin_lock(&inode->lock); if (inode->logged_trans == generation && inode->last_sub_trans <= inode->last_log_commit && - inode->last_sub_trans <= inode->root->last_log_commit) { - /* - * After a ranged fsync we might have left some extent maps - * (that fall outside the fsync's range). So return false - * here if the list isn't empty, to make sure btrfs_log_inode() - * will be called and process those extent maps. - */ - smp_mb(); - if (list_empty(&inode->extent_tree.modified_extents)) - ret = 1; - } + inode->last_sub_trans <= inode->root->last_log_commit) + ret = true; spin_unlock(&inode->lock); return ret; } diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 113cb85c1fd4..169508609324 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1555,10 +1555,11 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) BUG_ON(!block_ctx->pagev); num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >> PAGE_SHIFT; + /* Pages must be unmapped in reverse order */ while (num_pages > 0) { num_pages--; if (block_ctx->datav[num_pages]) { - kunmap(block_ctx->pagev[num_pages]); + kunmap_local(block_ctx->datav[num_pages]); block_ctx->datav[num_pages] = NULL; } if (block_ctx->pagev[num_pages]) { @@ -1637,7 +1638,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, i = j; } for (i = 0; i < num_pages; i++) - block_ctx->datav[i] = kmap(block_ctx->pagev[i]); + block_ctx->datav[i] = kmap_local_page(block_ctx->pagev[i]); return block_ctx->len; } @@ -2677,7 +2678,7 @@ static void __btrfsic_submit_bio(struct bio *bio) dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev); if (NULL != dev_state && (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) { - unsigned int i = 0; + int i = 0; u64 dev_bytenr; u64 cur_bytenr; struct bio_vec bvec; @@ -2702,7 +2703,7 @@ static void __btrfsic_submit_bio(struct bio *bio) bio_for_each_segment(bvec, bio, iter) { BUG_ON(bvec.bv_len != PAGE_SIZE); - mapped_datav[i] = kmap(bvec.bv_page); + mapped_datav[i] = kmap_local_page(bvec.bv_page); i++; if (dev_state->state->print_mask & @@ -2715,8 +2716,9 @@ static void __btrfsic_submit_bio(struct bio *bio) mapped_datav, segs, bio, &bio_is_patched, bio->bi_opf); - bio_for_each_segment(bvec, bio, iter) - kunmap(bvec.bv_page); + /* Unmap in reverse order */ + for (--i; i >= 0; i--) + kunmap_local(mapped_datav[i]); kfree(mapped_datav); } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) { if (dev_state->state->print_mask & diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 3f4c832abfed..17f93fd28f7e 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -80,10 +80,15 @@ static int compression_compress_pages(int type, struct list_head *ws, case BTRFS_COMPRESS_NONE: default: /* - * This can't happen, the type is validated several times - * before we get here. As a sane fallback, return what the - * callers will understand as 'no compression happened'. + * This can happen when compression races with remount setting + * it to 'no compress', while caller doesn't call + * inode_need_compress() to check if we really need to + * compress. + * + * Not a big deal, just need to inform caller that we + * haven't allocated any pages yet. */ + *out_pages = 0; return -E2BIG; } } @@ -1611,7 +1616,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, curr_sample_pos = 0; while (index < index_end) { page = find_get_page(inode->i_mapping, index); - in_data = kmap(page); + in_data = kmap_local_page(page); /* Handle case where the start is not aligned to PAGE_SIZE */ i = start % PAGE_SIZE; while (i < PAGE_SIZE - SAMPLING_READ_SIZE) { @@ -1624,7 +1629,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, start += SAMPLING_INTERVAL; curr_sample_pos += SAMPLING_READ_SIZE; } - kunmap(page); + kunmap_local(in_data); put_page(page); index++; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 34b929bd5c1a..a484fb72a01f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -14,6 +14,7 @@ #include "locking.h" #include "volumes.h" #include "qgroup.h" +#include "tree-mod-log.h" static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level); @@ -233,597 +234,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, return 0; } -enum mod_log_op { - MOD_LOG_KEY_REPLACE, - MOD_LOG_KEY_ADD, - MOD_LOG_KEY_REMOVE, - MOD_LOG_KEY_REMOVE_WHILE_FREEING, - MOD_LOG_KEY_REMOVE_WHILE_MOVING, - MOD_LOG_MOVE_KEYS, - MOD_LOG_ROOT_REPLACE, -}; - -struct tree_mod_root { - u64 logical; - u8 level; -}; - -struct tree_mod_elem { - struct rb_node node; - u64 logical; - u64 seq; - enum mod_log_op op; - - /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */ - int slot; - - /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */ - u64 generation; - - /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */ - struct btrfs_disk_key key; - u64 blockptr; - - /* this is used for op == MOD_LOG_MOVE_KEYS */ - struct { - int dst_slot; - int nr_items; - } move; - - /* this is used for op == MOD_LOG_ROOT_REPLACE */ - struct tree_mod_root old_root; -}; - -/* - * Pull a new tree mod seq number for our operation. - */ -static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) -{ - return atomic64_inc_return(&fs_info->tree_mod_seq); -} - -/* - * This adds a new blocker to the tree mod log's blocker list if the @elem - * passed does not already have a sequence number set. So when a caller expects - * to record tree modifications, it should ensure to set elem->seq to zero - * before calling btrfs_get_tree_mod_seq. - * Returns a fresh, unused tree log modification sequence number, even if no new - * blocker was added. - */ -u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, - struct seq_list *elem) -{ - write_lock(&fs_info->tree_mod_log_lock); - if (!elem->seq) { - elem->seq = btrfs_inc_tree_mod_seq(fs_info); - list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); - } - write_unlock(&fs_info->tree_mod_log_lock); - - return elem->seq; -} - -void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, - struct seq_list *elem) -{ - struct rb_root *tm_root; - struct rb_node *node; - struct rb_node *next; - struct tree_mod_elem *tm; - u64 min_seq = (u64)-1; - u64 seq_putting = elem->seq; - - if (!seq_putting) - return; - - write_lock(&fs_info->tree_mod_log_lock); - list_del(&elem->list); - elem->seq = 0; - - if (!list_empty(&fs_info->tree_mod_seq_list)) { - struct seq_list *first; - - first = list_first_entry(&fs_info->tree_mod_seq_list, - struct seq_list, list); - if (seq_putting > first->seq) { - /* - * Blocker with lower sequence number exists, we - * cannot remove anything from the log. - */ - write_unlock(&fs_info->tree_mod_log_lock); - return; - } - min_seq = first->seq; - } - - /* - * anything that's lower than the lowest existing (read: blocked) - * sequence number can be removed from the tree. - */ - tm_root = &fs_info->tree_mod_log; - for (node = rb_first(tm_root); node; node = next) { - next = rb_next(node); - tm = rb_entry(node, struct tree_mod_elem, node); - if (tm->seq >= min_seq) - continue; - rb_erase(node, tm_root); - kfree(tm); - } - write_unlock(&fs_info->tree_mod_log_lock); -} - -/* - * key order of the log: - * node/leaf start address -> sequence - * - * The 'start address' is the logical address of the *new* root node - * for root replace operations, or the logical address of the affected - * block for all other operations. - */ -static noinline int -__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) -{ - struct rb_root *tm_root; - struct rb_node **new; - struct rb_node *parent = NULL; - struct tree_mod_elem *cur; - - lockdep_assert_held_write(&fs_info->tree_mod_log_lock); - - tm->seq = btrfs_inc_tree_mod_seq(fs_info); - - tm_root = &fs_info->tree_mod_log; - new = &tm_root->rb_node; - while (*new) { - cur = rb_entry(*new, struct tree_mod_elem, node); - parent = *new; - if (cur->logical < tm->logical) - new = &((*new)->rb_left); - else if (cur->logical > tm->logical) - new = &((*new)->rb_right); - else if (cur->seq < tm->seq) - new = &((*new)->rb_left); - else if (cur->seq > tm->seq) - new = &((*new)->rb_right); - else - return -EEXIST; - } - - rb_link_node(&tm->node, parent, new); - rb_insert_color(&tm->node, tm_root); - return 0; -} - -/* - * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it - * returns zero with the tree_mod_log_lock acquired. The caller must hold - * this until all tree mod log insertions are recorded in the rb tree and then - * write unlock fs_info::tree_mod_log_lock. - */ -static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb) { - smp_mb(); - if (list_empty(&(fs_info)->tree_mod_seq_list)) - return 1; - if (eb && btrfs_header_level(eb) == 0) - return 1; - - write_lock(&fs_info->tree_mod_log_lock); - if (list_empty(&(fs_info)->tree_mod_seq_list)) { - write_unlock(&fs_info->tree_mod_log_lock); - return 1; - } - - return 0; -} - -/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ -static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info, - struct extent_buffer *eb) -{ - smp_mb(); - if (list_empty(&(fs_info)->tree_mod_seq_list)) - return 0; - if (eb && btrfs_header_level(eb) == 0) - return 0; - - return 1; -} - -static struct tree_mod_elem * -alloc_tree_mod_elem(struct extent_buffer *eb, int slot, - enum mod_log_op op, gfp_t flags) -{ - struct tree_mod_elem *tm; - - tm = kzalloc(sizeof(*tm), flags); - if (!tm) - return NULL; - - tm->logical = eb->start; - if (op != MOD_LOG_KEY_ADD) { - btrfs_node_key(eb, &tm->key, slot); - tm->blockptr = btrfs_node_blockptr(eb, slot); - } - tm->op = op; - tm->slot = slot; - tm->generation = btrfs_node_ptr_generation(eb, slot); - RB_CLEAR_NODE(&tm->node); - - return tm; -} - -static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot, - enum mod_log_op op, gfp_t flags) -{ - struct tree_mod_elem *tm; - int ret; - - if (!tree_mod_need_log(eb->fs_info, eb)) - return 0; - - tm = alloc_tree_mod_elem(eb, slot, op, flags); - if (!tm) - return -ENOMEM; - - if (tree_mod_dont_log(eb->fs_info, eb)) { - kfree(tm); - return 0; - } - - ret = __tree_mod_log_insert(eb->fs_info, tm); - write_unlock(&eb->fs_info->tree_mod_log_lock); - if (ret) - kfree(tm); - - return ret; -} - -static noinline int tree_mod_log_insert_move(struct extent_buffer *eb, - int dst_slot, int src_slot, int nr_items) -{ - struct tree_mod_elem *tm = NULL; - struct tree_mod_elem **tm_list = NULL; - int ret = 0; - int i; - int locked = 0; - - if (!tree_mod_need_log(eb->fs_info, eb)) - return 0; - - tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS); - if (!tm_list) - return -ENOMEM; - - tm = kzalloc(sizeof(*tm), GFP_NOFS); - if (!tm) { - ret = -ENOMEM; - goto free_tms; - } - - tm->logical = eb->start; - tm->slot = src_slot; - tm->move.dst_slot = dst_slot; - tm->move.nr_items = nr_items; - tm->op = MOD_LOG_MOVE_KEYS; - - for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { - tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, - MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS); - if (!tm_list[i]) { - ret = -ENOMEM; - goto free_tms; - } - } - - if (tree_mod_dont_log(eb->fs_info, eb)) - goto free_tms; - locked = 1; - - /* - * When we override something during the move, we log these removals. - * This can only happen when we move towards the beginning of the - * buffer, i.e. dst_slot < src_slot. - */ - for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { - ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]); - if (ret) - goto free_tms; - } - - ret = __tree_mod_log_insert(eb->fs_info, tm); - if (ret) - goto free_tms; - write_unlock(&eb->fs_info->tree_mod_log_lock); - kfree(tm_list); - - return 0; -free_tms: - for (i = 0; i < nr_items; i++) { - if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) - rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log); - kfree(tm_list[i]); - } - if (locked) - write_unlock(&eb->fs_info->tree_mod_log_lock); - kfree(tm_list); - kfree(tm); - - return ret; -} - -static inline int -__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, - struct tree_mod_elem **tm_list, - int nritems) -{ - int i, j; - int ret; - - for (i = nritems - 1; i >= 0; i--) { - ret = __tree_mod_log_insert(fs_info, tm_list[i]); - if (ret) { - for (j = nritems - 1; j > i; j--) - rb_erase(&tm_list[j]->node, - &fs_info->tree_mod_log); - return ret; - } - } - - return 0; -} - -static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root, - struct extent_buffer *new_root, int log_removal) -{ - struct btrfs_fs_info *fs_info = old_root->fs_info; - struct tree_mod_elem *tm = NULL; - struct tree_mod_elem **tm_list = NULL; - int nritems = 0; - int ret = 0; - int i; - - if (!tree_mod_need_log(fs_info, NULL)) - return 0; - - if (log_removal && btrfs_header_level(old_root) > 0) { - nritems = btrfs_header_nritems(old_root); - tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), - GFP_NOFS); - if (!tm_list) { - ret = -ENOMEM; - goto free_tms; - } - for (i = 0; i < nritems; i++) { - tm_list[i] = alloc_tree_mod_elem(old_root, i, - MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); - if (!tm_list[i]) { - ret = -ENOMEM; - goto free_tms; - } - } - } - - tm = kzalloc(sizeof(*tm), GFP_NOFS); - if (!tm) { - ret = -ENOMEM; - goto free_tms; - } - - tm->logical = new_root->start; - tm->old_root.logical = old_root->start; - tm->old_root.level = btrfs_header_level(old_root); - tm->generation = btrfs_header_generation(old_root); - tm->op = MOD_LOG_ROOT_REPLACE; - - if (tree_mod_dont_log(fs_info, NULL)) - goto free_tms; - - if (tm_list) - ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems); - if (!ret) - ret = __tree_mod_log_insert(fs_info, tm); - - write_unlock(&fs_info->tree_mod_log_lock); - if (ret) - goto free_tms; - kfree(tm_list); - - return ret; - -free_tms: - if (tm_list) { - for (i = 0; i < nritems; i++) - kfree(tm_list[i]); - kfree(tm_list); - } - kfree(tm); - - return ret; -} - -static struct tree_mod_elem * -__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, - int smallest) -{ - struct rb_root *tm_root; - struct rb_node *node; - struct tree_mod_elem *cur = NULL; - struct tree_mod_elem *found = NULL; - - read_lock(&fs_info->tree_mod_log_lock); - tm_root = &fs_info->tree_mod_log; - node = tm_root->rb_node; - while (node) { - cur = rb_entry(node, struct tree_mod_elem, node); - if (cur->logical < start) { - node = node->rb_left; - } else if (cur->logical > start) { - node = node->rb_right; - } else if (cur->seq < min_seq) { - node = node->rb_left; - } else if (!smallest) { - /* we want the node with the highest seq */ - if (found) - BUG_ON(found->seq > cur->seq); - found = cur; - node = node->rb_left; - } else if (cur->seq > min_seq) { - /* we want the node with the smallest seq */ - if (found) - BUG_ON(found->seq < cur->seq); - found = cur; - node = node->rb_right; - } else { - found = cur; - break; - } - } - read_unlock(&fs_info->tree_mod_log_lock); - - return found; -} - -/* - * this returns the element from the log with the smallest time sequence - * value that's in the log (the oldest log item). any element with a time - * sequence lower than min_seq will be ignored. - */ -static struct tree_mod_elem * -tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start, - u64 min_seq) -{ - return __tree_mod_log_search(fs_info, start, min_seq, 1); -} - -/* - * this returns the element from the log with the largest time sequence - * value that's in the log (the most recent log item). any element with - * a time sequence lower than min_seq will be ignored. - */ -static struct tree_mod_elem * -tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) -{ - return __tree_mod_log_search(fs_info, start, min_seq, 0); -} - -static noinline int tree_mod_log_eb_copy(struct extent_buffer *dst, - struct extent_buffer *src, unsigned long dst_offset, - unsigned long src_offset, int nr_items) -{ - struct btrfs_fs_info *fs_info = dst->fs_info; - int ret = 0; - struct tree_mod_elem **tm_list = NULL; - struct tree_mod_elem **tm_list_add, **tm_list_rem; - int i; - int locked = 0; - - if (!tree_mod_need_log(fs_info, NULL)) - return 0; - - if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) - return 0; - - tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *), - GFP_NOFS); - if (!tm_list) - return -ENOMEM; - - tm_list_add = tm_list; - tm_list_rem = tm_list + nr_items; - for (i = 0; i < nr_items; i++) { - tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset, - MOD_LOG_KEY_REMOVE, GFP_NOFS); - if (!tm_list_rem[i]) { - ret = -ENOMEM; - goto free_tms; - } - - tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset, - MOD_LOG_KEY_ADD, GFP_NOFS); - if (!tm_list_add[i]) { - ret = -ENOMEM; - goto free_tms; - } - } - - if (tree_mod_dont_log(fs_info, NULL)) - goto free_tms; - locked = 1; - - for (i = 0; i < nr_items; i++) { - ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]); - if (ret) - goto free_tms; - ret = __tree_mod_log_insert(fs_info, tm_list_add[i]); - if (ret) - goto free_tms; - } - - write_unlock(&fs_info->tree_mod_log_lock); - kfree(tm_list); - - return 0; - -free_tms: - for (i = 0; i < nr_items * 2; i++) { - if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) - rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); - kfree(tm_list[i]); - } - if (locked) - write_unlock(&fs_info->tree_mod_log_lock); - kfree(tm_list); - - return ret; -} - -static noinline int tree_mod_log_free_eb(struct extent_buffer *eb) -{ - struct tree_mod_elem **tm_list = NULL; - int nritems = 0; - int i; - int ret = 0; - - if (btrfs_header_level(eb) == 0) - return 0; - - if (!tree_mod_need_log(eb->fs_info, NULL)) - return 0; - - nritems = btrfs_header_nritems(eb); - tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS); - if (!tm_list) - return -ENOMEM; - - for (i = 0; i < nritems; i++) { - tm_list[i] = alloc_tree_mod_elem(eb, i, - MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); - if (!tm_list[i]) { - ret = -ENOMEM; - goto free_tms; - } - } - - if (tree_mod_dont_log(eb->fs_info, eb)) - goto free_tms; - - ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems); - write_unlock(&eb->fs_info->tree_mod_log_lock); - if (ret) - goto free_tms; - kfree(tm_list); - - return 0; - -free_tms: - for (i = 0; i < nritems; i++) - kfree(tm_list[i]); - kfree(tm_list); - - return ret; -} - /* * check if the tree block can be shared by multiple trees */ @@ -1090,7 +500,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, parent_start = buf->start; atomic_inc(&cow->refs); - ret = tree_mod_log_insert_root(root->node, cow, 1); + ret = btrfs_tree_mod_log_insert_root(root->node, cow, true); BUG_ON(ret < 0); rcu_assign_pointer(root->node, cow); @@ -1100,15 +510,15 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, add_root_to_dirty_list(root); } else { WARN_ON(trans->transid != btrfs_header_generation(parent)); - tree_mod_log_insert_key(parent, parent_slot, - MOD_LOG_KEY_REPLACE, GFP_NOFS); + btrfs_tree_mod_log_insert_key(parent, parent_slot, + BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); btrfs_set_node_blockptr(parent, parent_slot, cow->start); btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); if (last_ref) { - ret = tree_mod_log_free_eb(buf); + ret = btrfs_tree_mod_log_free_eb(buf); if (ret) { btrfs_tree_unlock(cow); free_extent_buffer(cow); @@ -1127,298 +537,6 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, return 0; } -/* - * returns the logical address of the oldest predecessor of the given root. - * entries older than time_seq are ignored. - */ -static struct tree_mod_elem *__tree_mod_log_oldest_root( - struct extent_buffer *eb_root, u64 time_seq) -{ - struct tree_mod_elem *tm; - struct tree_mod_elem *found = NULL; - u64 root_logical = eb_root->start; - int looped = 0; - - if (!time_seq) - return NULL; - - /* - * the very last operation that's logged for a root is the - * replacement operation (if it is replaced at all). this has - * the logical address of the *new* root, making it the very - * first operation that's logged for this root. - */ - while (1) { - tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical, - time_seq); - if (!looped && !tm) - return NULL; - /* - * if there are no tree operation for the oldest root, we simply - * return it. this should only happen if that (old) root is at - * level 0. - */ - if (!tm) - break; - - /* - * if there's an operation that's not a root replacement, we - * found the oldest version of our root. normally, we'll find a - * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here. - */ - if (tm->op != MOD_LOG_ROOT_REPLACE) - break; - - found = tm; - root_logical = tm->old_root.logical; - looped = 1; - } - - /* if there's no old root to return, return what we found instead */ - if (!found) - found = tm; - - return found; -} - -/* - * tm is a pointer to the first operation to rewind within eb. then, all - * previous operations will be rewound (until we reach something older than - * time_seq). - */ -static void -__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, - u64 time_seq, struct tree_mod_elem *first_tm) -{ - u32 n; - struct rb_node *next; - struct tree_mod_elem *tm = first_tm; - unsigned long o_dst; - unsigned long o_src; - unsigned long p_size = sizeof(struct btrfs_key_ptr); - - n = btrfs_header_nritems(eb); - read_lock(&fs_info->tree_mod_log_lock); - while (tm && tm->seq >= time_seq) { - /* - * all the operations are recorded with the operator used for - * the modification. as we're going backwards, we do the - * opposite of each operation here. - */ - switch (tm->op) { - case MOD_LOG_KEY_REMOVE_WHILE_FREEING: - BUG_ON(tm->slot < n); - fallthrough; - case MOD_LOG_KEY_REMOVE_WHILE_MOVING: - case MOD_LOG_KEY_REMOVE: - btrfs_set_node_key(eb, &tm->key, tm->slot); - btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); - btrfs_set_node_ptr_generation(eb, tm->slot, - tm->generation); - n++; - break; - case MOD_LOG_KEY_REPLACE: - BUG_ON(tm->slot >= n); - btrfs_set_node_key(eb, &tm->key, tm->slot); - btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); - btrfs_set_node_ptr_generation(eb, tm->slot, - tm->generation); - break; - case MOD_LOG_KEY_ADD: - /* if a move operation is needed it's in the log */ - n--; - break; - case MOD_LOG_MOVE_KEYS: - o_dst = btrfs_node_key_ptr_offset(tm->slot); - o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot); - memmove_extent_buffer(eb, o_dst, o_src, - tm->move.nr_items * p_size); - break; - case MOD_LOG_ROOT_REPLACE: - /* - * this operation is special. for roots, this must be - * handled explicitly before rewinding. - * for non-roots, this operation may exist if the node - * was a root: root A -> child B; then A gets empty and - * B is promoted to the new root. in the mod log, we'll - * have a root-replace operation for B, a tree block - * that is no root. we simply ignore that operation. - */ - break; - } - next = rb_next(&tm->node); - if (!next) - break; - tm = rb_entry(next, struct tree_mod_elem, node); - if (tm->logical != first_tm->logical) - break; - } - read_unlock(&fs_info->tree_mod_log_lock); - btrfs_set_header_nritems(eb, n); -} - -/* - * Called with eb read locked. If the buffer cannot be rewound, the same buffer - * is returned. If rewind operations happen, a fresh buffer is returned. The - * returned buffer is always read-locked. If the returned buffer is not the - * input buffer, the lock on the input buffer is released and the input buffer - * is freed (its refcount is decremented). - */ -static struct extent_buffer * -tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - struct extent_buffer *eb, u64 time_seq) -{ - struct extent_buffer *eb_rewin; - struct tree_mod_elem *tm; - - if (!time_seq) - return eb; - - if (btrfs_header_level(eb) == 0) - return eb; - - tm = tree_mod_log_search(fs_info, eb->start, time_seq); - if (!tm) - return eb; - - if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { - BUG_ON(tm->slot != 0); - eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start); - if (!eb_rewin) { - btrfs_tree_read_unlock(eb); - free_extent_buffer(eb); - return NULL; - } - btrfs_set_header_bytenr(eb_rewin, eb->start); - btrfs_set_header_backref_rev(eb_rewin, - btrfs_header_backref_rev(eb)); - btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); - btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); - } else { - eb_rewin = btrfs_clone_extent_buffer(eb); - if (!eb_rewin) { - btrfs_tree_read_unlock(eb); - free_extent_buffer(eb); - return NULL; - } - } - - btrfs_tree_read_unlock(eb); - free_extent_buffer(eb); - - btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin), - eb_rewin, btrfs_header_level(eb_rewin)); - btrfs_tree_read_lock(eb_rewin); - __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); - WARN_ON(btrfs_header_nritems(eb_rewin) > - BTRFS_NODEPTRS_PER_BLOCK(fs_info)); - - return eb_rewin; -} - -/* - * get_old_root() rewinds the state of @root's root node to the given @time_seq - * value. If there are no changes, the current root->root_node is returned. If - * anything changed in between, there's a fresh buffer allocated on which the - * rewind operations are done. In any case, the returned buffer is read locked. - * Returns NULL on error (with no locks held). - */ -static inline struct extent_buffer * -get_old_root(struct btrfs_root *root, u64 time_seq) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct tree_mod_elem *tm; - struct extent_buffer *eb = NULL; - struct extent_buffer *eb_root; - u64 eb_root_owner = 0; - struct extent_buffer *old; - struct tree_mod_root *old_root = NULL; - u64 old_generation = 0; - u64 logical; - int level; - - eb_root = btrfs_read_lock_root_node(root); - tm = __tree_mod_log_oldest_root(eb_root, time_seq); - if (!tm) - return eb_root; - - if (tm->op == MOD_LOG_ROOT_REPLACE) { - old_root = &tm->old_root; - old_generation = tm->generation; - logical = old_root->logical; - level = old_root->level; - } else { - logical = eb_root->start; - level = btrfs_header_level(eb_root); - } - - tm = tree_mod_log_search(fs_info, logical, time_seq); - if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) { - btrfs_tree_read_unlock(eb_root); - free_extent_buffer(eb_root); - old = read_tree_block(fs_info, logical, root->root_key.objectid, - 0, level, NULL); - if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { - if (!IS_ERR(old)) - free_extent_buffer(old); - btrfs_warn(fs_info, - "failed to read tree block %llu from get_old_root", - logical); - } else { - btrfs_tree_read_lock(old); - eb = btrfs_clone_extent_buffer(old); - btrfs_tree_read_unlock(old); - free_extent_buffer(old); - } - } else if (old_root) { - eb_root_owner = btrfs_header_owner(eb_root); - btrfs_tree_read_unlock(eb_root); - free_extent_buffer(eb_root); - eb = alloc_dummy_extent_buffer(fs_info, logical); - } else { - eb = btrfs_clone_extent_buffer(eb_root); - btrfs_tree_read_unlock(eb_root); - free_extent_buffer(eb_root); - } - - if (!eb) - return NULL; - if (old_root) { - btrfs_set_header_bytenr(eb, eb->start); - btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); - btrfs_set_header_owner(eb, eb_root_owner); - btrfs_set_header_level(eb, old_root->level); - btrfs_set_header_generation(eb, old_generation); - } - btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, - btrfs_header_level(eb)); - btrfs_tree_read_lock(eb); - if (tm) - __tree_mod_log_rewind(fs_info, eb, time_seq, tm); - else - WARN_ON(btrfs_header_level(eb) != 0); - WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info)); - - return eb; -} - -int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq) -{ - struct tree_mod_elem *tm; - int level; - struct extent_buffer *eb_root = btrfs_root_node(root); - - tm = __tree_mod_log_oldest_root(eb_root, time_seq); - if (tm && tm->op == MOD_LOG_ROOT_REPLACE) { - level = tm->old_root.level; - } else { - level = btrfs_header_level(eb_root); - } - free_extent_buffer(eb_root); - - return level; -} - static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) @@ -1840,7 +958,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto enospc; } - ret = tree_mod_log_insert_root(root->node, child, 1); + ret = btrfs_tree_mod_log_insert_root(root->node, child, true); BUG_ON(ret < 0); rcu_assign_pointer(root->node, child); @@ -1920,8 +1038,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } else { struct btrfs_disk_key right_key; btrfs_node_key(right, &right_key, 0); - ret = tree_mod_log_insert_key(parent, pslot + 1, - MOD_LOG_KEY_REPLACE, GFP_NOFS); + ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, + BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); BUG_ON(ret < 0); btrfs_set_node_key(parent, &right_key, pslot + 1); btrfs_mark_buffer_dirty(parent); @@ -1966,8 +1084,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* update the parent key to reflect our changes */ struct btrfs_disk_key mid_key; btrfs_node_key(mid, &mid_key, 0); - ret = tree_mod_log_insert_key(parent, pslot, - MOD_LOG_KEY_REPLACE, GFP_NOFS); + ret = btrfs_tree_mod_log_insert_key(parent, pslot, + BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); BUG_ON(ret < 0); btrfs_set_node_key(parent, &mid_key, pslot); btrfs_mark_buffer_dirty(parent); @@ -2068,8 +1186,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; orig_slot += left_nr; btrfs_node_key(mid, &disk_key, 0); - ret = tree_mod_log_insert_key(parent, pslot, - MOD_LOG_KEY_REPLACE, GFP_NOFS); + ret = btrfs_tree_mod_log_insert_key(parent, pslot, + BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); BUG_ON(ret < 0); btrfs_set_node_key(parent, &disk_key, pslot); btrfs_mark_buffer_dirty(parent); @@ -2122,8 +1240,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; btrfs_node_key(right, &disk_key, 0); - ret = tree_mod_log_insert_key(parent, pslot + 1, - MOD_LOG_KEY_REPLACE, GFP_NOFS); + ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, + BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); BUG_ON(ret < 0); btrfs_set_node_key(parent, &disk_key, pslot + 1); btrfs_mark_buffer_dirty(parent); @@ -2161,12 +1279,13 @@ static void reada_for_search(struct btrfs_fs_info *fs_info, u64 search; u64 target; u64 nread = 0; + u64 nread_max; struct extent_buffer *eb; u32 nr; u32 blocksize; u32 nscan = 0; - if (level != 1) + if (level != 1 && path->reada != READA_FORWARD_ALWAYS) return; if (!path->nodes[level]) @@ -2174,6 +1293,20 @@ static void reada_for_search(struct btrfs_fs_info *fs_info, node = path->nodes[level]; + /* + * Since the time between visiting leaves is much shorter than the time + * between visiting nodes, limit read ahead of nodes to 1, to avoid too + * much IO at once (possibly random). + */ + if (path->reada == READA_FORWARD_ALWAYS) { + if (level > 1) + nread_max = node->fs_info->nodesize; + else + nread_max = SZ_128K; + } else { + nread_max = SZ_64K; + } + search = btrfs_node_blockptr(node, slot); blocksize = fs_info->nodesize; eb = find_extent_buffer(fs_info, search); @@ -2192,7 +1325,8 @@ static void reada_for_search(struct btrfs_fs_info *fs_info, if (nr == 0) break; nr--; - } else if (path->reada == READA_FORWARD) { + } else if (path->reada == READA_FORWARD || + path->reada == READA_FORWARD_ALWAYS) { nr++; if (nr >= nritems) break; @@ -2203,13 +1337,14 @@ static void reada_for_search(struct btrfs_fs_info *fs_info, break; } search = btrfs_node_blockptr(node, nr); - if ((search <= target && target - search <= 65536) || + if (path->reada == READA_FORWARD_ALWAYS || + (search <= target && target - search <= 65536) || (search > target && search - target <= 65536)) { btrfs_readahead_node_child(node, nr); nread += blocksize; } nscan++; - if ((nread > 65536 || nscan > 32)) + if (nread > nread_max || nscan > 32) break; } } @@ -2318,6 +1453,9 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, tmp = find_extent_buffer(fs_info, blocknr); if (tmp) { + if (p->reada == READA_FORWARD_ALWAYS) + reada_for_search(fs_info, p, level, slot, key->objectid); + /* first we do an atomic uptodate check */ if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { /* @@ -2861,7 +1999,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, } again: - b = get_old_root(root, time_seq); + b = btrfs_get_old_root(root, time_seq); if (!b) { ret = -EIO; goto done; @@ -2916,7 +2054,7 @@ again: level = btrfs_header_level(b); btrfs_tree_read_lock(b); - b = tree_mod_log_rewind(fs_info, p, b, time_seq); + b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq); if (!b) { ret = -ENOMEM; goto done; @@ -3030,8 +2168,8 @@ static void fixup_low_keys(struct btrfs_path *path, if (!path->nodes[i]) break; t = path->nodes[i]; - ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE, - GFP_ATOMIC); + ret = btrfs_tree_mod_log_insert_key(t, tslot, + BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC); BUG_ON(ret < 0); btrfs_set_node_key(t, key, tslot); btrfs_mark_buffer_dirty(path->nodes[i]); @@ -3194,7 +2332,7 @@ static int push_node_left(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); return ret; } - ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); + ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); if (ret) { btrfs_abort_transaction(trans, ret); return ret; @@ -3206,8 +2344,8 @@ static int push_node_left(struct btrfs_trans_handle *trans, if (push_items < src_nritems) { /* - * Don't call tree_mod_log_insert_move here, key removal was - * already fully logged by tree_mod_log_eb_copy above. + * Don't call btrfs_tree_mod_log_insert_move() here, key removal + * was already fully logged by btrfs_tree_mod_log_eb_copy() above. */ memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(push_items), @@ -3268,15 +2406,15 @@ static int balance_node_right(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); return ret; } - ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems); + ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems); BUG_ON(ret < 0); memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), btrfs_node_key_ptr_offset(0), (dst_nritems) * sizeof(struct btrfs_key_ptr)); - ret = tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items, - push_items); + ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items, + push_items); if (ret) { btrfs_abort_transaction(trans, ret); return ret; @@ -3342,7 +2480,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); old = root->node; - ret = tree_mod_log_insert_root(root->node, c, 0); + ret = btrfs_tree_mod_log_insert_root(root->node, c, false); BUG_ON(ret < 0); rcu_assign_pointer(root->node, c); @@ -3381,8 +2519,8 @@ static void insert_ptr(struct btrfs_trans_handle *trans, BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info)); if (slot != nritems) { if (level) { - ret = tree_mod_log_insert_move(lower, slot + 1, slot, - nritems - slot); + ret = btrfs_tree_mod_log_insert_move(lower, slot + 1, + slot, nritems - slot); BUG_ON(ret < 0); } memmove_extent_buffer(lower, @@ -3391,8 +2529,8 @@ static void insert_ptr(struct btrfs_trans_handle *trans, (nritems - slot) * sizeof(struct btrfs_key_ptr)); } if (level) { - ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD, - GFP_NOFS); + ret = btrfs_tree_mod_log_insert_key(lower, slot, + BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS); BUG_ON(ret < 0); } btrfs_set_node_key(lower, key, slot); @@ -3433,9 +2571,9 @@ static noinline int split_node(struct btrfs_trans_handle *trans, * tree mod log: We don't log_removal old root in * insert_new_root, because that root buffer will be kept as a * normal node. We are going to log removal of half of the - * elements below with tree_mod_log_eb_copy. We're holding a - * tree lock on the buffer, which is why we cannot race with - * other tree_mod_log users. + * elements below with btrfs_tree_mod_log_eb_copy(). We're + * holding a tree lock on the buffer, which is why we cannot + * race with other tree_mod_log users. */ ret = insert_new_root(trans, root, path, level + 1); if (ret) @@ -3462,7 +2600,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, root_add_used(root, fs_info->nodesize); ASSERT(btrfs_header_level(c) == level); - ret = tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); + ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); if (ret) { btrfs_abort_transaction(trans, ret); return ret; @@ -4844,8 +3982,8 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, nritems = btrfs_header_nritems(parent); if (slot != nritems - 1) { if (level) { - ret = tree_mod_log_insert_move(parent, slot, slot + 1, - nritems - slot - 1); + ret = btrfs_tree_mod_log_insert_move(parent, slot, + slot + 1, nritems - slot - 1); BUG_ON(ret < 0); } memmove_extent_buffer(parent, @@ -4854,8 +3992,8 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, sizeof(struct btrfs_key_ptr) * (nritems - slot - 1)); } else if (level) { - ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE, - GFP_NOFS); + ret = btrfs_tree_mod_log_insert_key(parent, slot, + BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS); BUG_ON(ret < 0); } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 81178dd329c5..f83fd3cbf243 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -342,6 +342,27 @@ struct btrfs_node { struct btrfs_key_ptr ptrs[]; } __attribute__ ((__packed__)); +/* Read ahead values for struct btrfs_path.reada */ +enum { + READA_NONE, + READA_BACK, + READA_FORWARD, + /* + * Similar to READA_FORWARD but unlike it: + * + * 1) It will trigger readahead even for leaves that are not close to + * each other on disk; + * 2) It also triggers readahead for nodes; + * 3) During a search, even when a node or leaf is already in memory, it + * will still trigger readahead for other nodes and leaves that follow + * it. + * + * This is meant to be used only when we know we are iterating over the + * entire tree or a very large part of it. + */ + READA_FORWARD_ALWAYS, +}; + /* * btrfs_paths remember the path taken from the root down to the leaf. * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point @@ -350,7 +371,6 @@ struct btrfs_node { * The slots array records the index of the item or block pointer * used while walking the tree. */ -enum { READA_NONE, READA_BACK, READA_FORWARD }; struct btrfs_path { struct extent_buffer *nodes[BTRFS_MAX_LEVEL]; int slots[BTRFS_MAX_LEVEL]; @@ -482,16 +502,6 @@ struct btrfs_discard_ctl { atomic64_t discard_bytes_saved; }; -/* delayed seq elem */ -struct seq_list { - struct list_head list; - u64 seq; -}; - -#define SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 } - -#define SEQ_LAST ((u64)-1) - enum btrfs_orphan_cleanup_state { ORPHAN_CLEANUP_STARTED = 1, ORPHAN_CLEANUP_DONE = 2, @@ -572,6 +582,15 @@ enum { /* Indicate that we can't trust the free space tree for caching yet */ BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, + + /* Indicate whether there are any tree modification log users */ + BTRFS_FS_TREE_MOD_LOG_USERS, + +#if BITS_PER_LONG == 32 + /* Indicate if we have error/warn message printed on 32bit systems */ + BTRFS_FS_32BIT_ERROR, + BTRFS_FS_32BIT_WARN, +#endif }; /* @@ -941,10 +960,16 @@ struct btrfs_fs_info { struct work_struct async_data_reclaim_work; struct work_struct preempt_reclaim_work; + /* Reclaim partially filled block groups in the background */ + struct work_struct reclaim_bgs_work; + struct list_head reclaim_bgs; + int bg_reclaim_threshold; + spinlock_t unused_bgs_lock; struct list_head unused_bgs; struct mutex unused_bg_unpin_mutex; - struct mutex delete_unused_bgs_mutex; + /* Protect block groups that are going to be deleted */ + struct mutex reclaim_bgs_lock; /* Cached block sizes */ u32 nodesize; @@ -2691,7 +2716,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref); -int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); /* @@ -2929,13 +2953,6 @@ static inline void btrfs_clear_sb_rdonly(struct super_block *sb) clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); } -/* tree mod log functions from ctree.c */ -u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, - struct seq_list *elem); -void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, - struct seq_list *elem); -int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); - /* root-item.c */ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, u64 ref_id, u64 dirid, u64 sequence, const char *name, @@ -3084,7 +3101,7 @@ u64 btrfs_file_extent_end(const struct btrfs_path *path); blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, - struct page *page, u64 start, u64 end, int mirror); + struct page *page, u64 start, u64 end); struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, u64 start, u64 len); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, @@ -3179,6 +3196,7 @@ extern const struct iomap_dio_ops btrfs_dio_ops; /* Inode locking type flags, by default the exclusive lock is taken */ #define BTRFS_ILOCK_SHARED (1U << 0) #define BTRFS_ILOCK_TRY (1U << 1) +#define BTRFS_ILOCK_MMAP (1U << 2) int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags); void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags); @@ -3220,8 +3238,9 @@ extern const struct file_operations btrfs_file_operations; int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_drop_extents_args *args); -int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, - const u64 start, const u64 end, +int btrfs_replace_file_extents(struct btrfs_inode *inode, + struct btrfs_path *path, const u64 start, + const u64 end, struct btrfs_replace_extent_info *extent_info, struct btrfs_trans_handle **trans_out); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, @@ -3408,6 +3427,19 @@ static inline void assertfail(const char *expr, const char* file, int line) { } #define ASSERT(expr) (void)(expr) #endif +#if BITS_PER_LONG == 32 +#define BTRFS_32BIT_MAX_FILE_SIZE (((u64)ULONG_MAX + 1) << PAGE_SHIFT) +/* + * The warning threshold is 5/8th of the MAX_LFS_FILESIZE that limits the logical + * addresses of extents. + * + * For 4K page size it's about 10T, for 64K it's 160T. + */ +#define BTRFS_32BIT_EARLY_WARN_THRESHOLD (BTRFS_32BIT_MAX_FILE_SIZE * 5 / 8) +void btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info); +void btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info); +#endif + /* * Get the correct offset inside the page of extent buffer. * @@ -3735,8 +3767,6 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) return signal_pending(current); } -#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) - /* Sanity test specific functions */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_destroy_inode(struct inode *inode); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index bf25401c9768..1a88f6214ebc 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -602,7 +602,6 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, static int btrfs_delayed_inode_reserve_metadata( struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_inode *inode, struct btrfs_delayed_node *node) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -633,32 +632,17 @@ static int btrfs_delayed_inode_reserve_metadata( return ret; ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, BTRFS_RESERVE_NO_FLUSH); - /* - * Since we're under a transaction reserve_metadata_bytes could - * try to commit the transaction which will make it return - * EAGAIN to make us stop the transaction we have, so return - * ENOSPC instead so that btrfs_dirty_inode knows what to do. - */ - if (ret == -EAGAIN) { - ret = -ENOSPC; - btrfs_qgroup_free_meta_prealloc(root, num_bytes); - } - if (!ret) { - node->bytes_reserved = num_bytes; - trace_btrfs_space_reservation(fs_info, - "delayed_inode", - btrfs_ino(inode), - num_bytes, 1); - } else { + /* NO_FLUSH could only fail with -ENOSPC */ + ASSERT(ret == 0 || ret == -ENOSPC); + if (ret) btrfs_qgroup_free_meta_prealloc(root, num_bytes); - } - return ret; + } else { + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true); } - ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true); if (!ret) { trace_btrfs_space_reservation(fs_info, "delayed_inode", - btrfs_ino(inode), num_bytes, 1); + node->inode_id, num_bytes, 1); node->bytes_reserved = num_bytes; } @@ -1589,8 +1573,8 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode, * We can only do one readdir with delayed items at a time because of * item->readdir_list. */ - inode_unlock_shared(inode); - inode_lock(inode); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + btrfs_inode_lock(inode, 0); mutex_lock(&delayed_node->mutex); item = __btrfs_first_delayed_insertion_item(delayed_node); @@ -1833,8 +1817,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, goto release_node; } - ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode, - delayed_node); + ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); if (ret) goto release_node; diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 63be7d01a9a3..c92d9d4f5f46 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -11,6 +11,7 @@ #include "transaction.h" #include "qgroup.h" #include "space-info.h" +#include "tree-mod-log.h" struct kmem_cache *btrfs_delayed_ref_head_cachep; struct kmem_cache *btrfs_delayed_tree_ref_cachep; @@ -494,16 +495,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, if (head->is_data) return; - read_lock(&fs_info->tree_mod_log_lock); - if (!list_empty(&fs_info->tree_mod_seq_list)) { - struct seq_list *elem; - - elem = list_first_entry(&fs_info->tree_mod_seq_list, - struct seq_list, list); - seq = elem->seq; - } - read_unlock(&fs_info->tree_mod_log_lock); - + seq = btrfs_tree_mod_log_lowest_seq(fs_info); again: for (node = rb_first_cached(&head->ref_tree); node; node = rb_next(node)) { @@ -517,23 +509,16 @@ again: int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) { - struct seq_list *elem; int ret = 0; + u64 min_seq = btrfs_tree_mod_log_lowest_seq(fs_info); - read_lock(&fs_info->tree_mod_log_lock); - if (!list_empty(&fs_info->tree_mod_seq_list)) { - elem = list_first_entry(&fs_info->tree_mod_seq_list, - struct seq_list, list); - if (seq >= elem->seq) { - btrfs_debug(fs_info, - "holding back delayed_ref %#x.%x, lowest is %#x.%x", - (u32)(seq >> 32), (u32)seq, - (u32)(elem->seq >> 32), (u32)elem->seq); - ret = 1; - } + if (min_seq != 0 && seq >= min_seq) { + btrfs_debug(fs_info, + "holding back delayed_ref %llu, lowest is %llu", + seq, min_seq); + ret = 1; } - read_unlock(&fs_info->tree_mod_log_lock); return ret; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 289f1f09481d..c9a3036c23bf 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -42,6 +42,7 @@ #include "discard.h" #include "space-info.h" #include "zoned.h" +#include "subpage.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ @@ -440,6 +441,74 @@ static int btree_read_extent_buffer_pages(struct extent_buffer *eb, return ret; } +static int csum_one_extent_buffer(struct extent_buffer *eb) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + u8 result[BTRFS_CSUM_SIZE]; + int ret; + + ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, + offsetof(struct btrfs_header, fsid), + BTRFS_FSID_SIZE) == 0); + csum_tree_block(eb, result); + + if (btrfs_header_level(eb)) + ret = btrfs_check_node(eb); + else + ret = btrfs_check_leaf_full(eb); + + if (ret < 0) { + btrfs_print_tree(eb, 0); + btrfs_err(fs_info, + "block=%llu write time tree block corruption detected", + eb->start); + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + return ret; + } + write_extent_buffer(eb, result, 0, fs_info->csum_size); + + return 0; +} + +/* Checksum all dirty extent buffers in one bio_vec */ +static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info, + struct bio_vec *bvec) +{ + struct page *page = bvec->bv_page; + u64 bvec_start = page_offset(page) + bvec->bv_offset; + u64 cur; + int ret = 0; + + for (cur = bvec_start; cur < bvec_start + bvec->bv_len; + cur += fs_info->nodesize) { + struct extent_buffer *eb; + bool uptodate; + + eb = find_extent_buffer(fs_info, cur); + uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur, + fs_info->nodesize); + + /* A dirty eb shouldn't disappear from buffer_radix */ + if (WARN_ON(!eb)) + return -EUCLEAN; + + if (WARN_ON(cur != btrfs_header_bytenr(eb))) { + free_extent_buffer(eb); + return -EUCLEAN; + } + if (WARN_ON(!uptodate)) { + free_extent_buffer(eb); + return -EUCLEAN; + } + + ret = csum_one_extent_buffer(eb); + free_extent_buffer(eb); + if (ret < 0) + return ret; + } + return ret; +} + /* * Checksum a dirty tree block before IO. This has extra checks to make sure * we only fill in the checksum field in the first page of a multi-page block. @@ -450,9 +519,10 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec struct page *page = bvec->bv_page; u64 start = page_offset(page); u64 found_start; - u8 result[BTRFS_CSUM_SIZE]; struct extent_buffer *eb; - int ret; + + if (fs_info->sectorsize < PAGE_SIZE) + return csum_dirty_subpage_buffers(fs_info, bvec); eb = (struct extent_buffer *)page->private; if (page != eb->pages[0]) @@ -474,28 +544,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec if (WARN_ON(!PageUptodate(page))) return -EUCLEAN; - ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, - offsetof(struct btrfs_header, fsid), - BTRFS_FSID_SIZE) == 0); - - csum_tree_block(eb, result); - - if (btrfs_header_level(eb)) - ret = btrfs_check_node(eb); - else - ret = btrfs_check_leaf_full(eb); - - if (ret < 0) { - btrfs_print_tree(eb, 0); - btrfs_err(fs_info, - "block=%llu write time tree block corruption detected", - eb->start); - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); - return ret; - } - write_extent_buffer(eb, result, 0, fs_info->csum_size); - - return 0; + return csum_one_extent_buffer(eb); } static int check_tree_block_fsid(struct extent_buffer *eb) @@ -992,14 +1041,48 @@ static void btree_invalidatepage(struct page *page, unsigned int offset, static int btree_set_page_dirty(struct page *page) { #ifdef DEBUG + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct btrfs_subpage *subpage; struct extent_buffer *eb; + int cur_bit = 0; + u64 page_start = page_offset(page); + + if (fs_info->sectorsize == PAGE_SIZE) { + BUG_ON(!PagePrivate(page)); + eb = (struct extent_buffer *)page->private; + BUG_ON(!eb); + BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + BUG_ON(!atomic_read(&eb->refs)); + btrfs_assert_tree_locked(eb); + return __set_page_dirty_nobuffers(page); + } + ASSERT(PagePrivate(page) && page->private); + subpage = (struct btrfs_subpage *)page->private; + + ASSERT(subpage->dirty_bitmap); + while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) { + unsigned long flags; + u64 cur; + u16 tmp = (1 << cur_bit); + + spin_lock_irqsave(&subpage->lock, flags); + if (!(tmp & subpage->dirty_bitmap)) { + spin_unlock_irqrestore(&subpage->lock, flags); + cur_bit++; + continue; + } + spin_unlock_irqrestore(&subpage->lock, flags); + cur = page_start + cur_bit * fs_info->sectorsize; - BUG_ON(!PagePrivate(page)); - eb = (struct extent_buffer *)page->private; - BUG_ON(!eb); - BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); - BUG_ON(!atomic_read(&eb->refs)); - btrfs_assert_tree_locked(eb); + eb = find_extent_buffer(fs_info, cur); + ASSERT(eb); + ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + ASSERT(atomic_read(&eb->refs)); + btrfs_assert_tree_locked(eb); + free_extent_buffer(eb); + + cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits); + } #endif return __set_page_dirty_nobuffers(page); } @@ -1807,14 +1890,21 @@ static int cleaner_kthread(void *arg) btrfs_run_defrag_inodes(fs_info); /* - * Acquires fs_info->delete_unused_bgs_mutex to avoid racing + * Acquires fs_info->reclaim_bgs_lock to avoid racing * with relocation (btrfs_relocate_chunk) and relocation * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group) - * after acquiring fs_info->delete_unused_bgs_mutex. So we + * after acquiring fs_info->reclaim_bgs_lock. So we * can't hold, nor need to, fs_info->cleaner_mutex when deleting * unused block groups. */ btrfs_delete_unused_bgs(fs_info); + + /* + * Reclaim block groups in the reclaim_bgs list after we deleted + * all unused block_groups. This possibly gives us some more free + * space. + */ + btrfs_reclaim_bgs(fs_info); sleep: clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags); if (kthread_should_park()) @@ -2793,7 +2883,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->treelog_bg_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->unused_bg_unpin_mutex); - mutex_init(&fs_info->delete_unused_bgs_mutex); + mutex_init(&fs_info->reclaim_bgs_lock); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); mutex_init(&fs_info->zoned_meta_io_lock); @@ -2803,6 +2893,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&fs_info->space_info); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); INIT_LIST_HEAD(&fs_info->unused_bgs); + INIT_LIST_HEAD(&fs_info->reclaim_bgs); #ifdef CONFIG_BTRFS_DEBUG INIT_LIST_HEAD(&fs_info->allocated_roots); INIT_LIST_HEAD(&fs_info->allocated_ebs); @@ -2891,6 +2982,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) fs_info->swapfile_pins = RB_ROOT; fs_info->send_in_progress = 0; + + fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH; + INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work); } static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb) @@ -4249,6 +4343,8 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); + cancel_work_sync(&fs_info->reclaim_bgs_work); + /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 36a3c973fda1..7a28314189b4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2490,19 +2490,6 @@ int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, return __btrfs_mod_ref(trans, root, buf, full_backref, 0); } -int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) -{ - struct btrfs_block_group *block_group; - int readonly = 0; - - block_group = btrfs_lookup_block_group(fs_info, bytenr); - if (!block_group || block_group->ro) - readonly = 1; - if (block_group) - btrfs_put_block_group(block_group); - return readonly; -} - static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -3355,11 +3342,9 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, * find a node pointing to this leaf and record operations that * point to this leaf. */ - if (btrfs_header_level(buf) == 0) { - read_lock(&fs_info->tree_mod_log_lock); - must_pin = !list_empty(&fs_info->tree_mod_seq_list); - read_unlock(&fs_info->tree_mod_log_lock); - } + if (btrfs_header_level(buf) == 0 && + test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) + must_pin = true; if (must_pin || btrfs_is_zoned(fs_info)) { btrfs_redirty_list_add(trans->transaction, buf); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 910769d5fcdb..f2d1bb234377 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -13,6 +13,7 @@ #include <linux/pagevec.h> #include <linux/prefetch.h> #include <linux/cleancache.h> +#include "misc.h" #include "extent_io.h" #include "extent-io-tree.h" #include "extent_map.h" @@ -2983,8 +2984,7 @@ static void end_bio_extent_readpage(struct bio *bio) if (likely(uptodate)) { if (is_data_inode(inode)) ret = btrfs_verify_data_csum(io_bio, - bio_offset, page, start, end, - mirror); + bio_offset, page, start, end); else ret = btrfs_validate_metadata_buffer(io_bio, page, start, end, mirror); @@ -3967,7 +3967,13 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb btrfs_tree_unlock(eb); - if (!ret) + /* + * Either we don't need to submit any tree block, or we're submitting + * subpage eb. + * Subpage metadata doesn't use page locking at all, so we can skip + * the page locking. + */ + if (!ret || fs_info->sectorsize < PAGE_SIZE) return ret; num_pages = num_extent_pages(eb); @@ -4012,12 +4018,11 @@ err_unlock: return ret; } -static void set_btree_ioerr(struct page *page) +static void set_btree_ioerr(struct page *page, struct extent_buffer *eb) { - struct extent_buffer *eb = (struct extent_buffer *)page->private; - struct btrfs_fs_info *fs_info; + struct btrfs_fs_info *fs_info = eb->fs_info; - SetPageError(page); + btrfs_page_set_error(fs_info, page, eb->start, eb->len); if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) return; @@ -4025,7 +4030,6 @@ static void set_btree_ioerr(struct page *page) * If we error out, we should add back the dirty_metadata_bytes * to make it consistent. */ - fs_info = eb->fs_info; percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len, fs_info->dirty_metadata_batch); @@ -4069,26 +4073,111 @@ static void set_btree_ioerr(struct page *page) */ switch (eb->log_index) { case -1: - set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags); + set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags); break; case 0: - set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags); + set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags); break; case 1: - set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags); + set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags); break; default: BUG(); /* unexpected, logic error */ } } +/* + * The endio specific version which won't touch any unsafe spinlock in endio + * context. + */ +static struct extent_buffer *find_extent_buffer_nolock( + struct btrfs_fs_info *fs_info, u64 start) +{ + struct extent_buffer *eb; + + rcu_read_lock(); + eb = radix_tree_lookup(&fs_info->buffer_radix, + start >> fs_info->sectorsize_bits); + if (eb && atomic_inc_not_zero(&eb->refs)) { + rcu_read_unlock(); + return eb; + } + rcu_read_unlock(); + return NULL; +} + +/* + * The endio function for subpage extent buffer write. + * + * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback() + * after all extent buffers in the page has finished their writeback. + */ +static void end_bio_subpage_eb_writepage(struct btrfs_fs_info *fs_info, + struct bio *bio) +{ + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + ASSERT(!bio_flagged(bio, BIO_CLONED)); + bio_for_each_segment_all(bvec, bio, iter_all) { + struct page *page = bvec->bv_page; + u64 bvec_start = page_offset(page) + bvec->bv_offset; + u64 bvec_end = bvec_start + bvec->bv_len - 1; + u64 cur_bytenr = bvec_start; + + ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize)); + + /* Iterate through all extent buffers in the range */ + while (cur_bytenr <= bvec_end) { + struct extent_buffer *eb; + int done; + + /* + * Here we can't use find_extent_buffer(), as it may + * try to lock eb->refs_lock, which is not safe in endio + * context. + */ + eb = find_extent_buffer_nolock(fs_info, cur_bytenr); + ASSERT(eb); + + cur_bytenr = eb->start + eb->len; + + ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)); + done = atomic_dec_and_test(&eb->io_pages); + ASSERT(done); + + if (bio->bi_status || + test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { + ClearPageUptodate(page); + set_btree_ioerr(page, eb); + } + + btrfs_subpage_clear_writeback(fs_info, page, eb->start, + eb->len); + end_extent_buffer_writeback(eb); + /* + * free_extent_buffer() will grab spinlock which is not + * safe in endio context. Thus here we manually dec + * the ref. + */ + atomic_dec(&eb->refs); + } + } + bio_put(bio); +} + static void end_bio_extent_buffer_writepage(struct bio *bio) { + struct btrfs_fs_info *fs_info; struct bio_vec *bvec; struct extent_buffer *eb; int done; struct bvec_iter_all iter_all; + fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb); + if (fs_info->sectorsize < PAGE_SIZE) + return end_bio_subpage_eb_writepage(fs_info, bio); + ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; @@ -4100,7 +4189,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio) if (bio->bi_status || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { ClearPageUptodate(page); - set_btree_ioerr(page); + set_btree_ioerr(page, eb); } end_page_writeback(page); @@ -4114,6 +4203,56 @@ static void end_bio_extent_buffer_writepage(struct bio *bio) bio_put(bio); } +/* + * Unlike the work in write_one_eb(), we rely completely on extent locking. + * Page locking is only utilized at minimum to keep the VMM code happy. + * + * Caller should still call write_one_eb() other than this function directly. + * As write_one_eb() has extra preparation before submitting the extent buffer. + */ +static int write_one_subpage_eb(struct extent_buffer *eb, + struct writeback_control *wbc, + struct extent_page_data *epd) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + struct page *page = eb->pages[0]; + unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; + bool no_dirty_ebs = false; + int ret; + + /* clear_page_dirty_for_io() in subpage helper needs page locked */ + lock_page(page); + btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len); + + /* Check if this is the last dirty bit to update nr_written */ + no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page, + eb->start, eb->len); + if (no_dirty_ebs) + clear_page_dirty_for_io(page); + + ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, page, + eb->start, eb->len, eb->start - page_offset(page), + &epd->bio, end_bio_extent_buffer_writepage, 0, 0, 0, + false); + if (ret) { + btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len); + set_btree_ioerr(page, eb); + unlock_page(page); + + if (atomic_dec_and_test(&eb->io_pages)) + end_extent_buffer_writeback(eb); + return -EIO; + } + unlock_page(page); + /* + * Submission finished without problem, if no range of the page is + * dirty anymore, we have submitted a page. Update nr_written in wbc. + */ + if (no_dirty_ebs) + update_nr_written(wbc, 1); + return ret; +} + static noinline_for_stack int write_one_eb(struct extent_buffer *eb, struct writeback_control *wbc, struct extent_page_data *epd) @@ -4145,6 +4284,9 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, memzero_extent_buffer(eb, start, end - start); } + if (eb->fs_info->sectorsize < PAGE_SIZE) + return write_one_subpage_eb(eb, wbc, epd); + for (i = 0; i < num_pages; i++) { struct page *p = eb->pages[i]; @@ -4156,7 +4298,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, end_bio_extent_buffer_writepage, 0, 0, 0, false); if (ret) { - set_btree_ioerr(p); + set_btree_ioerr(p, eb); if (PageWriteback(p)) end_page_writeback(p); if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) @@ -4181,6 +4323,98 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, } /* + * Submit one subpage btree page. + * + * The main difference to submit_eb_page() is: + * - Page locking + * For subpage, we don't rely on page locking at all. + * + * - Flush write bio + * We only flush bio if we may be unable to fit current extent buffers into + * current bio. + * + * Return >=0 for the number of submitted extent buffers. + * Return <0 for fatal error. + */ +static int submit_eb_subpage(struct page *page, + struct writeback_control *wbc, + struct extent_page_data *epd) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + int submitted = 0; + u64 page_start = page_offset(page); + int bit_start = 0; + const int nbits = BTRFS_SUBPAGE_BITMAP_SIZE; + int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; + int ret; + + /* Lock and write each dirty extent buffers in the range */ + while (bit_start < nbits) { + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct extent_buffer *eb; + unsigned long flags; + u64 start; + + /* + * Take private lock to ensure the subpage won't be detached + * in the meantime. + */ + spin_lock(&page->mapping->private_lock); + if (!PagePrivate(page)) { + spin_unlock(&page->mapping->private_lock); + break; + } + spin_lock_irqsave(&subpage->lock, flags); + if (!((1 << bit_start) & subpage->dirty_bitmap)) { + spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock(&page->mapping->private_lock); + bit_start++; + continue; + } + + start = page_start + bit_start * fs_info->sectorsize; + bit_start += sectors_per_node; + + /* + * Here we just want to grab the eb without touching extra + * spin locks, so call find_extent_buffer_nolock(). + */ + eb = find_extent_buffer_nolock(fs_info, start); + spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock(&page->mapping->private_lock); + + /* + * The eb has already reached 0 refs thus find_extent_buffer() + * doesn't return it. We don't need to write back such eb + * anyway. + */ + if (!eb) + continue; + + ret = lock_extent_buffer_for_io(eb, epd); + if (ret == 0) { + free_extent_buffer(eb); + continue; + } + if (ret < 0) { + free_extent_buffer(eb); + goto cleanup; + } + ret = write_one_eb(eb, wbc, epd); + free_extent_buffer(eb); + if (ret < 0) + goto cleanup; + submitted++; + } + return submitted; + +cleanup: + /* We hit error, end bio for the submitted extent buffers */ + end_write_bio(epd, ret); + return ret; +} + +/* * Submit all page(s) of one extent buffer. * * @page: the page of one extent buffer @@ -4212,6 +4446,9 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, if (!PagePrivate(page)) return 0; + if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) + return submit_eb_subpage(page, wbc, epd); + spin_lock(&mapping->private_lock); if (!PagePrivate(page)) { spin_unlock(&mapping->private_lock); @@ -4652,10 +4889,8 @@ void extent_readahead(struct readahead_control *rac) int nr; while ((nr = readahead_page_batch(rac, pagepool))) { - u64 contig_start = page_offset(pagepool[0]); - u64 contig_end = page_offset(pagepool[nr - 1]) + PAGE_SIZE - 1; - - ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end); + u64 contig_start = readahead_pos(rac); + u64 contig_end = contig_start + readahead_batch_length(rac) - 1; contiguous_readpages(pagepool, nr, contig_start, contig_end, &em_cached, &bio, &bio_flags, &prev_em_start); @@ -5469,36 +5704,28 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, { struct extent_buffer *eb; - rcu_read_lock(); - eb = radix_tree_lookup(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits); - if (eb && atomic_inc_not_zero(&eb->refs)) { - rcu_read_unlock(); - /* - * Lock our eb's refs_lock to avoid races with - * free_extent_buffer. When we get our eb it might be flagged - * with EXTENT_BUFFER_STALE and another task running - * free_extent_buffer might have seen that flag set, - * eb->refs == 2, that the buffer isn't under IO (dirty and - * writeback flags not set) and it's still in the tree (flag - * EXTENT_BUFFER_TREE_REF set), therefore being in the process - * of decrementing the extent buffer's reference count twice. - * So here we could race and increment the eb's reference count, - * clear its stale flag, mark it as dirty and drop our reference - * before the other task finishes executing free_extent_buffer, - * which would later result in an attempt to free an extent - * buffer that is dirty. - */ - if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { - spin_lock(&eb->refs_lock); - spin_unlock(&eb->refs_lock); - } - mark_extent_buffer_accessed(eb, NULL); - return eb; + eb = find_extent_buffer_nolock(fs_info, start); + if (!eb) + return NULL; + /* + * Lock our eb's refs_lock to avoid races with free_extent_buffer(). + * When we get our eb it might be flagged with EXTENT_BUFFER_STALE and + * another task running free_extent_buffer() might have seen that flag + * set, eb->refs == 2, that the buffer isn't under IO (dirty and + * writeback flags not set) and it's still in the tree (flag + * EXTENT_BUFFER_TREE_REF set), therefore being in the process of + * decrementing the extent buffer's reference count twice. So here we + * could race and increment the eb's reference count, clear its stale + * flag, mark it as dirty and drop our reference before the other task + * finishes executing free_extent_buffer, which would later result in + * an attempt to free an extent buffer that is dirty. + */ + if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { + spin_lock(&eb->refs_lock); + spin_unlock(&eb->refs_lock); } - rcu_read_unlock(); - - return NULL; + mark_extent_buffer_accessed(eb, NULL); + return eb; } #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS @@ -5594,6 +5821,17 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, return ERR_PTR(-EINVAL); } +#if BITS_PER_LONG == 32 + if (start >= MAX_LFS_FILESIZE) { + btrfs_err_rl(fs_info, + "extent buffer %llu is beyond 32bit page cache limit", start); + btrfs_err_32bit_limit(fs_info); + return ERR_PTR(-EOVERFLOW); + } + if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD) + btrfs_warn_32bit_limit(fs_info); +#endif + if (fs_info->sectorsize < PAGE_SIZE && offset_in_page(start) + len > PAGE_SIZE) { btrfs_err(fs_info, @@ -5665,7 +5903,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, btrfs_page_inc_eb_refs(fs_info, p); spin_unlock(&mapping->private_lock); - WARN_ON(PageDirty(p)); + WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len)); eb->pages[i] = p; if (!PageUptodate(p)) uptodate = 0; @@ -5814,28 +6052,51 @@ void free_extent_buffer_stale(struct extent_buffer *eb) release_extent_buffer(eb); } +static void btree_clear_page_dirty(struct page *page) +{ + ASSERT(PageDirty(page)); + ASSERT(PageLocked(page)); + clear_page_dirty_for_io(page); + xa_lock_irq(&page->mapping->i_pages); + if (!PageDirty(page)) + __xa_clear_mark(&page->mapping->i_pages, + page_index(page), PAGECACHE_TAG_DIRTY); + xa_unlock_irq(&page->mapping->i_pages); +} + +static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + struct page *page = eb->pages[0]; + bool last; + + /* btree_clear_page_dirty() needs page locked */ + lock_page(page); + last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start, + eb->len); + if (last) + btree_clear_page_dirty(page); + unlock_page(page); + WARN_ON(atomic_read(&eb->refs) == 0); +} + void clear_extent_buffer_dirty(const struct extent_buffer *eb) { int i; int num_pages; struct page *page; + if (eb->fs_info->sectorsize < PAGE_SIZE) + return clear_subpage_extent_buffer_dirty(eb); + num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; if (!PageDirty(page)) continue; - lock_page(page); - WARN_ON(!PagePrivate(page)); - - clear_page_dirty_for_io(page); - xa_lock_irq(&page->mapping->i_pages); - if (!PageDirty(page)) - __xa_clear_mark(&page->mapping->i_pages, - page_index(page), PAGECACHE_TAG_DIRTY); - xa_unlock_irq(&page->mapping->i_pages); + btree_clear_page_dirty(page); ClearPageError(page); unlock_page(page); } @@ -5856,10 +6117,28 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb) WARN_ON(atomic_read(&eb->refs) == 0); WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); - if (!was_dirty) - for (i = 0; i < num_pages; i++) - set_page_dirty(eb->pages[i]); + if (!was_dirty) { + bool subpage = eb->fs_info->sectorsize < PAGE_SIZE; + /* + * For subpage case, we can have other extent buffers in the + * same page, and in clear_subpage_extent_buffer_dirty() we + * have to clear page dirty without subpage lock held. + * This can cause race where our page gets dirty cleared after + * we just set it. + * + * Thankfully, clear_subpage_extent_buffer_dirty() has locked + * its page for other reasons, we can use page lock to prevent + * the above race. + */ + if (subpage) + lock_page(eb->pages[0]); + for (i = 0; i < num_pages; i++) + btrfs_page_set_dirty(eb->fs_info, eb->pages[i], + eb->start, eb->len); + if (subpage) + unlock_page(eb->pages[0]); + } #ifdef CONFIG_BTRFS_DEBUG for (i = 0; i < num_pages; i++) ASSERT(PageDirty(eb->pages[i])); @@ -6217,12 +6496,34 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, return ret; } +/* + * Check that the extent buffer is uptodate. + * + * For regular sector size == PAGE_SIZE case, check if @page is uptodate. + * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE. + */ +static void assert_eb_page_uptodate(const struct extent_buffer *eb, + struct page *page) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + + if (fs_info->sectorsize < PAGE_SIZE) { + bool uptodate; + + uptodate = btrfs_subpage_test_uptodate(fs_info, page, + eb->start, eb->len); + WARN_ON(!uptodate); + } else { + WARN_ON(!PageUptodate(page)); + } +} + void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb, const void *srcv) { char *kaddr; - WARN_ON(!PageUptodate(eb->pages[0])); + assert_eb_page_uptodate(eb, eb->pages[0]); kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0); memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv, BTRFS_FSID_SIZE); @@ -6232,7 +6533,7 @@ void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv) { char *kaddr; - WARN_ON(!PageUptodate(eb->pages[0])); + assert_eb_page_uptodate(eb, eb->pages[0]); kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0); memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv, BTRFS_FSID_SIZE); @@ -6257,7 +6558,7 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, while (len > 0) { page = eb->pages[i]; - WARN_ON(!PageUptodate(page)); + assert_eb_page_uptodate(eb, page); cur = min(len, PAGE_SIZE - offset); kaddr = page_address(page); @@ -6286,7 +6587,7 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, while (len > 0) { page = eb->pages[i]; - WARN_ON(!PageUptodate(page)); + assert_eb_page_uptodate(eb, page); cur = min(len, PAGE_SIZE - offset); kaddr = page_address(page); @@ -6344,7 +6645,7 @@ void copy_extent_buffer(const struct extent_buffer *dst, while (len > 0) { page = dst->pages[i]; - WARN_ON(!PageUptodate(page)); + assert_eb_page_uptodate(dst, page); cur = min(len, (unsigned long)(PAGE_SIZE - offset)); @@ -6406,7 +6707,7 @@ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, eb_bitmap_offset(eb, start, nr, &i, &offset); page = eb->pages[i]; - WARN_ON(!PageUptodate(page)); + assert_eb_page_uptodate(eb, page); kaddr = page_address(page); return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); } @@ -6431,7 +6732,7 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star eb_bitmap_offset(eb, start, pos, &i, &offset); page = eb->pages[i]; - WARN_ON(!PageUptodate(page)); + assert_eb_page_uptodate(eb, page); kaddr = page_address(page); while (len >= bits_to_set) { @@ -6442,7 +6743,7 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star if (++offset >= PAGE_SIZE && len > 0) { offset = 0; page = eb->pages[++i]; - WARN_ON(!PageUptodate(page)); + assert_eb_page_uptodate(eb, page); kaddr = page_address(page); } } @@ -6474,7 +6775,7 @@ void extent_buffer_bitmap_clear(const struct extent_buffer *eb, eb_bitmap_offset(eb, start, pos, &i, &offset); page = eb->pages[i]; - WARN_ON(!PageUptodate(page)); + assert_eb_page_uptodate(eb, page); kaddr = page_address(page); while (len >= bits_to_clear) { @@ -6485,7 +6786,7 @@ void extent_buffer_bitmap_clear(const struct extent_buffer *eb, if (++offset >= PAGE_SIZE && len > 0) { offset = 0; page = eb->pages[++i]; - WARN_ON(!PageUptodate(page)); + assert_eb_page_uptodate(eb, page); kaddr = page_address(page); } } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 824640cb0ace..227215a5722c 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -66,6 +66,7 @@ enum { struct btrfs_root; struct btrfs_inode; struct btrfs_io_bio; +struct btrfs_fs_info; struct io_failure_record; struct extent_io_tree; @@ -270,9 +271,6 @@ struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs); struct bio *btrfs_bio_clone(struct bio *bio); struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size); -struct btrfs_fs_info; -struct btrfs_inode; - int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, u64 length, u64 logical, struct page *page, unsigned int pg_offset, int mirror_num); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 47cd3a6dc635..294602f139ef 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -9,6 +9,7 @@ #include <linux/highmem.h> #include <linux/sched/mm.h> #include <crypto/hash.h> +#include "misc.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0e155f013839..864c08d08a35 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2014,14 +2014,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, else num_written = btrfs_buffered_write(iocb, from); - /* - * We also have to set last_sub_trans to the current log transid, - * otherwise subsequent syncs to a file that's been synced in this - * transaction will appear to have already occurred. - */ - spin_lock(&inode->lock); - inode->last_sub_trans = inode->root->log_transid; - spin_unlock(&inode->lock); + btrfs_set_inode_last_sub_trans(inode); + if (num_written > 0) num_written = generic_write_sync(iocb, num_written); @@ -2122,7 +2116,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret) goto out; - inode_lock(inode); + btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); atomic_inc(&root->log_batch); @@ -2135,11 +2129,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) &BTRFS_I(inode)->runtime_flags); /* - * Before we acquired the inode's lock, someone may have dirtied more - * pages in the target range. We need to make sure that writeback for - * any such pages does not start while we are logging the inode, because - * if it does, any of the following might happen when we are not doing a - * full inode sync: + * Before we acquired the inode's lock and the mmap lock, someone may + * have dirtied more pages in the target range. We need to make sure + * that writeback for any such pages does not start while we are logging + * the inode, because if it does, any of the following might happen when + * we are not doing a full inode sync: * * 1) We log an extent after its writeback finishes but before its * checksums are added to the csum tree, leading to -EIO errors @@ -2154,7 +2148,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ ret = start_ordered_ops(inode, start, end); if (ret) { - inode_unlock(inode); + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); goto out; } @@ -2255,7 +2249,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * file again, but that will end up using the synchronization * inside btrfs_sync_log to keep things safe. */ - inode_unlock(inode); + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); if (ret != BTRFS_NO_LOG_SYNC) { if (!ret) { @@ -2285,7 +2279,7 @@ out: out_release_extents: btrfs_release_log_ctx_extents(&ctx); - inode_unlock(inode); + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); goto out; } @@ -2605,16 +2599,17 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, * extents without inserting a new one, so we must abort the transaction to avoid * a corruption. */ -int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, - const u64 start, const u64 end, - struct btrfs_replace_extent_info *extent_info, - struct btrfs_trans_handle **trans_out) +int btrfs_replace_file_extents(struct btrfs_inode *inode, + struct btrfs_path *path, const u64 start, + const u64 end, + struct btrfs_replace_extent_info *extent_info, + struct btrfs_trans_handle **trans_out) { struct btrfs_drop_extents_args drop_args = { 0 }; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1); - u64 ino_size = round_up(inode->i_size, fs_info->sectorsize); - struct btrfs_root *root = BTRFS_I(inode)->root; + u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize); struct btrfs_trans_handle *trans = NULL; struct btrfs_block_rsv *rsv; unsigned int rsv_count; @@ -2662,10 +2657,10 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, drop_args.drop_cache = true; while (cur_offset < end) { drop_args.start = cur_offset; - ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); + ret = btrfs_drop_extents(trans, root, inode, &drop_args); /* If we are punching a hole decrement the inode's byte count */ if (!extent_info) - btrfs_update_inode_bytes(BTRFS_I(inode), 0, + btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found); if (ret != -ENOSPC) { /* @@ -2685,8 +2680,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, if (!extent_info && cur_offset < drop_args.drop_end && cur_offset < ino_size) { - ret = fill_holes(trans, BTRFS_I(inode), path, - cur_offset, drop_args.drop_end); + ret = fill_holes(trans, inode, path, cur_offset, + drop_args.drop_end); if (ret) { /* * If we failed then we didn't insert our hole @@ -2704,7 +2699,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, * know to not set disk_i_size in this area until a new * file extent is inserted here. */ - ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), + ret = btrfs_inode_clear_file_extent_range(inode, cur_offset, drop_args.drop_end - cur_offset); if (ret) { @@ -2723,8 +2718,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, u64 replace_len = drop_args.drop_end - extent_info->file_offset; - ret = btrfs_insert_replace_extent(trans, BTRFS_I(inode), - path, extent_info, replace_len, + ret = btrfs_insert_replace_extent(trans, inode, path, + extent_info, replace_len, drop_args.bytes_found); if (ret) { btrfs_abort_transaction(trans, ret); @@ -2735,9 +2730,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, extent_info->file_offset += replace_len; } - cur_offset = drop_args.drop_end; - - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, root, inode); if (ret) break; @@ -2756,9 +2749,10 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; - if (!extent_info) { - ret = find_first_non_hole(BTRFS_I(inode), &cur_offset, - &len); + cur_offset = drop_args.drop_end; + len = end - cur_offset; + if (!extent_info && len) { + ret = find_first_non_hole(inode, &cur_offset, &len); if (unlikely(ret < 0)) break; if (ret && !len) { @@ -2771,14 +2765,11 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, /* * If we were cloning, force the next fsync to be a full one since we * we replaced (or just dropped in the case of cloning holes when - * NO_HOLES is enabled) extents and extent maps. - * This is for the sake of simplicity, and cloning into files larger - * than 16Mb would force the full fsync any way (when - * try_release_extent_mapping() is invoked during page cache truncation. + * NO_HOLES is enabled) file extent items and did not setup new extent + * maps for the replacement extents (or holes). */ if (extent_info && !extent_info->is_new_extent) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); if (ret) goto out_trans; @@ -2804,8 +2795,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, */ if (!extent_info && cur_offset < ino_size && cur_offset < drop_args.drop_end) { - ret = fill_holes(trans, BTRFS_I(inode), path, - cur_offset, drop_args.drop_end); + ret = fill_holes(trans, inode, path, cur_offset, + drop_args.drop_end); if (ret) { /* Same comment as above. */ btrfs_abort_transaction(trans, ret); @@ -2813,8 +2804,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, } } else if (!extent_info && cur_offset < drop_args.drop_end) { /* See the comment in the loop above for the reasoning here. */ - ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), - cur_offset, drop_args.drop_end - cur_offset); + ret = btrfs_inode_clear_file_extent_range(inode, cur_offset, + drop_args.drop_end - cur_offset); if (ret) { btrfs_abort_transaction(trans, ret); goto out_trans; @@ -2822,7 +2813,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, } if (extent_info) { - ret = btrfs_insert_replace_extent(trans, BTRFS_I(inode), path, + ret = btrfs_insert_replace_extent(trans, inode, path, extent_info, extent_info->data_len, drop_args.bytes_found); if (ret) { @@ -2868,7 +2859,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; - inode_lock(inode); + btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); ino_size = round_up(inode->i_size, fs_info->sectorsize); ret = find_first_non_hole(BTRFS_I(inode), &offset, &len); if (ret < 0) @@ -2908,7 +2899,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) truncated_block = true; ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0); if (ret) { - inode_unlock(inode); + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); return ret; } } @@ -2967,8 +2958,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out; } - ret = btrfs_replace_file_extents(inode, path, lockstart, lockend, NULL, - &trans); + ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart, + lockend, NULL, &trans); btrfs_free_path(path); if (ret) goto out; @@ -3009,7 +3000,7 @@ out_only_mutex: ret = ret2; } } - inode_unlock(inode); + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); return ret; } @@ -3335,7 +3326,7 @@ static long btrfs_fallocate(struct file *file, int mode, return ret; } - btrfs_inode_lock(inode, 0); + btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) { ret = inode_newsize_ok(inode, offset + len); @@ -3377,7 +3368,7 @@ static long btrfs_fallocate(struct file *file, int mode, if (mode & FALLOC_FL_ZERO_RANGE) { ret = btrfs_zero_range(inode, offset, len, mode); - inode_unlock(inode); + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); return ret; } @@ -3487,7 +3478,7 @@ out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state); out: - inode_unlock(inode); + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); /* Let go of our reservation. */ if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE)) btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved, @@ -3496,13 +3487,13 @@ out: return ret; } -static loff_t find_desired_extent(struct inode *inode, loff_t offset, +static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset, int whence) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; - loff_t i_size = inode->i_size; + loff_t i_size = inode->vfs_inode.i_size; u64 lockstart; u64 lockend; u64 start; @@ -3525,11 +3516,10 @@ static loff_t find_desired_extent(struct inode *inode, loff_t offset, lockend--; len = lockend - lockstart + 1; - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); + lock_extent_bits(&inode->io_tree, lockstart, lockend, &cached_state); while (start < i_size) { - em = btrfs_get_extent_fiemap(BTRFS_I(inode), start, len); + em = btrfs_get_extent_fiemap(inode, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); em = NULL; @@ -3551,7 +3541,7 @@ static loff_t find_desired_extent(struct inode *inode, loff_t offset, cond_resched(); } free_extent_map(em); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, + unlock_extent_cached(&inode->io_tree, lockstart, lockend, &cached_state); if (ret) { offset = ret; @@ -3575,7 +3565,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) case SEEK_DATA: case SEEK_HOLE: btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); - offset = find_desired_extent(inode, offset, whence); + offset = find_desired_extent(BTRFS_I(inode), offset, whence); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); break; } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 9988decd5717..e54466fc101f 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -11,6 +11,7 @@ #include <linux/ratelimit.h> #include <linux/error-injection.h> #include <linux/sched/mm.h> +#include "misc.h" #include "ctree.h" #include "free-space-cache.h" #include "transaction.h" @@ -2539,6 +2540,7 @@ out: static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, u64 bytenr, u64 size, bool used) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; u64 offset = bytenr - block_group->start; u64 to_free, to_unusable; @@ -2569,8 +2571,13 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, } /* All the region is now unusable. Mark it as unused and reclaim */ - if (block_group->zone_unusable == block_group->length) + if (block_group->zone_unusable == block_group->length) { btrfs_mark_bg_unused(block_group); + } else if (block_group->zone_unusable >= + div_factor_fine(block_group->length, + fs_info->bg_reclaim_threshold)) { + btrfs_mark_bg_to_reclaim(block_group); + } return 0; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e5469eeabf8a..b21d491b3adc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -102,6 +102,7 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode, * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt * return -EAGAIN + * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock */ int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags) { @@ -122,6 +123,8 @@ int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags) } inode_lock(inode); } + if (ilock_flags & BTRFS_ILOCK_MMAP) + down_write(&BTRFS_I(inode)->i_mmap_lock); return 0; } @@ -133,6 +136,8 @@ int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags) */ void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags) { + if (ilock_flags & BTRFS_ILOCK_MMAP) + up_write(&BTRFS_I(inode)->i_mmap_lock); if (ilock_flags & BTRFS_ILOCK_SHARED) inode_unlock_shared(inode); else @@ -1516,7 +1521,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, static noinline int run_delalloc_nocow(struct btrfs_inode *inode, struct page *locked_page, const u64 start, const u64 end, - int *page_started, int force, + int *page_started, unsigned long *nr_written) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1530,6 +1535,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, u64 ino = btrfs_ino(inode); bool nocow = false; u64 disk_bytenr = 0; + const bool force = inode->flags & BTRFS_INODE_NODATACOW; path = btrfs_alloc_path(); if (!path) { @@ -1863,23 +1869,16 @@ error: return ret; } -static inline int need_force_cow(struct btrfs_inode *inode, u64 start, u64 end) +static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) { - - if (!(inode->flags & BTRFS_INODE_NODATACOW) && - !(inode->flags & BTRFS_INODE_PREALLOC)) - return 0; - - /* - * @defrag_bytes is a hint value, no spinlock held here, - * if is not zero, it means the file is defragging. - * Force cow if given extent needs to be defragged. - */ - if (inode->defrag_bytes && - test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG, 0, NULL)) - return 1; - - return 0; + if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) { + if (inode->defrag_bytes && + test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG, + 0, NULL)) + return false; + return true; + } + return false; } /* @@ -1891,17 +1890,12 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page struct writeback_control *wbc) { int ret; - int force_cow = need_force_cow(inode, start, end); const bool zoned = btrfs_is_zoned(inode->root->fs_info); - if (inode->flags & BTRFS_INODE_NODATACOW && !force_cow) { - ASSERT(!zoned); - ret = run_delalloc_nocow(inode, locked_page, start, end, - page_started, 1, nr_written); - } else if (inode->flags & BTRFS_INODE_PREALLOC && !force_cow) { + if (should_nocow(inode, start, end)) { ASSERT(!zoned); ret = run_delalloc_nocow(inode, locked_page, start, end, - page_started, 0, nr_written); + page_started, nr_written); } else if (!inode_can_compress(inode) || !inode_need_compress(inode, start, end)) { if (zoned) @@ -3151,10 +3145,9 @@ zeroit: * @bio_offset: offset to the beginning of the bio (in bytes) * @start: file offset of the range start * @end: file offset of the range end (inclusive) - * @mirror: mirror number */ int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, - struct page *page, u64 start, u64 end, int mirror) + struct page *page, u64 start, u64 end) { struct inode *inode = page->mapping->host; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -3393,15 +3386,19 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) int is_dead_root = 0; /* - * this is an orphan in the tree root. Currently these + * This is an orphan in the tree root. Currently these * could come from 2 sources: - * a) a snapshot deletion in progress + * a) a root (snapshot/subvolume) deletion in progress * b) a free space cache inode - * We need to distinguish those two, as the snapshot - * orphan must not get deleted. - * find_dead_roots already ran before us, so if this - * is a snapshot deletion, we should find the root - * in the fs_roots radix tree. + * We need to distinguish those two, as the orphan item + * for a root must not get deleted before the deletion + * of the snapshot/subvolume's tree completes. + * + * btrfs_find_orphan_roots() ran before us, which has + * found all deleted roots and loaded them into + * fs_info->fs_roots_radix. So here we can find if an + * orphan item corresponds to a deleted root by looking + * up the root from that radix tree. */ spin_lock(&fs_info->fs_roots_radix_lock); @@ -4332,7 +4329,11 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) goto out_end_trans; } - btrfs_record_root_in_trans(trans, dest); + ret = btrfs_record_root_in_trans(trans, dest); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_end_trans; + } memset(&dest->root_item.drop_progress, 0, sizeof(dest->root_item.drop_progress)); @@ -7026,7 +7027,7 @@ next: if (ret) goto out; } else { - map = kmap(page); + map = kmap_local_page(page); read_extent_buffer(leaf, map + pg_offset, ptr, copy_size); if (pg_offset + copy_size < PAGE_SIZE) { @@ -7034,7 +7035,7 @@ next: PAGE_SIZE - pg_offset - copy_size); } - kunmap(page); + kunmap_local(map); } flush_dcache_page(page); } @@ -7262,6 +7263,19 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, return em; } +static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_block_group *block_group; + bool readonly = false; + + block_group = btrfs_lookup_block_group(fs_info, bytenr); + if (!block_group || block_group->ro) + readonly = true; + if (block_group) + btrfs_put_block_group(block_group); + return readonly; +} + /* * Check if we can do nocow write into the range [@offset, @offset + @len) * @@ -8403,17 +8417,11 @@ again: * for the finish_ordered_io */ if (TestClearPagePrivate2(page)) { - struct btrfs_ordered_inode_tree *tree; - u64 new_len; - - tree = &inode->ordered_tree; - - spin_lock_irq(&tree->lock); + spin_lock_irq(&inode->ordered_tree.lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); - new_len = start - ordered->file_offset; - if (new_len < ordered->truncated_len) - ordered->truncated_len = new_len; - spin_unlock_irq(&tree->lock); + ordered->truncated_len = min(ordered->truncated_len, + start - ordered->file_offset); + spin_unlock_irq(&inode->ordered_tree.lock); if (btrfs_dec_test_ordered_pending(inode, &ordered, start, @@ -8539,6 +8547,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ again: + down_read(&BTRFS_I(inode)->i_mmap_lock); lock_page(page); size = i_size_read(inode); @@ -8567,6 +8576,7 @@ again: unlock_extent_cached(io_tree, page_start, page_end, &cached_state); unlock_page(page); + up_read(&BTRFS_I(inode)->i_mmap_lock); btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); goto again; @@ -8619,11 +8629,10 @@ again: set_page_dirty(page); SetPageUptodate(page); - BTRFS_I(inode)->last_trans = fs_info->generation; - BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; - BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; + btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); unlock_extent_cached(io_tree, page_start, page_end, &cached_state); + up_read(&BTRFS_I(inode)->i_mmap_lock); btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); sb_end_pagefault(inode->i_sb); @@ -8632,6 +8641,7 @@ again: out_unlock: unlock_page(page); + up_read(&BTRFS_I(inode)->i_mmap_lock); out: btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, @@ -8883,6 +8893,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->delayed_iput); RB_CLEAR_NODE(&ei->rb_node); + init_rwsem(&ei->i_mmap_lock); return inode; } @@ -9101,8 +9112,11 @@ static int btrfs_rename_exchange(struct inode *old_dir, goto out_notrans; } - if (dest != root) - btrfs_record_root_in_trans(trans, dest); + if (dest != root) { + ret = btrfs_record_root_in_trans(trans, dest); + if (ret) + goto out_fail; + } /* * We need to find a free sequence number both in the source and @@ -9406,8 +9420,11 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_notrans; } - if (dest != root) - btrfs_record_root_in_trans(trans, dest); + if (dest != root) { + ret = btrfs_record_root_in_trans(trans, dest); + if (ret) + goto out_fail; + } ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index); if (ret) @@ -9919,7 +9936,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( goto free_qgroup; } - ret = btrfs_replace_file_extents(&inode->vfs_inode, path, file_offset, + ret = btrfs_replace_file_extents(inode, path, file_offset, file_offset + len - 1, &extent_info, &trans); btrfs_free_path(path); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 534db336a718..ee1dbabb5d3c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -553,8 +553,6 @@ static noinline int create_subvol(struct inode *dir, btrfs_set_root_otransid(root_item, trans->transid); btrfs_tree_unlock(leaf); - free_extent_buffer(leaf); - leaf = NULL; btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID); @@ -563,8 +561,22 @@ static noinline int create_subvol(struct inode *dir, key.type = BTRFS_ROOT_ITEM_KEY; ret = btrfs_insert_root(trans, fs_info->tree_root, &key, root_item); - if (ret) + if (ret) { + /* + * Since we don't abort the transaction in this case, free the + * tree block so that we don't leak space and leave the + * filesystem in an inconsistent state (an extent item in the + * extent tree without backreferences). Also no need to have + * the tree block locked since it is not in any tree at this + * point, so no other task can find it and use it. + */ + btrfs_free_tree_block(trans, root, leaf, 0, 1); + free_extent_buffer(leaf); goto fail; + } + + free_extent_buffer(leaf); + leaf = NULL; key.offset = (u64)-1; new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev); @@ -577,7 +589,12 @@ static noinline int create_subvol(struct inode *dir, /* Freeing will be done in btrfs_put_root() of new_root */ anon_dev = 0; - btrfs_record_root_in_trans(trans, new_root); + ret = btrfs_record_root_in_trans(trans, new_root); + if (ret) { + btrfs_put_root(new_root); + btrfs_abort_transaction(trans, ret); + goto fail; + } ret = btrfs_create_subvol_root(trans, new_root, root); btrfs_put_root(new_root); @@ -870,7 +887,7 @@ out_up_read: out_dput: dput(dentry); out_unlock: - inode_unlock(dir); + btrfs_inode_unlock(dir, 0); return error; } @@ -1468,7 +1485,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ra_index += cluster; } - inode_lock(inode); + btrfs_inode_lock(inode, 0); if (IS_SWAPFILE(inode)) { ret = -ETXTBSY; } else { @@ -1477,13 +1494,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ret = cluster_pages_for_defrag(inode, pages, i, cluster); } if (ret < 0) { - inode_unlock(inode); + btrfs_inode_unlock(inode, 0); goto out_ra; } defrag_count += ret; balance_dirty_pages_ratelimited(inode->i_mapping); - inode_unlock(inode); + btrfs_inode_unlock(inode, 0); if (newer_than) { if (newer_off == (u64)-1) @@ -1531,9 +1548,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, out_ra: if (do_compress) { - inode_lock(inode); + btrfs_inode_lock(inode, 0); BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; - inode_unlock(inode); + btrfs_inode_unlock(inode, 0); } if (!file) kfree(ra); @@ -2968,9 +2985,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out_dput; } - inode_lock(inode); + btrfs_inode_lock(inode, 0); err = btrfs_delete_subvolume(dir, dentry); - inode_unlock(inode); + btrfs_inode_unlock(inode, 0); if (!err) { fsnotify_rmdir(dir, dentry); d_delete(dentry); @@ -2979,7 +2996,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, out_dput: dput(dentry); out_unlock_dir: - inode_unlock(dir); + btrfs_inode_unlock(dir, 0); free_subvol_name: kfree(subvol_name_ptr); free_parent: diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 9084a950dc09..cd042c7567a4 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -118,7 +118,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0; char *data_in; - char *cpage_out; + char *cpage_out, *sizes_ptr; int nr_pages = 0; struct page *in_page = NULL; struct page *out_page = NULL; @@ -258,10 +258,9 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, } /* store the size of all chunks of compressed data */ - cpage_out = kmap(pages[0]); - write_compress_length(cpage_out, tot_out); - - kunmap(pages[0]); + sizes_ptr = kmap_local_page(pages[0]); + write_compress_length(sizes_ptr, tot_out); + kunmap_local(sizes_ptr); ret = 0; *total_out = tot_out; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 985a21558437..07b0b4218791 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -107,17 +107,6 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, return NULL; } -/* - * helper to check if a given offset is inside a given entry - */ -static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) -{ - if (file_offset < entry->file_offset || - entry->file_offset + entry->num_bytes <= file_offset) - return 0; - return 1; -} - static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, u64 len) { @@ -142,7 +131,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, if (tree->last) { entry = rb_entry(tree->last, struct btrfs_ordered_extent, rb_node); - if (offset_in_entry(entry, file_offset)) + if (in_range(file_offset, entry->file_offset, entry->num_bytes)) return tree->last; } ret = __tree_search(root, file_offset, &prev); @@ -349,7 +338,7 @@ bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, goto out; entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (!offset_in_entry(entry, *file_offset)) + if (!in_range(*file_offset, entry->file_offset, entry->num_bytes)) goto out; dec_start = max(*file_offset, entry->file_offset); @@ -428,7 +417,7 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); have_entry: - if (!offset_in_entry(entry, file_offset)) + if (!in_range(file_offset, entry->file_offset, entry->num_bytes)) goto out; if (io_size > entry->bytes_left) @@ -779,7 +768,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino goto out; entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (!offset_in_entry(entry, file_offset)) + if (!in_range(file_offset, entry->file_offset, entry->num_bytes)) entry = NULL; if (entry) refcount_inc(&entry->refs); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 99e0853e4d3b..e60c07f36427 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -39,8 +39,8 @@ struct btrfs_ordered_sum { */ enum { /* - * Different types for direct io, one and only one of the 4 type can - * be set when creating ordered extent. + * Different types for ordered extents, one and only one of the 4 types + * need to be set when creating ordered extent. * * REGULAR: For regular non-compressed COW write * NOCOW: For NOCOW write into existing non-hole extent diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index f0b9ef13153a..2319c923c9e6 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -23,6 +23,7 @@ #include "qgroup.h" #include "block-group.h" #include "sysfs.h" +#include "tree-mod-log.h" /* TODO XXX FIXME * - subvol delete -> delete when ref goes to 0? delete limits also? @@ -2639,12 +2640,12 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) record->data_rsv, BTRFS_QGROUP_RSV_DATA); /* - * Use SEQ_LAST as time_seq to do special search, which - * doesn't lock tree or delayed_refs and search current - * root. It's safe inside commit_transaction(). + * Use BTRFS_SEQ_LAST as time_seq to do special search, + * which doesn't lock tree or delayed_refs and search + * current root. It's safe inside commit_transaction(). */ ret = btrfs_find_all_roots(trans, fs_info, - record->bytenr, SEQ_LAST, &new_roots, false); + record->bytenr, BTRFS_SEQ_LAST, &new_roots, false); if (ret < 0) goto cleanup; if (qgroup_to_skip) { @@ -3543,37 +3544,19 @@ static int try_flush_qgroup(struct btrfs_root *root) { struct btrfs_trans_handle *trans; int ret; - bool can_commit = true; - /* - * If current process holds a transaction, we shouldn't flush, as we - * assume all space reservation happens before a transaction handle is - * held. - * - * But there are cases like btrfs_delayed_item_reserve_metadata() where - * we try to reserve space with one transction handle already held. - * In that case we can't commit transaction, but at least try to end it - * and hope the started data writes can free some space. - */ - if (current->journal_info && - current->journal_info != BTRFS_SEND_TRANS_STUB) - can_commit = false; + /* Can't hold an open transaction or we run the risk of deadlocking */ + ASSERT(current->journal_info == NULL || + current->journal_info == BTRFS_SEND_TRANS_STUB); + if (WARN_ON(current->journal_info && + current->journal_info != BTRFS_SEND_TRANS_STUB)) + return 0; /* * We don't want to run flush again and again, so if there is a running * one, we won't try to start a new flush, but exit directly. */ if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) { - /* - * We are already holding a transaction, thus we can block other - * threads from flushing. So exit right now. This increases - * the chance of EDQUOT for heavy load and near limit cases. - * But we can argue that if we're already near limit, EDQUOT is - * unavoidable anyway. - */ - if (!can_commit) - return 0; - wait_event(root->qgroup_flush_wait, !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)); return 0; @@ -3590,10 +3573,7 @@ static int try_flush_qgroup(struct btrfs_root *root) goto out; } - if (can_commit) - ret = btrfs_commit_transaction(trans); - else - ret = btrfs_end_transaction(trans); + ret = btrfs_commit_transaction(trans); out: clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); wake_up(&root->qgroup_flush_wait); @@ -3646,8 +3626,7 @@ cleanup: qgroup_unreserve_range(inode, reserved, start, len); out: if (new_reserved) { - extent_changeset_release(reserved); - kfree(reserved); + extent_changeset_free(reserved); *reserved_ret = NULL; } return ret; diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 8c31357f08ed..244d499ebc72 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -13,6 +13,7 @@ #include <linux/list_sort.h> #include <linux/raid/xor.h> #include <linux/mm.h> +#include "misc.h" #include "ctree.h" #include "disk-io.h" #include "volumes.h" @@ -1231,13 +1232,13 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) /* first collect one page from each data stripe */ for (stripe = 0; stripe < nr_data; stripe++) { p = page_in_rbio(rbio, stripe, pagenr, 0); - pointers[stripe] = kmap(p); + pointers[stripe] = kmap_local_page(p); } /* then add the parity stripe */ p = rbio_pstripe_page(rbio, pagenr); SetPageUptodate(p); - pointers[stripe++] = kmap(p); + pointers[stripe++] = kmap_local_page(p); if (has_qstripe) { @@ -1247,7 +1248,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) */ p = rbio_qstripe_page(rbio, pagenr); SetPageUptodate(p); - pointers[stripe++] = kmap(p); + pointers[stripe++] = kmap_local_page(p); raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, pointers); @@ -1256,10 +1257,8 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) copy_page(pointers[nr_data], pointers[0]); run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); } - - - for (stripe = 0; stripe < rbio->real_stripes; stripe++) - kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); + for (stripe = stripe - 1; stripe >= 0; stripe--) + kunmap_local(pointers[stripe]); } /* @@ -1634,7 +1633,8 @@ struct btrfs_plug_cb { /* * rbios on the plug list are sorted for easier merging. */ -static int plug_cmp(void *priv, struct list_head *a, struct list_head *b) +static int plug_cmp(void *priv, const struct list_head *a, + const struct list_head *b) { struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, plug_list); @@ -1776,6 +1776,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) { int pagenr, stripe; void **pointers; + void **unmap_array; int faila = -1, failb = -1; struct page *page; blk_status_t err; @@ -1787,6 +1788,16 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) goto cleanup_io; } + /* + * Store copy of pointers that does not get reordered during + * reconstruction so that kunmap_local works. + */ + unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); + if (!unmap_array) { + err = BLK_STS_RESOURCE; + goto cleanup_pointers; + } + faila = rbio->faila; failb = rbio->failb; @@ -1808,8 +1819,11 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) !test_bit(pagenr, rbio->dbitmap)) continue; - /* setup our array of pointers with pages - * from each stripe + /* + * Setup our array of pointers with pages from each stripe + * + * NOTE: store a duplicate array of pointers to preserve the + * pointer order */ for (stripe = 0; stripe < rbio->real_stripes; stripe++) { /* @@ -1823,7 +1837,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) } else { page = rbio_stripe_page(rbio, stripe, pagenr); } - pointers[stripe] = kmap(page); + pointers[stripe] = kmap_local_page(page); + unmap_array[stripe] = pointers[stripe]; } /* all raid6 handling here */ @@ -1916,24 +1931,14 @@ pstripe: } } } - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - /* - * if we're rebuilding a read, we have to use - * pages from the bio list - */ - if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && - (stripe == faila || stripe == failb)) { - page = page_in_rbio(rbio, stripe, pagenr, 0); - } else { - page = rbio_stripe_page(rbio, stripe, pagenr); - } - kunmap(page); - } + for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--) + kunmap_local(unmap_array[stripe]); } err = BLK_STS_OK; cleanup: + kfree(unmap_array); +cleanup_pointers: kfree(pointers); cleanup_io: @@ -2358,13 +2363,13 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, goto cleanup; } SetPageUptodate(q_page); - pointers[rbio->real_stripes - 1] = kmap(q_page); + pointers[rbio->real_stripes - 1] = kmap_local_page(q_page); } atomic_set(&rbio->error, 0); /* Map the parity stripe just once */ - pointers[nr_data] = kmap(p_page); + pointers[nr_data] = kmap_local_page(p_page); for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { struct page *p; @@ -2372,7 +2377,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, /* first collect one page from each data stripe */ for (stripe = 0; stripe < nr_data; stripe++) { p = page_in_rbio(rbio, stripe, pagenr, 0); - pointers[stripe] = kmap(p); + pointers[stripe] = kmap_local_page(p); } if (has_qstripe) { @@ -2387,22 +2392,22 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, /* Check scrubbing parity and repair it */ p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); - parity = kmap(p); + parity = kmap_local_page(p); if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE)) copy_page(parity, pointers[rbio->scrubp]); else /* Parity is right, needn't writeback */ bitmap_clear(rbio->dbitmap, pagenr, 1); - kunmap(p); + kunmap_local(parity); - for (stripe = 0; stripe < nr_data; stripe++) - kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); + for (stripe = nr_data - 1; stripe >= 0; stripe--) + kunmap_local(pointers[stripe]); } - kunmap(p_page); + kunmap_local(pointers[nr_data]); __free_page(p_page); if (q_page) { - kunmap(q_page); + kunmap_local(pointers[rbio->real_stripes - 1]); __free_page(q_page); } diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 762881b777b3..f4ec06b53aa0 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -478,9 +478,9 @@ process_slot: clone_info.file_offset = new_key.offset; clone_info.extent_buf = buf; clone_info.is_new_extent = false; - ret = btrfs_replace_file_extents(inode, path, drop_start, - new_key.offset + datal - 1, &clone_info, - &trans); + ret = btrfs_replace_file_extents(BTRFS_I(inode), path, + drop_start, new_key.offset + datal - 1, + &clone_info, &trans); if (ret) goto out; } else if (type == BTRFS_FILE_EXTENT_INLINE) { @@ -567,8 +567,8 @@ process_slot: set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); - ret = btrfs_replace_file_extents(inode, path, last_dest_end, - destoff + len - 1, NULL, &trans); + ret = btrfs_replace_file_extents(BTRFS_I(inode), path, + last_dest_end, destoff + len - 1, NULL, &trans); if (ret) goto out; @@ -604,6 +604,20 @@ static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); } +static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2) +{ + if (inode1 < inode2) + swap(inode1, inode2); + down_write(&BTRFS_I(inode1)->i_mmap_lock); + down_write_nested(&BTRFS_I(inode2)->i_mmap_lock, SINGLE_DEPTH_NESTING); +} + +static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2) +{ + up_write(&BTRFS_I(inode1)->i_mmap_lock); + up_write(&BTRFS_I(inode2)->i_mmap_lock); +} + static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, struct inode *dst, u64 dst_loff) { @@ -820,6 +834,16 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, len, remap_flags); } +static bool file_sync_write(const struct file *file) +{ + if (file->f_flags & (__O_SYNC | O_DSYNC)) + return true; + if (IS_SYNC(file_inode(file))) + return true; + + return false; +} + loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, struct file *dst_file, loff_t destoff, loff_t len, unsigned int remap_flags) @@ -832,10 +856,12 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; - if (same_inode) - inode_lock(src_inode); - else + if (same_inode) { + btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP); + } else { lock_two_nondirectories(src_inode, dst_inode); + btrfs_double_mmap_lock(src_inode, dst_inode); + } ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff, &len, remap_flags); @@ -848,10 +874,27 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); out_unlock: - if (same_inode) - inode_unlock(src_inode); - else + if (same_inode) { + btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP); + } else { + btrfs_double_mmap_unlock(src_inode, dst_inode); unlock_two_nondirectories(src_inode, dst_inode); + } + + /* + * If either the source or the destination file was opened with O_SYNC, + * O_DSYNC or has the S_SYNC attribute, fsync both the destination and + * source files/ranges, so that after a successful return (0) followed + * by a power failure results in the reflinked data to be readable from + * both files/ranges. + */ + if (ret == 0 && len > 0 && + (file_sync_write(src_file) || file_sync_write(dst_file))) { + ret = btrfs_sync_file(src_file, off, off + len - 1, 0); + if (ret == 0) + ret = btrfs_sync_file(dst_file, destoff, + destoff + len - 1, 0); + } return ret < 0 ? ret : len; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 232d5da7b7be..b70be2ac2e9e 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -638,9 +638,10 @@ static int __must_check __add_reloc_root(struct btrfs_root *root) node->bytenr, &node->rb_node); spin_unlock(&rc->reloc_root_tree.lock); if (rb_node) { - btrfs_panic(fs_info, -EEXIST, + btrfs_err(fs_info, "Duplicate root found for start=%llu while inserting into relocation tree", node->bytenr); + return -EEXIST; } list_add_tail(&root->root_list, &rc->reloc_roots); @@ -733,10 +734,12 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, struct extent_buffer *eb; struct btrfs_root_item *root_item; struct btrfs_key root_key; - int ret; + int ret = 0; + bool must_abort = false; root_item = kmalloc(sizeof(*root_item), GFP_NOFS); - BUG_ON(!root_item); + if (!root_item) + return ERR_PTR(-ENOMEM); root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; root_key.type = BTRFS_ROOT_ITEM_KEY; @@ -748,7 +751,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, /* called by btrfs_init_reloc_root */ ret = btrfs_copy_root(trans, root, root->commit_root, &eb, BTRFS_TREE_RELOC_OBJECTID); - BUG_ON(ret); + if (ret) + goto fail; + /* * Set the last_snapshot field to the generation of the commit * root - like this ctree.c:btrfs_block_can_be_shared() behaves @@ -769,9 +774,16 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, */ ret = btrfs_copy_root(trans, root, root->node, &eb, BTRFS_TREE_RELOC_OBJECTID); - BUG_ON(ret); + if (ret) + goto fail; } + /* + * We have changed references at this point, we must abort the + * transaction if anything fails. + */ + must_abort = true; + memcpy(root_item, &root->root_item, sizeof(*root_item)); btrfs_set_root_bytenr(root_item, eb->start); btrfs_set_root_level(root_item, btrfs_header_level(eb)); @@ -789,14 +801,25 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, ret = btrfs_insert_root(trans, fs_info->tree_root, &root_key, root_item); - BUG_ON(ret); + if (ret) + goto fail; + kfree(root_item); reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key); - BUG_ON(IS_ERR(reloc_root)); + if (IS_ERR(reloc_root)) { + ret = PTR_ERR(reloc_root); + goto abort; + } set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); reloc_root->last_trans = trans->transid; return reloc_root; +fail: + kfree(root_item); +abort: + if (must_abort) + btrfs_abort_transaction(trans, ret); + return ERR_PTR(ret); } /* @@ -856,9 +879,16 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, reloc_root = create_reloc_root(trans, root, root->root_key.objectid); if (clear_rsv) trans->block_rsv = rsv; + if (IS_ERR(reloc_root)) + return PTR_ERR(reloc_root); ret = __add_reloc_root(reloc_root); - BUG_ON(ret < 0); + ASSERT(ret != -EEXIST); + if (ret) { + /* Pairs with create_reloc_root */ + btrfs_put_root(reloc_root); + return ret; + } root->reloc_root = btrfs_grab_root(reloc_root); return 0; } @@ -875,7 +905,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, int ret; if (!have_reloc_root(root)) - goto out; + return 0; reloc_root = root->reloc_root; root_item = &reloc_root->root_item; @@ -908,10 +938,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, ret = btrfs_update_root(trans, fs_info->tree_root, &reloc_root->root_key, root_item); - BUG_ON(ret); btrfs_put_root(reloc_root); -out: - return 0; + return ret; } /* @@ -1185,8 +1213,8 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, int ret; int slot; - BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); - BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); + ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); + ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); last_snapshot = btrfs_root_last_snapshot(&src->root_item); again: @@ -1205,7 +1233,11 @@ again: if (cow) { ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb, BTRFS_NESTING_COW); - BUG_ON(ret); + if (ret) { + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + return ret; + } } if (next_key) { @@ -1217,7 +1249,7 @@ again: parent = eb; while (1) { level = btrfs_header_level(parent); - BUG_ON(level < lowest_level); + ASSERT(level >= lowest_level); ret = btrfs_bin_search(parent, &key, &slot); if (ret < 0) @@ -1265,7 +1297,11 @@ again: ret = btrfs_cow_block(trans, dest, eb, parent, slot, &eb, BTRFS_NESTING_COW); - BUG_ON(ret); + if (ret) { + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + break; + } } btrfs_tree_unlock(parent); @@ -1289,7 +1325,11 @@ again: path->lowest_level = level; ret = btrfs_search_slot(trans, src, &key, path, 0, 1); path->lowest_level = 0; - BUG_ON(ret); + if (ret) { + if (ret > 0) + ret = -ENOENT; + break; + } /* * Info qgroup to trace both subtrees. @@ -1329,27 +1369,39 @@ again: ref.skip_qgroup = true; btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid); ret = btrfs_inc_extent_ref(trans, &ref); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, blocksize, 0); ref.skip_qgroup = true; btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid); ret = btrfs_inc_extent_ref(trans, &ref); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr, blocksize, path->nodes[level]->start); btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid); ref.skip_qgroup = true; ret = btrfs_free_extent(trans, &ref); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr, blocksize, 0); btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid); ref.skip_qgroup = true; ret = btrfs_free_extent(trans, &ref); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } btrfs_unlock_up_safe(path, 0); @@ -1537,12 +1589,13 @@ static int find_next_key(struct btrfs_path *path, int level, /* * Insert current subvolume into reloc_control::dirty_subvol_roots */ -static void insert_dirty_subvol(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct btrfs_root *root) +static int insert_dirty_subvol(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_root *root) { struct btrfs_root *reloc_root = root->reloc_root; struct btrfs_root_item *reloc_root_item; + int ret; /* @root must be a subvolume tree root with a valid reloc tree */ ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); @@ -1553,12 +1606,16 @@ static void insert_dirty_subvol(struct btrfs_trans_handle *trans, sizeof(reloc_root_item->drop_progress)); btrfs_set_root_drop_level(reloc_root_item, 0); btrfs_set_root_refs(reloc_root_item, 0); - btrfs_update_reloc_root(trans, root); + ret = btrfs_update_reloc_root(trans, root); + if (ret) + return ret; if (list_empty(&root->reloc_dirty_list)) { btrfs_grab_root(root); list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots); } + + return 0; } static int clean_dirty_subvols(struct reloc_control *rc) @@ -1760,8 +1817,11 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, out: btrfs_free_path(path); - if (ret == 0) - insert_dirty_subvol(trans, rc, root); + if (ret == 0) { + ret = insert_dirty_subvol(trans, rc, root); + if (ret) + btrfs_abort_transaction(trans, ret); + } if (trans) btrfs_end_transaction_throttle(trans); @@ -1825,8 +1885,18 @@ again: root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); - BUG_ON(IS_ERR(root)); - BUG_ON(root->reloc_root != reloc_root); + if (IS_ERR(root)) { + /* + * Even if we have an error we need this reloc root + * back on our list so we can clean up properly. + */ + list_add(&reloc_root->root_list, &reloc_roots); + btrfs_abort_transaction(trans, (int)PTR_ERR(root)); + if (!err) + err = PTR_ERR(root); + break; + } + ASSERT(root->reloc_root == reloc_root); /* * set reference count to 1, so btrfs_recover_relocation @@ -1834,16 +1904,27 @@ again: */ if (!err) btrfs_set_root_refs(&reloc_root->root_item, 1); - btrfs_update_reloc_root(trans, root); + ret = btrfs_update_reloc_root(trans, root); + /* + * Even if we have an error we need this reloc root back on our + * list so we can clean up properly. + */ list_add(&reloc_root->root_list, &reloc_roots); btrfs_put_root(root); + + if (ret) { + btrfs_abort_transaction(trans, ret); + if (!err) + err = ret; + break; + } } list_splice(&reloc_roots, &rc->reloc_roots); if (!err) - btrfs_commit_transaction(trans); + err = btrfs_commit_transaction(trans); else btrfs_end_transaction(trans); return err; @@ -1888,8 +1969,29 @@ again: root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); if (btrfs_root_refs(&reloc_root->root_item) > 0) { - BUG_ON(IS_ERR(root)); - BUG_ON(root->reloc_root != reloc_root); + if (IS_ERR(root)) { + /* + * For recovery we read the fs roots on mount, + * and if we didn't find the root then we marked + * the reloc root as a garbage root. For normal + * relocation obviously the root should exist in + * memory. However there's no reason we can't + * handle the error properly here just in case. + */ + ASSERT(0); + ret = PTR_ERR(root); + goto out; + } + if (root->reloc_root != reloc_root) { + /* + * This is actually impossible without something + * going really wrong (like weird race condition + * or cosmic rays). + */ + ASSERT(0); + ret = -EINVAL; + goto out; + } ret = merge_reloc_root(rc, root); btrfs_put_root(root); if (ret) { @@ -1971,8 +2073,27 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, return 0; root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); - BUG_ON(IS_ERR(root)); - BUG_ON(root->reloc_root != reloc_root); + + /* + * This should succeed, since we can't have a reloc root without having + * already looked up the actual root and created the reloc root for this + * root. + * + * However if there's some sort of corruption where we have a ref to a + * reloc root without a corresponding root this could return ENOENT. + */ + if (IS_ERR(root)) { + ASSERT(0); + return PTR_ERR(root); + } + if (root->reloc_root != reloc_root) { + ASSERT(0); + btrfs_err(fs_info, + "root %llu has two reloc roots associated with it", + reloc_root->root_key.offset); + btrfs_put_root(root); + return -EUCLEAN; + } ret = btrfs_record_root_in_trans(trans, root); btrfs_put_root(root); @@ -1988,26 +2109,77 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_backref_node *next; struct btrfs_root *root; int index = 0; + int ret; next = node; while (1) { cond_resched(); next = walk_up_backref(next, edges, &index); root = next->root; - BUG_ON(!root); - BUG_ON(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)); + + /* + * If there is no root, then our references for this block are + * incomplete, as we should be able to walk all the way up to a + * block that is owned by a root. + * + * This path is only for SHAREABLE roots, so if we come upon a + * non-SHAREABLE root then we have backrefs that resolve + * improperly. + * + * Both of these cases indicate file system corruption, or a bug + * in the backref walking code. + */ + if (!root) { + ASSERT(0); + btrfs_err(trans->fs_info, + "bytenr %llu doesn't have a backref path ending in a root", + node->bytenr); + return ERR_PTR(-EUCLEAN); + } + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { + ASSERT(0); + btrfs_err(trans->fs_info, + "bytenr %llu has multiple refs with one ending in a non-shareable root", + node->bytenr); + return ERR_PTR(-EUCLEAN); + } if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - record_reloc_root_in_trans(trans, root); + ret = record_reloc_root_in_trans(trans, root); + if (ret) + return ERR_PTR(ret); break; } - btrfs_record_root_in_trans(trans, root); + ret = btrfs_record_root_in_trans(trans, root); + if (ret) + return ERR_PTR(ret); root = root->reloc_root; + /* + * We could have raced with another thread which failed, so + * root->reloc_root may not be set, return ENOENT in this case. + */ + if (!root) + return ERR_PTR(-ENOENT); + if (next->new_bytenr != root->node->start) { - BUG_ON(next->new_bytenr); - BUG_ON(!list_empty(&next->list)); + /* + * We just created the reloc root, so we shouldn't have + * ->new_bytenr set and this shouldn't be in the changed + * list. If it is then we have multiple roots pointing + * at the same bytenr which indicates corruption, or + * we've made a mistake in the backref walking code. + */ + ASSERT(next->new_bytenr == 0); + ASSERT(list_empty(&next->list)); + if (next->new_bytenr || !list_empty(&next->list)) { + btrfs_err(trans->fs_info, + "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu", + node->bytenr, next->bytenr); + return ERR_PTR(-EUCLEAN); + } + next->new_bytenr = root->node->start; btrfs_put_root(next->root); next->root = btrfs_grab_root(root); @@ -2024,8 +2196,14 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, if (!next || next->level <= node->level) break; } - if (!root) - return NULL; + if (!root) { + /* + * This can happen if there's fs corruption or if there's a bug + * in the backref lookup code. + */ + ASSERT(0); + return ERR_PTR(-ENOENT); + } next = node; /* setup backref node path for btrfs_reloc_cow_block */ @@ -2061,7 +2239,13 @@ struct btrfs_root *select_one_root(struct btrfs_backref_node *node) cond_resched(); next = walk_up_backref(next, edges, &index); root = next->root; - BUG_ON(!root); + + /* + * This can occur if we have incomplete extent refs leading all + * the way up a particular path, in this case return -EUCLEAN. + */ + if (!root) + return ERR_PTR(-EUCLEAN); /* No other choice for non-shareable tree */ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) @@ -2181,7 +2365,11 @@ static int do_relocation(struct btrfs_trans_handle *trans, int slot; int ret = 0; - BUG_ON(lowest && node->eb); + /* + * If we are lowest then this is the first time we're processing this + * block, and thus shouldn't have an eb associated with it yet. + */ + ASSERT(!lowest || !node->eb); path->lowest_level = node->level + 1; rc->backref_cache.path[node->level] = node; @@ -2192,7 +2380,10 @@ static int do_relocation(struct btrfs_trans_handle *trans, upper = edge->node[UPPER]; root = select_reloc_root(trans, rc, upper, edges); - BUG_ON(!root); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto next; + } if (upper->eb && !upper->locked) { if (!lowest) { @@ -2266,7 +2457,11 @@ static int do_relocation(struct btrfs_trans_handle *trans, free_extent_buffer(eb); if (ret < 0) goto next; - BUG_ON(node->eb != eb); + /* + * We've just COWed this block, it should have updated + * the correct backref node entry. + */ + ASSERT(node->eb == eb); } else { btrfs_set_node_blockptr(upper->eb, slot, node->eb->start); @@ -2281,10 +2476,11 @@ static int do_relocation(struct btrfs_trans_handle *trans, btrfs_init_tree_ref(&ref, node->level, btrfs_header_owner(upper->eb)); ret = btrfs_inc_extent_ref(trans, &ref); - BUG_ON(ret); - - ret = btrfs_drop_subtree(trans, root, eb, upper->eb); - BUG_ON(ret); + if (!ret) + ret = btrfs_drop_subtree(trans, root, eb, + upper->eb); + if (ret) + btrfs_abort_transaction(trans, ret); } next: if (!upper->pending) @@ -2302,7 +2498,12 @@ next: } path->lowest_level = 0; - BUG_ON(ret == -ENOSPC); + + /* + * We should have allocated all of our space in the block rsv and thus + * shouldn't ENOSPC. + */ + ASSERT(ret != -ENOSPC); return ret; } @@ -2434,16 +2635,53 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, BUG_ON(node->processed); root = select_one_root(node); - if (root == ERR_PTR(-ENOENT)) { - update_processed_blocks(rc, node); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + + /* See explanation in select_one_root for the -EUCLEAN case. */ + ASSERT(ret == -ENOENT); + if (ret == -ENOENT) { + ret = 0; + update_processed_blocks(rc, node); + } goto out; } if (root) { if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { - BUG_ON(node->new_bytenr); - BUG_ON(!list_empty(&node->list)); - btrfs_record_root_in_trans(trans, root); + /* + * This block was the root block of a root, and this is + * the first time we're processing the block and thus it + * should not have had the ->new_bytenr modified and + * should have not been included on the changed list. + * + * However in the case of corruption we could have + * multiple refs pointing to the same block improperly, + * and thus we would trip over these checks. ASSERT() + * for the developer case, because it could indicate a + * bug in the backref code, however error out for a + * normal user in the case of corruption. + */ + ASSERT(node->new_bytenr == 0); + ASSERT(list_empty(&node->list)); + if (node->new_bytenr || !list_empty(&node->list)) { + btrfs_err(root->fs_info, + "bytenr %llu has improper references to it", + node->bytenr); + ret = -EUCLEAN; + goto out; + } + ret = btrfs_record_root_in_trans(trans, root); + if (ret) + goto out; + /* + * Another thread could have failed, need to check if we + * have reloc_root actually set. + */ + if (!root->reloc_root) { + ret = -ENOENT; + goto out; + } root = root->reloc_root; node->new_bytenr = root->node->start; btrfs_put_root(node->root); @@ -2578,7 +2816,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( return btrfs_end_transaction(trans); } - inode_lock(&inode->vfs_inode); + btrfs_inode_lock(&inode->vfs_inode, 0); for (nr = 0; nr < cluster->nr; nr++) { start = cluster->boundary[nr] - offset; if (nr + 1 < cluster->nr) @@ -2596,7 +2834,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( if (ret) break; } - inode_unlock(&inode->vfs_inode); + btrfs_inode_unlock(&inode->vfs_inode, 0); if (cur_offset < prealloc_end) btrfs_free_reserved_data_space_noquota(inode->root->fs_info, @@ -3220,20 +3458,6 @@ static void unset_reloc_control(struct reloc_control *rc) mutex_unlock(&fs_info->reloc_mutex); } -static int check_extent_flags(u64 flags) -{ - if ((flags & BTRFS_EXTENT_FLAG_DATA) && - (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) - return 1; - if (!(flags & BTRFS_EXTENT_FLAG_DATA) && - !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) - return 1; - if ((flags & BTRFS_EXTENT_FLAG_DATA) && - (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) - return 1; - return 0; -} - static noinline_for_stack int prepare_to_relocate(struct reloc_control *rc) { @@ -3272,8 +3496,7 @@ int prepare_to_relocate(struct reloc_control *rc) */ return PTR_ERR(trans); } - btrfs_commit_transaction(trans); - return 0; + return btrfs_commit_transaction(trans); } static noinline_for_stack int relocate_block_group(struct reloc_control *rc) @@ -3285,7 +3508,6 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) struct btrfs_path *path; struct btrfs_extent_item *ei; u64 flags; - u32 item_size; int ret; int err = 0; int progress = 0; @@ -3334,19 +3556,7 @@ restart: ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); - item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); - if (item_size >= sizeof(*ei)) { - flags = btrfs_extent_flags(path->nodes[0], ei); - ret = check_extent_flags(flags); - BUG_ON(ret); - } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) { - err = -EINVAL; - btrfs_print_v0_err(trans->fs_info); - btrfs_abort_transaction(trans, err); - break; - } else { - BUG(); - } + flags = btrfs_extent_flags(path->nodes[0], ei); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { ret = add_tree_block(rc, &key, path, &blocks); @@ -3445,7 +3655,9 @@ restart: err = PTR_ERR(trans); goto out_free; } - btrfs_commit_transaction(trans); + ret = btrfs_commit_transaction(trans); + if (ret && !err) + err = ret; out_free: ret = clean_dirty_subvols(rc); if (ret < 0 && !err) @@ -3488,6 +3700,35 @@ out: return ret; } +static void delete_orphan_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret = 0; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = objectid; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret) { + if (ret > 0) + ret = -ENOENT; + goto out; + } + ret = btrfs_del_item(trans, root, path); +out: + if (ret) + btrfs_abort_transaction(trans, ret); + btrfs_free_path(path); +} + /* * helper to create inode for data relocation. * the inode is in data relocation tree and its link count is 0 @@ -3514,10 +3755,16 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, goto out; err = __insert_orphan_inode(trans, root, objectid); - BUG_ON(err); + if (err) + goto out; inode = btrfs_iget(fs_info->sb, objectid, root); - BUG_ON(IS_ERR(inode)); + if (IS_ERR(inode)) { + delete_orphan_inode(trans, root, objectid); + err = PTR_ERR(inode); + inode = NULL; + goto out; + } BTRFS_I(inode)->index_cnt = group->start; err = btrfs_orphan_add(trans, BTRFS_I(inode)); @@ -3859,7 +4106,13 @@ int btrfs_recover_relocation(struct btrfs_root *root) } err = __add_reloc_root(reloc_root); - BUG_ON(err < 0); /* -ENOMEM or logic error */ + ASSERT(err != -EEXIST); + if (err) { + list_add_tail(&reloc_root->root_list, &reloc_roots); + btrfs_put_root(fs_root); + btrfs_end_transaction(trans); + goto out_unset; + } fs_root->reloc_root = btrfs_grab_root(reloc_root); btrfs_put_root(fs_root); } @@ -4074,7 +4327,12 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, return PTR_ERR(reloc_root); ret = __add_reloc_root(reloc_root); - BUG_ON(ret < 0); + ASSERT(ret != -EEXIST); + if (ret) { + /* Pairs with create_reloc_root */ + btrfs_put_root(reloc_root); + return ret; + } new_root->reloc_root = btrfs_grab_root(reloc_root); if (rc->create_reloc_tree) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 3d9088eab2fc..485cda3eb8d7 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -206,9 +206,6 @@ struct full_stripe_lock { struct mutex mutex; }; -static void scrub_pending_bio_inc(struct scrub_ctx *sctx); -static void scrub_pending_bio_dec(struct scrub_ctx *sctx); -static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); static int scrub_setup_recheck_block(struct scrub_block *original_sblock, struct scrub_block *sblocks_for_recheck); static void scrub_recheck_block(struct btrfs_fs_info *fs_info, @@ -226,14 +223,11 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, static int scrub_checksum_data(struct scrub_block *sblock); static int scrub_checksum_tree_block(struct scrub_block *sblock); static int scrub_checksum_super(struct scrub_block *sblock); -static void scrub_block_get(struct scrub_block *sblock); static void scrub_block_put(struct scrub_block *sblock); static void scrub_page_get(struct scrub_page *spage); static void scrub_page_put(struct scrub_page *spage); static void scrub_parity_get(struct scrub_parity *sparity); static void scrub_parity_put(struct scrub_parity *sparity); -static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, - struct scrub_page *spage); static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len, u64 physical, struct btrfs_device *dev, u64 flags, u64 gen, int mirror_num, u8 *csum, @@ -251,8 +245,6 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, static void scrub_wr_submit(struct scrub_ctx *sctx); static void scrub_wr_bio_end_io(struct bio *bio); static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); -static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); -static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); static void scrub_put_ctx(struct scrub_ctx *sctx); static inline int scrub_is_page_on_raid56(struct scrub_page *spage) @@ -3682,8 +3674,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, spin_lock(&cache->lock); if (!cache->to_copy) { spin_unlock(&cache->lock); - ro_set = 0; - goto done; + btrfs_put_block_group(cache); + goto skip; } spin_unlock(&cache->lock); } @@ -3841,7 +3833,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, cache, found_key.offset)) ro_set = 0; -done: down_write(&dev_replace->rwsem); dev_replace->cursor_left = dev_replace->cursor_right; dev_replace->item_needs_writeback = 1; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 8f323859156b..55741adf9071 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -6650,6 +6650,7 @@ static int full_send_tree(struct send_ctx *sctx) path = alloc_path_for_send(); if (!path) return -ENOMEM; + path->reada = READA_FORWARD_ALWAYS; key.objectid = BTRFS_FIRST_FREE_OBJECTID; key.type = BTRFS_INODE_ITEM_KEY; @@ -6688,15 +6689,35 @@ out: return ret; } -static int tree_move_down(struct btrfs_path *path, int *level) +static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen) { struct extent_buffer *eb; + struct extent_buffer *parent = path->nodes[*level]; + int slot = path->slots[*level]; + const int nritems = btrfs_header_nritems(parent); + u64 reada_max; + u64 reada_done = 0; BUG_ON(*level == 0); - eb = btrfs_read_node_slot(path->nodes[*level], path->slots[*level]); + eb = btrfs_read_node_slot(parent, slot); if (IS_ERR(eb)) return PTR_ERR(eb); + /* + * Trigger readahead for the next leaves we will process, so that it is + * very likely that when we need them they are already in memory and we + * will not block on disk IO. For nodes we only do readahead for one, + * since the time window between processing nodes is typically larger. + */ + reada_max = (*level == 1 ? SZ_128K : eb->fs_info->nodesize); + + for (slot++; slot < nritems && reada_done < reada_max; slot++) { + if (btrfs_node_ptr_generation(parent, slot) > reada_min_gen) { + btrfs_readahead_node_child(parent, slot); + reada_done += eb->fs_info->nodesize; + } + } + path->nodes[*level - 1] = eb; path->slots[*level - 1] = 0; (*level)--; @@ -6736,14 +6757,15 @@ static int tree_move_next_or_upnext(struct btrfs_path *path, static int tree_advance(struct btrfs_path *path, int *level, int root_level, int allow_down, - struct btrfs_key *key) + struct btrfs_key *key, + u64 reada_min_gen) { int ret; if (*level == 0 || !allow_down) { ret = tree_move_next_or_upnext(path, level, root_level); } else { - ret = tree_move_down(path, level); + ret = tree_move_down(path, level, reada_min_gen); } if (ret >= 0) { if (*level == 0) @@ -6817,6 +6839,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, u64 right_blockptr; u64 left_gen; u64 right_gen; + u64 reada_min_gen; left_path = btrfs_alloc_path(); if (!left_path) { @@ -6896,6 +6919,14 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, ret = -ENOMEM; goto out; } + /* + * Our right root is the parent root, while the left root is the "send" + * root. We know that all new nodes/leaves in the left root must have + * a generation greater than the right root's generation, so we trigger + * readahead for those nodes and leaves of the left root, as we know we + * will need to read them at some point. + */ + reada_min_gen = btrfs_header_generation(right_root->commit_root); up_read(&fs_info->commit_root_sem); if (left_level == 0) @@ -6920,7 +6951,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, ret = tree_advance(left_path, &left_level, left_root_level, advance_left != ADVANCE_ONLY_NEXT, - &left_key); + &left_key, reada_min_gen); if (ret == -1) left_end_reached = ADVANCE; else if (ret < 0) @@ -6931,7 +6962,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, ret = tree_advance(right_path, &right_level, right_root_level, advance_right != ADVANCE_ONLY_NEXT, - &right_key); + &right_key, reada_min_gen); if (ret == -1) right_end_reached = ADVANCE; else if (ret < 0) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 2da6177f4b0b..2dc674b7c3b1 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -861,8 +861,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, * of heavy DIO or ordered reservations, preemptive flushing will just * waste time and cause us to slow down. */ - ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes); - delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes); + ordered = percpu_counter_read_positive(&fs_info->ordered_bytes); + delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes); if (ordered >= delalloc) used += fs_info->delayed_refs_rsv.reserved + fs_info->delayed_block_rsv.reserved; diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index c69049e7daa9..2d19089ab625 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -4,6 +4,64 @@ #include "ctree.h" #include "subpage.h" +/* + * Subpage (sectorsize < PAGE_SIZE) support overview: + * + * Limitations: + * + * - Only support 64K page size for now + * This is to make metadata handling easier, as 64K page would ensure + * all nodesize would fit inside one page, thus we don't need to handle + * cases where a tree block crosses several pages. + * + * - Only metadata read-write for now + * The data read-write part is in development. + * + * - Metadata can't cross 64K page boundary + * btrfs-progs and kernel have done that for a while, thus only ancient + * filesystems could have such problem. For such case, do a graceful + * rejection. + * + * Special behavior: + * + * - Metadata + * Metadata read is fully supported. + * Meaning when reading one tree block will only trigger the read for the + * needed range, other unrelated range in the same page will not be touched. + * + * Metadata write support is partial. + * The writeback is still for the full page, but we will only submit + * the dirty extent buffers in the page. + * + * This means, if we have a metadata page like this: + * + * Page offset + * 0 16K 32K 48K 64K + * |/////////| |///////////| + * \- Tree block A \- Tree block B + * + * Even if we just want to writeback tree block A, we will also writeback + * tree block B if it's also dirty. + * + * This may cause extra metadata writeback which results more COW. + * + * Implementation: + * + * - Common + * Both metadata and data will use a new structure, btrfs_subpage, to + * record the status of each sector inside a page. This provides the extra + * granularity needed. + * + * - Metadata + * Since we have multiple tree blocks inside one page, we can't rely on page + * locking anymore, or we will have greatly reduced concurrency or even + * deadlocks (hold one tree lock while trying to lock another tree lock in + * the same page). + * + * Thus for metadata locking, subpage support relies on io_tree locking only. + * This means a slightly higher tree locking latency. + */ + int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct page *page, enum btrfs_subpage_type type) { @@ -220,6 +278,82 @@ void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info, spin_unlock_irqrestore(&subpage->lock, flags); } +void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->dirty_bitmap |= tmp; + spin_unlock_irqrestore(&subpage->lock, flags); + set_page_dirty(page); +} + +/* + * Extra clear_and_test function for subpage dirty bitmap. + * + * Return true if we're the last bits in the dirty_bitmap and clear the + * dirty_bitmap. + * Return false otherwise. + * + * NOTE: Callers should manually clear page dirty for true case, as we have + * extra handling for tree blocks. + */ +bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + bool last = false; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->dirty_bitmap &= ~tmp; + if (subpage->dirty_bitmap == 0) + last = true; + spin_unlock_irqrestore(&subpage->lock, flags); + return last; +} + +void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + bool last; + + last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len); + if (last) + clear_page_dirty_for_io(page); +} + +void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->writeback_bitmap |= tmp; + set_page_writeback(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + +void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->writeback_bitmap &= ~tmp; + if (subpage->writeback_bitmap == 0) + end_page_writeback(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + /* * Unlike set/clear which is dependent on each page status, for test all bits * are tested in the same way. @@ -240,6 +374,8 @@ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ } IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error); +IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty); +IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback); /* * Note that, in selftests (extent-io-tests), we can have empty fs_info passed @@ -276,3 +412,7 @@ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate, PageUptodate); IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError); +IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io, + PageDirty); +IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback, + PageWriteback); diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index b86a4881475d..bfd626e955be 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -20,6 +20,8 @@ struct btrfs_subpage { spinlock_t lock; u16 uptodate_bitmap; u16 error_bitmap; + u16 dirty_bitmap; + u16 writeback_bitmap; union { /* * Structures only used by metadata @@ -87,5 +89,10 @@ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ DECLARE_BTRFS_SUBPAGE_OPS(uptodate); DECLARE_BTRFS_SUBPAGE_OPS(error); +DECLARE_BTRFS_SUBPAGE_OPS(dirty); +DECLARE_BTRFS_SUBPAGE_OPS(writeback); + +bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); #endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f7a4ad86adee..4a396c1147f1 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -252,6 +252,32 @@ void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, . } #endif +#if BITS_PER_LONG == 32 +void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info) +{ + if (!test_and_set_bit(BTRFS_FS_32BIT_WARN, &fs_info->flags)) { + btrfs_warn(fs_info, "reaching 32bit limit for logical addresses"); + btrfs_warn(fs_info, +"due to page cache limit on 32bit systems, btrfs can't access metadata at or beyond %lluT", + BTRFS_32BIT_MAX_FILE_SIZE >> 40); + btrfs_warn(fs_info, + "please consider upgrading to 64bit kernel/hardware"); + } +} + +void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info) +{ + if (!test_and_set_bit(BTRFS_FS_32BIT_ERROR, &fs_info->flags)) { + btrfs_err(fs_info, "reached 32bit limit for logical addresses"); + btrfs_err(fs_info, +"due to page cache limit on 32bit systems, metadata beyond %lluT can't be accessed", + BTRFS_32BIT_MAX_FILE_SIZE >> 40); + btrfs_err(fs_info, + "please consider upgrading to 64bit kernel/hardware"); + } +} +#endif + /* * We only mark the transaction aborted and then set the file system read-only. * This will prevent new transactions from starting or trying to join this diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 6eb1c50fa98c..436ac7b4b334 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -360,11 +360,26 @@ static ssize_t supported_rescue_options_show(struct kobject *kobj, BTRFS_ATTR(static_feature, supported_rescue_options, supported_rescue_options_show); +static ssize_t supported_sectorsizes_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + ssize_t ret = 0; + + /* Only sectorsize == PAGE_SIZE is now supported */ + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%lu\n", PAGE_SIZE); + + return ret; +} +BTRFS_ATTR(static_feature, supported_sectorsizes, + supported_sectorsizes_show); + static struct attribute *btrfs_supported_static_feature_attrs[] = { BTRFS_ATTR_PTR(static_feature, rmdir_subvol), BTRFS_ATTR_PTR(static_feature, supported_checksums), BTRFS_ATTR_PTR(static_feature, send_stream_version), BTRFS_ATTR_PTR(static_feature, supported_rescue_options), + BTRFS_ATTR_PTR(static_feature, supported_sectorsizes), NULL }; @@ -965,6 +980,40 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, } BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store); +static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + ssize_t ret; + + ret = scnprintf(buf, PAGE_SIZE, "%d\n", fs_info->bg_reclaim_threshold); + + return ret; +} + +static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + int thresh; + int ret; + + ret = kstrtoint(buf, 10, &thresh); + if (ret) + return ret; + + if (thresh <= 50 || thresh > 100) + return -EINVAL; + + fs_info->bg_reclaim_threshold = thresh; + + return len; +} +BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show, + btrfs_bg_reclaim_threshold_store); + static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, label), BTRFS_ATTR_PTR(, nodesize), @@ -976,6 +1025,7 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, exclusive_operation), BTRFS_ATTR_PTR(, generation), BTRFS_ATTR_PTR(, read_policy), + BTRFS_ATTR_PTR(, bg_reclaim_threshold), NULL, }; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index acff6bb49a97..f75de9f6c0ad 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -260,6 +260,7 @@ static inline int extwriter_counter_read(struct btrfs_transaction *trans) void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_transaction *cur_trans = trans->transaction; if (!trans->chunk_bytes_reserved) return; @@ -268,6 +269,8 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv, trans->chunk_bytes_reserved, NULL); + atomic64_sub(trans->chunk_bytes_reserved, &cur_trans->chunk_bytes_reserved); + cond_wake_up(&cur_trans->chunk_reserve_wait); trans->chunk_bytes_reserved = 0; } @@ -383,6 +386,8 @@ loop: spin_lock_init(&cur_trans->dropped_roots_lock); INIT_LIST_HEAD(&cur_trans->releasing_ebs); spin_lock_init(&cur_trans->releasing_ebs_lock); + atomic64_set(&cur_trans->chunk_bytes_reserved, 0); + init_waitqueue_head(&cur_trans->chunk_reserve_wait); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(fs_info, &cur_trans->dirty_pages, IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode); @@ -408,6 +413,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, int force) { struct btrfs_fs_info *fs_info = root->fs_info; + int ret = 0; if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && root->last_trans < trans->transid) || force) { @@ -456,11 +462,11 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, * lock. smp_wmb() makes sure that all the writes above are * done before we pop in the zero below */ - btrfs_init_reloc_root(trans, root); + ret = btrfs_init_reloc_root(trans, root); smp_mb__before_atomic(); clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state); } - return 0; + return ret; } @@ -487,6 +493,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; + int ret; if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return 0; @@ -501,10 +508,10 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, return 0; mutex_lock(&fs_info->reloc_mutex); - record_root_in_trans(trans, root, 0); + ret = record_root_in_trans(trans, root, 0); mutex_unlock(&fs_info->reloc_mutex); - return 0; + return ret; } static inline int is_transaction_blocked(struct btrfs_transaction *trans) @@ -741,7 +748,16 @@ got_it: * Thus it need to be called after current->journal_info initialized, * or we can deadlock. */ - btrfs_record_root_in_trans(h, root); + ret = btrfs_record_root_in_trans(h, root); + if (ret) { + /* + * The transaction handle is fully initialized and linked with + * other structures so it needs to be ended in case of errors, + * not just freed. + */ + btrfs_end_transaction(h); + return ERR_PTR(ret); + } return h; @@ -1347,7 +1363,9 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) spin_unlock(&fs_info->fs_roots_radix_lock); btrfs_free_log(trans, root); - btrfs_update_reloc_root(trans, root); + ret2 = btrfs_update_reloc_root(trans, root); + if (ret2) + return ret2; /* see comments in should_cow_block() */ clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); @@ -1440,7 +1458,9 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, * recorded root will never be updated again, causing an outdated root * item. */ - record_root_in_trans(trans, src, 1); + ret = record_root_in_trans(trans, src, 1); + if (ret) + return ret; /* * btrfs_qgroup_inherit relies on a consistent view of the usage for the @@ -1509,7 +1529,7 @@ out: * insert_dir_item() */ if (!ret) - record_root_in_trans(trans, parent, 1); + ret = record_root_in_trans(trans, parent, 1); return ret; } @@ -1586,8 +1606,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, dentry = pending->dentry; parent_inode = pending->dir; parent_root = BTRFS_I(parent_inode)->root; - record_root_in_trans(trans, parent_root, 0); - + ret = record_root_in_trans(trans, parent_root, 0); + if (ret) + goto fail; cur_time = current_time(parent_inode); /* @@ -1623,7 +1644,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } - record_root_in_trans(trans, root, 0); + ret = record_root_in_trans(trans, root, 0); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); btrfs_check_and_init_root_item(new_root_item); @@ -1961,7 +1986,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) */ BUG_ON(list_empty(&cur_trans->list)); - list_del_init(&cur_trans->list); if (cur_trans == fs_info->running_transaction) { cur_trans->state = TRANS_STATE_COMMIT_DOING; spin_unlock(&fs_info->trans_lock); @@ -1970,6 +1994,17 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) spin_lock(&fs_info->trans_lock); } + + /* + * Now that we know no one else is still using the transaction we can + * remove the transaction from the list of transactions. This avoids + * the transaction kthread from cleaning up the transaction while some + * other task is still using it, which could result in a use-after-free + * on things like log trees, as it forces the transaction kthread to + * wait for this transaction to be cleaned up by us. + */ + list_del_init(&cur_trans->list); + spin_unlock(&fs_info->trans_lock); btrfs_cleanup_one_transaction(trans->transaction, fs_info); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 6335716e513f..364cfbb4c5c5 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -96,6 +96,13 @@ struct btrfs_transaction { spinlock_t releasing_ebs_lock; struct list_head releasing_ebs; + + /* + * The number of bytes currently reserved, by all transaction handles + * attached to this transaction, for metadata extents of the chunk tree. + */ + atomic64_t chunk_bytes_reserved; + wait_queue_head_t chunk_reserve_wait; }; #define __TRANS_FREEZABLE (1U << 0) @@ -175,7 +182,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, spin_lock(&inode->lock); inode->last_trans = trans->transaction->transid; inode->last_sub_trans = inode->root->log_transid; - inode->last_log_commit = inode->root->last_log_commit; + inode->last_log_commit = inode->last_sub_trans - 1; spin_unlock(&inode->lock); } diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index f4ade821307d..a8b2e0d2c025 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1290,6 +1290,11 @@ static int check_extent_item(struct extent_buffer *leaf, key->offset, fs_info->sectorsize); return -EUCLEAN; } + if (unlikely(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { + extent_err(leaf, slot, + "invalid extent flag, data has full backref set"); + return -EUCLEAN; + } } ptr = (unsigned long)(struct btrfs_extent_item *)(ei + 1); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 92a368627791..f67721d82e5d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3165,20 +3165,22 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ mutex_unlock(&root->log_mutex); - btrfs_init_log_ctx(&root_log_ctx, NULL); - - mutex_lock(&log_root_tree->log_mutex); - if (btrfs_is_zoned(fs_info)) { + mutex_lock(&fs_info->tree_root->log_mutex); if (!log_root_tree->node) { ret = btrfs_alloc_log_tree_node(trans, log_root_tree); if (ret) { - mutex_unlock(&log_root_tree->log_mutex); + mutex_unlock(&fs_info->tree_log_mutex); goto out; } } + mutex_unlock(&fs_info->tree_root->log_mutex); } + btrfs_init_log_ctx(&root_log_ctx, NULL); + + mutex_lock(&log_root_tree->log_mutex); + index2 = log_root_tree->log_transid % 2; list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); root_log_ctx.log_transid = log_root_tree->log_transid; @@ -4136,7 +4138,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, return ret; } -static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) +static int extent_cmp(void *priv, const struct list_head *a, + const struct list_head *b) { struct extent_map *em1, *em2; @@ -6278,8 +6281,13 @@ again: } wc.replay_dest->log_root = log; - btrfs_record_root_in_trans(trans, wc.replay_dest); - ret = walk_log_tree(trans, log, &wc); + ret = btrfs_record_root_in_trans(trans, wc.replay_dest); + if (ret) + /* The loop needs to continue due to the root refs */ + btrfs_handle_fs_error(fs_info, ret, + "failed to record the log root in transaction"); + else + ret = walk_log_tree(trans, log, &wc); if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { ret = fixup_inode_link_counts(trans, wc.replay_dest, diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c new file mode 100644 index 000000000000..8a3a14686d3e --- /dev/null +++ b/fs/btrfs/tree-mod-log.c @@ -0,0 +1,929 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "tree-mod-log.h" +#include "disk-io.h" + +struct tree_mod_root { + u64 logical; + u8 level; +}; + +struct tree_mod_elem { + struct rb_node node; + u64 logical; + u64 seq; + enum btrfs_mod_log_op op; + + /* + * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS + * operations. + */ + int slot; + + /* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */ + u64 generation; + + /* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */ + struct btrfs_disk_key key; + u64 blockptr; + + /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */ + struct { + int dst_slot; + int nr_items; + } move; + + /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */ + struct tree_mod_root old_root; +}; + +/* + * Pull a new tree mod seq number for our operation. + */ +static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) +{ + return atomic64_inc_return(&fs_info->tree_mod_seq); +} + +/* + * This adds a new blocker to the tree mod log's blocker list if the @elem + * passed does not already have a sequence number set. So when a caller expects + * to record tree modifications, it should ensure to set elem->seq to zero + * before calling btrfs_get_tree_mod_seq. + * Returns a fresh, unused tree log modification sequence number, even if no new + * blocker was added. + */ +u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct btrfs_seq_list *elem) +{ + write_lock(&fs_info->tree_mod_log_lock); + if (!elem->seq) { + elem->seq = btrfs_inc_tree_mod_seq(fs_info); + list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); + set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags); + } + write_unlock(&fs_info->tree_mod_log_lock); + + return elem->seq; +} + +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct btrfs_seq_list *elem) +{ + struct rb_root *tm_root; + struct rb_node *node; + struct rb_node *next; + struct tree_mod_elem *tm; + u64 min_seq = BTRFS_SEQ_LAST; + u64 seq_putting = elem->seq; + + if (!seq_putting) + return; + + write_lock(&fs_info->tree_mod_log_lock); + list_del(&elem->list); + elem->seq = 0; + + if (list_empty(&fs_info->tree_mod_seq_list)) { + clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags); + } else { + struct btrfs_seq_list *first; + + first = list_first_entry(&fs_info->tree_mod_seq_list, + struct btrfs_seq_list, list); + if (seq_putting > first->seq) { + /* + * Blocker with lower sequence number exists, we cannot + * remove anything from the log. + */ + write_unlock(&fs_info->tree_mod_log_lock); + return; + } + min_seq = first->seq; + } + + /* + * Anything that's lower than the lowest existing (read: blocked) + * sequence number can be removed from the tree. + */ + tm_root = &fs_info->tree_mod_log; + for (node = rb_first(tm_root); node; node = next) { + next = rb_next(node); + tm = rb_entry(node, struct tree_mod_elem, node); + if (tm->seq >= min_seq) + continue; + rb_erase(node, tm_root); + kfree(tm); + } + write_unlock(&fs_info->tree_mod_log_lock); +} + +/* + * Key order of the log: + * node/leaf start address -> sequence + * + * The 'start address' is the logical address of the *new* root node for root + * replace operations, or the logical address of the affected block for all + * other operations. + */ +static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info, + struct tree_mod_elem *tm) +{ + struct rb_root *tm_root; + struct rb_node **new; + struct rb_node *parent = NULL; + struct tree_mod_elem *cur; + + lockdep_assert_held_write(&fs_info->tree_mod_log_lock); + + tm->seq = btrfs_inc_tree_mod_seq(fs_info); + + tm_root = &fs_info->tree_mod_log; + new = &tm_root->rb_node; + while (*new) { + cur = rb_entry(*new, struct tree_mod_elem, node); + parent = *new; + if (cur->logical < tm->logical) + new = &((*new)->rb_left); + else if (cur->logical > tm->logical) + new = &((*new)->rb_right); + else if (cur->seq < tm->seq) + new = &((*new)->rb_left); + else if (cur->seq > tm->seq) + new = &((*new)->rb_right); + else + return -EEXIST; + } + + rb_link_node(&tm->node, parent, new); + rb_insert_color(&tm->node, tm_root); + return 0; +} + +/* + * Determines if logging can be omitted. Returns true if it can. Otherwise, it + * returns false with the tree_mod_log_lock acquired. The caller must hold + * this until all tree mod log insertions are recorded in the rb tree and then + * write unlock fs_info::tree_mod_log_lock. + */ +static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb) +{ + if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) + return true; + if (eb && btrfs_header_level(eb) == 0) + return true; + + write_lock(&fs_info->tree_mod_log_lock); + if (list_empty(&(fs_info)->tree_mod_seq_list)) { + write_unlock(&fs_info->tree_mod_log_lock); + return true; + } + + return false; +} + +/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ +static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, + struct extent_buffer *eb) +{ + if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) + return false; + if (eb && btrfs_header_level(eb) == 0) + return false; + + return true; +} + +static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb, + int slot, + enum btrfs_mod_log_op op, + gfp_t flags) +{ + struct tree_mod_elem *tm; + + tm = kzalloc(sizeof(*tm), flags); + if (!tm) + return NULL; + + tm->logical = eb->start; + if (op != BTRFS_MOD_LOG_KEY_ADD) { + btrfs_node_key(eb, &tm->key, slot); + tm->blockptr = btrfs_node_blockptr(eb, slot); + } + tm->op = op; + tm->slot = slot; + tm->generation = btrfs_node_ptr_generation(eb, slot); + RB_CLEAR_NODE(&tm->node); + + return tm; +} + +int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, + enum btrfs_mod_log_op op, gfp_t flags) +{ + struct tree_mod_elem *tm; + int ret; + + if (!tree_mod_need_log(eb->fs_info, eb)) + return 0; + + tm = alloc_tree_mod_elem(eb, slot, op, flags); + if (!tm) + return -ENOMEM; + + if (tree_mod_dont_log(eb->fs_info, eb)) { + kfree(tm); + return 0; + } + + ret = tree_mod_log_insert(eb->fs_info, tm); + write_unlock(&eb->fs_info->tree_mod_log_lock); + if (ret) + kfree(tm); + + return ret; +} + +int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, + int dst_slot, int src_slot, + int nr_items) +{ + struct tree_mod_elem *tm = NULL; + struct tree_mod_elem **tm_list = NULL; + int ret = 0; + int i; + bool locked = false; + + if (!tree_mod_need_log(eb->fs_info, eb)) + return 0; + + tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS); + if (!tm_list) + return -ENOMEM; + + tm = kzalloc(sizeof(*tm), GFP_NOFS); + if (!tm) { + ret = -ENOMEM; + goto free_tms; + } + + tm->logical = eb->start; + tm->slot = src_slot; + tm->move.dst_slot = dst_slot; + tm->move.nr_items = nr_items; + tm->op = BTRFS_MOD_LOG_MOVE_KEYS; + + for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { + tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS); + if (!tm_list[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + + if (tree_mod_dont_log(eb->fs_info, eb)) + goto free_tms; + locked = true; + + /* + * When we override something during the move, we log these removals. + * This can only happen when we move towards the beginning of the + * buffer, i.e. dst_slot < src_slot. + */ + for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { + ret = tree_mod_log_insert(eb->fs_info, tm_list[i]); + if (ret) + goto free_tms; + } + + ret = tree_mod_log_insert(eb->fs_info, tm); + if (ret) + goto free_tms; + write_unlock(&eb->fs_info->tree_mod_log_lock); + kfree(tm_list); + + return 0; + +free_tms: + for (i = 0; i < nr_items; i++) { + if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) + rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log); + kfree(tm_list[i]); + } + if (locked) + write_unlock(&eb->fs_info->tree_mod_log_lock); + kfree(tm_list); + kfree(tm); + + return ret; +} + +static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, + struct tree_mod_elem **tm_list, + int nritems) +{ + int i, j; + int ret; + + for (i = nritems - 1; i >= 0; i--) { + ret = tree_mod_log_insert(fs_info, tm_list[i]); + if (ret) { + for (j = nritems - 1; j > i; j--) + rb_erase(&tm_list[j]->node, + &fs_info->tree_mod_log); + return ret; + } + } + + return 0; +} + +int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, + struct extent_buffer *new_root, + bool log_removal) +{ + struct btrfs_fs_info *fs_info = old_root->fs_info; + struct tree_mod_elem *tm = NULL; + struct tree_mod_elem **tm_list = NULL; + int nritems = 0; + int ret = 0; + int i; + + if (!tree_mod_need_log(fs_info, NULL)) + return 0; + + if (log_removal && btrfs_header_level(old_root) > 0) { + nritems = btrfs_header_nritems(old_root); + tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), + GFP_NOFS); + if (!tm_list) { + ret = -ENOMEM; + goto free_tms; + } + for (i = 0; i < nritems; i++) { + tm_list[i] = alloc_tree_mod_elem(old_root, i, + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); + if (!tm_list[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + } + + tm = kzalloc(sizeof(*tm), GFP_NOFS); + if (!tm) { + ret = -ENOMEM; + goto free_tms; + } + + tm->logical = new_root->start; + tm->old_root.logical = old_root->start; + tm->old_root.level = btrfs_header_level(old_root); + tm->generation = btrfs_header_generation(old_root); + tm->op = BTRFS_MOD_LOG_ROOT_REPLACE; + + if (tree_mod_dont_log(fs_info, NULL)) + goto free_tms; + + if (tm_list) + ret = tree_mod_log_free_eb(fs_info, tm_list, nritems); + if (!ret) + ret = tree_mod_log_insert(fs_info, tm); + + write_unlock(&fs_info->tree_mod_log_lock); + if (ret) + goto free_tms; + kfree(tm_list); + + return ret; + +free_tms: + if (tm_list) { + for (i = 0; i < nritems; i++) + kfree(tm_list[i]); + kfree(tm_list); + } + kfree(tm); + + return ret; +} + +static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info, + u64 start, u64 min_seq, + bool smallest) +{ + struct rb_root *tm_root; + struct rb_node *node; + struct tree_mod_elem *cur = NULL; + struct tree_mod_elem *found = NULL; + + read_lock(&fs_info->tree_mod_log_lock); + tm_root = &fs_info->tree_mod_log; + node = tm_root->rb_node; + while (node) { + cur = rb_entry(node, struct tree_mod_elem, node); + if (cur->logical < start) { + node = node->rb_left; + } else if (cur->logical > start) { + node = node->rb_right; + } else if (cur->seq < min_seq) { + node = node->rb_left; + } else if (!smallest) { + /* We want the node with the highest seq */ + if (found) + BUG_ON(found->seq > cur->seq); + found = cur; + node = node->rb_left; + } else if (cur->seq > min_seq) { + /* We want the node with the smallest seq */ + if (found) + BUG_ON(found->seq < cur->seq); + found = cur; + node = node->rb_right; + } else { + found = cur; + break; + } + } + read_unlock(&fs_info->tree_mod_log_lock); + + return found; +} + +/* + * This returns the element from the log with the smallest time sequence + * value that's in the log (the oldest log item). Any element with a time + * sequence lower than min_seq will be ignored. + */ +static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, + u64 start, u64 min_seq) +{ + return __tree_mod_log_search(fs_info, start, min_seq, true); +} + +/* + * This returns the element from the log with the largest time sequence + * value that's in the log (the most recent log item). Any element with + * a time sequence lower than min_seq will be ignored. + */ +static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info, + u64 start, u64 min_seq) +{ + return __tree_mod_log_search(fs_info, start, min_seq, false); +} + +int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, + struct extent_buffer *src, + unsigned long dst_offset, + unsigned long src_offset, + int nr_items) +{ + struct btrfs_fs_info *fs_info = dst->fs_info; + int ret = 0; + struct tree_mod_elem **tm_list = NULL; + struct tree_mod_elem **tm_list_add, **tm_list_rem; + int i; + bool locked = false; + + if (!tree_mod_need_log(fs_info, NULL)) + return 0; + + if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) + return 0; + + tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *), + GFP_NOFS); + if (!tm_list) + return -ENOMEM; + + tm_list_add = tm_list; + tm_list_rem = tm_list + nr_items; + for (i = 0; i < nr_items; i++) { + tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset, + BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS); + if (!tm_list_rem[i]) { + ret = -ENOMEM; + goto free_tms; + } + + tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset, + BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS); + if (!tm_list_add[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + + if (tree_mod_dont_log(fs_info, NULL)) + goto free_tms; + locked = true; + + for (i = 0; i < nr_items; i++) { + ret = tree_mod_log_insert(fs_info, tm_list_rem[i]); + if (ret) + goto free_tms; + ret = tree_mod_log_insert(fs_info, tm_list_add[i]); + if (ret) + goto free_tms; + } + + write_unlock(&fs_info->tree_mod_log_lock); + kfree(tm_list); + + return 0; + +free_tms: + for (i = 0; i < nr_items * 2; i++) { + if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) + rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); + kfree(tm_list[i]); + } + if (locked) + write_unlock(&fs_info->tree_mod_log_lock); + kfree(tm_list); + + return ret; +} + +int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb) +{ + struct tree_mod_elem **tm_list = NULL; + int nritems = 0; + int i; + int ret = 0; + + if (!tree_mod_need_log(eb->fs_info, eb)) + return 0; + + nritems = btrfs_header_nritems(eb); + tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS); + if (!tm_list) + return -ENOMEM; + + for (i = 0; i < nritems; i++) { + tm_list[i] = alloc_tree_mod_elem(eb, i, + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); + if (!tm_list[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + + if (tree_mod_dont_log(eb->fs_info, eb)) + goto free_tms; + + ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems); + write_unlock(&eb->fs_info->tree_mod_log_lock); + if (ret) + goto free_tms; + kfree(tm_list); + + return 0; + +free_tms: + for (i = 0; i < nritems; i++) + kfree(tm_list[i]); + kfree(tm_list); + + return ret; +} + +/* + * Returns the logical address of the oldest predecessor of the given root. + * Entries older than time_seq are ignored. + */ +static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root, + u64 time_seq) +{ + struct tree_mod_elem *tm; + struct tree_mod_elem *found = NULL; + u64 root_logical = eb_root->start; + bool looped = false; + + if (!time_seq) + return NULL; + + /* + * The very last operation that's logged for a root is the replacement + * operation (if it is replaced at all). This has the logical address + * of the *new* root, making it the very first operation that's logged + * for this root. + */ + while (1) { + tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical, + time_seq); + if (!looped && !tm) + return NULL; + /* + * If there are no tree operation for the oldest root, we simply + * return it. This should only happen if that (old) root is at + * level 0. + */ + if (!tm) + break; + + /* + * If there's an operation that's not a root replacement, we + * found the oldest version of our root. Normally, we'll find a + * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here. + */ + if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE) + break; + + found = tm; + root_logical = tm->old_root.logical; + looped = true; + } + + /* If there's no old root to return, return what we found instead */ + if (!found) + found = tm; + + return found; +} + + +/* + * tm is a pointer to the first operation to rewind within eb. Then, all + * previous operations will be rewound (until we reach something older than + * time_seq). + */ +static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, + u64 time_seq, + struct tree_mod_elem *first_tm) +{ + u32 n; + struct rb_node *next; + struct tree_mod_elem *tm = first_tm; + unsigned long o_dst; + unsigned long o_src; + unsigned long p_size = sizeof(struct btrfs_key_ptr); + + n = btrfs_header_nritems(eb); + read_lock(&fs_info->tree_mod_log_lock); + while (tm && tm->seq >= time_seq) { + /* + * All the operations are recorded with the operator used for + * the modification. As we're going backwards, we do the + * opposite of each operation here. + */ + switch (tm->op) { + case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING: + BUG_ON(tm->slot < n); + fallthrough; + case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING: + case BTRFS_MOD_LOG_KEY_REMOVE: + btrfs_set_node_key(eb, &tm->key, tm->slot); + btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); + btrfs_set_node_ptr_generation(eb, tm->slot, + tm->generation); + n++; + break; + case BTRFS_MOD_LOG_KEY_REPLACE: + BUG_ON(tm->slot >= n); + btrfs_set_node_key(eb, &tm->key, tm->slot); + btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); + btrfs_set_node_ptr_generation(eb, tm->slot, + tm->generation); + break; + case BTRFS_MOD_LOG_KEY_ADD: + /* if a move operation is needed it's in the log */ + n--; + break; + case BTRFS_MOD_LOG_MOVE_KEYS: + o_dst = btrfs_node_key_ptr_offset(tm->slot); + o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot); + memmove_extent_buffer(eb, o_dst, o_src, + tm->move.nr_items * p_size); + break; + case BTRFS_MOD_LOG_ROOT_REPLACE: + /* + * This operation is special. For roots, this must be + * handled explicitly before rewinding. + * For non-roots, this operation may exist if the node + * was a root: root A -> child B; then A gets empty and + * B is promoted to the new root. In the mod log, we'll + * have a root-replace operation for B, a tree block + * that is no root. We simply ignore that operation. + */ + break; + } + next = rb_next(&tm->node); + if (!next) + break; + tm = rb_entry(next, struct tree_mod_elem, node); + if (tm->logical != first_tm->logical) + break; + } + read_unlock(&fs_info->tree_mod_log_lock); + btrfs_set_header_nritems(eb, n); +} + +/* + * Called with eb read locked. If the buffer cannot be rewound, the same buffer + * is returned. If rewind operations happen, a fresh buffer is returned. The + * returned buffer is always read-locked. If the returned buffer is not the + * input buffer, the lock on the input buffer is released and the input buffer + * is freed (its refcount is decremented). + */ +struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + struct extent_buffer *eb, + u64 time_seq) +{ + struct extent_buffer *eb_rewin; + struct tree_mod_elem *tm; + + if (!time_seq) + return eb; + + if (btrfs_header_level(eb) == 0) + return eb; + + tm = tree_mod_log_search(fs_info, eb->start, time_seq); + if (!tm) + return eb; + + if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) { + BUG_ON(tm->slot != 0); + eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start); + if (!eb_rewin) { + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + return NULL; + } + btrfs_set_header_bytenr(eb_rewin, eb->start); + btrfs_set_header_backref_rev(eb_rewin, + btrfs_header_backref_rev(eb)); + btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); + btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); + } else { + eb_rewin = btrfs_clone_extent_buffer(eb); + if (!eb_rewin) { + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + return NULL; + } + } + + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + + btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin), + eb_rewin, btrfs_header_level(eb_rewin)); + btrfs_tree_read_lock(eb_rewin); + tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); + WARN_ON(btrfs_header_nritems(eb_rewin) > + BTRFS_NODEPTRS_PER_BLOCK(fs_info)); + + return eb_rewin; +} + +/* + * Rewind the state of @root's root node to the given @time_seq value. + * If there are no changes, the current root->root_node is returned. If anything + * changed in between, there's a fresh buffer allocated on which the rewind + * operations are done. In any case, the returned buffer is read locked. + * Returns NULL on error (with no locks held). + */ +struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct tree_mod_elem *tm; + struct extent_buffer *eb = NULL; + struct extent_buffer *eb_root; + u64 eb_root_owner = 0; + struct extent_buffer *old; + struct tree_mod_root *old_root = NULL; + u64 old_generation = 0; + u64 logical; + int level; + + eb_root = btrfs_read_lock_root_node(root); + tm = tree_mod_log_oldest_root(eb_root, time_seq); + if (!tm) + return eb_root; + + if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) { + old_root = &tm->old_root; + old_generation = tm->generation; + logical = old_root->logical; + level = old_root->level; + } else { + logical = eb_root->start; + level = btrfs_header_level(eb_root); + } + + tm = tree_mod_log_search(fs_info, logical, time_seq); + if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) { + btrfs_tree_read_unlock(eb_root); + free_extent_buffer(eb_root); + old = read_tree_block(fs_info, logical, root->root_key.objectid, + 0, level, NULL); + if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { + if (!IS_ERR(old)) + free_extent_buffer(old); + btrfs_warn(fs_info, + "failed to read tree block %llu from get_old_root", + logical); + } else { + struct tree_mod_elem *tm2; + + btrfs_tree_read_lock(old); + eb = btrfs_clone_extent_buffer(old); + /* + * After the lookup for the most recent tree mod operation + * above and before we locked and cloned the extent buffer + * 'old', a new tree mod log operation may have been added. + * So lookup for a more recent one to make sure the number + * of mod log operations we replay is consistent with the + * number of items we have in the cloned extent buffer, + * otherwise we can hit a BUG_ON when rewinding the extent + * buffer. + */ + tm2 = tree_mod_log_search(fs_info, logical, time_seq); + btrfs_tree_read_unlock(old); + free_extent_buffer(old); + ASSERT(tm2); + ASSERT(tm2 == tm || tm2->seq > tm->seq); + if (!tm2 || tm2->seq < tm->seq) { + free_extent_buffer(eb); + return NULL; + } + tm = tm2; + } + } else if (old_root) { + eb_root_owner = btrfs_header_owner(eb_root); + btrfs_tree_read_unlock(eb_root); + free_extent_buffer(eb_root); + eb = alloc_dummy_extent_buffer(fs_info, logical); + } else { + eb = btrfs_clone_extent_buffer(eb_root); + btrfs_tree_read_unlock(eb_root); + free_extent_buffer(eb_root); + } + + if (!eb) + return NULL; + if (old_root) { + btrfs_set_header_bytenr(eb, eb->start); + btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(eb, eb_root_owner); + btrfs_set_header_level(eb, old_root->level); + btrfs_set_header_generation(eb, old_generation); + } + btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, + btrfs_header_level(eb)); + btrfs_tree_read_lock(eb); + if (tm) + tree_mod_log_rewind(fs_info, eb, time_seq, tm); + else + WARN_ON(btrfs_header_level(eb) != 0); + WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info)); + + return eb; +} + +int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq) +{ + struct tree_mod_elem *tm; + int level; + struct extent_buffer *eb_root = btrfs_root_node(root); + + tm = tree_mod_log_oldest_root(eb_root, time_seq); + if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) + level = tm->old_root.level; + else + level = btrfs_header_level(eb_root); + + free_extent_buffer(eb_root); + + return level; +} + +/* + * Return the lowest sequence number in the tree modification log. + * + * Return the sequence number of the oldest tree modification log user, which + * corresponds to the lowest sequence number of all existing users. If there are + * no users it returns 0. + */ +u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info) +{ + u64 ret = 0; + + read_lock(&fs_info->tree_mod_log_lock); + if (!list_empty(&fs_info->tree_mod_seq_list)) { + struct btrfs_seq_list *elem; + + elem = list_first_entry(&fs_info->tree_mod_seq_list, + struct btrfs_seq_list, list); + ret = elem->seq; + } + read_unlock(&fs_info->tree_mod_log_lock); + + return ret; +} diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h new file mode 100644 index 000000000000..12605d19621b --- /dev/null +++ b/fs/btrfs/tree-mod-log.h @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef BTRFS_TREE_MOD_LOG_H +#define BTRFS_TREE_MOD_LOG_H + +#include "ctree.h" + +/* Represents a tree mod log user. */ +struct btrfs_seq_list { + struct list_head list; + u64 seq; +}; + +#define BTRFS_SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 } +#define BTRFS_SEQ_LAST ((u64)-1) + +enum btrfs_mod_log_op { + BTRFS_MOD_LOG_KEY_REPLACE, + BTRFS_MOD_LOG_KEY_ADD, + BTRFS_MOD_LOG_KEY_REMOVE, + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING, + BTRFS_MOD_LOG_MOVE_KEYS, + BTRFS_MOD_LOG_ROOT_REPLACE, +}; + +u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct btrfs_seq_list *elem); +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct btrfs_seq_list *elem); +int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, + struct extent_buffer *new_root, + bool log_removal); +int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, + enum btrfs_mod_log_op op, gfp_t flags); +int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb); +struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + struct extent_buffer *eb, + u64 time_seq); +struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq); +int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); +int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, + struct extent_buffer *src, + unsigned long dst_offset, + unsigned long src_offset, + int nr_items); +int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, + int dst_slot, int src_slot, + int nr_items); +u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info); + +#endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1c6810bbaf8b..9a1ead0c4a31 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1224,7 +1224,8 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, return 0; } -static int devid_cmp(void *priv, struct list_head *a, struct list_head *b) +static int devid_cmp(void *priv, const struct list_head *a, + const struct list_head *b) { struct btrfs_device *dev1, *dev2; @@ -1458,8 +1459,8 @@ static bool dev_extent_hole_check_zoned(struct btrfs_device *device, /* Given hole range was invalid (outside of device) */ if (ret == -ERANGE) { *hole_start += *hole_size; - *hole_size = 0; - return 1; + *hole_size = false; + return true; } *hole_start += zone_size; @@ -3098,11 +3099,12 @@ out: return ret; } -static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) +int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) { struct btrfs_root *root = fs_info->chunk_root; struct btrfs_trans_handle *trans; struct btrfs_block_group *block_group; + u64 length; int ret; /* @@ -3117,7 +3119,7 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) * we release the path used to search the chunk/dev tree and before * the current task acquires this mutex and calls us. */ - lockdep_assert_held(&fs_info->delete_unused_bgs_mutex); + lockdep_assert_held(&fs_info->reclaim_bgs_lock); /* step one, relocate all the extents inside this chunk */ btrfs_scrub_pause(fs_info); @@ -3130,8 +3132,23 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) if (!block_group) return -ENOENT; btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); + length = block_group->length; btrfs_put_block_group(block_group); + /* + * On a zoned file system, discard the whole block group, this will + * trigger a REQ_OP_ZONE_RESET operation on the device zone. If + * resetting the zone fails, don't treat it as a fatal problem from the + * filesystem's point of view. + */ + if (btrfs_is_zoned(fs_info)) { + ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL); + if (ret) + btrfs_info(fs_info, + "failed to reset zone %llu after relocation", + chunk_offset); + } + trans = btrfs_start_trans_remove_block_group(root->fs_info, chunk_offset); if (IS_ERR(trans)) { @@ -3172,10 +3189,10 @@ again: key.type = BTRFS_CHUNK_ITEM_KEY; while (1) { - mutex_lock(&fs_info->delete_unused_bgs_mutex); + mutex_lock(&fs_info->reclaim_bgs_lock); ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); if (ret < 0) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); goto error; } BUG_ON(ret == 0); /* Corruption */ @@ -3183,7 +3200,7 @@ again: ret = btrfs_previous_item(chunk_root, path, key.objectid, key.type); if (ret) - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); if (ret < 0) goto error; if (ret > 0) @@ -3204,7 +3221,7 @@ again: else BUG_ON(ret); } - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); if (found_key.offset == 0) break; @@ -3744,10 +3761,10 @@ again: goto error; } - mutex_lock(&fs_info->delete_unused_bgs_mutex); + mutex_lock(&fs_info->reclaim_bgs_lock); ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); if (ret < 0) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); goto error; } @@ -3761,7 +3778,7 @@ again: ret = btrfs_previous_item(chunk_root, path, 0, BTRFS_CHUNK_ITEM_KEY); if (ret) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); ret = 0; break; } @@ -3771,7 +3788,7 @@ again: btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.objectid != key.objectid) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); break; } @@ -3788,12 +3805,12 @@ again: btrfs_release_path(path); if (!ret) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); goto loop; } if (counting) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); spin_lock(&fs_info->balance_lock); bctl->stat.expected++; spin_unlock(&fs_info->balance_lock); @@ -3818,7 +3835,7 @@ again: count_meta < bctl->meta.limit_min) || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && count_sys < bctl->sys.limit_min)) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); goto loop; } @@ -3832,7 +3849,7 @@ again: ret = btrfs_may_alloc_data_chunk(fs_info, found_key.offset); if (ret < 0) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); goto error; } else if (ret == 1) { chunk_reserved = 1; @@ -3840,7 +3857,7 @@ again: } ret = btrfs_relocate_chunk(fs_info, found_key.offset); - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); if (ret == -ENOSPC) { enospc_errors++; } else if (ret == -ETXTBSY) { @@ -4725,16 +4742,16 @@ again: key.type = BTRFS_DEV_EXTENT_KEY; do { - mutex_lock(&fs_info->delete_unused_bgs_mutex); + mutex_lock(&fs_info->reclaim_bgs_lock); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); goto done; } ret = btrfs_previous_item(root, path, 0, key.type); if (ret) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); if (ret < 0) goto done; ret = 0; @@ -4747,7 +4764,7 @@ again: btrfs_item_key_to_cpu(l, &key, path->slots[0]); if (key.objectid != device->devid) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_release_path(path); break; } @@ -4756,7 +4773,7 @@ again: length = btrfs_dev_extent_length(l, dev_extent); if (key.offset + length <= new_size) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_release_path(path); break; } @@ -4772,12 +4789,12 @@ again: */ ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset); if (ret < 0) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); goto done; } ret = btrfs_relocate_chunk(fs_info, chunk_offset); - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); if (ret == -ENOSPC) { failed++; } else if (ret) { @@ -4989,6 +5006,8 @@ static void init_alloc_chunk_ctl_policy_zoned( ctl->max_chunk_size = 2 * ctl->max_stripe_size; ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); + } else { + BUG(); } /* We don't want a chunk larger than 10% of writable space */ @@ -6787,6 +6806,46 @@ static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) return div_u64(chunk_len, data_stripes); } +#if BITS_PER_LONG == 32 +/* + * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE + * can't be accessed on 32bit systems. + * + * This function do mount time check to reject the fs if it already has + * metadata chunk beyond that limit. + */ +static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info, + u64 logical, u64 length, u64 type) +{ + if (!(type & BTRFS_BLOCK_GROUP_METADATA)) + return 0; + + if (logical + length < MAX_LFS_FILESIZE) + return 0; + + btrfs_err_32bit_limit(fs_info); + return -EOVERFLOW; +} + +/* + * This is to give early warning for any metadata chunk reaching + * BTRFS_32BIT_EARLY_WARN_THRESHOLD. + * Although we can still access the metadata, it's not going to be possible + * once the limit is reached. + */ +static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info, + u64 logical, u64 length, u64 type) +{ + if (!(type & BTRFS_BLOCK_GROUP_METADATA)) + return; + + if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD) + return; + + btrfs_warn_32bit_limit(fs_info); +} +#endif + static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, struct btrfs_chunk *chunk) { @@ -6797,6 +6856,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, u64 logical; u64 length; u64 devid; + u64 type; u8 uuid[BTRFS_UUID_SIZE]; int num_stripes; int ret; @@ -6804,8 +6864,16 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, logical = key->offset; length = btrfs_chunk_length(leaf, chunk); + type = btrfs_chunk_type(leaf, chunk); num_stripes = btrfs_chunk_num_stripes(leaf, chunk); +#if BITS_PER_LONG == 32 + ret = check_32bit_meta_chunk(fs_info, logical, length, type); + if (ret < 0) + return ret; + warn_32bit_meta_chunk(fs_info, logical, length, type); +#endif + /* * Only need to verify chunk item if we're reading from sys chunk array, * as chunk item in tree block is already verified by tree-checker. @@ -6849,10 +6917,10 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, map->io_width = btrfs_chunk_io_width(leaf, chunk); map->io_align = btrfs_chunk_io_align(leaf, chunk); map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); - map->type = btrfs_chunk_type(leaf, chunk); + map->type = type; map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); map->verified_stripes = 0; - em->orig_block_len = calc_stripe_length(map->type, em->len, + em->orig_block_len = calc_stripe_length(type, em->len, map->num_stripes); for (i = 0; i < num_stripes; i++) { map->stripes[i].physical = @@ -8001,7 +8069,7 @@ static int relocating_repair_kthread(void *data) return -EBUSY; } - mutex_lock(&fs_info->delete_unused_bgs_mutex); + mutex_lock(&fs_info->reclaim_bgs_lock); /* Ensure block group still exists */ cache = btrfs_lookup_block_group(fs_info, target); @@ -8023,7 +8091,7 @@ static int relocating_repair_kthread(void *data) out: if (cache) btrfs_put_block_group(cache); - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_exclop_finish(fs_info); return ret; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index d4c3e0dd32b8..9c0d84e5ec06 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -484,6 +484,7 @@ void btrfs_describe_block_groups(u64 flags, char *buf, u32 size_buf); int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); int btrfs_recover_balance(struct btrfs_fs_info *fs_info); int btrfs_pause_balance(struct btrfs_fs_info *fs_info); +int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset); int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); int btrfs_uuid_scan_kthread(void *data); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 1f972b75a9ab..70b23a0d03b1 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -21,9 +21,30 @@ /* Pseudo write pointer value for conventional zone */ #define WP_CONVENTIONAL ((u64)-2) +/* + * Location of the first zone of superblock logging zone pairs. + * + * - primary superblock: 0B (zone 0) + * - first copy: 512G (zone starting at that offset) + * - second copy: 4T (zone starting at that offset) + */ +#define BTRFS_SB_LOG_PRIMARY_OFFSET (0ULL) +#define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G) +#define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G) + +#define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET) +#define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET) + /* Number of superblock log zones */ #define BTRFS_NR_SB_LOG_ZONES 2 +/* + * Maximum supported zone size. Currently, SMR disks have a zone size of + * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not + * expect the zone size to become larger than 8GiB in the near future. + */ +#define BTRFS_MAX_ZONE_SIZE SZ_8G + static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data) { struct blk_zone *zones = data; @@ -111,23 +132,22 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, } /* - * The following zones are reserved as the circular buffer on ZONED btrfs. - * - The primary superblock: zones 0 and 1 - * - The first copy: zones 16 and 17 - * - The second copy: zones 1024 or zone at 256GB which is minimum, and - * the following one + * Get the first zone number of the superblock mirror */ static inline u32 sb_zone_number(int shift, int mirror) { - ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); + u64 zone; + ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); switch (mirror) { - case 0: return 0; - case 1: return 16; - case 2: return min_t(u64, btrfs_sb_offset(mirror) >> shift, 1024); + case 0: zone = 0; break; + case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break; + case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break; } - return 0; + ASSERT(zone <= U32_MAX); + + return (u32)zone; } /* @@ -300,10 +320,21 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) zone_sectors = bdev_zone_sectors(bdev); } - nr_sectors = bdev_nr_sectors(bdev); /* Check if it's power of 2 (see is_power_of_2) */ ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0); zone_info->zone_size = zone_sectors << SECTOR_SHIFT; + + /* We reject devices with a zone size larger than 8GB */ + if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) { + btrfs_err_in_rcu(fs_info, + "zoned: %s: zone size %llu larger than supported maximum %llu", + rcu_str_deref(device->name), + zone_info->zone_size, BTRFS_MAX_ZONE_SIZE); + ret = -EINVAL; + goto out; + } + + nr_sectors = bdev_nr_sectors(bdev); zone_info->zone_size_shift = ilog2(zone_info->zone_size); zone_info->max_zone_append_size = (u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT; @@ -311,6 +342,13 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; + if (bdev_is_zoned(bdev) && zone_info->max_zone_append_size == 0) { + btrfs_err(fs_info, "zoned: device %pg does not support zone append", + bdev); + ret = -EINVAL; + goto out; + } + zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); if (!zone_info->seq_zones) { ret = -ENOMEM; diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 61e969652fe1..5e41a74a9cb2 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -9,6 +9,12 @@ #include "disk-io.h" #include "block-group.h" +/* + * Block groups with more than this value (percents) of unusable space will be + * scheduled for background reclaim. + */ +#define BTRFS_DEFAULT_RECLAIM_THRESH 75 + struct btrfs_zoned_device_info { /* * Number of zones, zone size and types of zones if bdev is a diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 570731c4d019..3c03fa37cac4 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3358,7 +3358,13 @@ static void handle_cap_grant(struct inode *inode, if ((newcaps & CEPH_CAP_AUTH_SHARED) && (extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) { - inode->i_mode = le32_to_cpu(grant->mode); + umode_t mode = le32_to_cpu(grant->mode); + + if (inode_wrong_type(inode, mode)) + pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n", + ceph_vinop(inode), inode->i_mode, mode); + else + inode->i_mode = mode; inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid)); ci->i_btime = extra_info->btime; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 83d9358854fb..f7a790ed62c4 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -677,6 +677,8 @@ int ceph_handle_snapdir(struct ceph_mds_request *req, strcmp(dentry->d_name.name, fsc->mount_options->snapdir_name) == 0) { struct inode *inode = ceph_get_snapdir(parent); + if (IS_ERR(inode)) + return PTR_ERR(inode); dout("ENOENT on snapdir %p '%pd', linking to snapdir %p\n", dentry, dentry, inode); BUG_ON(!d_unhashed(dentry)); diff --git a/fs/ceph/export.c b/fs/ceph/export.c index e088843a7734..f22156ee7306 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -248,9 +248,10 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb, ihold(inode); } else { /* mds does not support lookup snapped inode */ - err = -EOPNOTSUPP; - inode = NULL; + inode = ERR_PTR(-EOPNOTSUPP); } + } else { + inode = ERR_PTR(-ESTALE); } ceph_mdsc_put_request(req); @@ -261,8 +262,8 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb, dout("snapfh_to_dentry %llx.%llx parent %llx hash %x err=%d", vino.ino, vino.snap, sfh->parent_ino, sfh->hash, err); } - if (!inode) - return ERR_PTR(-ESTALE); + if (IS_ERR(inode)) + return ERR_CAST(inode); /* see comments in ceph_get_parent() */ return unlinked ? d_obtain_root(inode) : d_obtain_alias(inode); } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 156f849f5385..689e3ffd29d7 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -78,9 +78,21 @@ struct inode *ceph_get_snapdir(struct inode *parent) struct inode *inode = ceph_get_inode(parent->i_sb, vino); struct ceph_inode_info *ci = ceph_inode(inode); - BUG_ON(!S_ISDIR(parent->i_mode)); if (IS_ERR(inode)) return inode; + + if (!S_ISDIR(parent->i_mode)) { + pr_warn_once("bad snapdir parent type (mode=0%o)\n", + parent->i_mode); + return ERR_PTR(-ENOTDIR); + } + + if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) { + pr_warn_once("bad snapdir inode type (mode=0%o)\n", + inode->i_mode); + return ERR_PTR(-ENOTDIR); + } + inode->i_mode = parent->i_mode; inode->i_uid = parent->i_uid; inode->i_gid = parent->i_gid; @@ -757,11 +769,32 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, bool queue_trunc = false; bool new_version = false; bool fill_inline = false; + umode_t mode = le32_to_cpu(info->mode); + dev_t rdev = le32_to_cpu(info->rdev); dout("%s %p ino %llx.%llx v %llu had %llu\n", __func__, inode, ceph_vinop(inode), le64_to_cpu(info->version), ci->i_version); + /* Once I_NEW is cleared, we can't change type or dev numbers */ + if (inode->i_state & I_NEW) { + inode->i_mode = mode; + } else { + if (inode_wrong_type(inode, mode)) { + pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n", + ceph_vinop(inode), inode->i_mode, mode); + return -ESTALE; + } + + if ((S_ISCHR(mode) || S_ISBLK(mode)) && inode->i_rdev != rdev) { + pr_warn_once("dev inode rdev changed! (ino %llx.%llx is %u:%u, mds says %u:%u)\n", + ceph_vinop(inode), MAJOR(inode->i_rdev), + MINOR(inode->i_rdev), MAJOR(rdev), + MINOR(rdev)); + return -ESTALE; + } + } + info_caps = le32_to_cpu(info->cap.caps); /* prealloc new cap struct */ @@ -815,8 +848,6 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, issued |= __ceph_caps_dirty(ci); new_issued = ~issued & info_caps; - /* update inode */ - inode->i_rdev = le32_to_cpu(info->rdev); /* directories have fl_stripe_unit set to zero */ if (le32_to_cpu(info->layout.fl_stripe_unit)) inode->i_blkbits = @@ -828,7 +859,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) && (issued & CEPH_CAP_AUTH_EXCL) == 0) { - inode->i_mode = le32_to_cpu(info->mode); + inode->i_mode = mode; inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid)); inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid)); dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, @@ -926,7 +957,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, case S_IFCHR: case S_IFSOCK: inode->i_blkbits = PAGE_SHIFT; - init_special_inode(inode, inode->i_mode, inode->i_rdev); + init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &ceph_file_iops; break; case S_IFREG: diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index fe03cbdae959..bf52e9326ebe 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -18,6 +18,7 @@ config CIFS select CRYPTO_AES select CRYPTO_LIB_DES select KEYS + select DNS_RESOLVER help This is the client VFS module for the SMB3 family of NAS protocols, (including support for the most recent, most secure dialect SMB3.1.1) @@ -112,7 +113,6 @@ config CIFS_WEAK_PW_HASH config CIFS_UPCALL bool "Kerberos/SPNEGO advanced session setup" depends on CIFS - select DNS_RESOLVER help Enables an upcall mechanism for CIFS which accesses userspace helper utilities to provide SPNEGO packaged (RFC 4178) Kerberos tickets @@ -179,7 +179,6 @@ config CIFS_DEBUG_DUMP_KEYS config CIFS_DFS_UPCALL bool "DFS feature support" depends on CIFS - select DNS_RESOLVER help Distributed File System (DFS) support is used to access shares transparently in an enterprise name space, even if the share diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index 5213b20843b5..3ee3b7de4ded 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -10,13 +10,14 @@ cifs-y := trace.o cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o \ cifs_unicode.o nterr.o cifsencrypt.o \ readdir.o ioctl.o sess.o export.o smb1ops.o unc.o winucase.o \ smb2ops.o smb2maperror.o smb2transport.o \ - smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o + smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o \ + dns_resolve.o cifs-$(CONFIG_CIFS_XATTR) += xattr.o cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o -cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o dfs_cache.o +cifs-$(CONFIG_CIFS_DFS_UPCALL) += cifs_dfs_ref.o dfs_cache.o cifs-$(CONFIG_CIFS_SWN_UPCALL) += netlink.o cifs_swn.o diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 88a7958170ee..68e8e5b27841 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -17,15 +17,14 @@ #include "cifsproto.h" #include "cifs_debug.h" #include "cifsfs.h" +#include "fs_context.h" #ifdef CONFIG_CIFS_DFS_UPCALL #include "dfs_cache.h" #endif #ifdef CONFIG_CIFS_SMB_DIRECT #include "smbdirect.h" #endif -#ifdef CONFIG_CIFS_SWN_UPCALL #include "cifs_swn.h" -#endif void cifs_dump_mem(char *label, void *data, int length) @@ -118,10 +117,8 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon) seq_printf(m, " POSIX Extensions"); if (tcon->ses->server->ops->dump_share_caps) tcon->ses->server->ops->dump_share_caps(m, tcon); -#ifdef CONFIG_CIFS_SWN_UPCALL if (tcon->use_witness) seq_puts(m, " Witness"); -#endif if (tcon->need_reconnect) seq_puts(m, "\tDISCONNECTED "); @@ -490,10 +487,8 @@ skip_rdma: spin_unlock(&cifs_tcp_ses_lock); seq_putc(m, '\n'); - -#ifdef CONFIG_CIFS_SWN_UPCALL cifs_swn_dump(m); -#endif + /* BB add code to dump additional info such as TCP session info now */ return 0; } @@ -702,6 +697,7 @@ static const struct proc_ops cifs_lookup_cache_proc_ops; static const struct proc_ops traceSMB_proc_ops; static const struct proc_ops cifs_security_flags_proc_ops; static const struct proc_ops cifs_linux_ext_proc_ops; +static const struct proc_ops cifs_mount_params_proc_ops; void cifs_proc_init(void) @@ -726,6 +722,8 @@ cifs_proc_init(void) proc_create("LookupCacheEnabled", 0644, proc_fs_cifs, &cifs_lookup_cache_proc_ops); + proc_create("mount_params", 0444, proc_fs_cifs, &cifs_mount_params_proc_ops); + #ifdef CONFIG_CIFS_DFS_UPCALL proc_create("dfscache", 0644, proc_fs_cifs, &dfscache_proc_ops); #endif @@ -764,6 +762,7 @@ cifs_proc_clean(void) remove_proc_entry("SecurityFlags", proc_fs_cifs); remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs); remove_proc_entry("LookupCacheEnabled", proc_fs_cifs); + remove_proc_entry("mount_params", proc_fs_cifs); #ifdef CONFIG_CIFS_DFS_UPCALL remove_proc_entry("dfscache", proc_fs_cifs); @@ -1023,6 +1022,51 @@ static const struct proc_ops cifs_security_flags_proc_ops = { .proc_release = single_release, .proc_write = cifs_security_flags_proc_write, }; + +/* To make it easier to debug, can help to show mount params */ +static int cifs_mount_params_proc_show(struct seq_file *m, void *v) +{ + const struct fs_parameter_spec *p; + const char *type; + + for (p = smb3_fs_parameters; p->name; p++) { + /* cannot use switch with pointers... */ + if (!p->type) { + if (p->flags == fs_param_neg_with_no) + type = "noflag"; + else + type = "flag"; + } else if (p->type == fs_param_is_bool) + type = "bool"; + else if (p->type == fs_param_is_u32) + type = "u32"; + else if (p->type == fs_param_is_u64) + type = "u64"; + else if (p->type == fs_param_is_string) + type = "string"; + else + type = "unknown"; + + seq_printf(m, "%s:%s\n", p->name, type); + } + + return 0; +} + +static int cifs_mount_params_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cifs_mount_params_proc_show, NULL); +} + +static const struct proc_ops cifs_mount_params_proc_ops = { + .proc_open = cifs_mount_params_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + /* No need for write for now */ + /* .proc_write = cifs_mount_params_proc_write, */ +}; + #else inline void cifs_proc_init(void) { diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 6b1ce4efb591..c87c37cf2914 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -270,7 +270,7 @@ static struct vfsmount *cifs_dfs_do_mount(struct dentry *mntpt, char *mountdata; char *devname; - devname = kstrndup(fullpath, strlen(fullpath), GFP_KERNEL); + devname = kstrdup(fullpath, GFP_KERNEL); if (!devname) return ERR_PTR(-ENOMEM); @@ -302,6 +302,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) struct cifs_sb_info *cifs_sb; struct cifs_ses *ses; struct cifs_tcon *tcon; + void *page; char *full_path, *root_path; unsigned int xid; int rc; @@ -324,10 +325,13 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) goto cdda_exit; } + page = alloc_dentry_path(); /* always use tree name prefix */ - full_path = build_path_from_dentry_optional_prefix(mntpt, true); - if (full_path == NULL) - goto cdda_exit; + full_path = build_path_from_dentry_optional_prefix(mntpt, page, true); + if (IS_ERR(full_path)) { + mnt = ERR_CAST(full_path); + goto free_full_path; + } convert_delimiter(full_path, '\\'); @@ -385,7 +389,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) free_root_path: kfree(root_path); free_full_path: - kfree(full_path); + free_dentry_path(page); cdda_exit: cifs_dbg(FYI, "leaving %s\n" , __func__); return mnt; diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index aa77edc12212..2a5325a7ae49 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -81,5 +81,9 @@ struct cifs_sb_info { * (cifs_autodisable_serverino) in order to match new mounts. */ bool mnt_cifs_serverino_autodisabled; + /* + * Available once the mount has completed. + */ + struct dentry *root; }; #endif /* _CIFS_FS_SB_H */ diff --git a/fs/cifs/cifs_swn.h b/fs/cifs/cifs_swn.h index 236ecd4959d5..8a9d2a5c9077 100644 --- a/fs/cifs/cifs_swn.h +++ b/fs/cifs/cifs_swn.h @@ -7,11 +7,13 @@ #ifndef _CIFS_SWN_H #define _CIFS_SWN_H +#include "cifsglob.h" struct cifs_tcon; struct sk_buff; struct genl_info; +#ifdef CONFIG_CIFS_SWN_UPCALL extern int cifs_swn_register(struct cifs_tcon *tcon); extern int cifs_swn_unregister(struct cifs_tcon *tcon); @@ -22,4 +24,29 @@ extern void cifs_swn_dump(struct seq_file *m); extern void cifs_swn_check(void); +static inline bool cifs_swn_set_server_dstaddr(struct TCP_Server_Info *server) +{ + if (server->use_swn_dstaddr) { + server->dstaddr = server->swn_dstaddr; + return true; + } + return false; +} + +static inline void cifs_swn_reset_server_dstaddr(struct TCP_Server_Info *server) +{ + server->use_swn_dstaddr = false; +} + +#else + +static inline int cifs_swn_register(struct cifs_tcon *tcon) { return 0; } +static inline int cifs_swn_unregister(struct cifs_tcon *tcon) { return 0; } +static inline int cifs_swn_notify(struct sk_buff *s, struct genl_info *i) { return 0; } +static inline void cifs_swn_dump(struct seq_file *m) {} +static inline void cifs_swn_check(void) {} +static inline bool cifs_swn_set_server_dstaddr(struct TCP_Server_Info *server) { return false; } +static inline void cifs_swn_reset_server_dstaddr(struct TCP_Server_Info *server) {} + +#endif /* CONFIG_CIFS_SWN_UPCALL */ #endif /* _CIFS_SWN_H */ diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index d178cf85e926..784407f9280f 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1094,11 +1094,9 @@ static int set_chmod_dacl(struct cifs_acl *pdacl, struct cifs_acl *pndacl, struct cifs_ace *pnntace = NULL; char *nacl_base = NULL; u32 num_aces = 0; - __u64 nmode; bool new_aces_set = false; /* Assuming that pndacl and pnmode are never NULL */ - nmode = *pnmode; nacl_base = (char *)pndacl; nsize = sizeof(struct cifs_acl); @@ -1651,7 +1649,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, * Add three ACEs for owner, group, everyone getting rid of other ACEs * as chmod disables ACEs and set the security descriptor. Allocate * memory for the smb header, set security descriptor request security - * descriptor parameters, and secuirty descriptor itself + * descriptor parameters, and security descriptor itself */ nsecdesclen = max_t(u32, nsecdesclen, DEFAULT_SEC_DESC_LEN); pnntsd = kmalloc(nsecdesclen, GFP_KERNEL); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 099ad9f3660b..5f2c139143a7 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -217,8 +217,11 @@ cifs_read_super(struct super_block *sb) rc = super_setup_bdi(sb); if (rc) goto out_no_root; - /* tune readahead according to rsize */ - sb->s_bdi->ra_pages = cifs_sb->ctx->rsize / PAGE_SIZE; + /* tune readahead according to rsize if readahead size not set on mount */ + if (cifs_sb->ctx->rasize) + sb->s_bdi->ra_pages = cifs_sb->ctx->rasize / PAGE_SIZE; + else + sb->s_bdi->ra_pages = cifs_sb->ctx->rsize / PAGE_SIZE; sb->s_blocksize = CIFS_MAX_MSGSIZE; sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */ @@ -257,6 +260,29 @@ out_no_root: static void cifs_kill_sb(struct super_block *sb) { struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct cifs_tcon *tcon; + struct cached_fid *cfid; + + /* + * We ned to release all dentries for the cached directories + * before we kill the sb. + */ + if (cifs_sb->root) { + dput(cifs_sb->root); + cifs_sb->root = NULL; + } + tcon = cifs_sb_master_tcon(cifs_sb); + if (tcon) { + cfid = &tcon->crfid; + mutex_lock(&cfid->fid_mutex); + if (cfid->dentry) { + + dput(cfid->dentry); + cfid->dentry = NULL; + } + mutex_unlock(&cfid->fid_mutex); + } + kill_anon_super(sb); cifs_umount(cifs_sb); } @@ -476,7 +502,8 @@ static int cifs_show_devname(struct seq_file *m, struct dentry *root) seq_puts(m, "none"); else { convert_delimiter(devname, '/'); - seq_puts(m, devname); + /* escape all spaces in share names */ + seq_escape(m, devname, " \t"); kfree(devname); } return 0; @@ -625,6 +652,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",rsize=%u", cifs_sb->ctx->rsize); seq_printf(s, ",wsize=%u", cifs_sb->ctx->wsize); seq_printf(s, ",bsize=%u", cifs_sb->ctx->bsize); + if (cifs_sb->ctx->rasize) + seq_printf(s, ",rasize=%u", cifs_sb->ctx->rasize); if (tcon->ses->server->min_offload) seq_printf(s, ",esize=%u", tcon->ses->server->min_offload); seq_printf(s, ",echo_interval=%lu", @@ -655,10 +684,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",multichannel,max_channels=%zu", tcon->ses->chan_max); -#ifdef CONFIG_CIFS_SWN_UPCALL if (tcon->use_witness) seq_puts(s, ",witness"); -#endif return 0; } @@ -833,6 +860,12 @@ cifs_smb3_do_mount(struct file_system_type *fs_type, goto out; } + /* cifs_setup_volume_info->smb3_parse_devname() redups UNC & prepath */ + kfree(cifs_sb->ctx->UNC); + cifs_sb->ctx->UNC = NULL; + kfree(cifs_sb->ctx->prepath); + cifs_sb->ctx->prepath = NULL; + rc = cifs_setup_volume_info(cifs_sb->ctx, NULL, old_ctx->UNC); if (rc) { root = ERR_PTR(rc); @@ -887,6 +920,9 @@ cifs_smb3_do_mount(struct file_system_type *fs_type, if (IS_ERR(root)) goto out_super; + if (cifs_sb) + cifs_sb->root = dget(root); + cifs_dbg(FYI, "dentry root is: %p\n", root); return root; @@ -1527,10 +1563,6 @@ init_cifs(void) int rc = 0; cifs_proc_init(); INIT_LIST_HEAD(&cifs_tcp_ses_list); -#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */ - INIT_LIST_HEAD(&GlobalDnotifyReqList); - INIT_LIST_HEAD(&GlobalDnotifyRsp_Q); -#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */ /* * Initialize Global counters */ diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 0d7ef150dbb2..6beddb108ba0 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -165,5 +165,5 @@ extern struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type, extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.31" +#define CIFS_VERSION "2.32" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 67c056a9a519..b23a0ee8c6f8 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -495,7 +495,7 @@ struct smb_version_operations { struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, - char *full_path, + const char *full_path, umode_t mode, dev_t device_number); /* version specific fiemap implementation */ @@ -988,10 +988,12 @@ struct cached_fid { bool is_valid:1; /* Do we have a useable root fid */ bool file_all_info_is_valid:1; bool has_lease:1; + unsigned long time; /* jiffies of when lease was taken */ struct kref refcount; struct cifs_fid *fid; struct mutex fid_mutex; struct cifs_tcon *tcon; + struct dentry *dentry; struct work_struct lease_break; struct smb2_file_all_info file_all_info; }; @@ -1070,6 +1072,7 @@ struct cifs_tcon { bool use_resilient:1; /* use resilient instead of durable handles */ bool use_persistent:1; /* use persistent instead of durable handles */ bool no_lease:1; /* Do not request leases on files or directories */ + bool use_witness:1; /* use witness protocol */ __le32 capabilities; __u32 share_flags; __u32 maximal_access; @@ -1094,9 +1097,6 @@ struct cifs_tcon { int remap:2; struct list_head ulist; /* cache update list */ #endif -#ifdef CONFIG_CIFS_SWN_UPCALL - bool use_witness:1; /* use witness protocol */ -#endif }; /* @@ -1283,8 +1283,6 @@ struct cifs_aio_ctx { bool direct_io; }; -struct cifs_readdata; - /* asynchronous read support */ struct cifs_readdata { struct kref refcount; @@ -1318,8 +1316,6 @@ struct cifs_readdata { struct page **pages; }; -struct cifs_writedata; - /* asynchronous write support */ struct cifs_writedata { struct kref refcount; @@ -1798,9 +1794,8 @@ require use of the stronger protocol */ * * Semaphores * ---------- - * sesSem operations on smb session - * tconSem operations on tree connection - * fh_sem file handle reconnection operations + * cifsInodeInfo->lock_sem protects: + * the list of locks held by the inode * ****************************************************************************/ @@ -1831,13 +1826,6 @@ GLOBAL_EXTERN struct list_head cifs_tcp_ses_list; */ GLOBAL_EXTERN spinlock_t cifs_tcp_ses_lock; -#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */ -/* Outstanding dir notify requests */ -GLOBAL_EXTERN struct list_head GlobalDnotifyReqList; -/* DirNotify response queue */ -GLOBAL_EXTERN struct list_head GlobalDnotifyRsp_Q; -#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */ - /* * Global transaction id (XID) information */ @@ -1881,19 +1869,9 @@ extern unsigned int cifs_min_small; /* min size of small buf pool */ extern unsigned int cifs_max_pending; /* MAX requests at once to server*/ extern bool disable_legacy_dialects; /* forbid vers=1.0 and vers=2.0 mounts */ -GLOBAL_EXTERN struct rb_root uidtree; -GLOBAL_EXTERN struct rb_root gidtree; -GLOBAL_EXTERN spinlock_t siduidlock; -GLOBAL_EXTERN spinlock_t sidgidlock; -GLOBAL_EXTERN struct rb_root siduidtree; -GLOBAL_EXTERN struct rb_root sidgidtree; -GLOBAL_EXTERN spinlock_t uidsidlock; -GLOBAL_EXTERN spinlock_t gidsidlock; - void cifs_oplock_break(struct work_struct *work); void cifs_queue_oplock_break(struct cifsFileInfo *cfile); -extern const struct slow_work_ops cifs_oplock_break_ops; extern struct workqueue_struct *cifsiod_wq; extern struct workqueue_struct *decrypt_wq; extern struct workqueue_struct *fileinfo_put_wq; diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 9adc74bd9f8f..b53a87db282f 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -1903,7 +1903,7 @@ typedef struct smb_com_transaction2_fnext_req { __le16 InformationLevel; __u32 ResumeKey; __le16 SearchFlags; - char ResumeFileName[1]; + char ResumeFileName[]; } __attribute__((packed)) TRANSACTION2_FNEXT_REQ; typedef struct smb_com_transaction2_fnext_rsp { diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 75ce6f742b8d..a79d50001fbf 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -69,9 +69,20 @@ extern int init_cifs_idmap(void); extern void exit_cifs_idmap(void); extern int init_cifs_spnego(void); extern void exit_cifs_spnego(void); -extern char *build_path_from_dentry(struct dentry *); +extern const char *build_path_from_dentry(struct dentry *, void *); extern char *build_path_from_dentry_optional_prefix(struct dentry *direntry, - bool prefix); + void *page, bool prefix); +static inline void *alloc_dentry_path(void) +{ + return __getname(); +} + +static inline void free_dentry_path(void *page) +{ + if (page) + __putname(page); +} + extern char *cifs_build_path_to_root(struct smb3_fs_context *ctx, struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon, @@ -184,7 +195,7 @@ extern struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, struct tcon_link *tlink, __u32 oplock); -extern int cifs_posix_open(char *full_path, struct inode **inode, +extern int cifs_posix_open(const char *full_path, struct inode **inode, struct super_block *sb, int mode, unsigned int f_flags, __u32 *oplock, __u16 *netfid, unsigned int xid); @@ -194,7 +205,7 @@ extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb); extern void cifs_dir_info_to_fattr(struct cifs_fattr *, FILE_DIRECTORY_INFO *, struct cifs_sb_info *); -extern void cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr); +extern int cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr); extern struct inode *cifs_iget(struct super_block *sb, struct cifs_fattr *fattr); @@ -207,7 +218,7 @@ extern int cifs_get_inode_info_unix(struct inode **pinode, const unsigned char *search_path, struct super_block *sb, unsigned int xid); extern int cifs_set_file_info(struct inode *inode, struct iattr *attrs, - unsigned int xid, char *full_path, __u32 dosattr); + unsigned int xid, const char *full_path, __u32 dosattr); extern int cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, const unsigned int xid); @@ -358,11 +369,6 @@ extern int CIFSSMBSetFileDisposition(const unsigned int xid, struct cifs_tcon *tcon, bool delete_file, __u16 fid, __u32 pid_of_opener); -#if 0 -extern int CIFSSMBSetAttrLegacy(unsigned int xid, struct cifs_tcon *tcon, - char *fileName, __u16 dos_attributes, - const struct nls_table *nls_codepage); -#endif /* possibly unneeded function */ extern int CIFSSMBSetEOF(const unsigned int xid, struct cifs_tcon *tcon, const char *file_name, __u64 size, struct cifs_sb_info *cifs_sb, bool set_allocation); @@ -504,12 +510,6 @@ extern int generate_smb311signingkey(struct cifs_ses *); extern int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt, char *lnm_session_key); #endif /* CIFS_WEAK_PW_HASH */ -#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */ -extern int CIFSSMBNotify(const unsigned int xid, struct cifs_tcon *tcon, - const int notify_subdirs, const __u16 netfid, - __u32 filter, struct file *file, int multishot, - const struct nls_table *nls_codepage); -#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */ extern int CIFSSMBCopy(unsigned int xid, struct cifs_tcon *source_tcon, const char *fromName, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index c279527aae92..41f74163cc1c 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -114,7 +114,7 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) mutex_lock(&tcon->crfid.fid_mutex); tcon->crfid.is_valid = false; /* cached handle is not valid, so SMB2_CLOSE won't be sent below */ - close_shroot_lease_locked(&tcon->crfid); + close_cached_dir_lease_locked(&tcon->crfid); memset(tcon->crfid.fid, 0, sizeof(struct cifs_fid)); mutex_unlock(&tcon->crfid.fid_mutex); @@ -5917,56 +5917,6 @@ SetTimesRetry: return rc; } -/* Can not be used to set time stamps yet (due to old DOS time format) */ -/* Can be used to set attributes */ -#if 0 /* Possibly not needed - since it turns out that strangely NT4 has a bug - handling it anyway and NT4 was what we thought it would be needed for - Do not delete it until we prove whether needed for Win9x though */ -int -CIFSSMBSetAttrLegacy(unsigned int xid, struct cifs_tcon *tcon, char *fileName, - __u16 dos_attrs, const struct nls_table *nls_codepage) -{ - SETATTR_REQ *pSMB = NULL; - SETATTR_RSP *pSMBr = NULL; - int rc = 0; - int bytes_returned; - int name_len; - - cifs_dbg(FYI, "In SetAttrLegacy\n"); - -SetAttrLgcyRetry: - rc = smb_init(SMB_COM_SETATTR, 8, tcon, (void **) &pSMB, - (void **) &pSMBr); - if (rc) - return rc; - - if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { - name_len = - ConvertToUTF16((__le16 *) pSMB->fileName, fileName, - PATH_MAX, nls_codepage); - name_len++; /* trailing null */ - name_len *= 2; - } else { - name_len = copy_path_name(pSMB->fileName, fileName); - } - pSMB->attr = cpu_to_le16(dos_attrs); - pSMB->BufferFormat = 0x04; - inc_rfc1001_len(pSMB, name_len + 1); - pSMB->ByteCount = cpu_to_le16(name_len + 1); - rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, - (struct smb_hdr *) pSMBr, &bytes_returned, 0); - if (rc) - cifs_dbg(FYI, "Error in LegacySetAttr = %d\n", rc); - - cifs_buf_release(pSMB); - - if (rc == -EAGAIN) - goto SetAttrLgcyRetry; - - return rc; -} -#endif /* temporarily unneeded SetAttr legacy function */ - static void cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset, const struct cifs_unix_set_info_args *args) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index eec8a2052da2..121d8b4535b0 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -62,9 +62,7 @@ #include "dfs_cache.h" #endif #include "fs_context.h" -#ifdef CONFIG_CIFS_SWN_UPCALL #include "cifs_swn.h" -#endif extern mempool_t *cifs_req_poolp; extern bool disable_legacy_dialects; @@ -87,7 +85,6 @@ static void cifs_prune_tlinks(struct work_struct *work); * * This should be called with server->srv_mutex held. */ -#ifdef CONFIG_CIFS_DFS_UPCALL static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server) { int rc; @@ -124,6 +121,7 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server) return !rc ? -1 : 0; } +#ifdef CONFIG_CIFS_DFS_UPCALL /* These functions must be called with server->srv_mutex held */ static void reconn_set_next_dfs_target(struct TCP_Server_Info *server, struct cifs_sb_info *cifs_sb, @@ -314,25 +312,34 @@ cifs_reconnect(struct TCP_Server_Info *server) mutex_lock(&server->srv_mutex); -#ifdef CONFIG_CIFS_SWN_UPCALL - if (server->use_swn_dstaddr) { - server->dstaddr = server->swn_dstaddr; - } else { -#endif + if (!cifs_swn_set_server_dstaddr(server)) { #ifdef CONFIG_CIFS_DFS_UPCALL + if (cifs_sb && cifs_sb->origin_fullpath) /* * Set up next DFS target server (if any) for reconnect. If DFS * feature is disabled, then we will retry last server we * connected to before. */ reconn_set_next_dfs_target(server, cifs_sb, &tgt_list, &tgt_it); + else { #endif + /* + * Resolve the hostname again to make sure that IP address is up-to-date. + */ + rc = reconn_set_ipaddr_from_hostname(server); + if (rc) { + cifs_dbg(FYI, "%s: failed to resolve hostname: %d\n", + __func__, rc); + } -#ifdef CONFIG_CIFS_SWN_UPCALL +#ifdef CONFIG_CIFS_DFS_UPCALL } #endif + + } + if (cifs_rdma_enabled(server)) rc = smbd_reconnect(server); else @@ -348,9 +355,7 @@ cifs_reconnect(struct TCP_Server_Info *server) if (server->tcpStatus != CifsExiting) server->tcpStatus = CifsNeedNegotiate; spin_unlock(&GlobalMid_Lock); -#ifdef CONFIG_CIFS_SWN_UPCALL - server->use_swn_dstaddr = false; -#endif + cifs_swn_reset_server_dstaddr(server); mutex_unlock(&server->srv_mutex); } } while (server->tcpStatus == CifsNeedReconnect); @@ -415,10 +420,8 @@ cifs_echo_request(struct work_struct *work) cifs_dbg(FYI, "Unable to send echo request to server: %s\n", server->hostname); -#ifdef CONFIG_CIFS_SWN_UPCALL /* Check witness registrations */ cifs_swn_check(); -#endif requeue_echo: queue_delayed_work(cifsiod_wq, &server->echo, server->echo_interval); @@ -1775,9 +1778,7 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses) * for the request. */ if (is_domain && ses->domainName) { - ctx->domainname = kstrndup(ses->domainName, - strlen(ses->domainName), - GFP_KERNEL); + ctx->domainname = kstrdup(ses->domainName, GFP_KERNEL); if (!ctx->domainname) { cifs_dbg(FYI, "Unable to allocate %zd bytes for domain\n", len); @@ -1994,7 +1995,6 @@ cifs_put_tcon(struct cifs_tcon *tcon) return; } -#ifdef CONFIG_CIFS_SWN_UPCALL if (tcon->use_witness) { int rc; @@ -2004,7 +2004,6 @@ cifs_put_tcon(struct cifs_tcon *tcon) __func__, rc); } } -#endif list_del_init(&tcon->tcon_list); spin_unlock(&cifs_tcp_ses_lock); @@ -2166,9 +2165,9 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) } tcon->use_resilient = true; } -#ifdef CONFIG_CIFS_SWN_UPCALL + tcon->use_witness = false; - if (ctx->witness) { + if (IS_ENABLED(CONFIG_CIFS_SWN_UPCALL) && ctx->witness) { if (ses->server->vals->protocol_id >= SMB30_PROT_ID) { if (tcon->capabilities & SMB2_SHARE_CAP_CLUSTER) { /* @@ -2194,7 +2193,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) goto out_fail; } } -#endif /* If the user really knows what they are doing they can override */ if (tcon->share_flags & SMB2_SHAREFLAG_NO_CACHING) { @@ -3411,8 +3409,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) goto error; } /* Save mount options */ - mntdata = kstrndup(cifs_sb->ctx->mount_options, - strlen(cifs_sb->ctx->mount_options), GFP_KERNEL); + mntdata = kstrdup(cifs_sb->ctx->mount_options, GFP_KERNEL); if (!mntdata) { rc = -ENOMEM; goto error; @@ -3485,7 +3482,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) * links, the prefix path is included in both and may be changed during reconnect. See * cifs_tree_connect(). */ - cifs_sb->origin_fullpath = kstrndup(full_path, strlen(full_path), GFP_KERNEL); + cifs_sb->origin_fullpath = kstrdup(full_path, GFP_KERNEL); if (!cifs_sb->origin_fullpath) { rc = -ENOMEM; goto error; @@ -3862,9 +3859,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) ctx->sectype = master_tcon->ses->sectype; ctx->sign = master_tcon->ses->sign; ctx->seal = master_tcon->seal; -#ifdef CONFIG_CIFS_SWN_UPCALL ctx->witness = master_tcon->use_witness; -#endif rc = cifs_set_vol_auth(ctx, master_tcon->ses); if (rc) { diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index 098b4bc8da59..b1fa30fefe1f 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -81,23 +81,24 @@ static void refresh_cache_worker(struct work_struct *work); static DECLARE_DELAYED_WORK(refresh_task, refresh_cache_worker); -static int get_normalized_path(const char *path, char **npath) +static int get_normalized_path(const char *path, const char **npath) { if (!path || strlen(path) < 3 || (*path != '\\' && *path != '/')) return -EINVAL; if (*path == '\\') { - *npath = (char *)path; + *npath = path; } else { - *npath = kstrndup(path, strlen(path), GFP_KERNEL); - if (!*npath) + char *s = kstrdup(path, GFP_KERNEL); + if (!s) return -ENOMEM; - convert_delimiter(*npath, '\\'); + convert_delimiter(s, '\\'); + *npath = s; } return 0; } -static inline void free_normalized_path(const char *path, char *npath) +static inline void free_normalized_path(const char *path, const char *npath) { if (path != npath) kfree(npath); @@ -358,7 +359,7 @@ static struct cache_dfs_tgt *alloc_target(const char *name, int path_consumed) t = kmalloc(sizeof(*t), GFP_ATOMIC); if (!t) return ERR_PTR(-ENOMEM); - t->name = kstrndup(name, strlen(name), GFP_ATOMIC); + t->name = kstrdup(name, GFP_ATOMIC); if (!t->name) { kfree(t); return ERR_PTR(-ENOMEM); @@ -419,7 +420,7 @@ static struct cache_entry *alloc_cache_entry(const char *path, if (!ce) return ERR_PTR(-ENOMEM); - ce->path = kstrndup(path, strlen(path), GFP_KERNEL); + ce->path = kstrdup(path, GFP_KERNEL); if (!ce->path) { kmem_cache_free(cache_slab, ce); return ERR_PTR(-ENOMEM); @@ -531,7 +532,7 @@ static struct cache_entry *lookup_cache_entry(const char *path, unsigned int *ha char *s, *e; char sep; - npath = kstrndup(path, strlen(path), GFP_KERNEL); + npath = kstrdup(path, GFP_KERNEL); if (!npath) return ERR_PTR(-ENOMEM); @@ -641,7 +642,7 @@ static int __update_cache_entry(const char *path, if (ce->tgthint) { s = ce->tgthint->name; - th = kstrndup(s, strlen(s), GFP_ATOMIC); + th = kstrdup(s, GFP_ATOMIC); if (!th) return -ENOMEM; } @@ -786,11 +787,11 @@ static int setup_referral(const char *path, struct cache_entry *ce, memset(ref, 0, sizeof(*ref)); - ref->path_name = kstrndup(path, strlen(path), GFP_ATOMIC); + ref->path_name = kstrdup(path, GFP_ATOMIC); if (!ref->path_name) return -ENOMEM; - ref->node_name = kstrndup(target, strlen(target), GFP_ATOMIC); + ref->node_name = kstrdup(target, GFP_ATOMIC); if (!ref->node_name) { rc = -ENOMEM; goto err_free_path; @@ -828,7 +829,7 @@ static int get_targets(struct cache_entry *ce, struct dfs_cache_tgt_list *tl) goto err_free_it; } - it->it_name = kstrndup(t->name, strlen(t->name), GFP_ATOMIC); + it->it_name = kstrdup(t->name, GFP_ATOMIC); if (!it->it_name) { kfree(it); rc = -ENOMEM; @@ -882,7 +883,7 @@ int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, struct dfs_cache_tgt_list *tgt_list) { int rc; - char *npath; + const char *npath; struct cache_entry *ce; rc = get_normalized_path(path, &npath); @@ -936,7 +937,7 @@ int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref, struct dfs_cache_tgt_list *tgt_list) { int rc; - char *npath; + const char *npath; struct cache_entry *ce; rc = get_normalized_path(path, &npath); @@ -991,7 +992,7 @@ int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses, const struct dfs_cache_tgt_iterator *it) { int rc; - char *npath; + const char *npath; struct cache_entry *ce; struct cache_dfs_tgt *t; @@ -1053,7 +1054,7 @@ int dfs_cache_noreq_update_tgthint(const char *path, const struct dfs_cache_tgt_iterator *it) { int rc; - char *npath; + const char *npath; struct cache_entry *ce; struct cache_dfs_tgt *t; @@ -1111,7 +1112,7 @@ int dfs_cache_get_tgt_referral(const char *path, struct dfs_info3_param *ref) { int rc; - char *npath; + const char *npath; struct cache_entry *ce; if (!it || !ref) @@ -1166,7 +1167,7 @@ int dfs_cache_add_vol(char *mntdata, struct smb3_fs_context *ctx, const char *fu if (!vi) return -ENOMEM; - vi->fullpath = kstrndup(fullpath, strlen(fullpath), GFP_KERNEL); + vi->fullpath = kstrdup(fullpath, GFP_KERNEL); if (!vi->fullpath) { rc = -ENOMEM; goto err_free_vi; @@ -1484,7 +1485,7 @@ static int refresh_tcon(struct vol_info *vi, struct cifs_tcon *tcon) { int rc = 0; unsigned int xid; - char *path, *npath; + const char *path, *npath; struct cache_entry *ce; struct cifs_ses *root_ses = NULL, *ses; struct dfs_info3_param *refs = NULL; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index a3fb81e0ba17..c85aff838305 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -78,31 +78,31 @@ cifs_build_path_to_root(struct smb3_fs_context *ctx, struct cifs_sb_info *cifs_s } /* Note: caller must free return buffer */ -char * -build_path_from_dentry(struct dentry *direntry) +const char * +build_path_from_dentry(struct dentry *direntry, void *page) { struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); bool prefix = tcon->Flags & SMB_SHARE_IS_IN_DFS; - return build_path_from_dentry_optional_prefix(direntry, + return build_path_from_dentry_optional_prefix(direntry, page, prefix); } char * -build_path_from_dentry_optional_prefix(struct dentry *direntry, bool prefix) +build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page, + bool prefix) { - struct dentry *temp; - int namelen; int dfsplen; int pplen = 0; - char *full_path; - char dirsep; struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); - unsigned seq; + char dirsep = CIFS_DIR_SEP(cifs_sb); + char *s; + + if (unlikely(!page)) + return ERR_PTR(-ENOMEM); - dirsep = CIFS_DIR_SEP(cifs_sb); if (prefix) dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1); else @@ -111,86 +111,39 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, bool prefix) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) pplen = cifs_sb->prepath ? strlen(cifs_sb->prepath) + 1 : 0; -cifs_bp_rename_retry: - namelen = dfsplen + pplen; - seq = read_seqbegin(&rename_lock); - rcu_read_lock(); - for (temp = direntry; !IS_ROOT(temp);) { - namelen += (1 + temp->d_name.len); - temp = temp->d_parent; - if (temp == NULL) { - cifs_dbg(VFS, "corrupt dentry\n"); - rcu_read_unlock(); - return NULL; - } - } - rcu_read_unlock(); - - full_path = kmalloc(namelen+1, GFP_ATOMIC); - if (full_path == NULL) - return full_path; - full_path[namelen] = 0; /* trailing null */ - rcu_read_lock(); - for (temp = direntry; !IS_ROOT(temp);) { - spin_lock(&temp->d_lock); - namelen -= 1 + temp->d_name.len; - if (namelen < 0) { - spin_unlock(&temp->d_lock); - break; - } else { - full_path[namelen] = dirsep; - strncpy(full_path + namelen + 1, temp->d_name.name, - temp->d_name.len); - cifs_dbg(FYI, "name: %s\n", full_path + namelen); - } - spin_unlock(&temp->d_lock); - temp = temp->d_parent; - if (temp == NULL) { - cifs_dbg(VFS, "corrupt dentry\n"); - rcu_read_unlock(); - kfree(full_path); - return NULL; - } - } - rcu_read_unlock(); - if (namelen != dfsplen + pplen || read_seqretry(&rename_lock, seq)) { - cifs_dbg(FYI, "did not end path lookup where expected. namelen=%ddfsplen=%d\n", - namelen, dfsplen); - /* presumably this is only possible if racing with a rename - of one of the parent directories (we can not lock the dentries - above us to prevent this, but retrying should be harmless) */ - kfree(full_path); - goto cifs_bp_rename_retry; - } - /* DIR_SEP already set for byte 0 / vs \ but not for - subsequent slashes in prepath which currently must - be entered the right way - not sure if there is an alternative - since the '\' is a valid posix character so we can not switch - those safely to '/' if any are found in the middle of the prepath */ - /* BB test paths to Windows with '/' in the midst of prepath */ - + s = dentry_path_raw(direntry, page, PAGE_SIZE); + if (IS_ERR(s)) + return s; + if (!s[1]) // for root we want "", not "/" + s++; + if (s < (char *)page + pplen + dfsplen) + return ERR_PTR(-ENAMETOOLONG); if (pplen) { - int i; - cifs_dbg(FYI, "using cifs_sb prepath <%s>\n", cifs_sb->prepath); - memcpy(full_path+dfsplen+1, cifs_sb->prepath, pplen-1); - full_path[dfsplen] = dirsep; - for (i = 0; i < pplen-1; i++) - if (full_path[dfsplen+1+i] == '/') - full_path[dfsplen+1+i] = CIFS_DIR_SEP(cifs_sb); + s -= pplen; + memcpy(s + 1, cifs_sb->prepath, pplen - 1); + *s = '/'; } + if (dirsep != '/') { + /* BB test paths to Windows with '/' in the midst of prepath */ + char *p; + for (p = s; *p; p++) + if (*p == '/') + *p = dirsep; + } if (dfsplen) { - strncpy(full_path, tcon->treeName, dfsplen); + s -= dfsplen; + memcpy(s, tcon->treeName, dfsplen); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) { int i; for (i = 0; i < dfsplen; i++) { - if (full_path[i] == '\\') - full_path[i] = '/'; + if (s[i] == '\\') + s[i] = '/'; } } } - return full_path; + return s; } /* @@ -233,7 +186,8 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, int desired_access; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_tcon *tcon = tlink_tcon(tlink); - char *full_path = NULL; + const char *full_path; + void *page = alloc_dentry_path(); FILE_ALL_INFO *buf = NULL; struct inode *newinode = NULL; int disposition; @@ -244,9 +198,11 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, if (tcon->ses->server->oplocks) *oplock = REQ_OPLOCK; - full_path = build_path_from_dentry(direntry); - if (!full_path) - return -ENOMEM; + full_path = build_path_from_dentry(direntry, page); + if (IS_ERR(full_path)) { + free_dentry_path(page); + return PTR_ERR(full_path); + } if (tcon->unix_ext && cap_unix(tcon->ses) && !tcon->broken_posix_open && (CIFS_UNIX_POSIX_PATH_OPS_CAP & @@ -418,15 +374,16 @@ cifs_create_get_file_info: if (newinode) { if (server->ops->set_lease_key) server->ops->set_lease_key(newinode, fid); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) - newinode->i_mode = mode; - if ((*oplock & CIFS_CREATE_ACTION) && - (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) { - newinode->i_uid = current_fsuid(); - if (inode->i_mode & S_ISGID) - newinode->i_gid = inode->i_gid; - else - newinode->i_gid = current_fsgid(); + if ((*oplock & CIFS_CREATE_ACTION) && S_ISREG(newinode->i_mode)) { + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) + newinode->i_mode = mode; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { + newinode->i_uid = current_fsuid(); + if (inode->i_mode & S_ISGID) + newinode->i_gid = inode->i_gid; + else + newinode->i_gid = current_fsgid(); + } } } } @@ -448,7 +405,7 @@ cifs_create_set_dentry: out: kfree(buf); - kfree(full_path); + free_dentry_path(page); return rc; out_err: @@ -619,7 +576,8 @@ int cifs_mknod(struct user_namespace *mnt_userns, struct inode *inode, struct cifs_sb_info *cifs_sb; struct tcon_link *tlink; struct cifs_tcon *tcon; - char *full_path = NULL; + const char *full_path; + void *page; if (!old_valid_dev(device_number)) return -EINVAL; @@ -629,13 +587,13 @@ int cifs_mknod(struct user_namespace *mnt_userns, struct inode *inode, if (IS_ERR(tlink)) return PTR_ERR(tlink); + page = alloc_dentry_path(); tcon = tlink_tcon(tlink); - xid = get_xid(); - full_path = build_path_from_dentry(direntry); - if (full_path == NULL) { - rc = -ENOMEM; + full_path = build_path_from_dentry(direntry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto mknod_out; } @@ -644,7 +602,7 @@ int cifs_mknod(struct user_namespace *mnt_userns, struct inode *inode, device_number); mknod_out: - kfree(full_path); + free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); return rc; @@ -660,7 +618,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, struct tcon_link *tlink; struct cifs_tcon *pTcon; struct inode *newInode = NULL; - char *full_path = NULL; + const char *full_path; + void *page; xid = get_xid(); @@ -687,11 +646,13 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, /* can not grab the rename sem here since it would deadlock in the cases (beginning of sys_rename itself) in which we already have the sb rename sem */ - full_path = build_path_from_dentry(direntry); - if (full_path == NULL) { + page = alloc_dentry_path(); + full_path = build_path_from_dentry(direntry, page); + if (IS_ERR(full_path)) { cifs_put_tlink(tlink); free_xid(xid); - return ERR_PTR(-ENOMEM); + free_dentry_path(page); + return ERR_CAST(full_path); } if (d_really_is_positive(direntry)) { @@ -727,7 +688,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, } newInode = ERR_PTR(rc); } - kfree(full_path); + free_dentry_path(page); cifs_put_tlink(tlink); free_xid(xid); return d_splice_alias(newInode, direntry); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 042e24aad410..639c59596d4f 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -112,7 +112,7 @@ static inline int cifs_get_disposition(unsigned int flags) return FILE_OPEN; } -int cifs_posix_open(char *full_path, struct inode **pinode, +int cifs_posix_open(const char *full_path, struct inode **pinode, struct super_block *sb, int mode, unsigned int f_flags, __u32 *poplock, __u16 *pnetfid, unsigned int xid) { @@ -166,7 +166,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode, } } else { cifs_revalidate_mapping(*pinode); - cifs_fattr_to_inode(*pinode, &fattr); + rc = cifs_fattr_to_inode(*pinode, &fattr); } posix_open_ret: @@ -175,7 +175,7 @@ posix_open_ret: } static int -cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, +cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon, unsigned int f_flags, __u32 *oplock, struct cifs_fid *fid, unsigned int xid) { @@ -530,7 +530,8 @@ int cifs_open(struct inode *inode, struct file *file) struct cifs_tcon *tcon; struct tcon_link *tlink; struct cifsFileInfo *cfile = NULL; - char *full_path = NULL; + void *page; + const char *full_path; bool posix_open_ok = false; struct cifs_fid fid; struct cifs_pending_open open; @@ -546,9 +547,10 @@ int cifs_open(struct inode *inode, struct file *file) tcon = tlink_tcon(tlink); server = tcon->ses->server; - full_path = build_path_from_dentry(file_dentry(file)); - if (full_path == NULL) { - rc = -ENOMEM; + page = alloc_dentry_path(); + full_path = build_path_from_dentry(file_dentry(file), page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto out; } @@ -640,7 +642,7 @@ int cifs_open(struct inode *inode, struct file *file) } out: - kfree(full_path); + free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); return rc; @@ -689,7 +691,8 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) struct TCP_Server_Info *server; struct cifsInodeInfo *cinode; struct inode *inode; - char *full_path = NULL; + void *page; + const char *full_path; int desired_access; int disposition = FILE_OPEN; int create_options = CREATE_NOT_DIR; @@ -699,9 +702,8 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) mutex_lock(&cfile->fh_mutex); if (!cfile->invalidHandle) { mutex_unlock(&cfile->fh_mutex); - rc = 0; free_xid(xid); - return rc; + return 0; } inode = d_inode(cfile->dentry); @@ -715,12 +717,13 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) * called and if the server was down that means we end up here, and we * can never tell if the caller already has the rename_sem. */ - full_path = build_path_from_dentry(cfile->dentry); - if (full_path == NULL) { - rc = -ENOMEM; + page = alloc_dentry_path(); + full_path = build_path_from_dentry(cfile->dentry, page); + if (IS_ERR(full_path)) { mutex_unlock(&cfile->fh_mutex); + free_dentry_path(page); free_xid(xid); - return rc; + return PTR_ERR(full_path); } cifs_dbg(FYI, "inode = 0x%p file flags 0x%x for %s\n", @@ -838,7 +841,7 @@ reopen_success: cifs_relock_file(cfile); reopen_error_exit: - kfree(full_path); + free_dentry_path(page); free_xid(xid); return rc; } @@ -2069,34 +2072,31 @@ cifs_get_writable_path(struct cifs_tcon *tcon, const char *name, int flags, struct cifsFileInfo **ret_file) { - struct list_head *tmp; struct cifsFileInfo *cfile; - struct cifsInodeInfo *cinode; - char *full_path; + void *page = alloc_dentry_path(); *ret_file = NULL; spin_lock(&tcon->open_file_lock); - list_for_each(tmp, &tcon->openFileList) { - cfile = list_entry(tmp, struct cifsFileInfo, - tlist); - full_path = build_path_from_dentry(cfile->dentry); - if (full_path == NULL) { + list_for_each_entry(cfile, &tcon->openFileList, tlist) { + struct cifsInodeInfo *cinode; + const char *full_path = build_path_from_dentry(cfile->dentry, page); + if (IS_ERR(full_path)) { spin_unlock(&tcon->open_file_lock); - return -ENOMEM; + free_dentry_path(page); + return PTR_ERR(full_path); } - if (strcmp(full_path, name)) { - kfree(full_path); + if (strcmp(full_path, name)) continue; - } - kfree(full_path); cinode = CIFS_I(d_inode(cfile->dentry)); spin_unlock(&tcon->open_file_lock); + free_dentry_path(page); return cifs_get_writable_file(cinode, flags, ret_file); } spin_unlock(&tcon->open_file_lock); + free_dentry_path(page); return -ENOENT; } @@ -2104,35 +2104,32 @@ int cifs_get_readable_path(struct cifs_tcon *tcon, const char *name, struct cifsFileInfo **ret_file) { - struct list_head *tmp; struct cifsFileInfo *cfile; - struct cifsInodeInfo *cinode; - char *full_path; + void *page = alloc_dentry_path(); *ret_file = NULL; spin_lock(&tcon->open_file_lock); - list_for_each(tmp, &tcon->openFileList) { - cfile = list_entry(tmp, struct cifsFileInfo, - tlist); - full_path = build_path_from_dentry(cfile->dentry); - if (full_path == NULL) { + list_for_each_entry(cfile, &tcon->openFileList, tlist) { + struct cifsInodeInfo *cinode; + const char *full_path = build_path_from_dentry(cfile->dentry, page); + if (IS_ERR(full_path)) { spin_unlock(&tcon->open_file_lock); - return -ENOMEM; + free_dentry_path(page); + return PTR_ERR(full_path); } - if (strcmp(full_path, name)) { - kfree(full_path); + if (strcmp(full_path, name)) continue; - } - kfree(full_path); cinode = CIFS_I(d_inode(cfile->dentry)); spin_unlock(&tcon->open_file_lock); + free_dentry_path(page); *ret_file = find_readable_file(cinode, 0); return *ret_file ? 0 : -ENOENT; } spin_unlock(&tcon->open_file_lock); + free_dentry_path(page); return -ENOENT; } diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index 78889024a7ed..3e0d016849e3 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -137,6 +137,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_u32("min_enc_offload", Opt_min_enc_offload), fsparam_u32("esize", Opt_min_enc_offload), fsparam_u32("bsize", Opt_blocksize), + fsparam_u32("rasize", Opt_rasize), fsparam_u32("rsize", Opt_rsize), fsparam_u32("wsize", Opt_wsize), fsparam_u32("actimeo", Opt_actimeo), @@ -188,8 +189,8 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { {} }; -int -cifs_parse_security_flavors(char *value, struct smb3_fs_context *ctx) +static int +cifs_parse_security_flavors(struct fs_context *fc, char *value, struct smb3_fs_context *ctx) { substring_t args[MAX_OPT_ARGS]; @@ -203,7 +204,7 @@ cifs_parse_security_flavors(char *value, struct smb3_fs_context *ctx) switch (match_token(value, cifs_secflavor_tokens, args)) { case Opt_sec_krb5p: - cifs_dbg(VFS, "sec=krb5p is not supported!\n"); + cifs_errorf(fc, "sec=krb5p is not supported!\n"); return 1; case Opt_sec_krb5i: ctx->sign = true; @@ -238,7 +239,7 @@ cifs_parse_security_flavors(char *value, struct smb3_fs_context *ctx) ctx->nullauth = 1; break; default: - cifs_dbg(VFS, "bad security option: %s\n", value); + cifs_errorf(fc, "bad security option: %s\n", value); return 1; } @@ -254,8 +255,8 @@ static const match_table_t cifs_cacheflavor_tokens = { { Opt_cache_err, NULL } }; -int -cifs_parse_cache_flavor(char *value, struct smb3_fs_context *ctx) +static int +cifs_parse_cache_flavor(struct fs_context *fc, char *value, struct smb3_fs_context *ctx) { substring_t args[MAX_OPT_ARGS]; @@ -291,7 +292,7 @@ cifs_parse_cache_flavor(char *value, struct smb3_fs_context *ctx) ctx->cache_rw = true; break; default: - cifs_dbg(VFS, "bad cache= option: %s\n", value); + cifs_errorf(fc, "bad cache= option: %s\n", value); return 1; } return 0; @@ -339,7 +340,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx } static int -cifs_parse_smb_version(char *value, struct smb3_fs_context *ctx, bool is_smb3) +cifs_parse_smb_version(struct fs_context *fc, char *value, struct smb3_fs_context *ctx, bool is_smb3) { substring_t args[MAX_OPT_ARGS]; @@ -347,24 +348,24 @@ cifs_parse_smb_version(char *value, struct smb3_fs_context *ctx, bool is_smb3) #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY case Smb_1: if (disable_legacy_dialects) { - cifs_dbg(VFS, "mount with legacy dialect disabled\n"); + cifs_errorf(fc, "mount with legacy dialect disabled\n"); return 1; } if (is_smb3) { - cifs_dbg(VFS, "vers=1.0 (cifs) not permitted when mounting with smb3\n"); + cifs_errorf(fc, "vers=1.0 (cifs) not permitted when mounting with smb3\n"); return 1; } - cifs_dbg(VFS, "Use of the less secure dialect vers=1.0 is not recommended unless required for access to very old servers\n"); + cifs_errorf(fc, "Use of the less secure dialect vers=1.0 is not recommended unless required for access to very old servers\n"); ctx->ops = &smb1_operations; ctx->vals = &smb1_values; break; case Smb_20: if (disable_legacy_dialects) { - cifs_dbg(VFS, "mount with legacy dialect disabled\n"); + cifs_errorf(fc, "mount with legacy dialect disabled\n"); return 1; } if (is_smb3) { - cifs_dbg(VFS, "vers=2.0 not permitted when mounting with smb3\n"); + cifs_errorf(fc, "vers=2.0 not permitted when mounting with smb3\n"); return 1; } ctx->ops = &smb20_operations; @@ -372,10 +373,10 @@ cifs_parse_smb_version(char *value, struct smb3_fs_context *ctx, bool is_smb3) break; #else case Smb_1: - cifs_dbg(VFS, "vers=1.0 (cifs) mount not permitted when legacy dialects disabled\n"); + cifs_errorf(fc, "vers=1.0 (cifs) mount not permitted when legacy dialects disabled\n"); return 1; case Smb_20: - cifs_dbg(VFS, "vers=2.0 mount not permitted when legacy dialects disabled\n"); + cifs_errorf(fc, "vers=2.0 mount not permitted when legacy dialects disabled\n"); return 1; #endif /* CIFS_ALLOW_INSECURE_LEGACY */ case Smb_21: @@ -403,7 +404,7 @@ cifs_parse_smb_version(char *value, struct smb3_fs_context *ctx, bool is_smb3) ctx->vals = &smbdefault_values; break; default: - cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value); + cifs_errorf(fc, "Unknown vers= option specified: %s\n", value); return 1; } return 0; @@ -430,7 +431,7 @@ int smb3_parse_opt(const char *options, const char *key, char **val) if (nval == p) continue; *nval++ = 0; - *val = kstrndup(nval, strlen(nval), GFP_KERNEL); + *val = kstrdup(nval, GFP_KERNEL); rc = !*val ? -ENOMEM : 0; goto out; } @@ -588,14 +589,14 @@ static int smb3_fs_context_validate(struct fs_context *fc) struct smb3_fs_context *ctx = smb3_fc2context(fc); if (ctx->rdma && ctx->vals->protocol_id < SMB30_PROT_ID) { - cifs_dbg(VFS, "SMB Direct requires Version >=3.0\n"); + cifs_errorf(fc, "SMB Direct requires Version >=3.0\n"); return -EOPNOTSUPP; } #ifndef CONFIG_KEYS /* Muliuser mounts require CONFIG_KEYS support */ if (ctx->multiuser) { - cifs_dbg(VFS, "Multiuser mounts require kernels with CONFIG_KEYS enabled\n"); + cifs_errorf(fc, "Multiuser mounts require kernels with CONFIG_KEYS enabled\n"); return -1; } #endif @@ -605,13 +606,13 @@ static int smb3_fs_context_validate(struct fs_context *fc) if (!ctx->UNC) { - cifs_dbg(VFS, "CIFS mount error: No usable UNC path provided in device string!\n"); + cifs_errorf(fc, "CIFS mount error: No usable UNC path provided in device string!\n"); return -1; } /* make sure UNC has a share name */ if (strlen(ctx->UNC) < 3 || !strchr(ctx->UNC + 3, '\\')) { - cifs_dbg(VFS, "Malformed UNC. Unable to find share name.\n"); + cifs_errorf(fc, "Malformed UNC. Unable to find share name.\n"); return -ENOENT; } @@ -684,49 +685,50 @@ static void smb3_fs_context_free(struct fs_context *fc) * Compare the old and new proposed context during reconfigure * and check if the changes are compatible. */ -static int smb3_verify_reconfigure_ctx(struct smb3_fs_context *new_ctx, +static int smb3_verify_reconfigure_ctx(struct fs_context *fc, + struct smb3_fs_context *new_ctx, struct smb3_fs_context *old_ctx) { if (new_ctx->posix_paths != old_ctx->posix_paths) { - cifs_dbg(VFS, "can not change posixpaths during remount\n"); + cifs_errorf(fc, "can not change posixpaths during remount\n"); return -EINVAL; } if (new_ctx->sectype != old_ctx->sectype) { - cifs_dbg(VFS, "can not change sec during remount\n"); + cifs_errorf(fc, "can not change sec during remount\n"); return -EINVAL; } if (new_ctx->multiuser != old_ctx->multiuser) { - cifs_dbg(VFS, "can not change multiuser during remount\n"); + cifs_errorf(fc, "can not change multiuser during remount\n"); return -EINVAL; } if (new_ctx->UNC && (!old_ctx->UNC || strcmp(new_ctx->UNC, old_ctx->UNC))) { - cifs_dbg(VFS, "can not change UNC during remount\n"); + cifs_errorf(fc, "can not change UNC during remount\n"); return -EINVAL; } if (new_ctx->username && (!old_ctx->username || strcmp(new_ctx->username, old_ctx->username))) { - cifs_dbg(VFS, "can not change username during remount\n"); + cifs_errorf(fc, "can not change username during remount\n"); return -EINVAL; } if (new_ctx->password && (!old_ctx->password || strcmp(new_ctx->password, old_ctx->password))) { - cifs_dbg(VFS, "can not change password during remount\n"); + cifs_errorf(fc, "can not change password during remount\n"); return -EINVAL; } if (new_ctx->domainname && (!old_ctx->domainname || strcmp(new_ctx->domainname, old_ctx->domainname))) { - cifs_dbg(VFS, "can not change domainname during remount\n"); + cifs_errorf(fc, "can not change domainname during remount\n"); return -EINVAL; } if (new_ctx->nodename && (!old_ctx->nodename || strcmp(new_ctx->nodename, old_ctx->nodename))) { - cifs_dbg(VFS, "can not change nodename during remount\n"); + cifs_errorf(fc, "can not change nodename during remount\n"); return -EINVAL; } if (new_ctx->iocharset && (!old_ctx->iocharset || strcmp(new_ctx->iocharset, old_ctx->iocharset))) { - cifs_dbg(VFS, "can not change iocharset during remount\n"); + cifs_errorf(fc, "can not change iocharset during remount\n"); return -EINVAL; } @@ -747,7 +749,7 @@ static int smb3_reconfigure(struct fs_context *fc) struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb); int rc; - rc = smb3_verify_reconfigure_ctx(ctx, cifs_sb->ctx); + rc = smb3_verify_reconfigure_ctx(fc, ctx, cifs_sb->ctx); if (rc) return rc; @@ -933,13 +935,33 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, */ if ((result.uint_32 < CIFS_MAX_MSGSIZE) || (result.uint_32 > (4 * SMB3_DEFAULT_IOSIZE))) { - cifs_dbg(VFS, "%s: Invalid blocksize\n", + cifs_errorf(fc, "%s: Invalid blocksize\n", __func__); goto cifs_parse_mount_err; } ctx->bsize = result.uint_32; ctx->got_bsize = true; break; + case Opt_rasize: + /* + * readahead size realistically should never need to be + * less than 1M (CIFS_DEFAULT_IOSIZE) or greater than 32M + * (perhaps an exception should be considered in the + * for the case of a large number of channels + * when multichannel is negotiated) since that would lead + * to plenty of parallel I/O in flight to the server. + * Note that smaller read ahead sizes would + * hurt performance of common tools like cp and scp + * which often trigger sequential i/o with read ahead + */ + if ((result.uint_32 > (8 * SMB3_DEFAULT_IOSIZE)) || + (result.uint_32 < CIFS_DEFAULT_IOSIZE)) { + cifs_errorf(fc, "%s: Invalid rasize %d vs. %d\n", + __func__, result.uint_32, SMB3_DEFAULT_IOSIZE); + goto cifs_parse_mount_err; + } + ctx->rasize = result.uint_32; + break; case Opt_rsize: ctx->rsize = result.uint_32; ctx->got_rsize = true; @@ -951,25 +973,25 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, case Opt_acregmax: ctx->acregmax = HZ * result.uint_32; if (ctx->acregmax > CIFS_MAX_ACTIMEO) { - cifs_dbg(VFS, "acregmax too large\n"); + cifs_errorf(fc, "acregmax too large\n"); goto cifs_parse_mount_err; } break; case Opt_acdirmax: ctx->acdirmax = HZ * result.uint_32; if (ctx->acdirmax > CIFS_MAX_ACTIMEO) { - cifs_dbg(VFS, "acdirmax too large\n"); + cifs_errorf(fc, "acdirmax too large\n"); goto cifs_parse_mount_err; } break; case Opt_actimeo: if (HZ * result.uint_32 > CIFS_MAX_ACTIMEO) { - cifs_dbg(VFS, "timeout too large\n"); + cifs_errorf(fc, "timeout too large\n"); goto cifs_parse_mount_err; } if ((ctx->acdirmax != CIFS_DEF_ACTIMEO) || (ctx->acregmax != CIFS_DEF_ACTIMEO)) { - cifs_dbg(VFS, "actimeo ignored since acregmax or acdirmax specified\n"); + cifs_errorf(fc, "actimeo ignored since acregmax or acdirmax specified\n"); break; } ctx->acdirmax = ctx->acregmax = HZ * result.uint_32; @@ -982,7 +1004,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, break; case Opt_max_credits: if (result.uint_32 < 20 || result.uint_32 > 60000) { - cifs_dbg(VFS, "%s: Invalid max_credits value\n", + cifs_errorf(fc, "%s: Invalid max_credits value\n", __func__); goto cifs_parse_mount_err; } @@ -990,7 +1012,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, break; case Opt_max_channels: if (result.uint_32 < 1 || result.uint_32 > CIFS_MAX_CHANNELS) { - cifs_dbg(VFS, "%s: Invalid max_channels value, needs to be 1-%d\n", + cifs_errorf(fc, "%s: Invalid max_channels value, needs to be 1-%d\n", __func__, CIFS_MAX_CHANNELS); goto cifs_parse_mount_err; } @@ -999,7 +1021,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, case Opt_handletimeout: ctx->handle_timeout = result.uint_32; if (ctx->handle_timeout > SMB3_MAX_HANDLE_TIMEOUT) { - cifs_dbg(VFS, "Invalid handle cache timeout, longer than 16 minutes\n"); + cifs_errorf(fc, "Invalid handle cache timeout, longer than 16 minutes\n"); goto cifs_parse_mount_err; } break; @@ -1010,23 +1032,23 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, case 0: break; case -ENOMEM: - cifs_dbg(VFS, "Unable to allocate memory for devname\n"); + cifs_errorf(fc, "Unable to allocate memory for devname\n"); goto cifs_parse_mount_err; case -EINVAL: - cifs_dbg(VFS, "Malformed UNC in devname\n"); + cifs_errorf(fc, "Malformed UNC in devname\n"); goto cifs_parse_mount_err; default: - cifs_dbg(VFS, "Unknown error parsing devname\n"); + cifs_errorf(fc, "Unknown error parsing devname\n"); goto cifs_parse_mount_err; } ctx->source = kstrdup(param->string, GFP_KERNEL); if (ctx->source == NULL) { - cifs_dbg(VFS, "OOM when copying UNC string\n"); + cifs_errorf(fc, "OOM when copying UNC string\n"); goto cifs_parse_mount_err; } fc->source = kstrdup(param->string, GFP_KERNEL); if (fc->source == NULL) { - cifs_dbg(VFS, "OOM when copying UNC string\n"); + cifs_errorf(fc, "OOM when copying UNC string\n"); goto cifs_parse_mount_err; } break; @@ -1046,7 +1068,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, } ctx->username = kstrdup(param->string, GFP_KERNEL); if (ctx->username == NULL) { - cifs_dbg(VFS, "OOM when copying username string\n"); + cifs_errorf(fc, "OOM when copying username string\n"); goto cifs_parse_mount_err; } break; @@ -1058,7 +1080,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, ctx->password = kstrdup(param->string, GFP_KERNEL); if (ctx->password == NULL) { - cifs_dbg(VFS, "OOM when copying password string\n"); + cifs_errorf(fc, "OOM when copying password string\n"); goto cifs_parse_mount_err; } break; @@ -1085,7 +1107,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, kfree(ctx->domainname); ctx->domainname = kstrdup(param->string, GFP_KERNEL); if (ctx->domainname == NULL) { - cifs_dbg(VFS, "OOM when copying domainname string\n"); + cifs_errorf(fc, "OOM when copying domainname string\n"); goto cifs_parse_mount_err; } cifs_dbg(FYI, "Domain name set\n"); @@ -1109,7 +1131,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, kfree(ctx->iocharset); ctx->iocharset = kstrdup(param->string, GFP_KERNEL); if (ctx->iocharset == NULL) { - cifs_dbg(VFS, "OOM when copying iocharset string\n"); + cifs_errorf(fc, "OOM when copying iocharset string\n"); goto cifs_parse_mount_err; } } @@ -1175,21 +1197,21 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, goto cifs_parse_mount_err; case Opt_vers: /* protocol version (dialect) */ - if (cifs_parse_smb_version(param->string, ctx, is_smb3) != 0) + if (cifs_parse_smb_version(fc, param->string, ctx, is_smb3) != 0) goto cifs_parse_mount_err; ctx->got_version = true; break; case Opt_sec: - if (cifs_parse_security_flavors(param->string, ctx) != 0) + if (cifs_parse_security_flavors(fc, param->string, ctx) != 0) goto cifs_parse_mount_err; break; case Opt_cache: - if (cifs_parse_cache_flavor(param->string, ctx) != 0) + if (cifs_parse_cache_flavor(fc, param->string, ctx) != 0) goto cifs_parse_mount_err; break; case Opt_witness: #ifndef CONFIG_CIFS_SWN_UPCALL - cifs_dbg(VFS, "Witness support needs CONFIG_CIFS_SWN_UPCALL config option\n"); + cifs_errorf(fc, "Witness support needs CONFIG_CIFS_SWN_UPCALL config option\n"); goto cifs_parse_mount_err; #endif ctx->witness = true; @@ -1290,7 +1312,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, break; case Opt_fsc: #ifndef CONFIG_CIFS_FSCACHE - cifs_dbg(VFS, "FS-Cache support needs CONFIG_CIFS_FSCACHE kernel config option set\n"); + cifs_errorf(fc, "FS-Cache support needs CONFIG_CIFS_FSCACHE kernel config option set\n"); goto cifs_parse_mount_err; #endif ctx->fsc = true; @@ -1311,15 +1333,13 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, if (result.negated) { ctx->nopersistent = true; if (ctx->persistent) { - cifs_dbg(VFS, - "persistenthandles mount options conflict\n"); + cifs_errorf(fc, "persistenthandles mount options conflict\n"); goto cifs_parse_mount_err; } } else { ctx->persistent = true; if ((ctx->nopersistent) || (ctx->resilient)) { - cifs_dbg(VFS, - "persistenthandles mount options conflict\n"); + cifs_errorf(fc, "persistenthandles mount options conflict\n"); goto cifs_parse_mount_err; } } @@ -1330,8 +1350,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, } else { ctx->resilient = true; if (ctx->persistent) { - cifs_dbg(VFS, - "persistenthandles mount options conflict\n"); + cifs_errorf(fc, "persistenthandles mount options conflict\n"); goto cifs_parse_mount_err; } } @@ -1379,7 +1398,9 @@ int smb3_init_fs_context(struct fs_context *fc) ctx->cred_uid = current_uid(); ctx->linux_uid = current_uid(); ctx->linux_gid = current_gid(); - ctx->bsize = 1024 * 1024; /* can improve cp performance significantly */ + /* By default 4MB read ahead size, 1MB block size */ + ctx->bsize = CIFS_DEFAULT_IOSIZE; /* can improve cp performance significantly */ + ctx->rasize = 0; /* 0 = use default (ie negotiated rsize) for read ahead pages */ /* * default to SFM style remapping of seven reserved characters diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h index 87dd1f7168f2..2a71c8e411ac 100644 --- a/fs/cifs/fs_context.h +++ b/fs/cifs/fs_context.h @@ -13,7 +13,12 @@ #include <linux/parser.h> #include <linux/fs_parser.h> -#define cifs_invalf(fc, fmt, ...) invalf(fc, fmt, ## __VA_ARGS__) +/* Log errors in fs_context (new mount api) but also in dmesg (old style) */ +#define cifs_errorf(fc, fmt, ...) \ + do { \ + errorf(fc, fmt, ## __VA_ARGS__); \ + cifs_dbg(VFS, fmt, ## __VA_ARGS__); \ + } while (0) enum smb_version { Smb_1 = 1, @@ -115,6 +120,7 @@ enum cifs_param { Opt_dirmode, Opt_min_enc_offload, Opt_blocksize, + Opt_rasize, Opt_rsize, Opt_wsize, Opt_actimeo, @@ -230,6 +236,7 @@ struct smb3_fs_context { /* reuse existing guid for multichannel */ u8 client_guid[SMB2_CLIENT_GUID_SIZE]; unsigned int bsize; + unsigned int rasize; unsigned int rsize; unsigned int wsize; unsigned int min_offload; @@ -257,10 +264,6 @@ struct smb3_fs_context { extern const struct fs_parameter_spec smb3_fs_parameters[]; -extern int cifs_parse_cache_flavor(char *value, - struct smb3_fs_context *ctx); -extern int cifs_parse_security_flavors(char *value, - struct smb3_fs_context *ctx); extern int smb3_init_fs_context(struct fs_context *fc); extern void smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx); extern void smb3_cleanup_fs_context(struct smb3_fs_context *ctx); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index f2df4422e54a..002d864b8f7b 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -157,12 +157,18 @@ cifs_nlink_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) } /* populate an inode with info from a cifs_fattr struct */ -void +int cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) { struct cifsInodeInfo *cifs_i = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + if (!(inode->i_state & I_NEW) && + unlikely(inode_wrong_type(inode, fattr->cf_mode))) { + CIFS_I(inode)->time = 0; /* force reval */ + return -ESTALE; + } + cifs_revalidate_cache(inode, fattr); spin_lock(&inode->i_lock); @@ -219,6 +225,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) inode->i_flags |= S_AUTOMOUNT; if (inode->i_state & I_NEW) cifs_set_ops(inode); + return 0; } void @@ -363,7 +370,7 @@ cifs_get_file_info_unix(struct file *filp) rc = 0; } - cifs_fattr_to_inode(inode, &fattr); + rc = cifs_fattr_to_inode(inode, &fattr); free_xid(xid); return rc; } @@ -426,14 +433,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, } /* if filetype is different, return error */ - if (unlikely(((*pinode)->i_mode & S_IFMT) != - (fattr.cf_mode & S_IFMT))) { - CIFS_I(*pinode)->time = 0; /* force reval */ - rc = -ESTALE; - goto cgiiu_exit; - } - - cifs_fattr_to_inode(*pinode, &fattr); + rc = cifs_fattr_to_inode(*pinode, &fattr); } cgiiu_exit: @@ -783,7 +783,8 @@ cifs_get_file_info(struct file *filp) */ fattr.cf_uniqueid = CIFS_I(inode)->uniqueid; fattr.cf_flags |= CIFS_FATTR_NEED_REVAL; - cifs_fattr_to_inode(inode, &fattr); + /* if filetype is different, return error */ + rc = cifs_fattr_to_inode(inode, &fattr); cgfi_exit: free_xid(xid); return rc; @@ -1100,16 +1101,8 @@ handle_mnt_opt: rc = -ESTALE; goto out; } - /* if filetype is different, return error */ - if (unlikely(((*inode)->i_mode & S_IFMT) != - (fattr.cf_mode & S_IFMT))) { - CIFS_I(*inode)->time = 0; /* force reval */ - rc = -ESTALE; - goto out; - } - - cifs_fattr_to_inode(*inode, &fattr); + rc = cifs_fattr_to_inode(*inode, &fattr); } out: cifs_buf_release(smb1_backup_rsp_buf); @@ -1215,14 +1208,7 @@ smb311_posix_get_inode_info(struct inode **inode, } /* if filetype is different, return error */ - if (unlikely(((*inode)->i_mode & S_IFMT) != - (fattr.cf_mode & S_IFMT))) { - CIFS_I(*inode)->time = 0; /* force reval */ - rc = -ESTALE; - goto out; - } - - cifs_fattr_to_inode(*inode, &fattr); + rc = cifs_fattr_to_inode(*inode, &fattr); } out: cifs_put_tlink(tlink); @@ -1249,7 +1235,7 @@ cifs_find_inode(struct inode *inode, void *opaque) return 0; /* don't match inode of different type */ - if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT)) + if (inode_wrong_type(inode, fattr->cf_mode)) return 0; /* if it's not a directory or has no dentries, then flag it */ @@ -1317,6 +1303,7 @@ retry_iget5_locked: } } + /* can't fail - see cifs_find_inode() */ cifs_fattr_to_inode(inode, fattr); if (sb->s_flags & SB_NOATIME) inode->i_flags |= S_NOATIME | S_NOCMTIME; @@ -1408,7 +1395,7 @@ out: int cifs_set_file_info(struct inode *inode, struct iattr *attrs, unsigned int xid, - char *full_path, __u32 dosattr) + const char *full_path, __u32 dosattr) { bool set_time = false; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); @@ -1609,7 +1596,8 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) { int rc = 0; unsigned int xid; - char *full_path = NULL; + const char *full_path; + void *page; struct inode *inode = d_inode(dentry); struct cifsInodeInfo *cifs_inode; struct super_block *sb = dir->i_sb; @@ -1629,6 +1617,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) server = tcon->ses->server; xid = get_xid(); + page = alloc_dentry_path(); if (tcon->nodelete) { rc = -EACCES; @@ -1637,9 +1626,9 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) /* Unlink can be called from rename so we can not take the * sb->s_vfs_rename_mutex here */ - full_path = build_path_from_dentry(dentry); - if (full_path == NULL) { - rc = -ENOMEM; + full_path = build_path_from_dentry(dentry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto unlink_out; } @@ -1713,7 +1702,7 @@ out_reval: cifs_inode = CIFS_I(dir); CIFS_I(dir)->time = 0; /* force revalidate of dir as well */ unlink_out: - kfree(full_path); + free_dentry_path(page); kfree(attrs); free_xid(xid); cifs_put_tlink(tlink); @@ -1740,6 +1729,16 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode, if (rc) return rc; + if (!S_ISDIR(inode->i_mode)) { + /* + * mkdir succeeded, but another client has managed to remove the + * sucker and replace it with non-directory. Return success, + * but don't leave the child in dcache. + */ + iput(inode); + d_drop(dentry); + return 0; + } /* * setting nlink not necessary except in cases where we failed to get it * from the server or was set bogus. Also, since this is a brand new @@ -1791,7 +1790,7 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode, } } d_instantiate(dentry, inode); - return rc; + return 0; } static int @@ -1866,7 +1865,8 @@ int cifs_mkdir(struct user_namespace *mnt_userns, struct inode *inode, struct tcon_link *tlink; struct cifs_tcon *tcon; struct TCP_Server_Info *server; - char *full_path; + const char *full_path; + void *page; cifs_dbg(FYI, "In cifs_mkdir, mode = %04ho inode = 0x%p\n", mode, inode); @@ -1879,9 +1879,10 @@ int cifs_mkdir(struct user_namespace *mnt_userns, struct inode *inode, xid = get_xid(); - full_path = build_path_from_dentry(direntry); - if (full_path == NULL) { - rc = -ENOMEM; + page = alloc_dentry_path(); + full_path = build_path_from_dentry(direntry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto mkdir_out; } @@ -1924,7 +1925,7 @@ mkdir_out: * attributes are invalid now. */ CIFS_I(inode)->time = 0; - kfree(full_path); + free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); return rc; @@ -1938,16 +1939,17 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) struct tcon_link *tlink; struct cifs_tcon *tcon; struct TCP_Server_Info *server; - char *full_path = NULL; + const char *full_path; + void *page = alloc_dentry_path(); struct cifsInodeInfo *cifsInode; cifs_dbg(FYI, "cifs_rmdir, inode = 0x%p\n", inode); xid = get_xid(); - full_path = build_path_from_dentry(direntry); - if (full_path == NULL) { - rc = -ENOMEM; + full_path = build_path_from_dentry(direntry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto rmdir_exit; } @@ -1997,7 +1999,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) current_time(inode); rmdir_exit: - kfree(full_path); + free_dentry_path(page); free_xid(xid); return rc; } @@ -2072,8 +2074,8 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir, struct dentry *source_dentry, struct inode *target_dir, struct dentry *target_dentry, unsigned int flags) { - char *from_name = NULL; - char *to_name = NULL; + const char *from_name, *to_name; + void *page1, *page2; struct cifs_sb_info *cifs_sb; struct tcon_link *tlink; struct cifs_tcon *tcon; @@ -2091,21 +2093,19 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir, return PTR_ERR(tlink); tcon = tlink_tcon(tlink); + page1 = alloc_dentry_path(); + page2 = alloc_dentry_path(); xid = get_xid(); - /* - * we already have the rename sem so we do not need to - * grab it again here to protect the path integrity - */ - from_name = build_path_from_dentry(source_dentry); - if (from_name == NULL) { - rc = -ENOMEM; + from_name = build_path_from_dentry(source_dentry, page1); + if (IS_ERR(from_name)) { + rc = PTR_ERR(from_name); goto cifs_rename_exit; } - to_name = build_path_from_dentry(target_dentry); - if (to_name == NULL) { - rc = -ENOMEM; + to_name = build_path_from_dentry(target_dentry, page2); + if (IS_ERR(to_name)) { + rc = PTR_ERR(to_name); goto cifs_rename_exit; } @@ -2177,18 +2177,21 @@ unlink_target: cifs_rename_exit: kfree(info_buf_source); - kfree(from_name); - kfree(to_name); + free_dentry_path(page2); + free_dentry_path(page1); free_xid(xid); cifs_put_tlink(tlink); return rc; } static bool -cifs_inode_needs_reval(struct inode *inode) +cifs_dentry_needs_reval(struct dentry *dentry) { + struct inode *inode = d_inode(dentry); struct cifsInodeInfo *cifs_i = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + struct cached_fid *cfid = NULL; if (cifs_i->time == 0) return true; @@ -2199,6 +2202,16 @@ cifs_inode_needs_reval(struct inode *inode) if (!lookupCacheEnabled) return true; + if (!open_cached_dir_by_dentry(tcon, dentry->d_parent, &cfid)) { + mutex_lock(&cfid->fid_mutex); + if (cfid->time && cifs_i->time > cfid->time) { + mutex_unlock(&cfid->fid_mutex); + close_cached_dir(cfid); + return false; + } + mutex_unlock(&cfid->fid_mutex); + close_cached_dir(cfid); + } /* * depending on inode type, check if attribute caching disabled for * files or directories @@ -2297,10 +2310,10 @@ cifs_zap_mapping(struct inode *inode) int cifs_revalidate_file_attr(struct file *filp) { int rc = 0; - struct inode *inode = file_inode(filp); + struct dentry *dentry = file_dentry(filp); struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data; - if (!cifs_inode_needs_reval(inode)) + if (!cifs_dentry_needs_reval(dentry)) return rc; if (tlink_tcon(cfile->tlink)->unix_ext) @@ -2317,22 +2330,22 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry) int rc = 0; struct inode *inode = d_inode(dentry); struct super_block *sb = dentry->d_sb; - char *full_path = NULL; + const char *full_path; + void *page; int count = 0; if (inode == NULL) return -ENOENT; - if (!cifs_inode_needs_reval(inode)) + if (!cifs_dentry_needs_reval(dentry)) return rc; xid = get_xid(); - /* can not safely grab the rename sem here if rename calls revalidate - since that would deadlock */ - full_path = build_path_from_dentry(dentry); - if (full_path == NULL) { - rc = -ENOMEM; + page = alloc_dentry_path(); + full_path = build_path_from_dentry(dentry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto out; } @@ -2351,7 +2364,7 @@ again: if (rc == -EAGAIN && count++ < 10) goto again; out: - kfree(full_path); + free_dentry_path(page); free_xid(xid); return rc; @@ -2522,7 +2535,7 @@ void cifs_setsize(struct inode *inode, loff_t offset) static int cifs_set_file_size(struct inode *inode, struct iattr *attrs, - unsigned int xid, char *full_path) + unsigned int xid, const char *full_path) { int rc; struct cifsFileInfo *open_file; @@ -2613,7 +2626,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) { int rc; unsigned int xid; - char *full_path = NULL; + const char *full_path; + void *page = alloc_dentry_path(); struct inode *inode = d_inode(direntry); struct cifsInodeInfo *cifsInode = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); @@ -2634,9 +2648,9 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) if (rc < 0) goto out; - full_path = build_path_from_dentry(direntry); - if (full_path == NULL) { - rc = -ENOMEM; + full_path = build_path_from_dentry(direntry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto out; } @@ -2748,7 +2762,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) cifsInode->time = 0; out: kfree(args); - kfree(full_path); + free_dentry_path(page); free_xid(xid); return rc; } @@ -2764,7 +2778,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) struct cifsInodeInfo *cifsInode = CIFS_I(inode); struct cifsFileInfo *wfile; struct cifs_tcon *tcon; - char *full_path = NULL; + const char *full_path; + void *page = alloc_dentry_path(); int rc = -EACCES; __u32 dosattr = 0; __u64 mode = NO_CHANGE_64; @@ -2778,16 +2793,13 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) attrs->ia_valid |= ATTR_FORCE; rc = setattr_prepare(&init_user_ns, direntry, attrs); - if (rc < 0) { - free_xid(xid); - return rc; - } + if (rc < 0) + goto cifs_setattr_exit; - full_path = build_path_from_dentry(direntry); - if (full_path == NULL) { - rc = -ENOMEM; - free_xid(xid); - return rc; + full_path = build_path_from_dentry(direntry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); + goto cifs_setattr_exit; } /* @@ -2937,8 +2949,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) mark_inode_dirty(inode); cifs_setattr_exit: - kfree(full_path); free_xid(xid); + free_dentry_path(page); return rc; } @@ -2961,12 +2973,3 @@ cifs_setattr(struct user_namespace *mnt_userns, struct dentry *direntry, /* BB: add cifs_setattr_legacy for really old servers */ return rc; } - -#if 0 -void cifs_delete_inode(struct inode *inode) -{ - cifs_dbg(FYI, "In cifs_delete_inode, inode = 0x%p\n", inode); - /* may have to add back in if and when safe distributed caching of - directories added e.g. via FindNotify */ -} -#endif diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index dcde44ff6cf9..08d99fec593e 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -42,13 +42,16 @@ static long cifs_ioctl_query_info(unsigned int xid, struct file *filep, struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); struct dentry *dentry = filep->f_path.dentry; - unsigned char *path; + const unsigned char *path; + void *page = alloc_dentry_path(); __le16 *utf16_path = NULL, root_path; int rc = 0; - path = build_path_from_dentry(dentry); - if (path == NULL) - return -ENOMEM; + path = build_path_from_dentry(dentry, page); + if (IS_ERR(path)) { + free_dentry_path(page); + return PTR_ERR(path); + } cifs_dbg(FYI, "%s %s\n", __func__, path); @@ -73,7 +76,7 @@ static long cifs_ioctl_query_info(unsigned int xid, struct file *filep, ici_exit: if (utf16_path != &root_path) kfree(utf16_path); - kfree(path); + free_dentry_path(page); return rc; } diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 7c5878a645d9..616e1bc0cc0a 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -510,8 +510,8 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, { int rc = -EACCES; unsigned int xid; - char *from_name = NULL; - char *to_name = NULL; + const char *from_name, *to_name; + void *page1, *page2; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink; struct cifs_tcon *tcon; @@ -524,11 +524,17 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, tcon = tlink_tcon(tlink); xid = get_xid(); + page1 = alloc_dentry_path(); + page2 = alloc_dentry_path(); - from_name = build_path_from_dentry(old_file); - to_name = build_path_from_dentry(direntry); - if ((from_name == NULL) || (to_name == NULL)) { - rc = -ENOMEM; + from_name = build_path_from_dentry(old_file, page1); + if (IS_ERR(from_name)) { + rc = PTR_ERR(from_name); + goto cifs_hl_exit; + } + to_name = build_path_from_dentry(direntry, page2); + if (IS_ERR(to_name)) { + rc = PTR_ERR(to_name); goto cifs_hl_exit; } @@ -587,8 +593,8 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, } cifs_hl_exit: - kfree(from_name); - kfree(to_name); + free_dentry_path(page1); + free_dentry_path(page2); free_xid(xid); cifs_put_tlink(tlink); return rc; @@ -600,7 +606,8 @@ cifs_get_link(struct dentry *direntry, struct inode *inode, { int rc = -ENOMEM; unsigned int xid; - char *full_path = NULL; + const char *full_path; + void *page; char *target_path = NULL; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink = NULL; @@ -620,11 +627,13 @@ cifs_get_link(struct dentry *direntry, struct inode *inode, tcon = tlink_tcon(tlink); server = tcon->ses->server; - full_path = build_path_from_dentry(direntry); - if (!full_path) { + page = alloc_dentry_path(); + full_path = build_path_from_dentry(direntry, page); + if (IS_ERR(full_path)) { free_xid(xid); cifs_put_tlink(tlink); - return ERR_PTR(-ENOMEM); + free_dentry_path(page); + return ERR_CAST(full_path); } cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", full_path, inode); @@ -649,7 +658,7 @@ cifs_get_link(struct dentry *direntry, struct inode *inode, &target_path, reparse_point); } - kfree(full_path); + free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); if (rc != 0) { @@ -669,7 +678,8 @@ cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode, struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink; struct cifs_tcon *pTcon; - char *full_path = NULL; + const char *full_path; + void *page = alloc_dentry_path(); struct inode *newinode = NULL; xid = get_xid(); @@ -681,9 +691,9 @@ cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode, } pTcon = tlink_tcon(tlink); - full_path = build_path_from_dentry(direntry); - if (full_path == NULL) { - rc = -ENOMEM; + full_path = build_path_from_dentry(direntry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto symlink_exit; } @@ -719,7 +729,7 @@ cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode, } } symlink_exit: - kfree(full_path); + free_dentry_path(page); cifs_put_tlink(tlink); free_xid(xid); return rc; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 82e176720ca6..c15a90e422be 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -1180,7 +1180,7 @@ int update_super_prepath(struct cifs_tcon *tcon, char *prefix) kfree(cifs_sb->prepath); if (prefix && *prefix) { - cifs_sb->prepath = kstrndup(prefix, strlen(prefix), GFP_ATOMIC); + cifs_sb->prepath = kstrdup(prefix, GFP_ATOMIC); if (!cifs_sb->prepath) { rc = -ENOMEM; goto out; diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 80bf4c6f4c7b..63bfc533c9fb 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -119,9 +119,7 @@ retry: /* update inode in place * if both i_ino and i_mode didn't change */ if (CIFS_I(inode)->uniqueid == fattr->cf_uniqueid && - (inode->i_mode & S_IFMT) == - (fattr->cf_mode & S_IFMT)) { - cifs_fattr_to_inode(inode, fattr); + cifs_fattr_to_inode(inode, fattr) == 0) { dput(dentry); return; } @@ -384,7 +382,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb, static int initiate_cifs_search(const unsigned int xid, struct file *file, - char *full_path) + const char *full_path) { __u16 search_flags; int rc = 0; @@ -704,7 +702,7 @@ static int cifs_save_resume_key(const char *current_entry, */ static int find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, - struct file *file, char *full_path, + struct file *file, const char *full_path, char **current_entry, int *num_to_ret) { __u16 search_flags; @@ -942,13 +940,14 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) char *tmp_buf = NULL; char *end_of_smb; unsigned int max_len; - char *full_path = NULL; + const char *full_path; + void *page = alloc_dentry_path(); xid = get_xid(); - full_path = build_path_from_dentry(file_dentry(file)); - if (full_path == NULL) { - rc = -ENOMEM; + full_path = build_path_from_dentry(file_dentry(file), page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto rddir2_exit; } @@ -1043,7 +1042,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) kfree(tmp_buf); rddir2_exit: - kfree(full_path); + free_dentry_path(page); free_xid(xid); return rc; } diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index e31b939e628c..3b83839fc2c2 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -926,9 +926,7 @@ cifs_unix_dfs_readlink(const unsigned int xid, struct cifs_tcon *tcon, 0); if (!rc) { - *symlinkinfo = kstrndup(referral.node_name, - strlen(referral.node_name), - GFP_KERNEL); + *symlinkinfo = kstrdup(referral.node_name, GFP_KERNEL); free_dfs_info_param(&referral); if (!*symlinkinfo) rc = -ENOMEM; @@ -1027,7 +1025,7 @@ cifs_can_echo(struct TCP_Server_Info *server) static int cifs_make_node(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, - char *full_path, umode_t mode, dev_t dev) + const char *full_path, umode_t mode, dev_t dev) { struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct inode *newinode = NULL; diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index a718dc77e604..9a61209a283e 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -512,7 +512,6 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, int rc; struct smb2_file_all_info *smb2_data; __u32 create_options = 0; - bool no_cached_open = tcon->nohandlecache; struct cifsFileInfo *cfile; struct cached_fid *cfid = NULL; @@ -525,11 +524,8 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, return -ENOMEM; /* If it is a root and its handle is cached then use it */ - if (!strlen(full_path) && !no_cached_open) { - rc = open_shroot(xid, tcon, cifs_sb, &cfid); - if (rc) - goto out; - + rc = open_cached_dir(xid, tcon, full_path, cifs_sb, &cfid); + if (!rc) { if (tcon->crfid.file_all_info_is_valid) { move_smb2_info_to_cifs(data, &tcon->crfid.file_all_info); @@ -540,7 +536,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, if (!rc) move_smb2_info_to_cifs(data, smb2_data); } - close_shroot(cfid); + close_cached_dir(cfid); goto out; } diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index aac384f69f74..06d555d4da9a 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -667,6 +667,7 @@ smb2_is_valid_lease_break(char *buffer) !memcmp(rsp->LeaseKey, tcon->crfid.fid->lease_key, SMB2_LEASE_KEY_SIZE)) { + tcon->crfid.time = 0; INIT_WORK(&tcon->crfid.lease_break, smb2_cached_lease_break); queue_work(cifsiod_wq, diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index f703204fb185..dd0eb665b680 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -690,17 +690,21 @@ smb2_close_cached_fid(struct kref *ref) cfid->is_valid = false; cfid->file_all_info_is_valid = false; cfid->has_lease = false; + if (cfid->dentry) { + dput(cfid->dentry); + cfid->dentry = NULL; + } } } -void close_shroot(struct cached_fid *cfid) +void close_cached_dir(struct cached_fid *cfid) { mutex_lock(&cfid->fid_mutex); kref_put(&cfid->refcount, smb2_close_cached_fid); mutex_unlock(&cfid->fid_mutex); } -void close_shroot_lease_locked(struct cached_fid *cfid) +void close_cached_dir_lease_locked(struct cached_fid *cfid) { if (cfid->has_lease) { cfid->has_lease = false; @@ -708,10 +712,10 @@ void close_shroot_lease_locked(struct cached_fid *cfid) } } -void close_shroot_lease(struct cached_fid *cfid) +void close_cached_dir_lease(struct cached_fid *cfid) { mutex_lock(&cfid->fid_mutex); - close_shroot_lease_locked(cfid); + close_cached_dir_lease_locked(cfid); mutex_unlock(&cfid->fid_mutex); } @@ -721,13 +725,15 @@ smb2_cached_lease_break(struct work_struct *work) struct cached_fid *cfid = container_of(work, struct cached_fid, lease_break); - close_shroot_lease(cfid); + close_cached_dir_lease(cfid); } /* - * Open the directory at the root of a share + * Open the and cache a directory handle. + * Only supported for the root handle. */ -int open_shroot(unsigned int xid, struct cifs_tcon *tcon, +int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, + const char *path, struct cifs_sb_info *cifs_sb, struct cached_fid **cfid) { @@ -745,6 +751,18 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, __le16 utf16_path = 0; /* Null - since an open of top of share */ u8 oplock = SMB2_OPLOCK_LEVEL_II; struct cifs_fid *pfid; + struct dentry *dentry; + + if (tcon->nohandlecache) + return -ENOTSUPP; + + if (cifs_sb->root == NULL) + return -ENOENT; + + if (strlen(path)) + return -ENOENT; + + dentry = cifs_sb->root; mutex_lock(&tcon->crfid.fid_mutex); if (tcon->crfid.is_valid) { @@ -830,11 +848,9 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, }; /* - * caller expects this func to set pfid to a valid - * cached root, so we copy the existing one and get a - * reference. + * caller expects this func to set the fid in crfid to valid + * cached root, so increment the refcount. */ - memcpy(pfid, tcon->crfid.fid, sizeof(*pfid)); kref_get(&tcon->crfid.refcount); mutex_unlock(&tcon->crfid.fid_mutex); @@ -867,13 +883,18 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, oparms.fid->mid = le64_to_cpu(o_rsp->sync_hdr.MessageId); #endif /* CIFS_DEBUG2 */ - memcpy(tcon->crfid.fid, pfid, sizeof(struct cifs_fid)); tcon->crfid.tcon = tcon; tcon->crfid.is_valid = true; + tcon->crfid.dentry = dentry; + dget(dentry); kref_init(&tcon->crfid.refcount); /* BB TBD check to see if oplock level check can be removed below */ if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) { + /* + * See commit 2f94a3125b87. Increment the refcount when we + * get a lease for root, release it if lease break occurs + */ kref_get(&tcon->crfid.refcount); tcon->crfid.has_lease = true; smb2_parse_contexts(server, o_rsp, @@ -892,6 +913,8 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, &rsp_iov[1], sizeof(struct smb2_file_all_info), (char *)&tcon->crfid.file_all_info)) tcon->crfid.file_all_info_is_valid = true; + tcon->crfid.time = jiffies; + oshr_exit: mutex_unlock(&tcon->crfid.fid_mutex); @@ -905,6 +928,22 @@ oshr_free: return rc; } +int open_cached_dir_by_dentry(struct cifs_tcon *tcon, + struct dentry *dentry, + struct cached_fid **cfid) +{ + mutex_lock(&tcon->crfid.fid_mutex); + if (tcon->crfid.dentry == dentry) { + cifs_dbg(FYI, "found a cached root file handle by dentry\n"); + *cfid = &tcon->crfid; + kref_get(&tcon->crfid.refcount); + mutex_unlock(&tcon->crfid.fid_mutex); + return 0; + } + mutex_unlock(&tcon->crfid.fid_mutex); + return -ENOENT; +} + static void smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb) @@ -914,7 +953,6 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, u8 oplock = SMB2_OPLOCK_LEVEL_NONE; struct cifs_open_parms oparms; struct cifs_fid fid; - bool no_cached_open = tcon->nohandlecache; struct cached_fid *cfid = NULL; oparms.tcon = tcon; @@ -924,14 +962,12 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = &fid; oparms.reconnect = false; - if (no_cached_open) { + rc = open_cached_dir(xid, tcon, "", cifs_sb, &cfid); + if (rc == 0) + memcpy(&fid, cfid->fid, sizeof(struct cifs_fid)); + else rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL, NULL, NULL); - } else { - rc = open_shroot(xid, tcon, cifs_sb, &cfid); - if (rc == 0) - memcpy(&fid, cfid->fid, sizeof(struct cifs_fid)); - } if (rc) return; @@ -945,10 +981,10 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, FS_VOLUME_INFORMATION); SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid, FS_SECTOR_SIZE_INFORMATION); /* SMB3 specific */ - if (no_cached_open) + if (cfid == NULL) SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); else - close_shroot(cfid); + close_cached_dir(cfid); } static void @@ -1531,7 +1567,10 @@ SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon, NULL, 0 /* no input */, CIFSMaxBufSize, (char **)&res_key, &ret_data_len); - if (rc) { + if (rc == -EOPNOTSUPP) { + pr_warn_once("Server share %s does not support copy range\n", tcon->treeName); + goto req_res_key_exit; + } else if (rc) { cifs_tcon_dbg(VFS, "refcpy ioctl error %d getting resume key\n", rc); goto req_res_key_exit; } @@ -1763,18 +1802,14 @@ smb2_ioctl_query_info(const unsigned int xid, } iqinf_exit: - kfree(vars); - kfree(buffer); - SMB2_open_free(&rqst[0]); - if (qi.flags & PASSTHRU_FSCTL) - SMB2_ioctl_free(&rqst[1]); - else - SMB2_query_info_free(&rqst[1]); - - SMB2_close_free(&rqst[2]); + cifs_small_buf_release(rqst[0].rq_iov[0].iov_base); + cifs_small_buf_release(rqst[1].rq_iov[0].iov_base); + cifs_small_buf_release(rqst[2].rq_iov[0].iov_base); free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); + kfree(vars); + kfree(buffer); return rc; e_fault: @@ -2217,22 +2252,23 @@ smb3_notify(const unsigned int xid, struct file *pfile, struct smb3_notify notify; struct dentry *dentry = pfile->f_path.dentry; struct inode *inode = file_inode(pfile); - struct cifs_sb_info *cifs_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_open_parms oparms; struct cifs_fid fid; struct cifs_tcon *tcon; - unsigned char *path = NULL; + const unsigned char *path; + void *page = alloc_dentry_path(); __le16 *utf16_path = NULL; u8 oplock = SMB2_OPLOCK_LEVEL_NONE; int rc = 0; - path = build_path_from_dentry(dentry); - if (path == NULL) - return -ENOMEM; - - cifs_sb = CIFS_SB(inode->i_sb); + path = build_path_from_dentry(dentry, page); + if (IS_ERR(path)) { + rc = PTR_ERR(path); + goto notify_exit; + } - utf16_path = cifs_convert_path_to_utf16(path + 1, cifs_sb); + utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); if (utf16_path == NULL) { rc = -ENOMEM; goto notify_exit; @@ -2264,7 +2300,7 @@ smb3_notify(const unsigned int xid, struct file *pfile, cifs_dbg(FYI, "change notify for path %s rc %d\n", path, rc); notify_exit: - kfree(path); + free_dentry_path(page); kfree(utf16_path); return rc; } @@ -3652,6 +3688,77 @@ out: return rc; } +static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, + loff_t off, loff_t len) +{ + int rc; + unsigned int xid; + struct cifsFileInfo *cfile = file->private_data; + __le64 eof; + + xid = get_xid(); + + if (off >= i_size_read(file->f_inode) || + off + len >= i_size_read(file->f_inode)) { + rc = -EINVAL; + goto out; + } + + rc = smb2_copychunk_range(xid, cfile, cfile, off + len, + i_size_read(file->f_inode) - off - len, off); + if (rc < 0) + goto out; + + eof = cpu_to_le64(i_size_read(file->f_inode) - len); + rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, cfile->pid, &eof); + if (rc < 0) + goto out; + + rc = 0; + out: + free_xid(xid); + return rc; +} + +static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon, + loff_t off, loff_t len) +{ + int rc; + unsigned int xid; + struct cifsFileInfo *cfile = file->private_data; + __le64 eof; + __u64 count; + + xid = get_xid(); + + if (off >= i_size_read(file->f_inode)) { + rc = -EINVAL; + goto out; + } + + count = i_size_read(file->f_inode) - off; + eof = cpu_to_le64(i_size_read(file->f_inode) + len); + + rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, cfile->pid, &eof); + if (rc < 0) + goto out; + + rc = smb2_copychunk_range(xid, cfile, cfile, off, count, off + len); + if (rc < 0) + goto out; + + rc = smb3_zero_range(file, tcon, off, len, 1); + if (rc < 0) + goto out; + + rc = 0; + out: + free_xid(xid); + return rc; +} + static loff_t smb3_llseek(struct file *file, struct cifs_tcon *tcon, loff_t offset, int whence) { struct cifsFileInfo *wrcfile, *cfile = file->private_data; @@ -3823,6 +3930,10 @@ static long smb3_fallocate(struct file *file, struct cifs_tcon *tcon, int mode, return smb3_zero_range(file, tcon, off, len, false); } else if (mode == FALLOC_FL_KEEP_SIZE) return smb3_simple_falloc(file, tcon, off, len, true); + else if (mode == FALLOC_FL_COLLAPSE_RANGE) + return smb3_collapse_range(file, tcon, off, len); + else if (mode == FALLOC_FL_INSERT_RANGE) + return smb3_insert_range(file, tcon, off, len); else if (mode == 0) return smb3_simple_falloc(file, tcon, off, len, false); @@ -4178,7 +4289,7 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key) } spin_unlock(&cifs_tcp_ses_lock); - return 1; + return -EAGAIN; } /* * Encrypt or decrypt @rqst message. @rqst[0] has the following format: @@ -4968,7 +5079,7 @@ smb2_next_header(char *buf) static int smb2_make_node(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, - char *full_path, umode_t mode, dev_t dev) + const char *full_path, umode_t mode, dev_t dev) { struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); int rc = -EPERM; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 2199a9bfae8f..e36c2a867783 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1857,7 +1857,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) if ((tcon->need_reconnect) || (tcon->ses->need_reconnect)) return 0; - close_shroot_lease(&tcon->crfid); + close_cached_dir_lease(&tcon->crfid); rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, ses->server, (void **) &req, diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index a5a9e33c0d73..6442dc1c292b 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -144,7 +144,7 @@ struct smb2_transform_hdr { } __packed; /* See MS-SMB2 2.2.42 */ -struct smb2_compression_transform_hdr { +struct smb2_compression_transform_hdr_unchained { __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */ __le32 OriginalCompressedSegmentSize; __le16 CompressionAlgorithm; @@ -160,10 +160,17 @@ struct compression_payload_header { __le16 CompressionAlgorithm; __le16 Flags; __le32 Length; /* length of compressed playload including field below if present */ - /* __le32 OriginalPayloadSize; */ /* optional */ + /* __le32 OriginalPayloadSize; */ /* optional, present when LZNT1, LZ77, LZ77+Huffman */ } __packed; /* See MS-SMB2 2.2.42.2 */ +struct smb2_compression_transform_hdr_chained { + __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */ + __le32 OriginalCompressedSegmentSize; + /* struct compression_payload_header[] */ +} __packed; + +/* See MS-SMB2 2.2.42.2.2 */ struct compression_pattern_payload_v1 { __le16 Pattern; __le16 Reserved1; @@ -181,7 +188,11 @@ struct smb2_rdma_transform { __le32 Reserved2; } __packed; -struct smb2_rdma_encryption_transform { +/* TransformType */ +#define SMB2_RDMA_TRANSFORM_TYPE_ENCRYPTION 0x0001 +#define SMB2_RDMA_TRANSFORM_TYPE_SIGNING 0x0002 + +struct smb2_rdma_crypto_transform { __le16 TransformType; __le16 SignatureLength; __le16 NonceLength; @@ -409,13 +420,29 @@ struct smb2_netname_neg_context { } __packed; /* - * For rdma transform capabilities context see MS-SMB2 2.2.3.1.6 + * For smb2_transport_capabilities context see MS-SMB2 2.2.3.1.5 * and 2.2.4.1.5 */ +/* Flags */ +#define SMB2_ACCEPT_TRANSFORM_LEVEL_SECURITY 0x00000001 + +struct smb2_transport_capabilities_context { + __le16 ContextType; /* 6 */ + __le16 DataLength; + __u32 Reserved; + __le32 Flags; +} __packed; + +/* + * For rdma transform capabilities context see MS-SMB2 2.2.3.1.6 + * and 2.2.4.1.6 + */ + /* RDMA Transform IDs */ #define SMB2_RDMA_TRANSFORM_NONE 0x0000 #define SMB2_RDMA_TRANSFORM_ENCRYPTION 0x0001 +#define SMB2_RDMA_TRANSFORM_SIGNING 0x0002 struct smb2_rdma_transform_capabilities_context { __le16 ContextType; /* 7 */ @@ -427,6 +454,11 @@ struct smb2_rdma_transform_capabilities_context { __le16 RDMATransformIds[]; } __packed; +/* + * For signing capabilities context see MS-SMB2 2.2.3.1.7 + * and 2.2.4.1.7 + */ + /* Signing algorithms */ #define SIGNING_ALG_HMAC_SHA256 0 #define SIGNING_ALG_AES_CMAC 1 @@ -634,7 +666,8 @@ struct smb2_tree_connect_rsp { #define SHI1005_FLAGS_ENABLE_HASH_V2 0x00004000 #define SHI1005_FLAGS_ENCRYPT_DATA 0x00008000 #define SMB2_SHAREFLAG_IDENTITY_REMOTING 0x00040000 /* 3.1.1 */ -#define SHI1005_FLAGS_ALL 0x0004FF33 +#define SMB2_SHAREFLAG_COMPRESS_DATA 0x00100000 /* 3.1.1 */ +#define SHI1005_FLAGS_ALL 0x0014FF33 /* Possible share capabilities */ #define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) /* all dialects */ @@ -1390,7 +1423,11 @@ struct smb2_lock_req { struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 48 */ __le16 LockCount; - __le32 Reserved; + /* + * The least significant four bits are the index, the other 28 bits are + * the lock sequence number (0 to 64). See MS-SMB2 2.2.26 + */ + __le32 LockSequenceNumber; __u64 PersistentFileId; /* opaque endianness */ __u64 VolatileFileId; /* opaque endianness */ /* Followed by at least one */ diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index a2eb34a8d9c9..a5f87b02cfaf 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -69,12 +69,16 @@ extern struct cifs_ses *smb2_find_smb_ses(struct TCP_Server_Info *server, extern int smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid); -extern int open_shroot(unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, - struct cached_fid **cfid); -extern void close_shroot(struct cached_fid *cfid); -extern void close_shroot_lease(struct cached_fid *cfid); -extern void close_shroot_lease_locked(struct cached_fid *cfid); +extern int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, + const char *path, + struct cifs_sb_info *cifs_sb, + struct cached_fid **cfid); +extern int open_cached_dir_by_dentry(struct cifs_tcon *tcon, + struct dentry *dentry, + struct cached_fid **cfid); +extern void close_cached_dir(struct cached_fid *cfid); +extern void close_cached_dir_lease(struct cached_fid *cfid); +extern void close_cached_dir_lease_locked(struct cached_fid *cfid); extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src); extern int smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, diff --git a/fs/cifs/unc.c b/fs/cifs/unc.c index 394aa00cea40..f6fc5e343ea4 100644 --- a/fs/cifs/unc.c +++ b/fs/cifs/unc.c @@ -50,7 +50,6 @@ char *extract_sharename(const char *unc) { const char *src; char *delim, *dst; - int len; /* skip double chars at the beginning */ src = unc + 2; @@ -60,10 +59,9 @@ char *extract_sharename(const char *unc) if (!delim) return ERR_PTR(-EINVAL); delim++; - len = strlen(delim); /* caller has to free the memory */ - dst = kstrndup(delim, len, GFP_KERNEL); + dst = kstrdup(delim, GFP_KERNEL); if (!dst) return ERR_PTR(-ENOMEM); diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 41a611e76bb7..e351b945135b 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -53,7 +53,7 @@ enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT, XATTR_CIFS_NTSD, XATTR_CIFS_NTSD_FULL }; static int cifs_attrib_set(unsigned int xid, struct cifs_tcon *pTcon, - struct inode *inode, char *full_path, + struct inode *inode, const char *full_path, const void *value, size_t size) { ssize_t rc = -EOPNOTSUPP; @@ -77,7 +77,7 @@ static int cifs_attrib_set(unsigned int xid, struct cifs_tcon *pTcon, } static int cifs_creation_time_set(unsigned int xid, struct cifs_tcon *pTcon, - struct inode *inode, char *full_path, + struct inode *inode, const char *full_path, const void *value, size_t size) { ssize_t rc = -EOPNOTSUPP; @@ -112,7 +112,8 @@ static int cifs_xattr_set(const struct xattr_handler *handler, struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct tcon_link *tlink; struct cifs_tcon *pTcon; - char *full_path; + const char *full_path; + void *page; tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) @@ -120,10 +121,11 @@ static int cifs_xattr_set(const struct xattr_handler *handler, pTcon = tlink_tcon(tlink); xid = get_xid(); + page = alloc_dentry_path(); - full_path = build_path_from_dentry(dentry); - if (full_path == NULL) { - rc = -ENOMEM; + full_path = build_path_from_dentry(dentry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto out; } /* return dos attributes as pseudo xattr */ @@ -235,7 +237,7 @@ static int cifs_xattr_set(const struct xattr_handler *handler, } out: - kfree(full_path); + free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); return rc; @@ -297,7 +299,8 @@ static int cifs_xattr_get(const struct xattr_handler *handler, struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct tcon_link *tlink; struct cifs_tcon *pTcon; - char *full_path; + const char *full_path; + void *page; tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) @@ -305,10 +308,11 @@ static int cifs_xattr_get(const struct xattr_handler *handler, pTcon = tlink_tcon(tlink); xid = get_xid(); + page = alloc_dentry_path(); - full_path = build_path_from_dentry(dentry); - if (full_path == NULL) { - rc = -ENOMEM; + full_path = build_path_from_dentry(dentry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto out; } @@ -401,7 +405,7 @@ static int cifs_xattr_get(const struct xattr_handler *handler, rc = -EOPNOTSUPP; out: - kfree(full_path); + free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); return rc; @@ -414,7 +418,8 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size) struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); struct tcon_link *tlink; struct cifs_tcon *pTcon; - char *full_path; + const char *full_path; + void *page; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) return -EOPNOTSUPP; @@ -425,10 +430,11 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size) pTcon = tlink_tcon(tlink); xid = get_xid(); + page = alloc_dentry_path(); - full_path = build_path_from_dentry(direntry); - if (full_path == NULL) { - rc = -ENOMEM; + full_path = build_path_from_dentry(direntry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto list_ea_exit; } /* return dos attributes as pseudo xattr */ @@ -442,7 +448,7 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size) rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, full_path, NULL, data, buf_size, cifs_sb); list_ea_exit: - kfree(full_path); + free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); return rc; diff --git a/fs/coda/file.c b/fs/coda/file.c index 128d63df5bfb..ef5ca22bfb3e 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -175,10 +175,10 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma) ret = call_mmap(vma->vm_file, vma); if (ret) { - /* if call_mmap fails, our caller will put coda_file so we - * should drop the reference to the host_file that we got. + /* if call_mmap fails, our caller will put host_file so we + * should drop the reference to the coda_file that we got. */ - fput(host_file); + fput(coda_file); kfree(cvm_ops); } else { /* here we add redirects for the open/close vm_operations */ diff --git a/fs/coredump.c b/fs/coredump.c index 1c0fdc1aa70b..2868e3e171ae 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -809,6 +809,16 @@ void do_coredump(const kernel_siginfo_t *siginfo) } file_start_write(cprm.file); core_dumped = binfmt->core_dump(&cprm); + /* + * Ensures that file size is big enough to contain the current + * file postion. This prevents gdb from complaining about + * a truncated file if the last "write" to the file was + * dump_skip. + */ + if (cprm.to_skip) { + cprm.to_skip--; + dump_emit(&cprm, "", 1); + } file_end_write(cprm.file); } if (ispipe && core_pipe_limit) @@ -835,7 +845,7 @@ fail: * do on a core-file: use only these functions to write out all the * necessary info. */ -int dump_emit(struct coredump_params *cprm, const void *addr, int nr) +static int __dump_emit(struct coredump_params *cprm, const void *addr, int nr) { struct file *file = cprm->file; loff_t pos = file->f_pos; @@ -855,9 +865,8 @@ int dump_emit(struct coredump_params *cprm, const void *addr, int nr) return 1; } -EXPORT_SYMBOL(dump_emit); -int dump_skip(struct coredump_params *cprm, size_t nr) +static int __dump_skip(struct coredump_params *cprm, size_t nr) { static char zeroes[PAGE_SIZE]; struct file *file = cprm->file; @@ -869,13 +878,35 @@ int dump_skip(struct coredump_params *cprm, size_t nr) return 1; } else { while (nr > PAGE_SIZE) { - if (!dump_emit(cprm, zeroes, PAGE_SIZE)) + if (!__dump_emit(cprm, zeroes, PAGE_SIZE)) return 0; nr -= PAGE_SIZE; } - return dump_emit(cprm, zeroes, nr); + return __dump_emit(cprm, zeroes, nr); } } + +int dump_emit(struct coredump_params *cprm, const void *addr, int nr) +{ + if (cprm->to_skip) { + if (!__dump_skip(cprm, cprm->to_skip)) + return 0; + cprm->to_skip = 0; + } + return __dump_emit(cprm, addr, nr); +} +EXPORT_SYMBOL(dump_emit); + +void dump_skip_to(struct coredump_params *cprm, unsigned long pos) +{ + cprm->to_skip = pos - cprm->pos; +} +EXPORT_SYMBOL(dump_skip_to); + +void dump_skip(struct coredump_params *cprm, size_t nr) +{ + cprm->to_skip += nr; +} EXPORT_SYMBOL(dump_skip); #ifdef CONFIG_ELF_CORE @@ -902,11 +933,11 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, stop = !dump_emit(cprm, kaddr, PAGE_SIZE); kunmap_local(kaddr); put_page(page); + if (stop) + return 0; } else { - stop = !dump_skip(cprm, PAGE_SIZE); + dump_skip(cprm, PAGE_SIZE); } - if (stop) - return 0; } return 1; } @@ -914,33 +945,16 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, int dump_align(struct coredump_params *cprm, int align) { - unsigned mod = cprm->pos & (align - 1); + unsigned mod = (cprm->pos + cprm->to_skip) & (align - 1); if (align & (align - 1)) return 0; - return mod ? dump_skip(cprm, align - mod) : 1; + if (mod) + cprm->to_skip += align - mod; + return 1; } EXPORT_SYMBOL(dump_align); /* - * Ensures that file size is big enough to contain the current file - * postion. This prevents gdb from complaining about a truncated file - * if the last "write" to the file was dump_skip. - */ -void dump_truncate(struct coredump_params *cprm) -{ - struct file *file = cprm->file; - loff_t offset; - - if (file->f_op->llseek && file->f_op->llseek != no_llseek) { - offset = file->f_op->llseek(file, 0, SEEK_CUR); - if (i_size_read(file->f_mapping->host) < offset) - do_truncate(file_mnt_user_ns(file), file->f_path.dentry, - offset, 0, file); - } -} -EXPORT_SYMBOL(dump_truncate); - -/* * The purpose of always_dump_vma() is to make sure that special kernel mappings * that are useful for post-mortem analysis are included in every core dump. * In that way we ensure that the core dump is fully interpretable later diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig index a5f5c30368a2..2d0c8922f635 100644 --- a/fs/crypto/Kconfig +++ b/fs/crypto/Kconfig @@ -14,16 +14,30 @@ config FS_ENCRYPTION F2FS and UBIFS make use of this feature. # Filesystems supporting encryption must select this if FS_ENCRYPTION. This -# allows the algorithms to be built as modules when all the filesystems are. +# allows the algorithms to be built as modules when all the filesystems are, +# whereas selecting them from FS_ENCRYPTION would force them to be built-in. +# +# Note: this option only pulls in the algorithms that filesystem encryption +# needs "by default". If userspace will use "non-default" encryption modes such +# as Adiantum encryption, then those other modes need to be explicitly enabled +# in the crypto API; see Documentation/filesystems/fscrypt.rst for details. +# +# Also note that this option only pulls in the generic implementations of the +# algorithms, not any per-architecture optimized implementations. It is +# strongly recommended to enable optimized implementations too. It is safe to +# disable these generic implementations if corresponding optimized +# implementations will always be available too; for this reason, these are soft +# dependencies ('imply' rather than 'select'). Only disable these generic +# implementations if you're sure they will never be needed, though. config FS_ENCRYPTION_ALGS tristate - select CRYPTO_AES - select CRYPTO_CBC - select CRYPTO_CTS - select CRYPTO_ECB - select CRYPTO_HMAC - select CRYPTO_SHA512 - select CRYPTO_XTS + imply CRYPTO_AES + imply CRYPTO_CBC + imply CRYPTO_CTS + imply CRYPTO_ECB + imply CRYPTO_HMAC + imply CRYPTO_SHA512 + imply CRYPTO_XTS config FS_ENCRYPTION_INLINE_CRYPT bool "Enable fscrypt to use inline crypto" diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 686e0ad28788..9979d981e9be 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -773,7 +773,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_atomic_t); ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { - char buf[3]; + char buf[2]; bool val; int r; struct dentry *dentry = F_DENTRY(file); @@ -789,7 +789,6 @@ ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf, else buf[0] = 'N'; buf[1] = '\n'; - buf[2] = 0x00; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); } EXPORT_SYMBOL_GPL(debugfs_read_file_bool); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 22e86ae4dd5a..1d252164d97b 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -35,7 +35,7 @@ static struct vfsmount *debugfs_mount; static int debugfs_mount_count; static bool debugfs_registered; -static unsigned int debugfs_allow = DEFAULT_DEBUGFS_ALLOW_BITS; +static unsigned int debugfs_allow __ro_after_init = DEFAULT_DEBUGFS_ALLOW_BITS; /* * Don't allow access attributes to be changed whilst the kernel is locked down diff --git a/fs/direct-io.c b/fs/direct-io.c index b61491bf3166..b2e86e739d7a 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -812,6 +812,7 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, struct buffer_head *map_bh) { int ret = 0; + int boundary = sdio->boundary; /* dio_send_cur_page may clear it */ if (dio->op == REQ_OP_WRITE) { /* @@ -850,10 +851,10 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits; out: /* - * If sdio->boundary then we want to schedule the IO now to + * If boundary then we want to schedule the IO now to * avoid metadata seeks. */ - if (sdio->boundary) { + if (boundary) { ret = dio_send_cur_page(dio, sdio, map_bh); if (sdio->bio) dio_bio_submit(dio, sdio); diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 74b0aaa7114c..858b3339f381 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -76,17 +76,3 @@ config EROFS_FS_ZIP If you don't want to enable compression feature, say N. -config EROFS_FS_CLUSTER_PAGE_LIMIT - int "EROFS Cluster Pages Hard Limit" - depends on EROFS_FS_ZIP - range 1 256 - default "1" - help - Indicates maximum # of pages of a compressed - physical cluster. - - For example, if files in a image were compressed - into 8k-unit, hard limit should not be configured - less than 2. Otherwise, the image will be refused - to mount on this kernel. - diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile index af159539fc1b..1f9aced49070 100644 --- a/fs/erofs/Makefile +++ b/fs/erofs/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_EROFS_FS) += erofs.o -erofs-objs := super.o inode.o data.o namei.o dir.o utils.o +erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 1249e74b3bf0..ebac756cb2a3 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -109,21 +109,6 @@ err_out: return err; } -int erofs_map_blocks(struct inode *inode, - struct erofs_map_blocks *map, int flags) -{ - if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) { - int err = z_erofs_map_blocks_iter(inode, map, flags); - - if (map->mpage) { - put_page(map->mpage); - map->mpage = NULL; - } - return err; - } - return erofs_map_blocks_flatmode(inode, map, flags); -} - static inline struct bio *erofs_read_raw_page(struct bio *bio, struct address_space *mapping, struct page *page, @@ -159,7 +144,7 @@ submit_bio_retry: erofs_blk_t blknr; unsigned int blkoff; - err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); + err = erofs_map_blocks_flatmode(inode, &map, EROFS_GET_BLOCKS_RAW); if (err) goto err_out; @@ -318,7 +303,7 @@ static sector_t erofs_bmap(struct address_space *mapping, sector_t block) return 0; } - if (!erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW)) + if (!erofs_map_blocks_flatmode(inode, &map, EROFS_GET_BLOCKS_RAW)) return erofs_blknr(map.m_pa); return 0; diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 1cb1ffd10569..88e33addf229 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -28,6 +28,42 @@ struct z_erofs_decompressor { char *name; }; +int z_erofs_load_lz4_config(struct super_block *sb, + struct erofs_super_block *dsb, + struct z_erofs_lz4_cfgs *lz4, int size) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + u16 distance; + + if (lz4) { + if (size < sizeof(struct z_erofs_lz4_cfgs)) { + erofs_err(sb, "invalid lz4 cfgs, size=%u", size); + return -EINVAL; + } + distance = le16_to_cpu(lz4->max_distance); + + sbi->lz4.max_pclusterblks = le16_to_cpu(lz4->max_pclusterblks); + if (!sbi->lz4.max_pclusterblks) { + sbi->lz4.max_pclusterblks = 1; /* reserved case */ + } else if (sbi->lz4.max_pclusterblks > + Z_EROFS_PCLUSTER_MAX_SIZE / EROFS_BLKSIZ) { + erofs_err(sb, "too large lz4 pclusterblks %u", + sbi->lz4.max_pclusterblks); + return -EINVAL; + } else if (sbi->lz4.max_pclusterblks >= 2) { + erofs_info(sb, "EXPERIMENTAL big pcluster feature in use. Use at your own risk!"); + } + } else { + distance = le16_to_cpu(dsb->u1.lz4_max_distance); + sbi->lz4.max_pclusterblks = 1; + } + + sbi->lz4.max_distance_pages = distance ? + DIV_ROUND_UP(distance, PAGE_SIZE) + 1 : + LZ4_MAX_DISTANCE_PAGES; + return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks); +} + static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq, struct list_head *pagepool) { @@ -36,6 +72,8 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq, struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL }; unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES, BITS_PER_LONG)] = { 0 }; + unsigned int lz4_max_distance_pages = + EROFS_SB(rq->sb)->lz4.max_distance_pages; void *kaddr = NULL; unsigned int i, j, top; @@ -44,14 +82,14 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq, struct page *const page = rq->out[i]; struct page *victim; - if (j >= LZ4_MAX_DISTANCE_PAGES) + if (j >= lz4_max_distance_pages) j = 0; /* 'valid' bounced can only be tested after a complete round */ if (test_bit(j, bounced)) { - DBG_BUGON(i < LZ4_MAX_DISTANCE_PAGES); - DBG_BUGON(top >= LZ4_MAX_DISTANCE_PAGES); - availables[top++] = rq->out[i - LZ4_MAX_DISTANCE_PAGES]; + DBG_BUGON(i < lz4_max_distance_pages); + DBG_BUGON(top >= lz4_max_distance_pages); + availables[top++] = rq->out[i - lz4_max_distance_pages]; } if (page) { @@ -73,9 +111,8 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq, victim = availables[--top]; get_page(victim); } else { - victim = erofs_allocpage(pagepool, GFP_KERNEL); - if (!victim) - return -ENOMEM; + victim = erofs_allocpage(pagepool, + GFP_KERNEL | __GFP_NOFAIL); set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE); } rq->out[i] = victim; @@ -83,96 +120,123 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq, return kaddr ? 1 : 0; } -static void *generic_copy_inplace_data(struct z_erofs_decompress_req *rq, - u8 *src, unsigned int pageofs_in) +static void *z_erofs_handle_inplace_io(struct z_erofs_decompress_req *rq, + void *inpage, unsigned int *inputmargin, int *maptype, + bool support_0padding) { - /* - * if in-place decompression is ongoing, those decompressed - * pages should be copied in order to avoid being overlapped. - */ - struct page **in = rq->in; - u8 *const tmp = erofs_get_pcpubuf(0); - u8 *tmpp = tmp; - unsigned int inlen = rq->inputsize - pageofs_in; - unsigned int count = min_t(uint, inlen, PAGE_SIZE - pageofs_in); - - while (tmpp < tmp + inlen) { - if (!src) - src = kmap_atomic(*in); - memcpy(tmpp, src + pageofs_in, count); - kunmap_atomic(src); - src = NULL; - tmpp += count; - pageofs_in = 0; - count = PAGE_SIZE; + unsigned int nrpages_in, nrpages_out; + unsigned int ofull, oend, inputsize, total, i, j; + struct page **in; + void *src, *tmp; + + inputsize = rq->inputsize; + nrpages_in = PAGE_ALIGN(inputsize) >> PAGE_SHIFT; + oend = rq->pageofs_out + rq->outputsize; + ofull = PAGE_ALIGN(oend); + nrpages_out = ofull >> PAGE_SHIFT; + + if (rq->inplace_io) { + if (rq->partial_decoding || !support_0padding || + ofull - oend < LZ4_DECOMPRESS_INPLACE_MARGIN(inputsize)) + goto docopy; + + for (i = 0; i < nrpages_in; ++i) { + DBG_BUGON(rq->in[i] == NULL); + for (j = 0; j < nrpages_out - nrpages_in + i; ++j) + if (rq->out[j] == rq->in[i]) + goto docopy; + } + } + + if (nrpages_in <= 1) { + *maptype = 0; + return inpage; + } + kunmap_atomic(inpage); + might_sleep(); + src = erofs_vm_map_ram(rq->in, nrpages_in); + if (!src) + return ERR_PTR(-ENOMEM); + *maptype = 1; + return src; + +docopy: + /* Or copy compressed data which can be overlapped to per-CPU buffer */ + in = rq->in; + src = erofs_get_pcpubuf(nrpages_in); + if (!src) { + DBG_BUGON(1); + kunmap_atomic(inpage); + return ERR_PTR(-EFAULT); + } + + tmp = src; + total = rq->inputsize; + while (total) { + unsigned int page_copycnt = + min_t(unsigned int, total, PAGE_SIZE - *inputmargin); + + if (!inpage) + inpage = kmap_atomic(*in); + memcpy(tmp, inpage + *inputmargin, page_copycnt); + kunmap_atomic(inpage); + inpage = NULL; + tmp += page_copycnt; + total -= page_copycnt; ++in; + *inputmargin = 0; } - return tmp; + *maptype = 2; + return src; } static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) { - unsigned int inputmargin, inlen; - u8 *src; - bool copied, support_0padding; - int ret; - - if (rq->inputsize > PAGE_SIZE) - return -EOPNOTSUPP; + unsigned int inputmargin; + u8 *headpage, *src; + bool support_0padding; + int ret, maptype; - src = kmap_atomic(*rq->in); + DBG_BUGON(*rq->in == NULL); + headpage = kmap_atomic(*rq->in); inputmargin = 0; support_0padding = false; /* decompression inplace is only safe when 0padding is enabled */ - if (EROFS_SB(rq->sb)->feature_incompat & - EROFS_FEATURE_INCOMPAT_LZ4_0PADDING) { + if (erofs_sb_has_lz4_0padding(EROFS_SB(rq->sb))) { support_0padding = true; - while (!src[inputmargin & ~PAGE_MASK]) + while (!headpage[inputmargin & ~PAGE_MASK]) if (!(++inputmargin & ~PAGE_MASK)) break; if (inputmargin >= rq->inputsize) { - kunmap_atomic(src); + kunmap_atomic(headpage); return -EIO; } } - copied = false; - inlen = rq->inputsize - inputmargin; - if (rq->inplace_io) { - const uint oend = (rq->pageofs_out + - rq->outputsize) & ~PAGE_MASK; - const uint nr = PAGE_ALIGN(rq->pageofs_out + - rq->outputsize) >> PAGE_SHIFT; - - if (rq->partial_decoding || !support_0padding || - rq->out[nr - 1] != rq->in[0] || - rq->inputsize - oend < - LZ4_DECOMPRESS_INPLACE_MARGIN(inlen)) { - src = generic_copy_inplace_data(rq, src, inputmargin); - inputmargin = 0; - copied = true; - } - } + rq->inputsize -= inputmargin; + src = z_erofs_handle_inplace_io(rq, headpage, &inputmargin, &maptype, + support_0padding); + if (IS_ERR(src)) + return PTR_ERR(src); /* legacy format could compress extra data in a pcluster. */ if (rq->partial_decoding || !support_0padding) ret = LZ4_decompress_safe_partial(src + inputmargin, out, - inlen, rq->outputsize, - rq->outputsize); + rq->inputsize, rq->outputsize, rq->outputsize); else ret = LZ4_decompress_safe(src + inputmargin, out, - inlen, rq->outputsize); + rq->inputsize, rq->outputsize); if (ret != rq->outputsize) { erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]", - ret, inlen, inputmargin, rq->outputsize); + ret, rq->inputsize, inputmargin, rq->outputsize); WARN_ON(1); print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET, - 16, 1, src + inputmargin, inlen, true); + 16, 1, src + inputmargin, rq->inputsize, true); print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET, 16, 1, out, rq->outputsize, true); @@ -181,10 +245,16 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) ret = -EIO; } - if (copied) - erofs_put_pcpubuf(src); - else + if (maptype == 0) { kunmap_atomic(src); + } else if (maptype == 1) { + vm_unmap_ram(src, PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT); + } else if (maptype == 2) { + erofs_put_pcpubuf(src); + } else { + DBG_BUGON(1); + return -EFAULT; + } return ret; } @@ -234,57 +304,51 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq, const struct z_erofs_decompressor *alg = decompressors + rq->alg; unsigned int dst_maptype; void *dst; - int ret, i; + int ret; - if (nrpages_out == 1 && !rq->inplace_io) { - DBG_BUGON(!*rq->out); - dst = kmap_atomic(*rq->out); - dst_maptype = 0; - goto dstmap_out; - } + /* two optimized fast paths only for non bigpcluster cases yet */ + if (rq->inputsize <= PAGE_SIZE) { + if (nrpages_out == 1 && !rq->inplace_io) { + DBG_BUGON(!*rq->out); + dst = kmap_atomic(*rq->out); + dst_maptype = 0; + goto dstmap_out; + } - /* - * For the case of small output size (especially much less - * than PAGE_SIZE), memcpy the decompressed data rather than - * compressed data is preferred. - */ - if (rq->outputsize <= PAGE_SIZE * 7 / 8) { - dst = erofs_get_pcpubuf(0); - if (IS_ERR(dst)) - return PTR_ERR(dst); - - rq->inplace_io = false; - ret = alg->decompress(rq, dst); - if (!ret) - copy_from_pcpubuf(rq->out, dst, rq->pageofs_out, - rq->outputsize); - - erofs_put_pcpubuf(dst); - return ret; + /* + * For the case of small output size (especially much less + * than PAGE_SIZE), memcpy the decompressed data rather than + * compressed data is preferred. + */ + if (rq->outputsize <= PAGE_SIZE * 7 / 8) { + dst = erofs_get_pcpubuf(1); + if (IS_ERR(dst)) + return PTR_ERR(dst); + + rq->inplace_io = false; + ret = alg->decompress(rq, dst); + if (!ret) + copy_from_pcpubuf(rq->out, dst, rq->pageofs_out, + rq->outputsize); + + erofs_put_pcpubuf(dst); + return ret; + } } + /* general decoding path which can be used for all cases */ ret = alg->prepare_destpages(rq, pagepool); - if (ret < 0) { + if (ret < 0) return ret; - } else if (ret) { + if (ret) { dst = page_address(*rq->out); dst_maptype = 1; goto dstmap_out; } - i = 0; - while (1) { - dst = vm_map_ram(rq->out, nrpages_out, -1); - - /* retry two more times (totally 3 times) */ - if (dst || ++i >= 3) - break; - vm_unmap_aliases(); - } - + dst = erofs_vm_map_ram(rq->out, nrpages_out); if (!dst) return -ENOMEM; - dst_maptype = 2; dstmap_out: diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 9ad1615f4474..8739d3adf51f 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -18,15 +18,22 @@ * be incompatible with this kernel version. */ #define EROFS_FEATURE_INCOMPAT_LZ4_0PADDING 0x00000001 -#define EROFS_ALL_FEATURE_INCOMPAT EROFS_FEATURE_INCOMPAT_LZ4_0PADDING +#define EROFS_FEATURE_INCOMPAT_COMPR_CFGS 0x00000002 +#define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER 0x00000002 +#define EROFS_ALL_FEATURE_INCOMPAT \ + (EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \ + EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \ + EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER) -/* 128-byte erofs on-disk super block */ +#define EROFS_SB_EXTSLOT_SIZE 16 + +/* erofs on-disk super block (currently 128 bytes) */ struct erofs_super_block { __le32 magic; /* file system magic number */ __le32 checksum; /* crc32c(super_block) */ __le32 feature_compat; __u8 blkszbits; /* support block_size == PAGE_SIZE only */ - __u8 reserved; + __u8 sb_extslots; /* superblock size = 128 + sb_extslots * 16 */ __le16 root_nid; /* nid of root directory */ __le64 inos; /* total valid ino # (== f_files - f_favail) */ @@ -39,7 +46,13 @@ struct erofs_super_block { __u8 uuid[16]; /* 128-bit uuid for volume */ __u8 volume_name[16]; /* volume name */ __le32 feature_incompat; - __u8 reserved2[44]; + union { + /* bitmap for available compression algorithms */ + __le16 available_compr_algs; + /* customized sliding window size instead of 64k by default */ + __le16 lz4_max_distance; + } __packed u1; + __u8 reserved2[42]; }; /* @@ -75,6 +88,9 @@ static inline bool erofs_inode_is_data_compressed(unsigned int datamode) #define EROFS_I_VERSION_BIT 0 #define EROFS_I_DATALAYOUT_BIT 1 +#define EROFS_I_ALL \ + ((1 << (EROFS_I_DATALAYOUT_BIT + EROFS_I_DATALAYOUT_BITS)) - 1) + /* 32-byte reduced form of an ondisk inode */ struct erofs_inode_compact { __le16 i_format; /* inode format hints */ @@ -189,20 +205,33 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e) e->e_name_len + le16_to_cpu(e->e_value_size)); } +/* maximum supported size of a physical compression cluster */ +#define Z_EROFS_PCLUSTER_MAX_SIZE (1024 * 1024) + /* available compression algorithm types (for h_algorithmtype) */ enum { Z_EROFS_COMPRESSION_LZ4 = 0, Z_EROFS_COMPRESSION_MAX }; +#define Z_EROFS_ALL_COMPR_ALGS (1 << (Z_EROFS_COMPRESSION_MAX - 1)) + +/* 14 bytes (+ length field = 16 bytes) */ +struct z_erofs_lz4_cfgs { + __le16 max_distance; + __le16 max_pclusterblks; + u8 reserved[10]; +} __packed; /* * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on) * e.g. for 4k logical cluster size, 4B if compacted 2B is off; * (4B) + 2B + (4B) if compacted 2B is on. + * bit 1 : HEAD1 big pcluster (0 - off; 1 - on) + * bit 2 : HEAD2 big pcluster (0 - off; 1 - on) */ -#define Z_EROFS_ADVISE_COMPACTED_2B_BIT 0 - -#define Z_EROFS_ADVISE_COMPACTED_2B (1 << Z_EROFS_ADVISE_COMPACTED_2B_BIT) +#define Z_EROFS_ADVISE_COMPACTED_2B 0x0001 +#define Z_EROFS_ADVISE_BIG_PCLUSTER_1 0x0002 +#define Z_EROFS_ADVISE_BIG_PCLUSTER_2 0x0004 struct z_erofs_map_header { __le32 h_reserved1; @@ -214,9 +243,7 @@ struct z_erofs_map_header { __u8 h_algorithmtype; /* * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096; - * bit 3-4 : (physical - logical) cluster bits of head 1: - * For example, if logical clustersize = 4096, 1 for 8192. - * bit 5-7 : (physical - logical) cluster bits of head 2. + * bit 3-7 : reserved. */ __u8 h_clusterbits; }; @@ -259,6 +286,13 @@ enum { #define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS 2 #define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT 0 +/* + * D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store the + * compressed block count of a compressed extent (in logical clusters, aka. + * block count of a pcluster). + */ +#define Z_EROFS_VLE_DI_D0_CBLKCNT (1 << 11) + struct z_erofs_vle_decompressed_index { __le16 di_advise; /* where to decompress in the head cluster */ diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 119fdce1b520..7ed2d7391692 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -44,6 +44,13 @@ static struct page *erofs_read_inode(struct inode *inode, dic = page_address(page) + *ofs; ifmt = le16_to_cpu(dic->i_format); + if (ifmt & ~EROFS_I_ALL) { + erofs_err(inode->i_sb, "unsupported i_format %u of nid %llu", + ifmt, vi->nid); + err = -EOPNOTSUPP; + goto err_out; + } + vi->datalayout = erofs_inode_datalayout(ifmt); if (vi->datalayout >= EROFS_INODE_DATALAYOUT_MAX) { erofs_err(inode->i_sb, "unsupported datalayout %u of nid %llu", diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 351dae524a0c..f92e3e32b9f4 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -50,6 +50,8 @@ struct erofs_fs_context { #ifdef CONFIG_EROFS_FS_ZIP /* current strategy of how to use managed cache */ unsigned char cache_strategy; + /* strategy of sync decompression (false - auto, true - force on) */ + bool readahead_sync_decompress; /* threshold for decompression synchronously */ unsigned int max_sync_decompress_pages; @@ -57,6 +59,14 @@ struct erofs_fs_context { unsigned int mount_opt; }; +/* all filesystem-wide lz4 configurations */ +struct erofs_sb_lz4_info { + /* # of pages needed for EROFS lz4 rolling decompression */ + u16 max_distance_pages; + /* maximum possible blocks for pclusters in the filesystem */ + u16 max_pclusterblks; +}; + struct erofs_sb_info { #ifdef CONFIG_EROFS_FS_ZIP /* list for all registered superblocks, mainly for shrinker */ @@ -67,9 +77,12 @@ struct erofs_sb_info { struct xarray managed_pslots; unsigned int shrinker_run_no; + u16 available_compr_algs; /* pseudo inode to manage cached pages */ struct inode *managed_cache; + + struct erofs_sb_lz4_info lz4; #endif /* CONFIG_EROFS_FS_ZIP */ u32 blocks; u32 meta_blkaddr; @@ -80,6 +93,7 @@ struct erofs_sb_info { /* inode slot unit size in bit shift */ unsigned char islotbits; + u32 sb_size; /* total superblock size */ u32 build_time_nsec; u64 build_time; @@ -182,12 +196,6 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp) return v; } #endif /* !CONFIG_SMP */ - -/* hard limit of pages per compressed cluster */ -#define Z_EROFS_CLUSTER_MAX_PAGES (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT) -#define EROFS_PCPUBUF_NR_PAGES Z_EROFS_CLUSTER_MAX_PAGES -#else -#define EROFS_PCPUBUF_NR_PAGES 0 #endif /* !CONFIG_EROFS_FS_ZIP */ /* we strictly follow PAGE_SIZE and no buffer head yet */ @@ -216,6 +224,17 @@ static inline erofs_off_t iloc(struct erofs_sb_info *sbi, erofs_nid_t nid) return blknr_to_addr(sbi->meta_blkaddr) + (nid << sbi->islotbits); } +#define EROFS_FEATURE_FUNCS(name, compat, feature) \ +static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \ +{ \ + return sbi->feature_##compat & EROFS_FEATURE_##feature; \ +} + +EROFS_FEATURE_FUNCS(lz4_0padding, incompat, INCOMPAT_LZ4_0PADDING) +EROFS_FEATURE_FUNCS(compr_cfgs, incompat, INCOMPAT_COMPR_CFGS) +EROFS_FEATURE_FUNCS(big_pcluster, incompat, INCOMPAT_BIG_PCLUSTER) +EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) + /* atomic flag definitions */ #define EROFS_I_EA_INITED_BIT 0 #define EROFS_I_Z_INITED_BIT 1 @@ -244,7 +263,6 @@ struct erofs_inode { unsigned short z_advise; unsigned char z_algorithmtype[2]; unsigned char z_logical_clusterbits; - unsigned char z_physical_clusterbits[2]; }; #endif /* CONFIG_EROFS_FS_ZIP */ }; @@ -287,7 +305,7 @@ extern const struct address_space_operations erofs_raw_access_aops; extern const struct address_space_operations z_erofs_aops; /* - * Logical to physical block mapping, used by erofs_map_blocks() + * Logical to physical block mapping * * Different with other file systems, it is used for 2 access modes: * @@ -334,7 +352,7 @@ struct erofs_map_blocks { struct page *mpage; }; -/* Flags used by erofs_map_blocks() */ +/* Flags used by erofs_map_blocks_flatmode() */ #define EROFS_GET_BLOCKS_RAW 0x0001 /* zmap.c */ @@ -356,8 +374,6 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode, /* data.c */ struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr); -int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int); - /* inode.c */ static inline unsigned long erofs_inode_hash(erofs_nid_t nid) { @@ -386,23 +402,30 @@ int erofs_namei(struct inode *dir, struct qstr *name, /* dir.c */ extern const struct file_operations erofs_dir_fops; -/* utils.c / zdata.c */ -struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp); - -#if (EROFS_PCPUBUF_NR_PAGES > 0) -void *erofs_get_pcpubuf(unsigned int pagenr); -#define erofs_put_pcpubuf(buf) do { \ - (void)&(buf); \ - preempt_enable(); \ -} while (0) -#else -static inline void *erofs_get_pcpubuf(unsigned int pagenr) +static inline void *erofs_vm_map_ram(struct page **pages, unsigned int count) { - return ERR_PTR(-EOPNOTSUPP); + int retried = 0; + + while (1) { + void *p = vm_map_ram(pages, count, -1); + + /* retry two more times (totally 3 times) */ + if (p || ++retried >= 3) + return p; + vm_unmap_aliases(); + } + return NULL; } -#define erofs_put_pcpubuf(buf) do {} while (0) -#endif +/* pcpubuf.c */ +void *erofs_get_pcpubuf(unsigned int requiredpages); +void erofs_put_pcpubuf(void *ptr); +int erofs_pcpubuf_growsize(unsigned int nrpages); +void erofs_pcpubuf_init(void); +void erofs_pcpubuf_exit(void); + +/* utils.c / zdata.c */ +struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp); #ifdef CONFIG_EROFS_FS_ZIP int erofs_workgroup_put(struct erofs_workgroup *grp); @@ -421,6 +444,9 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, struct erofs_workgroup *egrp); int erofs_try_to_free_cached_page(struct address_space *mapping, struct page *page); +int z_erofs_load_lz4_config(struct super_block *sb, + struct erofs_super_block *dsb, + struct z_erofs_lz4_cfgs *lz4, int len); #else static inline void erofs_shrinker_register(struct super_block *sb) {} static inline void erofs_shrinker_unregister(struct super_block *sb) {} @@ -428,6 +454,16 @@ static inline int erofs_init_shrinker(void) { return 0; } static inline void erofs_exit_shrinker(void) {} static inline int z_erofs_init_zip_subsystem(void) { return 0; } static inline void z_erofs_exit_zip_subsystem(void) {} +static inline int z_erofs_load_lz4_config(struct super_block *sb, + struct erofs_super_block *dsb, + struct z_erofs_lz4_cfgs *lz4, int len) +{ + if (lz4 || dsb->u1.lz4_max_distance) { + erofs_err(sb, "lz4 algorithm isn't enabled"); + return -EINVAL; + } + return 0; +} #endif /* !CONFIG_EROFS_FS_ZIP */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c new file mode 100644 index 000000000000..6c885575128a --- /dev/null +++ b/fs/erofs/pcpubuf.c @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Gao Xiang <xiang@kernel.org> + * + * For low-latency decompression algorithms (e.g. lz4), reserve consecutive + * per-CPU virtual memory (in pages) in advance to store such inplace I/O + * data if inplace decompression is failed (due to unmet inplace margin for + * example). + */ +#include "internal.h" + +struct erofs_pcpubuf { + raw_spinlock_t lock; + void *ptr; + struct page **pages; + unsigned int nrpages; +}; + +static DEFINE_PER_CPU(struct erofs_pcpubuf, erofs_pcb); + +void *erofs_get_pcpubuf(unsigned int requiredpages) + __acquires(pcb->lock) +{ + struct erofs_pcpubuf *pcb = &get_cpu_var(erofs_pcb); + + raw_spin_lock(&pcb->lock); + /* check if the per-CPU buffer is too small */ + if (requiredpages > pcb->nrpages) { + raw_spin_unlock(&pcb->lock); + put_cpu_var(erofs_pcb); + /* (for sparse checker) pretend pcb->lock is still taken */ + __acquire(pcb->lock); + return NULL; + } + return pcb->ptr; +} + +void erofs_put_pcpubuf(void *ptr) __releases(pcb->lock) +{ + struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, smp_processor_id()); + + DBG_BUGON(pcb->ptr != ptr); + raw_spin_unlock(&pcb->lock); + put_cpu_var(erofs_pcb); +} + +/* the next step: support per-CPU page buffers hotplug */ +int erofs_pcpubuf_growsize(unsigned int nrpages) +{ + static DEFINE_MUTEX(pcb_resize_mutex); + static unsigned int pcb_nrpages; + LIST_HEAD(pagepool); + int delta, cpu, ret, i; + + mutex_lock(&pcb_resize_mutex); + delta = nrpages - pcb_nrpages; + ret = 0; + /* avoid shrinking pcpubuf, since no idea how many fses rely on */ + if (delta <= 0) + goto out; + + for_each_possible_cpu(cpu) { + struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu); + struct page **pages, **oldpages; + void *ptr, *old_ptr; + + pages = kmalloc_array(nrpages, sizeof(*pages), GFP_KERNEL); + if (!pages) { + ret = -ENOMEM; + break; + } + + for (i = 0; i < nrpages; ++i) { + pages[i] = erofs_allocpage(&pagepool, GFP_KERNEL); + if (!pages[i]) { + ret = -ENOMEM; + oldpages = pages; + goto free_pagearray; + } + } + ptr = vmap(pages, nrpages, VM_MAP, PAGE_KERNEL); + if (!ptr) { + ret = -ENOMEM; + oldpages = pages; + goto free_pagearray; + } + raw_spin_lock(&pcb->lock); + old_ptr = pcb->ptr; + pcb->ptr = ptr; + oldpages = pcb->pages; + pcb->pages = pages; + i = pcb->nrpages; + pcb->nrpages = nrpages; + raw_spin_unlock(&pcb->lock); + + if (!oldpages) { + DBG_BUGON(old_ptr); + continue; + } + + if (old_ptr) + vunmap(old_ptr); +free_pagearray: + while (i) + list_add(&oldpages[--i]->lru, &pagepool); + kfree(oldpages); + if (ret) + break; + } + pcb_nrpages = nrpages; + put_pages_list(&pagepool); +out: + mutex_unlock(&pcb_resize_mutex); + return ret; +} + +void erofs_pcpubuf_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu); + + raw_spin_lock_init(&pcb->lock); + } +} + +void erofs_pcpubuf_exit(void) +{ + int cpu, i; + + for_each_possible_cpu(cpu) { + struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu); + + if (pcb->ptr) { + vunmap(pcb->ptr); + pcb->ptr = NULL; + } + if (!pcb->pages) + continue; + + for (i = 0; i < pcb->nrpages; ++i) + if (pcb->pages[i]) + put_page(pcb->pages[i]); + kfree(pcb->pages); + pcb->pages = NULL; + } +} diff --git a/fs/erofs/super.c b/fs/erofs/super.c index d5a6b9b888a5..bbf3bbd908e0 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -122,6 +122,136 @@ static bool check_layout_compatibility(struct super_block *sb, return true; } +#ifdef CONFIG_EROFS_FS_ZIP +/* read variable-sized metadata, offset will be aligned by 4-byte */ +static void *erofs_read_metadata(struct super_block *sb, struct page **pagep, + erofs_off_t *offset, int *lengthp) +{ + struct page *page = *pagep; + u8 *buffer, *ptr; + int len, i, cnt; + erofs_blk_t blk; + + *offset = round_up(*offset, 4); + blk = erofs_blknr(*offset); + + if (!page || page->index != blk) { + if (page) { + unlock_page(page); + put_page(page); + } + page = erofs_get_meta_page(sb, blk); + if (IS_ERR(page)) + goto err_nullpage; + } + + ptr = kmap(page); + len = le16_to_cpu(*(__le16 *)&ptr[erofs_blkoff(*offset)]); + if (!len) + len = U16_MAX + 1; + buffer = kmalloc(len, GFP_KERNEL); + if (!buffer) { + buffer = ERR_PTR(-ENOMEM); + goto out; + } + *offset += sizeof(__le16); + *lengthp = len; + + for (i = 0; i < len; i += cnt) { + cnt = min(EROFS_BLKSIZ - (int)erofs_blkoff(*offset), len - i); + blk = erofs_blknr(*offset); + + if (!page || page->index != blk) { + if (page) { + kunmap(page); + unlock_page(page); + put_page(page); + } + page = erofs_get_meta_page(sb, blk); + if (IS_ERR(page)) { + kfree(buffer); + goto err_nullpage; + } + ptr = kmap(page); + } + memcpy(buffer + i, ptr + erofs_blkoff(*offset), cnt); + *offset += cnt; + } +out: + kunmap(page); + *pagep = page; + return buffer; +err_nullpage: + *pagep = NULL; + return page; +} + +static int erofs_load_compr_cfgs(struct super_block *sb, + struct erofs_super_block *dsb) +{ + struct erofs_sb_info *sbi; + struct page *page; + unsigned int algs, alg; + erofs_off_t offset; + int size, ret; + + sbi = EROFS_SB(sb); + sbi->available_compr_algs = le16_to_cpu(dsb->u1.available_compr_algs); + + if (sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS) { + erofs_err(sb, "try to load compressed fs with unsupported algorithms %x", + sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS); + return -EINVAL; + } + + offset = EROFS_SUPER_OFFSET + sbi->sb_size; + page = NULL; + alg = 0; + ret = 0; + + for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) { + void *data; + + if (!(algs & 1)) + continue; + + data = erofs_read_metadata(sb, &page, &offset, &size); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + goto err; + } + + switch (alg) { + case Z_EROFS_COMPRESSION_LZ4: + ret = z_erofs_load_lz4_config(sb, dsb, data, size); + break; + default: + DBG_BUGON(1); + ret = -EFAULT; + } + kfree(data); + if (ret) + goto err; + } +err: + if (page) { + unlock_page(page); + put_page(page); + } + return ret; +} +#else +static int erofs_load_compr_cfgs(struct super_block *sb, + struct erofs_super_block *dsb) +{ + if (dsb->u1.available_compr_algs) { + erofs_err(sb, "try to load compressed fs when compression is disabled"); + return -EINVAL; + } + return 0; +} +#endif + static int erofs_read_superblock(struct super_block *sb) { struct erofs_sb_info *sbi; @@ -149,7 +279,7 @@ static int erofs_read_superblock(struct super_block *sb) } sbi->feature_compat = le32_to_cpu(dsb->feature_compat); - if (sbi->feature_compat & EROFS_FEATURE_COMPAT_SB_CHKSUM) { + if (erofs_sb_has_sb_chksum(sbi)) { ret = erofs_superblock_csum_verify(sb, data); if (ret) goto out; @@ -166,6 +296,12 @@ static int erofs_read_superblock(struct super_block *sb) if (!check_layout_compatibility(sb, dsb)) goto out; + sbi->sb_size = 128 + dsb->sb_extslots * EROFS_SB_EXTSLOT_SIZE; + if (sbi->sb_size > EROFS_BLKSIZ) { + erofs_err(sb, "invalid sb_extslots %u (more than a fs block)", + sbi->sb_size); + goto out; + } sbi->blocks = le32_to_cpu(dsb->blocks); sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr); #ifdef CONFIG_EROFS_FS_XATTR @@ -187,7 +323,12 @@ static int erofs_read_superblock(struct super_block *sb) ret = -EFSCORRUPTED; goto out; } - ret = 0; + + /* parse on-disk compression configurations */ + if (erofs_sb_has_compr_cfgs(sbi)) + ret = erofs_load_compr_cfgs(sb, dsb); + else + ret = z_erofs_load_lz4_config(sb, dsb, NULL, 0); out: kunmap(page); put_page(page); @@ -200,6 +341,7 @@ static void erofs_default_options(struct erofs_fs_context *ctx) #ifdef CONFIG_EROFS_FS_ZIP ctx->cache_strategy = EROFS_ZIP_CACHE_READAROUND; ctx->max_sync_decompress_pages = 3; + ctx->readahead_sync_decompress = false; #endif #ifdef CONFIG_EROFS_FS_XATTR set_opt(ctx, XATTR_USER); @@ -513,6 +655,7 @@ static int __init erofs_module_init(void) if (err) goto shrinker_err; + erofs_pcpubuf_init(); err = z_erofs_init_zip_subsystem(); if (err) goto zip_err; @@ -542,6 +685,7 @@ static void __exit erofs_module_exit(void) /* Ensure all RCU free inodes are safe before cache is destroyed. */ rcu_barrier(); kmem_cache_destroy(erofs_inode_cachep); + erofs_pcpubuf_exit(); } /* get filesystem statistics */ diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index de9986d2f82f..6758c5b19f7c 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -21,18 +21,6 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp) return page; } -#if (EROFS_PCPUBUF_NR_PAGES > 0) -static struct { - u8 data[PAGE_SIZE * EROFS_PCPUBUF_NR_PAGES]; -} ____cacheline_aligned_in_smp erofs_pcpubuf[NR_CPUS]; - -void *erofs_get_pcpubuf(unsigned int pagenr) -{ - preempt_disable(); - return &erofs_pcpubuf[smp_processor_id()].data[pagenr * PAGE_SIZE]; -} -#endif - #ifdef CONFIG_EROFS_FS_ZIP /* global shrink count (for all mounted EROFS instances) */ static atomic_long_t erofs_global_shrink_cnt; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 3851e1a64f73..78e4b598ecca 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -11,6 +11,93 @@ #include <trace/events/erofs.h> /* + * since pclustersize is variable for big pcluster feature, introduce slab + * pools implementation for different pcluster sizes. + */ +struct z_erofs_pcluster_slab { + struct kmem_cache *slab; + unsigned int maxpages; + char name[48]; +}; + +#define _PCLP(n) { .maxpages = n } + +static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = { + _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128), + _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES) +}; + +static void z_erofs_destroy_pcluster_pool(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { + if (!pcluster_pool[i].slab) + continue; + kmem_cache_destroy(pcluster_pool[i].slab); + pcluster_pool[i].slab = NULL; + } +} + +static int z_erofs_create_pcluster_pool(void) +{ + struct z_erofs_pcluster_slab *pcs; + struct z_erofs_pcluster *a; + unsigned int size; + + for (pcs = pcluster_pool; + pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) { + size = struct_size(a, compressed_pages, pcs->maxpages); + + sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages); + pcs->slab = kmem_cache_create(pcs->name, size, 0, + SLAB_RECLAIM_ACCOUNT, NULL); + if (pcs->slab) + continue; + + z_erofs_destroy_pcluster_pool(); + return -ENOMEM; + } + return 0; +} + +static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { + struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; + struct z_erofs_pcluster *pcl; + + if (nrpages > pcs->maxpages) + continue; + + pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS); + if (!pcl) + return ERR_PTR(-ENOMEM); + pcl->pclusterpages = nrpages; + return pcl; + } + return ERR_PTR(-EINVAL); +} + +static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { + struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; + + if (pcl->pclusterpages > pcs->maxpages) + continue; + + kmem_cache_free(pcs->slab, pcl); + return; + } + DBG_BUGON(1); +} + +/* * a compressed_pages[] placeholder in order to avoid * being filled with file pages for in-place decompression. */ @@ -37,12 +124,11 @@ typedef tagptr1_t compressed_page_t; tagptr_fold(compressed_page_t, page, 1) static struct workqueue_struct *z_erofs_workqueue __read_mostly; -static struct kmem_cache *pcluster_cachep __read_mostly; void z_erofs_exit_zip_subsystem(void) { destroy_workqueue(z_erofs_workqueue); - kmem_cache_destroy(pcluster_cachep); + z_erofs_destroy_pcluster_pool(); } static inline int z_erofs_init_workqueue(void) @@ -59,32 +145,16 @@ static inline int z_erofs_init_workqueue(void) return z_erofs_workqueue ? 0 : -ENOMEM; } -static void z_erofs_pcluster_init_once(void *ptr) -{ - struct z_erofs_pcluster *pcl = ptr; - struct z_erofs_collection *cl = z_erofs_primarycollection(pcl); - unsigned int i; - - mutex_init(&cl->lock); - cl->nr_pages = 0; - cl->vcnt = 0; - for (i = 0; i < Z_EROFS_CLUSTER_MAX_PAGES; ++i) - pcl->compressed_pages[i] = NULL; -} - int __init z_erofs_init_zip_subsystem(void) { - pcluster_cachep = kmem_cache_create("erofs_compress", - Z_EROFS_WORKGROUP_SIZE, 0, - SLAB_RECLAIM_ACCOUNT, - z_erofs_pcluster_init_once); - if (pcluster_cachep) { - if (!z_erofs_init_workqueue()) - return 0; - - kmem_cache_destroy(pcluster_cachep); - } - return -ENOMEM; + int err = z_erofs_create_pcluster_pool(); + + if (err) + return err; + err = z_erofs_init_workqueue(); + if (err) + z_erofs_destroy_pcluster_pool(); + return err; } enum z_erofs_collectmode { @@ -104,6 +174,12 @@ enum z_erofs_collectmode { * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________| */ COLLECT_PRIMARY_HOOKED, + /* + * a weak form of COLLECT_PRIMARY_FOLLOWED, the difference is that it + * could be dispatched into bypass queue later due to uptodated managed + * pages. All related online pages cannot be reused for inplace I/O (or + * pagevec) since it can be directly decoded without I/O submission. + */ COLLECT_PRIMARY_FOLLOWED_NOINPLACE, /* * The current collection has been linked with the owned chain, and @@ -128,7 +204,8 @@ struct z_erofs_collector { struct z_erofs_pcluster *pcl, *tailpcl; struct z_erofs_collection *cl; - struct page **compressedpages; + /* a pointer used to pick up inplace I/O pages */ + struct page **icpage_ptr; z_erofs_next_pcluster_t owned_head; enum z_erofs_collectmode mode; @@ -162,18 +239,19 @@ static void preload_compressed_pages(struct z_erofs_collector *clt, enum z_erofs_cache_alloctype type, struct list_head *pagepool) { - const struct z_erofs_pcluster *pcl = clt->pcl; - const unsigned int clusterpages = BIT(pcl->clusterbits); - struct page **pages = clt->compressedpages; - pgoff_t index = pcl->obj.index + (pages - pcl->compressed_pages); + struct z_erofs_pcluster *pcl = clt->pcl; bool standalone = true; gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; + struct page **pages; + pgoff_t index; if (clt->mode < COLLECT_PRIMARY_FOLLOWED) return; - for (; pages < pcl->compressed_pages + clusterpages; ++pages) { + pages = pcl->compressed_pages; + index = pcl->obj.index; + for (; index < pcl->obj.index + pcl->pclusterpages; ++index, ++pages) { struct page *page; compressed_page_t t; struct page *newpage = NULL; @@ -186,21 +264,25 @@ static void preload_compressed_pages(struct z_erofs_collector *clt, if (page) { t = tag_compressed_page_justfound(page); - } else if (type == DELAYEDALLOC) { - t = tagptr_init(compressed_page_t, PAGE_UNALLOCATED); - } else if (type == TRYALLOC) { - newpage = erofs_allocpage(pagepool, gfp); - if (!newpage) - goto dontalloc; - - set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); - t = tag_compressed_page_justfound(newpage); - } else { /* DONTALLOC */ -dontalloc: - if (standalone) - clt->compressedpages = pages; + } else { + /* I/O is needed, no possible to decompress directly */ standalone = false; - continue; + switch (type) { + case DELAYEDALLOC: + t = tagptr_init(compressed_page_t, + PAGE_UNALLOCATED); + break; + case TRYALLOC: + newpage = erofs_allocpage(pagepool, gfp); + if (!newpage) + continue; + set_page_private(newpage, + Z_EROFS_PREALLOCATED_PAGE); + t = tag_compressed_page_justfound(newpage); + break; + default: /* DONTALLOC */ + continue; + } } if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t))) @@ -214,7 +296,11 @@ dontalloc: } } - if (standalone) /* downgrade to PRIMARY_FOLLOWED_NOINPLACE */ + /* + * don't do inplace I/O if all compressed pages are available in + * managed cache since it can be moved to the bypass queue instead. + */ + if (standalone) clt->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; } @@ -225,14 +311,13 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, struct z_erofs_pcluster *const pcl = container_of(grp, struct z_erofs_pcluster, obj); struct address_space *const mapping = MNGD_MAPPING(sbi); - const unsigned int clusterpages = BIT(pcl->clusterbits); int i; /* * refcount of workgroup is now freezed as 1, * therefore no need to worry about available decompression users. */ - for (i = 0; i < clusterpages; ++i) { + for (i = 0; i < pcl->pclusterpages; ++i) { struct page *page = pcl->compressed_pages[i]; if (!page) @@ -257,13 +342,12 @@ int erofs_try_to_free_cached_page(struct address_space *mapping, struct page *page) { struct z_erofs_pcluster *const pcl = (void *)page_private(page); - const unsigned int clusterpages = BIT(pcl->clusterbits); int ret = 0; /* 0 - busy */ if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) { unsigned int i; - for (i = 0; i < clusterpages; ++i) { + for (i = 0; i < pcl->pclusterpages; ++i) { if (pcl->compressed_pages[i] == page) { WRITE_ONCE(pcl->compressed_pages[i], NULL); ret = 1; @@ -279,16 +363,14 @@ int erofs_try_to_free_cached_page(struct address_space *mapping, } /* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */ -static inline bool z_erofs_try_inplace_io(struct z_erofs_collector *clt, - struct page *page) +static bool z_erofs_try_inplace_io(struct z_erofs_collector *clt, + struct page *page) { struct z_erofs_pcluster *const pcl = clt->pcl; - const unsigned int clusterpages = BIT(pcl->clusterbits); - while (clt->compressedpages < pcl->compressed_pages + clusterpages) { - if (!cmpxchg(clt->compressedpages++, NULL, page)) + while (clt->icpage_ptr > pcl->compressed_pages) + if (!cmpxchg(--clt->icpage_ptr, NULL, page)) return true; - } return false; } @@ -399,10 +481,10 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, struct erofs_workgroup *grp; int err; - /* no available workgroup, let's allocate one */ - pcl = kmem_cache_alloc(pcluster_cachep, GFP_NOFS); - if (!pcl) - return -ENOMEM; + /* no available pcluster, let's allocate one */ + pcl = z_erofs_alloc_pcluster(map->m_plen >> PAGE_SHIFT); + if (IS_ERR(pcl)) + return PTR_ERR(pcl); atomic_set(&pcl->obj.refcount, 1); pcl->obj.index = map->m_pa >> PAGE_SHIFT; @@ -416,25 +498,18 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, else pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; - pcl->clusterbits = EROFS_I(inode)->z_physical_clusterbits[0]; - pcl->clusterbits -= PAGE_SHIFT; - /* new pclusters should be claimed as type 1, primary and followed */ pcl->next = clt->owned_head; clt->mode = COLLECT_PRIMARY_FOLLOWED; cl = z_erofs_primarycollection(pcl); - - /* must be cleaned before freeing to slab */ - DBG_BUGON(cl->nr_pages); - DBG_BUGON(cl->vcnt); - cl->pageofs = map->m_la & ~PAGE_MASK; /* * lock all primary followed works before visible to others * and mutex_trylock *never* fails for a new pcluster. */ + mutex_init(&cl->lock); DBG_BUGON(!mutex_trylock(&cl->lock)); grp = erofs_insert_workgroup(inode->i_sb, &pcl->obj); @@ -458,7 +533,7 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, err_out: mutex_unlock(&cl->lock); - kmem_cache_free(pcluster_cachep, pcl); + z_erofs_free_pcluster(pcl); return err; } @@ -502,9 +577,8 @@ out: z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS, clt->cl->pagevec, clt->cl->vcnt); - clt->compressedpages = clt->pcl->compressed_pages; - if (clt->mode <= COLLECT_PRIMARY) /* cannot do in-place I/O */ - clt->compressedpages += Z_EROFS_CLUSTER_MAX_PAGES; + /* since file-backed online pages are traversed in reverse order */ + clt->icpage_ptr = clt->pcl->compressed_pages + clt->pcl->pclusterpages; return 0; } @@ -517,9 +591,8 @@ static void z_erofs_rcu_callback(struct rcu_head *head) struct z_erofs_collection *const cl = container_of(head, struct z_erofs_collection, rcu); - kmem_cache_free(pcluster_cachep, - container_of(cl, struct z_erofs_pcluster, - primary_collection)); + z_erofs_free_pcluster(container_of(cl, struct z_erofs_pcluster, + primary_collection)); } void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) @@ -706,9 +779,12 @@ err_out: goto out; } +static void z_erofs_decompressqueue_work(struct work_struct *work); static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, bool sync, int bios) { + struct erofs_sb_info *const sbi = EROFS_SB(io->sb); + /* wake up the caller thread for sync decompression */ if (sync) { unsigned long flags; @@ -720,8 +796,15 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, return; } - if (!atomic_add_return(bios, &io->pending_bios)) + if (atomic_add_return(bios, &io->pending_bios)) + return; + /* Use workqueue and sync decompression for atomic contexts only */ + if (in_atomic() || irqs_disabled()) { queue_work(z_erofs_workqueue, &io->u.work); + sbi->ctx.readahead_sync_decompress = true; + return; + } + z_erofs_decompressqueue_work(&io->u.work); } static bool z_erofs_page_is_invalidated(struct page *page) @@ -761,9 +844,8 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, struct list_head *pagepool) { struct erofs_sb_info *const sbi = EROFS_SB(sb); - const unsigned int clusterpages = BIT(pcl->clusterbits); struct z_erofs_pagevec_ctor ctor; - unsigned int i, outputsize, llen, nr_pages; + unsigned int i, inputsize, outputsize, llen, nr_pages; struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES]; struct page **pages, **compressed_pages, *page; @@ -843,7 +925,7 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, overlapped = false; compressed_pages = pcl->compressed_pages; - for (i = 0; i < clusterpages; ++i) { + for (i = 0; i < pcl->pclusterpages; ++i) { unsigned int pagenr; page = compressed_pages[i]; @@ -896,12 +978,13 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, partial = true; } + inputsize = pcl->pclusterpages * PAGE_SIZE; err = z_erofs_decompress(&(struct z_erofs_decompress_req) { .sb = sb, .in = compressed_pages, .out = pages, .pageofs_out = cl->pageofs, - .inputsize = PAGE_SIZE, + .inputsize = inputsize, .outputsize = outputsize, .alg = pcl->algorithmformat, .inplace_io = overlapped, @@ -909,8 +992,8 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, }, pagepool); out: - /* must handle all compressed pages before endding pages */ - for (i = 0; i < clusterpages; ++i) { + /* must handle all compressed pages before ending pages */ + for (i = 0; i < pcl->pclusterpages; ++i) { page = compressed_pages[i]; if (erofs_page_is_managed(sbi, page)) @@ -1213,7 +1296,7 @@ static void z_erofs_submit_queue(struct super_block *sb, pcl = container_of(owned_head, struct z_erofs_pcluster, next); cur = pcl->obj.index; - end = cur + BIT(pcl->clusterbits); + end = cur + pcl->pclusterpages; /* close the main owned chain at first */ owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, @@ -1333,7 +1416,8 @@ static void z_erofs_readahead(struct readahead_control *rac) struct erofs_sb_info *const sbi = EROFS_I_SB(inode); unsigned int nr_pages = readahead_count(rac); - bool sync = (nr_pages <= sbi->ctx.max_sync_decompress_pages); + bool sync = (sbi->ctx.readahead_sync_decompress && + nr_pages <= sbi->ctx.max_sync_decompress_pages); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); struct page *page, *head = NULL; LIST_HEAD(pagepool); diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index b503b353d4ab..942ee69dff6a 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -10,6 +10,7 @@ #include "internal.h" #include "zpvec.h" +#define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) #define Z_EROFS_NR_INLINE_PAGEVECS 3 /* @@ -59,16 +60,17 @@ struct z_erofs_pcluster { /* A: point to next chained pcluster or TAILs */ z_erofs_next_pcluster_t next; - /* A: compressed pages (including multi-usage pages) */ - struct page *compressed_pages[Z_EROFS_CLUSTER_MAX_PAGES]; - /* A: lower limit of decompressed length and if full length or not */ unsigned int length; + /* I: physical cluster size in pages */ + unsigned short pclusterpages; + /* I: compression algorithm format */ unsigned char algorithmformat; - /* I: bit shift of physical cluster size */ - unsigned char clusterbits; + + /* A: compressed pages (can be cached or inplaced pages) */ + struct page *compressed_pages[]; }; #define z_erofs_primarycollection(pcluster) (&(pcluster)->primary_collection) @@ -82,8 +84,6 @@ struct z_erofs_pcluster { #define Z_EROFS_PCLUSTER_NIL (NULL) -#define Z_EROFS_WORKGROUP_SIZE sizeof(struct z_erofs_pcluster) - struct z_erofs_decompressqueue { struct super_block *sb; atomic_t pending_bios; diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 14d2de35110c..e62d813756f2 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -11,17 +11,16 @@ int z_erofs_fill_inode(struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); + struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); - if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) { + if (!erofs_sb_has_big_pcluster(sbi) && + vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) { vi->z_advise = 0; vi->z_algorithmtype[0] = 0; vi->z_algorithmtype[1] = 0; vi->z_logical_clusterbits = LOG_BLOCK_SIZE; - vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits; - vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits; set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); } - inode->i_mapping->a_ops = &z_erofs_aops; return 0; } @@ -52,7 +51,8 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) goto out_unlock; - DBG_BUGON(vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY); + DBG_BUGON(!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && + vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY); pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize + vi->xattr_isize, 8); @@ -77,18 +77,22 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) } vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7); - vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits + - ((h->h_clusterbits >> 3) & 3); - - if (vi->z_physical_clusterbits[0] != LOG_BLOCK_SIZE) { - erofs_err(sb, "unsupported physical clusterbits %u for nid %llu, please upgrade kernel", - vi->z_physical_clusterbits[0], vi->nid); - err = -EOPNOTSUPP; + if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && + vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | + Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { + erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu", + vi->nid); + err = -EFSCORRUPTED; + goto unmap_done; + } + if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION && + !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^ + !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { + erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu", + vi->nid); + err = -EFSCORRUPTED; goto unmap_done; } - - vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits + - ((h->h_clusterbits >> 5) & 7); /* paired with smp_mb() at the beginning of the function */ smp_mb(); set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); @@ -111,7 +115,7 @@ struct z_erofs_maprecorder { u8 type; u16 clusterofs; u16 delta[2]; - erofs_blk_t pblk; + erofs_blk_t pblk, compressedlcs; }; static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m, @@ -174,6 +178,15 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: m->clusterofs = 1 << vi->z_logical_clusterbits; m->delta[0] = le16_to_cpu(di->di_u.delta[0]); + if (m->delta[0] & Z_EROFS_VLE_DI_D0_CBLKCNT) { + if (!(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + m->compressedlcs = m->delta[0] & + ~Z_EROFS_VLE_DI_D0_CBLKCNT; + m->delta[0] = 1; + } m->delta[1] = le16_to_cpu(di->di_u.delta[1]); break; case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: @@ -210,6 +223,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, unsigned int vcnt, base, lo, encodebits, nblk; int i; u8 *in, type; + bool big_pcluster; if (1 << amortizedshift == 4) vcnt = 2; @@ -218,6 +232,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, else return -EOPNOTSUPP; + big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1; encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt; base = round_down(eofs, vcnt << amortizedshift); in = m->kaddr + base; @@ -229,7 +244,15 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, m->type = type; if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) { m->clusterofs = 1 << lclusterbits; - if (i + 1 != vcnt) { + if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) { + if (!big_pcluster) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + m->compressedlcs = lo & ~Z_EROFS_VLE_DI_D0_CBLKCNT; + m->delta[0] = 1; + return 0; + } else if (i + 1 != (int)vcnt) { m->delta[0] = lo; return 0; } @@ -242,22 +265,48 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, in, encodebits * (i - 1), &type); if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) lo = 0; + else if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) + lo = 1; m->delta[0] = lo + 1; return 0; } m->clusterofs = lo; m->delta[0] = 0; /* figout out blkaddr (pblk) for HEAD lclusters */ - nblk = 1; - while (i > 0) { - --i; - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * i, &type); - if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) - i -= lo; - - if (i >= 0) + if (!big_pcluster) { + nblk = 1; + while (i > 0) { + --i; + lo = decode_compactedbits(lclusterbits, lomask, + in, encodebits * i, &type); + if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) + i -= lo; + + if (i >= 0) + ++nblk; + } + } else { + nblk = 0; + while (i > 0) { + --i; + lo = decode_compactedbits(lclusterbits, lomask, + in, encodebits * i, &type); + if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) { + if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) { + --i; + nblk += lo & ~Z_EROFS_VLE_DI_D0_CBLKCNT; + continue; + } + /* bigpcluster shouldn't have plain d0 == 1 */ + if (lo <= 1) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + i -= lo - 2; + continue; + } ++nblk; + } } in += (vcnt << amortizedshift) - sizeof(__le32); m->pblk = le32_to_cpu(*(__le32 *)in) + nblk; @@ -381,6 +430,58 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, return 0; } +static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, + unsigned int initial_lcn) +{ + struct erofs_inode *const vi = EROFS_I(m->inode); + struct erofs_map_blocks *const map = m->map; + const unsigned int lclusterbits = vi->z_logical_clusterbits; + unsigned long lcn; + int err; + + DBG_BUGON(m->type != Z_EROFS_VLE_CLUSTER_TYPE_PLAIN && + m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD); + if (!(map->m_flags & EROFS_MAP_ZIPPED) || + !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) { + map->m_plen = 1 << lclusterbits; + return 0; + } + + lcn = m->lcn + 1; + if (m->compressedlcs) + goto out; + if (lcn == initial_lcn) + goto err_bonus_cblkcnt; + + err = z_erofs_load_cluster_from_disk(m, lcn); + if (err) + return err; + + switch (m->type) { + case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: + if (m->delta[0] != 1) + goto err_bonus_cblkcnt; + if (m->compressedlcs) + break; + fallthrough; + default: + erofs_err(m->inode->i_sb, + "cannot found CBLKCNT @ lcn %lu of nid %llu", + lcn, vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } +out: + map->m_plen = m->compressedlcs << lclusterbits; + return 0; +err_bonus_cblkcnt: + erofs_err(m->inode->i_sb, + "bogus CBLKCNT @ lcn %lu of nid %llu", + lcn, vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; +} + int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int flags) @@ -392,6 +493,7 @@ int z_erofs_map_blocks_iter(struct inode *inode, }; int err = 0; unsigned int lclusterbits, endoff; + unsigned long initial_lcn; unsigned long long ofs, end; trace_z_erofs_map_blocks_iter_enter(inode, map, flags); @@ -410,10 +512,10 @@ int z_erofs_map_blocks_iter(struct inode *inode, lclusterbits = vi->z_logical_clusterbits; ofs = map->m_la; - m.lcn = ofs >> lclusterbits; + initial_lcn = ofs >> lclusterbits; endoff = ofs & ((1 << lclusterbits) - 1); - err = z_erofs_load_cluster_from_disk(&m, m.lcn); + err = z_erofs_load_cluster_from_disk(&m, initial_lcn); if (err) goto unmap_out; @@ -443,7 +545,7 @@ int z_erofs_map_blocks_iter(struct inode *inode, m.delta[0] = 1; fallthrough; case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: - /* get the correspoinding first chunk */ + /* get the corresponding first chunk */ err = z_erofs_extent_lookback(&m, m.delta[0]); if (err) goto unmap_out; @@ -457,10 +559,12 @@ int z_erofs_map_blocks_iter(struct inode *inode, } map->m_llen = end - map->m_la; - map->m_plen = 1 << lclusterbits; map->m_pa = blknr_to_addr(m.pblk); map->m_flags |= EROFS_MAP_MAPPED; + err = z_erofs_get_extent_compressedlen(&m, initial_lcn); + if (err) + goto out; unmap_out: if (m.kaddr) kunmap_atomic(m.kaddr); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 3196474cbe24..73138ea68342 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -366,8 +366,8 @@ static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p) * * @ep: Pointer to the eventpoll context. * - * Returns: Returns a value different than zero if ready events are available, - * or zero otherwise. + * Return: a value different than %zero if ready events are available, + * or %zero otherwise. */ static inline int ep_events_available(struct eventpoll *ep) { @@ -1023,7 +1023,7 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, } #endif /* CONFIG_KCMP */ -/** +/* * Adds a new entry to the tail of the list in a lockless way, i.e. * multiple CPUs are allowed to call this function concurrently. * @@ -1035,10 +1035,10 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, * completed. * * Also an element can be locklessly added to the list only in one - * direction i.e. either to the tail either to the head, otherwise + * direction i.e. either to the tail or to the head, otherwise * concurrent access will corrupt the list. * - * Returns %false if element has been already added to the list, %true + * Return: %false if element has been already added to the list, %true * otherwise. */ static inline bool list_add_tail_lockless(struct list_head *new, @@ -1076,11 +1076,11 @@ static inline bool list_add_tail_lockless(struct list_head *new, return true; } -/** +/* * Chains a new epi entry to the tail of the ep->ovflist in a lockless way, * i.e. multiple CPUs are allowed to call this function concurrently. * - * Returns %false if epi element has been already chained, %true otherwise. + * Return: %false if epi element has been already chained, %true otherwise. */ static inline bool chain_epi_lockless(struct epitem *epi) { @@ -1105,8 +1105,8 @@ static inline bool chain_epi_lockless(struct epitem *epi) * mechanism. It is called by the stored file descriptors when they * have events to report. * - * This callback takes a read lock in order not to content with concurrent - * events from another file descriptors, thus all modifications to ->rdllist + * This callback takes a read lock in order not to contend with concurrent + * events from another file descriptor, thus all modifications to ->rdllist * or ->ovflist are lockless. Read lock is paired with the write lock from * ep_scan_ready_list(), which stops all list modifications and guarantees * that lists state is seen correctly. @@ -1335,8 +1335,8 @@ static int reverse_path_check_proc(struct hlist_head *refs, int depth) * paths such that we will spend all our time waking up * eventpoll objects. * - * Returns: Returns zero if the proposed links don't create too many paths, - * -1 otherwise. + * Return: %zero if the proposed links don't create too many paths, + * %-1 otherwise. */ static int reverse_path_check(void) { @@ -1734,7 +1734,7 @@ static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms) } /** - * ep_poll - Retrieves ready events, and delivers them to the caller supplied + * ep_poll - Retrieves ready events, and delivers them to the caller-supplied * event buffer. * * @ep: Pointer to the eventpoll context. @@ -1747,7 +1747,7 @@ static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms) * until at least one event has been retrieved (or an error * occurred). * - * Returns: Returns the number of ready events which have been fetched, or an + * Return: the number of ready events which have been fetched, or an * error code, in case of error. */ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, @@ -1774,9 +1774,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, /* * This call is racy: We may or may not see events that are being added - * to the ready list under the lock (e.g., in IRQ callbacks). For, cases + * to the ready list under the lock (e.g., in IRQ callbacks). For cases * with a non-zero timeout, this thread will check the ready list under - * lock and will added to the wait queue. For, cases with a zero + * lock and will add to the wait queue. For cases with a zero * timeout, the user by definition should not care and will have to * recheck again. */ @@ -1869,15 +1869,15 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, /** * ep_loop_check_proc - verify that adding an epoll file inside another - * epoll structure, does not violate the constraints, in + * epoll structure does not violate the constraints, in * terms of closed loops, or too deep chains (which can * result in excessive stack usage). * - * @priv: Pointer to the epoll file to be currently checked. + * @ep: the &struct eventpoll to be currently checked. * @depth: Current depth of the path being checked. * - * Returns: Returns zero if adding the epoll @file inside current epoll - * structure @ep does not violate the constraints, or -1 otherwise. + * Return: %zero if adding the epoll @file inside current epoll + * structure @ep does not violate the constraints, or %-1 otherwise. */ static int ep_loop_check_proc(struct eventpoll *ep, int depth) { @@ -1919,14 +1919,14 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth) /** * ep_loop_check - Performs a check to verify that adding an epoll file (@to) - * into another epoll file (represented by @from) does not create + * into another epoll file (represented by @ep) does not create * closed loops or too deep chains. * - * @from: Pointer to the epoll we are inserting into. + * @ep: Pointer to the epoll we are inserting into. * @to: Pointer to the epoll to be inserted. * - * Returns: Returns zero if adding the epoll @to inside the epoll @from - * does not violate the constraints, or -1 otherwise. + * Return: %zero if adding the epoll @to inside the epoll @from + * does not violate the constraints, or %-1 otherwise. */ static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to) { @@ -2074,8 +2074,8 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, ep = f.file->private_data; /* - * When we insert an epoll file descriptor, inside another epoll file - * descriptor, there is the change of creating closed loops, which are + * When we insert an epoll file descriptor inside another epoll file + * descriptor, there is the chance of creating closed loops, which are * better be handled here, than in more critical paths. While we are * checking for loops we also determine the list of files reachable * and hang them on the tfile_check_list, so we can check that we @@ -2113,7 +2113,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, } /* - * Try to lookup the file inside our RB tree, Since we grabbed "mtx" + * Try to lookup the file inside our RB tree. Since we grabbed "mtx" * above, we can be sure to be able to use the item looked up by * ep_find() till we release the mutex. */ diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index 4c2a9fe30067..4493ef0c715e 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -354,8 +354,8 @@ static unsigned int ext4_getfsmap_find_sb(struct super_block *sb, /* Compare two fsmap items. */ static int ext4_getfsmap_compare(void *priv, - struct list_head *a, - struct list_head *b) + const struct list_head *a, + const struct list_head *b) { struct ext4_fsmap *fa; struct ext4_fsmap *fb; diff --git a/fs/file.c b/fs/file.c index f3a4bac2cbe9..f633348029a5 100644 --- a/fs/file.c +++ b/fs/file.c @@ -629,17 +629,30 @@ int close_fd(unsigned fd) } EXPORT_SYMBOL(close_fd); /* for ksys_close() */ +/** + * last_fd - return last valid index into fd table + * @cur_fds: files struct + * + * Context: Either rcu read lock or files_lock must be held. + * + * Returns: Last valid index into fdtable. + */ +static inline unsigned last_fd(struct fdtable *fdt) +{ + return fdt->max_fds - 1; +} + static inline void __range_cloexec(struct files_struct *cur_fds, unsigned int fd, unsigned int max_fd) { struct fdtable *fdt; - if (fd > max_fd) - return; - + /* make sure we're using the correct maximum value */ spin_lock(&cur_fds->file_lock); fdt = files_fdtable(cur_fds); - bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1); + max_fd = min(last_fd(fdt), max_fd); + if (fd <= max_fd) + bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1); spin_unlock(&cur_fds->file_lock); } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 44f7bfd825cf..1b6c001a7dd1 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -252,7 +252,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) if (ret == -ENOMEM) goto out; if (ret || fuse_invalid_attr(&outarg.attr) || - (outarg.attr.mode ^ inode->i_mode) & S_IFMT) + inode_wrong_type(inode, outarg.attr.mode)) goto invalid; forget_all_cached_acls(inode); @@ -1054,7 +1054,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, err = fuse_simple_request(fm, &args); if (!err) { if (fuse_invalid_attr(&outarg.attr) || - (inode->i_mode ^ outarg.attr.mode) & S_IFMT) { + inode_wrong_type(inode, outarg.attr.mode)) { fuse_make_bad(inode); err = -EIO; } else { @@ -1703,7 +1703,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, } if (fuse_invalid_attr(&outarg.attr) || - (inode->i_mode ^ outarg.attr.mode) & S_IFMT) { + inode_wrong_type(inode, outarg.attr.mode)) { fuse_make_bad(inode); err = -EIO; goto error; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index b0e18b470e91..b4b956da3851 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -350,7 +350,7 @@ retry: inode->i_generation = generation; fuse_init_inode(inode, attr); unlock_new_inode(inode); - } else if ((inode->i_mode ^ attr->mode) & S_IFMT) { + } else if (inode_wrong_type(inode, attr->mode)) { /* Inode has changed type, any I/O on the old should fail */ fuse_make_bad(inode); iput(inode); diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 3441ffa740f3..277f7041d55a 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -202,7 +202,7 @@ retry: inode = d_inode(dentry); if (!inode || get_node_id(inode) != o->nodeid || - ((o->attr.mode ^ inode->i_mode) & S_IFMT)) { + inode_wrong_type(inode, o->attr.mode)) { d_invalidate(dentry); dput(dentry); goto retry; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 9567520d79f7..c06a6cdf05de 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1732,7 +1732,8 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) spin_unlock(&gl->gl_lockref.lock); } -static int glock_cmp(void *priv, struct list_head *a, struct list_head *b) +static int glock_cmp(void *priv, const struct list_head *a, + const struct list_head *b) { struct gfs2_glock *gla, *glb; diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 8e32d569c8bf..ef0b583c3417 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -394,18 +394,24 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) const struct gfs2_dinode *str = buf; struct timespec64 atime; u16 height, depth; + umode_t mode = be32_to_cpu(str->di_mode); + bool is_new = ip->i_inode.i_flags & I_NEW; if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr))) goto corrupt; + if (unlikely(!is_new && inode_wrong_type(&ip->i_inode, mode))) + goto corrupt; ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino); - ip->i_inode.i_mode = be32_to_cpu(str->di_mode); - ip->i_inode.i_rdev = 0; - switch (ip->i_inode.i_mode & S_IFMT) { - case S_IFBLK: - case S_IFCHR: - ip->i_inode.i_rdev = MKDEV(be32_to_cpu(str->di_major), - be32_to_cpu(str->di_minor)); - break; + ip->i_inode.i_mode = mode; + if (is_new) { + ip->i_inode.i_rdev = 0; + switch (mode & S_IFMT) { + case S_IFBLK: + case S_IFCHR: + ip->i_inode.i_rdev = MKDEV(be32_to_cpu(str->di_major), + be32_to_cpu(str->di_minor)); + break; + } } i_uid_write(&ip->i_inode, be32_to_cpu(str->di_uid)); diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 6410281546f9..88649b43fcff 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -695,7 +695,7 @@ void log_flush_wait(struct gfs2_sbd *sdp) } } -static int ip_cmp(void *priv, struct list_head *a, struct list_head *b) +static int ip_cmp(void *priv, const struct list_head *a, const struct list_head *b) { struct gfs2_inode *ipa, *ipb; diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index a82f4747aa8d..b4809967efc6 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -634,7 +634,8 @@ static void gfs2_check_magic(struct buffer_head *bh) kunmap_atomic(kaddr); } -static int blocknr_cmp(void *priv, struct list_head *a, struct list_head *b) +static int blocknr_cmp(void *priv, const struct list_head *a, + const struct list_head *b) { struct gfs2_bufdata *bda, *bdb; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 29e407762626..7b5e984ff02a 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -144,7 +144,7 @@ static char *follow_link(char *link) char *name, *resolved, *end; int n; - name = __getname(); + name = kmalloc(PATH_MAX, GFP_KERNEL); if (!name) { n = -ENOMEM; goto out_free; @@ -173,12 +173,11 @@ static char *follow_link(char *link) goto out_free; } - __putname(name); - kfree(link); + kfree(name); return resolved; out_free: - __putname(name); + kfree(name); return ERR_PTR(n); } @@ -712,7 +711,6 @@ static int hostfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, if (name == NULL) goto out_put; - init_special_inode(inode, mode, dev); err = do_mknod(name, mode, MAJOR(dev), MINOR(dev)); if (err) goto out_free; diff --git a/fs/io-wq.c b/fs/io-wq.c index 433c4d3c3c1c..4eba531bea5a 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -415,6 +415,7 @@ static void io_worker_handle_work(struct io_worker *worker) { struct io_wqe *wqe = worker->wqe; struct io_wq *wq = wqe->wq; + bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state); do { struct io_wq_work *work; @@ -444,6 +445,9 @@ get_next: unsigned int hash = io_get_work_hash(work); next_hashed = wq_next_work(work); + + if (unlikely(do_kill) && (work->flags & IO_WQ_WORK_UNBOUND)) + work->flags |= IO_WQ_WORK_CANCEL; wq->do_work(work); io_assign_current_work(worker, NULL); diff --git a/fs/io_uring.c b/fs/io_uring.c index 65a17d560a73..dff34975d86b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2762,6 +2762,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); struct io_async_rw *io = req->async_data; + bool check_reissue = kiocb->ki_complete == io_complete_rw; /* add previously done IO, if any */ if (io && io->bytes_done > 0) { @@ -2777,6 +2778,18 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, __io_complete_rw(req, ret, 0, issue_flags); else io_rw_done(kiocb, ret); + + if (check_reissue && req->flags & REQ_F_REISSUE) { + req->flags &= ~REQ_F_REISSUE; + if (!io_rw_reissue(req)) { + int cflags = 0; + + req_set_fail_links(req); + if (req->flags & REQ_F_BUFFER_SELECTED) + cflags = io_put_rw_kbuf(req); + __io_req_complete(req, issue_flags, ret, cflags); + } + } } static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter) @@ -3294,6 +3307,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) ret = io_iter_do_read(req, iter); if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { + req->flags &= ~REQ_F_REISSUE; /* IOPOLL retry should happen for io-wq threads */ if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) goto done; @@ -3417,8 +3431,10 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) else ret2 = -EINVAL; - if (req->flags & REQ_F_REISSUE) + if (req->flags & REQ_F_REISSUE) { + req->flags &= ~REQ_F_REISSUE; ret2 = -EAGAIN; + } /* * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just @@ -6173,7 +6189,6 @@ static void io_wq_submit_work(struct io_wq_work *work) ret = -ECANCELED; if (!ret) { - req->flags &= ~REQ_F_REISSUE; do { ret = io_issue_sqe(req, 0); /* @@ -6739,6 +6754,9 @@ static int io_sq_thread(void *data) current->flags |= PF_NO_SETAFFINITY; mutex_lock(&sqd->lock); + /* a user may had exited before the thread started */ + io_run_task_work_head(&sqd->park_task_work); + while (!test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) { int ret; bool cap_entries, sqt_spin, needs_sched; @@ -6755,10 +6773,10 @@ static int io_sq_thread(void *data) } cond_resched(); mutex_lock(&sqd->lock); - if (did_sig) - break; io_run_task_work(); io_run_task_work_head(&sqd->park_task_work); + if (did_sig) + break; timeout = jiffies + sqd->sq_thread_idle; continue; } diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 414769a6ad11..0129e6bab985 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1155,7 +1155,8 @@ iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends, EXPORT_SYMBOL_GPL(iomap_ioend_try_merge); static int -iomap_ioend_compare(void *priv, struct list_head *a, struct list_head *b) +iomap_ioend_compare(void *priv, const struct list_head *a, + const struct list_head *b) { struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list); struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list); diff --git a/fs/locks.c b/fs/locks.c index 6125d2de39b8..5c42363aa811 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2369,7 +2369,6 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock) if (flock->l_pid != 0) goto out; - cmd = F_GETLK; fl->fl_flags |= FL_OFDLCK; fl->fl_owner = filp; } @@ -2825,7 +2824,7 @@ struct locks_iterator { }; static void lock_get_status(struct seq_file *f, struct file_lock *fl, - loff_t id, char *pfx) + loff_t id, char *pfx, int repeat) { struct inode *inode = NULL; unsigned int fl_pid; @@ -2841,7 +2840,11 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, if (fl->fl_file != NULL) inode = locks_inode(fl->fl_file); - seq_printf(f, "%lld:%s ", id, pfx); + seq_printf(f, "%lld: ", id); + + if (repeat) + seq_printf(f, "%*s", repeat - 1 + (int)strlen(pfx), pfx); + if (IS_POSIX(fl)) { if (fl->fl_flags & FL_ACCESS) seq_puts(f, "ACCESS"); @@ -2903,21 +2906,64 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, } } +static struct file_lock *get_next_blocked_member(struct file_lock *node) +{ + struct file_lock *tmp; + + /* NULL node or root node */ + if (node == NULL || node->fl_blocker == NULL) + return NULL; + + /* Next member in the linked list could be itself */ + tmp = list_next_entry(node, fl_blocked_member); + if (list_entry_is_head(tmp, &node->fl_blocker->fl_blocked_requests, fl_blocked_member) + || tmp == node) { + return NULL; + } + + return tmp; +} + static int locks_show(struct seq_file *f, void *v) { struct locks_iterator *iter = f->private; - struct file_lock *fl, *bfl; + struct file_lock *cur, *tmp; struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb); + int level = 0; - fl = hlist_entry(v, struct file_lock, fl_link); + cur = hlist_entry(v, struct file_lock, fl_link); - if (locks_translate_pid(fl, proc_pidns) == 0) + if (locks_translate_pid(cur, proc_pidns) == 0) return 0; - lock_get_status(f, fl, iter->li_pos, ""); + /* View this crossed linked list as a binary tree, the first member of fl_blocked_requests + * is the left child of current node, the next silibing in fl_blocked_member is the + * right child, we can alse get the parent of current node from fl_blocker, so this + * question becomes traversal of a binary tree + */ + while (cur != NULL) { + if (level) + lock_get_status(f, cur, iter->li_pos, "-> ", level); + else + lock_get_status(f, cur, iter->li_pos, "", level); - list_for_each_entry(bfl, &fl->fl_blocked_requests, fl_blocked_member) - lock_get_status(f, bfl, iter->li_pos, " ->"); + if (!list_empty(&cur->fl_blocked_requests)) { + /* Turn left */ + cur = list_first_entry_or_null(&cur->fl_blocked_requests, + struct file_lock, fl_blocked_member); + level++; + } else { + /* Turn right */ + tmp = get_next_blocked_member(cur); + /* Fall back to parent node */ + while (tmp == NULL && cur->fl_blocker != NULL) { + cur = cur->fl_blocker; + level--; + tmp = get_next_blocked_member(cur); + } + cur = tmp; + } + } return 0; } @@ -2938,7 +2984,7 @@ static void __show_fd_locks(struct seq_file *f, (*id)++; seq_puts(f, "lock:\t"); - lock_get_status(f, fl, *id, ""); + lock_get_status(f, fl, *id, "", 0); } } diff --git a/fs/namei.c b/fs/namei.c index 216f16e74351..48a2f288e802 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -579,6 +579,8 @@ static void set_nameidata(struct nameidata *p, int dfd, struct filename *name) p->stack = p->internal; p->dfd = dfd; p->name = name; + p->path.mnt = NULL; + p->path.dentry = NULL; p->total_link_count = old ? old->total_link_count : 0; p->saved = old; current->nameidata = p; @@ -652,6 +654,8 @@ static void terminate_walk(struct nameidata *nd) rcu_read_unlock(); } nd->depth = 0; + nd->path.mnt = NULL; + nd->path.dentry = NULL; } /* path_put is needed afterwards regardless of success or failure */ @@ -2322,8 +2326,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags) } nd->root.mnt = NULL; - nd->path.mnt = NULL; - nd->path.dentry = NULL; /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */ if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) { @@ -2419,16 +2421,16 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path while (!(err = link_path_walk(s, nd)) && (s = lookup_last(nd)) != NULL) ; + if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) { + err = handle_lookup_down(nd); + nd->flags &= ~LOOKUP_JUMPED; // no d_weak_revalidate(), please... + } if (!err) err = complete_walk(nd); if (!err && nd->flags & LOOKUP_DIRECTORY) if (!d_can_lookup(nd->path.dentry)) err = -ENOTDIR; - if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) { - err = handle_lookup_down(nd); - nd->flags &= ~LOOKUP_JUMPED; // no d_weak_revalidate(), please... - } if (!err) { *path = nd->path; nd->path.mnt = NULL; diff --git a/fs/namespace.c b/fs/namespace.c index 56bb5a5fdc0d..f63337828e1c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1242,8 +1242,9 @@ struct vfsmount *mntget(struct vfsmount *mnt) } EXPORT_SYMBOL(mntget); -/* path_is_mountpoint() - Check if path is a mount in the current - * namespace. +/** + * path_is_mountpoint() - Check if path is a mount in the current namespace. + * @path: path to check * * d_mountpoint() can only be used reliably to establish if a dentry is * not mounted in any namespace and that common case is handled inline. @@ -1369,7 +1370,7 @@ void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor) /** * may_umount_tree - check if a mount tree is busy - * @mnt: root of mount tree + * @m: root of mount tree * * This is called to check if a tree of mounts has any * open files, pwds, chroots or sub mounts that are @@ -1939,10 +1940,11 @@ void drop_collected_mounts(struct vfsmount *mnt) /** * clone_private_mount - create a private clone of a path + * @path: path to clone * - * This creates a new vfsmount, which will be the clone of @path. The new will - * not be attached anywhere in the namespace and will be private (i.e. changes - * to the originating mount won't be propagated into this). + * This creates a new vfsmount, which will be the clone of @path. The new mount + * will not be attached anywhere in the namespace and will be private (i.e. + * changes to the originating mount won't be propagated into this). * * Release with mntput(). */ diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index a7fb076a5f44..5a8854de0c19 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -335,7 +335,7 @@ nfs_find_actor(struct inode *inode, void *opaque) if (NFS_FILEID(inode) != fattr->fileid) return 0; - if ((S_IFMT & inode->i_mode) != (S_IFMT & fattr->mode)) + if (inode_wrong_type(inode, fattr->mode)) return 0; if (nfs_compare_fh(NFS_FH(inode), fh)) return 0; @@ -1461,7 +1461,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat return 0; return -ESTALE; } - if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) + if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && inode_wrong_type(inode, fattr->mode)) return -ESTALE; @@ -1876,7 +1876,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* * Make sure the inode's type hasn't changed. */ - if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) { + if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && inode_wrong_type(inode, fattr->mode)) { /* * Big trouble! The inode has become a different object. */ diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c index 79c563c1a5e8..5a5bd85d08f8 100644 --- a/fs/nfs_common/nfsacl.c +++ b/fs/nfs_common/nfsacl.c @@ -136,6 +136,77 @@ int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode, } EXPORT_SYMBOL_GPL(nfsacl_encode); +/** + * nfs_stream_encode_acl - Encode an NFSv3 ACL + * + * @xdr: an xdr_stream positioned to receive an encoded ACL + * @inode: inode of file whose ACL this is + * @acl: posix_acl to encode + * @encode_entries: whether to encode ACEs as well + * @typeflag: ACL type: NFS_ACL_DEFAULT or zero + * + * Return values: + * %false: The ACL could not be encoded + * %true: @xdr is advanced to the next available position + */ +bool nfs_stream_encode_acl(struct xdr_stream *xdr, struct inode *inode, + struct posix_acl *acl, int encode_entries, + int typeflag) +{ + const size_t elem_size = XDR_UNIT * 3; + u32 entries = (acl && acl->a_count) ? max_t(int, acl->a_count, 4) : 0; + struct nfsacl_encode_desc nfsacl_desc = { + .desc = { + .elem_size = elem_size, + .array_len = encode_entries ? entries : 0, + .xcode = xdr_nfsace_encode, + }, + .acl = acl, + .typeflag = typeflag, + .uid = inode->i_uid, + .gid = inode->i_gid, + }; + struct nfsacl_simple_acl aclbuf; + unsigned int base; + int err; + + if (entries > NFS_ACL_MAX_ENTRIES) + return false; + if (xdr_stream_encode_u32(xdr, entries) < 0) + return false; + + if (encode_entries && acl && acl->a_count == 3) { + struct posix_acl *acl2 = &aclbuf.acl; + + /* Avoid the use of posix_acl_alloc(). nfsacl_encode() is + * invoked in contexts where a memory allocation failure is + * fatal. Fortunately this fake ACL is small enough to + * construct on the stack. */ + posix_acl_init(acl2, 4); + + /* Insert entries in canonical order: other orders seem + to confuse Solaris VxFS. */ + acl2->a_entries[0] = acl->a_entries[0]; /* ACL_USER_OBJ */ + acl2->a_entries[1] = acl->a_entries[1]; /* ACL_GROUP_OBJ */ + acl2->a_entries[2] = acl->a_entries[1]; /* ACL_MASK */ + acl2->a_entries[2].e_tag = ACL_MASK; + acl2->a_entries[3] = acl->a_entries[2]; /* ACL_OTHER */ + nfsacl_desc.acl = acl2; + } + + base = xdr_stream_pos(xdr); + if (!xdr_reserve_space(xdr, XDR_UNIT + + elem_size * nfsacl_desc.desc.array_len)) + return false; + err = xdr_encode_array2(xdr->buf, base, &nfsacl_desc.desc); + if (err) + return false; + + return true; +} +EXPORT_SYMBOL_GPL(nfs_stream_encode_acl); + + struct nfsacl_decode_desc { struct xdr_array2_desc desc; unsigned int count; diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index d6cff5fbe705..5fa38ad9e7e3 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -99,7 +99,7 @@ config NFSD_BLOCKLAYOUT help This option enables support for the exporting pNFS block layouts in the kernel's NFS server. The pNFS block layout enables NFS - clients to directly perform I/O to block devices accesible to both + clients to directly perform I/O to block devices accessible to both the server and the clients. See RFC 5663 for more details. If unsure, say N. @@ -113,7 +113,7 @@ config NFSD_SCSILAYOUT help This option enables support for the exporting pNFS SCSI layouts in the kernel's NFS server. The pNFS SCSI layout enables NFS - clients to directly perform I/O to SCSI devices accesible to both + clients to directly perform I/O to SCSI devices accessible to both the server and the clients. See draft-ietf-nfsv4-scsi-layout for more details. @@ -127,7 +127,7 @@ config NFSD_FLEXFILELAYOUT This option enables support for the exporting pNFS Flex File layouts in the kernel's NFS server. The pNFS Flex File layout enables NFS clients to directly perform I/O to NFSv3 devices - accesible to both the server and the clients. See + accessible to both the server and the clients. See draft-ietf-nfsv4-flex-files for more details. Warning, this server implements the bare minimum functionality diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index c330f5bd0cf3..a75abeb1e698 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -51,9 +51,6 @@ struct nfsd_net { bool grace_ended; time64_t boot_time; - /* internal mount of the "nfsd" pseudofilesystem: */ - struct vfsmount *nfsd_mnt; - struct dentry *nfsd_client_dir; /* @@ -130,6 +127,9 @@ struct nfsd_net { wait_queue_head_t ntf_wq; atomic_t ntf_refcnt; + /* Allow umount to wait for nfsd state cleanup */ + struct completion nfsd_shutdown_complete; + /* * clientid and stateid data for construction of net unique COPY * stateids. diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 855e17772eba..4b43929c1f25 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -242,79 +242,61 @@ static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p) /* GETACL */ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_getaclres *resp = rqstp->rq_resp; struct dentry *dentry = resp->fh.fh_dentry; struct inode *inode; - struct kvec *head = rqstp->rq_res.head; - unsigned int base; - int n; int w; - *p++ = resp->status; - if (resp->status != nfs_ok) - return xdr_ressize_check(rqstp, p); + if (!svcxdr_encode_stat(xdr, resp->status)) + return 0; - /* - * Since this is version 2, the check for nfserr in - * nfsd_dispatch actually ensures the following cannot happen. - * However, it seems fragile to depend on that. - */ if (dentry == NULL || d_really_is_negative(dentry)) - return 0; + return 1; inode = d_inode(dentry); - p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat); - *p++ = htonl(resp->mask); - if (!xdr_ressize_check(rqstp, p)) + if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat)) + return 0; + if (xdr_stream_encode_u32(xdr, resp->mask) < 0) return 0; - base = (char *)p - (char *)head->iov_base; rqstp->rq_res.page_len = w = nfsacl_size( (resp->mask & NFS_ACL) ? resp->acl_access : NULL, (resp->mask & NFS_DFACL) ? resp->acl_default : NULL); while (w > 0) { if (!*(rqstp->rq_next_page++)) - return 0; + return 1; w -= PAGE_SIZE; } - n = nfsacl_encode(&rqstp->rq_res, base, inode, - resp->acl_access, - resp->mask & NFS_ACL, 0); - if (n > 0) - n = nfsacl_encode(&rqstp->rq_res, base + n, inode, - resp->acl_default, - resp->mask & NFS_DFACL, - NFS_ACL_DEFAULT); - return (n > 0); -} - -static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p) -{ - struct nfsd_attrstat *resp = rqstp->rq_resp; - - *p++ = resp->status; - if (resp->status != nfs_ok) - goto out; + if (!nfs_stream_encode_acl(xdr, inode, resp->acl_access, + resp->mask & NFS_ACL, 0)) + return 0; + if (!nfs_stream_encode_acl(xdr, inode, resp->acl_default, + resp->mask & NFS_DFACL, NFS_ACL_DEFAULT)) + return 0; - p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat); -out: - return xdr_ressize_check(rqstp, p); + return 1; } /* ACCESS */ static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_accessres *resp = rqstp->rq_resp; - *p++ = resp->status; - if (resp->status != nfs_ok) - goto out; + if (!svcxdr_encode_stat(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat)) + return 0; + if (xdr_stream_encode_u32(xdr, resp->access) < 0) + return 0; + break; + } - p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat); - *p++ = htonl(resp->access); -out: - return xdr_ressize_check(rqstp, p); + return 1; } /* @@ -329,13 +311,6 @@ static void nfsaclsvc_release_getacl(struct svc_rqst *rqstp) posix_acl_release(resp->acl_default); } -static void nfsaclsvc_release_attrstat(struct svc_rqst *rqstp) -{ - struct nfsd_attrstat *resp = rqstp->rq_resp; - - fh_put(&resp->fh); -} - static void nfsaclsvc_release_access(struct svc_rqst *rqstp) { struct nfsd3_accessres *resp = rqstp->rq_resp; @@ -375,8 +350,8 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = { [ACLPROC2_SETACL] = { .pc_func = nfsacld_proc_setacl, .pc_decode = nfsaclsvc_decode_setaclargs, - .pc_encode = nfsaclsvc_encode_attrstatres, - .pc_release = nfsaclsvc_release_attrstat, + .pc_encode = nfssvc_encode_attrstatres, + .pc_release = nfssvc_release_attrstat, .pc_argsize = sizeof(struct nfsd3_setaclargs), .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_NOCACHE, @@ -386,8 +361,8 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = { [ACLPROC2_GETATTR] = { .pc_func = nfsacld_proc_getattr, .pc_decode = nfssvc_decode_fhandleargs, - .pc_encode = nfsaclsvc_encode_attrstatres, - .pc_release = nfsaclsvc_release_attrstat, + .pc_encode = nfssvc_encode_attrstatres, + .pc_release = nfssvc_release_attrstat, .pc_argsize = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_NOCACHE, diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 9a6f18d74d14..a1591feeea22 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -168,22 +168,25 @@ static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p) /* GETACL */ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_getaclres *resp = rqstp->rq_resp; struct dentry *dentry = resp->fh.fh_dentry; + struct kvec *head = rqstp->rq_res.head; + struct inode *inode = d_inode(dentry); + unsigned int base; + int n; + int w; - *p++ = resp->status; - p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh); - if (resp->status == 0 && dentry && d_really_is_positive(dentry)) { - struct inode *inode = d_inode(dentry); - struct kvec *head = rqstp->rq_res.head; - unsigned int base; - int n; - int w; - - *p++ = htonl(resp->mask); - if (!xdr_ressize_check(rqstp, p)) + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) + return 0; + if (xdr_stream_encode_u32(xdr, resp->mask) < 0) return 0; - base = (char *)p - (char *)head->iov_base; + + base = (char *)xdr->p - (char *)head->iov_base; rqstp->rq_res.page_len = w = nfsacl_size( (resp->mask & NFS_ACL) ? resp->acl_access : NULL, @@ -204,9 +207,11 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p) NFS_ACL_DEFAULT); if (n <= 0) return 0; - } else - if (!xdr_ressize_check(rqstp, p)) + break; + default: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) return 0; + } return 1; } @@ -214,11 +219,11 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p) /* SETACL */ static int nfs3svc_encode_setaclres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_attrstat *resp = rqstp->rq_resp; - *p++ = resp->status; - p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh); - return xdr_ressize_check(rqstp, p); + return svcxdr_encode_nfsstat3(xdr, resp->status) && + svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh); } /* diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 8675851199f8..17715a6c7a40 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -126,14 +126,15 @@ nfsd3_proc_readlink(struct svc_rqst *rqstp) { struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd3_readlinkres *resp = rqstp->rq_resp; - char *buffer = page_address(*(rqstp->rq_next_page++)); dprintk("nfsd: READLINK(3) %s\n", SVCFH_fmt(&argp->fh)); /* Read the symlink. */ fh_copy(&resp->fh, &argp->fh); resp->len = NFS3_MAXPATHLEN; - resp->status = nfsd_readlink(rqstp, &resp->fh, buffer, &resp->len); + resp->pages = rqstp->rq_next_page++; + resp->status = nfsd_readlink(rqstp, &resp->fh, + page_address(*resp->pages), &resp->len); return rpc_success; } @@ -158,6 +159,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp) v = 0; len = argp->count; + resp->pages = rqstp->rq_next_page; while (len > 0) { struct page *page = *(rqstp->rq_next_page++); @@ -439,17 +441,30 @@ static void nfsd3_init_dirlist_pages(struct svc_rqst *rqstp, struct nfsd3_readdirres *resp, int count) { + struct xdr_buf *buf = &resp->dirlist; + struct xdr_stream *xdr = &resp->xdr; + count = min_t(u32, count, svc_max_payload(rqstp)); - /* Convert byte count to number of words (i.e. >> 2), - * and reserve room for the NULL ptr & eof flag (-2 words) */ - resp->buflen = (count >> 2) - 2; + memset(buf, 0, sizeof(*buf)); - resp->buffer = page_address(*rqstp->rq_next_page); + /* Reserve room for the NULL ptr & eof flag (-2 words) */ + buf->buflen = count - XDR_UNIT * 2; + buf->pages = rqstp->rq_next_page; while (count > 0) { rqstp->rq_next_page++; count -= PAGE_SIZE; } + + /* This is xdr_init_encode(), but it assumes that + * the head kvec has already been consumed. */ + xdr_set_scratch_buffer(xdr, NULL, 0); + xdr->buf = buf; + xdr->page_ptr = buf->pages; + xdr->iov = NULL; + xdr->p = page_address(*buf->pages); + xdr->end = xdr->p + (PAGE_SIZE >> 2); + xdr->rqst = NULL; } /* @@ -460,10 +475,7 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) { struct nfsd3_readdirargs *argp = rqstp->rq_argp; struct nfsd3_readdirres *resp = rqstp->rq_resp; - int count = 0; loff_t offset; - struct page **p; - caddr_t page_addr = NULL; dprintk("nfsd: READDIR(3) %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), @@ -471,39 +483,18 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) nfsd3_init_dirlist_pages(rqstp, resp, argp->count); - /* Read directory and encode entries on the fly */ fh_copy(&resp->fh, &argp->fh); - resp->common.err = nfs_ok; + resp->cookie_offset = 0; resp->rqstp = rqstp; offset = argp->cookie; - resp->status = nfsd_readdir(rqstp, &resp->fh, &offset, - &resp->common, nfs3svc_encode_entry); + &resp->common, nfs3svc_encode_entry3); memcpy(resp->verf, argp->verf, 8); - count = 0; - for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) { - page_addr = page_address(*p); + nfs3svc_encode_cookie3(resp, offset); - if (((caddr_t)resp->buffer >= page_addr) && - ((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) { - count += (caddr_t)resp->buffer - page_addr; - break; - } - count += PAGE_SIZE; - } - resp->count = count >> 2; - if (resp->offset) { - if (unlikely(resp->offset1)) { - /* we ended up with offset on a page boundary */ - *resp->offset = htonl(offset >> 32); - *resp->offset1 = htonl(offset & 0xffffffff); - resp->offset1 = NULL; - } else { - xdr_encode_hyper(resp->offset, offset); - } - resp->offset = NULL; - } + /* Recycle only pages that were part of the reply */ + rqstp->rq_next_page = resp->xdr.page_ptr + 1; return rpc_success; } @@ -517,10 +508,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp) { struct nfsd3_readdirargs *argp = rqstp->rq_argp; struct nfsd3_readdirres *resp = rqstp->rq_resp; - int count = 0; loff_t offset; - struct page **p; - caddr_t page_addr = NULL; dprintk("nfsd: READDIR+(3) %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), @@ -528,10 +516,9 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp) nfsd3_init_dirlist_pages(rqstp, resp, argp->count); - /* Read directory and encode entries on the fly */ fh_copy(&resp->fh, &argp->fh); - resp->common.err = nfs_ok; + resp->cookie_offset = 0; resp->rqstp = rqstp; offset = argp->cookie; @@ -545,30 +532,12 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp) } resp->status = nfsd_readdir(rqstp, &resp->fh, &offset, - &resp->common, nfs3svc_encode_entry_plus); + &resp->common, nfs3svc_encode_entryplus3); memcpy(resp->verf, argp->verf, 8); - for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) { - page_addr = page_address(*p); + nfs3svc_encode_cookie3(resp, offset); - if (((caddr_t)resp->buffer >= page_addr) && - ((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) { - count += (caddr_t)resp->buffer - page_addr; - break; - } - count += PAGE_SIZE; - } - resp->count = count >> 2; - if (resp->offset) { - if (unlikely(resp->offset1)) { - /* we ended up with offset on a page boundary */ - *resp->offset = htonl(offset >> 32); - *resp->offset1 = htonl(offset & 0xffffffff); - resp->offset1 = NULL; - } else { - xdr_encode_hyper(resp->offset, offset); - } - resp->offset = NULL; - } + /* Recycle only pages that were part of the reply */ + rqstp->rq_next_page = resp->xdr.page_ptr + 1; out: return rpc_success; @@ -736,7 +705,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { [NFS3PROC_GETATTR] = { .pc_func = nfsd3_proc_getattr, .pc_decode = nfs3svc_decode_fhandleargs, - .pc_encode = nfs3svc_encode_attrstatres, + .pc_encode = nfs3svc_encode_getattrres, .pc_release = nfs3svc_release_fhandle, .pc_argsize = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd3_attrstatres), @@ -758,7 +727,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { [NFS3PROC_LOOKUP] = { .pc_func = nfsd3_proc_lookup, .pc_decode = nfs3svc_decode_diropargs, - .pc_encode = nfs3svc_encode_diropres, + .pc_encode = nfs3svc_encode_lookupres, .pc_release = nfs3svc_release_fhandle2, .pc_argsize = sizeof(struct nfsd3_diropargs), .pc_ressize = sizeof(struct nfsd3_diropres), diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 9d9a01ce0b27..0a5ebc52e6a9 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -14,13 +14,26 @@ #include "netns.h" #include "vfs.h" -#define NFSDDBG_FACILITY NFSDDBG_XDR +/* + * Force construction of an empty post-op attr + */ +static const struct svc_fh nfs3svc_null_fh = { + .fh_no_wcc = true, +}; +/* + * time_delta. {1, 0} means the server is accurate only + * to the nearest second. + */ +static const struct timespec64 nfs3svc_time_delta = { + .tv_sec = 1, + .tv_nsec = 0, +}; /* * Mapping of S_IF* types to NFS file types */ -static u32 nfs3_ftypes[] = { +static const u32 nfs3_ftypes[] = { NF3NON, NF3FIFO, NF3CHR, NF3BAD, NF3DIR, NF3BAD, NF3BLK, NF3BAD, NF3REG, NF3BAD, NF3LNK, NF3BAD, @@ -33,9 +46,11 @@ static u32 nfs3_ftypes[] = { */ static __be32 * -encode_time3(__be32 *p, struct timespec64 *time) +encode_nfstime3(__be32 *p, const struct timespec64 *time) { - *p++ = htonl((u32) time->tv_sec); *p++ = htonl(time->tv_nsec); + *p++ = cpu_to_be32((u32)time->tv_sec); + *p++ = cpu_to_be32(time->tv_nsec); + return p; } @@ -82,14 +97,80 @@ svcxdr_decode_nfs_fh3(struct xdr_stream *xdr, struct svc_fh *fhp) return true; } -static __be32 * -encode_fh(__be32 *p, struct svc_fh *fhp) +/** + * svcxdr_encode_nfsstat3 - Encode an NFSv3 status code + * @xdr: XDR stream + * @status: status value to encode + * + * Return values: + * %false: Send buffer space was exhausted + * %true: Success + */ +bool +svcxdr_encode_nfsstat3(struct xdr_stream *xdr, __be32 status) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, sizeof(status)); + if (!p) + return false; + *p = status; + + return true; +} + +static bool +svcxdr_encode_nfs_fh3(struct xdr_stream *xdr, const struct svc_fh *fhp) { - unsigned int size = fhp->fh_handle.fh_size; - *p++ = htonl(size); - if (size) p[XDR_QUADLEN(size)-1]=0; + u32 size = fhp->fh_handle.fh_size; + __be32 *p; + + p = xdr_reserve_space(xdr, XDR_UNIT + size); + if (!p) + return false; + *p++ = cpu_to_be32(size); + if (size) + p[XDR_QUADLEN(size) - 1] = 0; memcpy(p, &fhp->fh_handle.fh_base, size); - return p + XDR_QUADLEN(size); + + return true; +} + +static bool +svcxdr_encode_post_op_fh3(struct xdr_stream *xdr, const struct svc_fh *fhp) +{ + if (xdr_stream_encode_item_present(xdr) < 0) + return false; + if (!svcxdr_encode_nfs_fh3(xdr, fhp)) + return false; + + return true; +} + +static bool +svcxdr_encode_cookieverf3(struct xdr_stream *xdr, const __be32 *verf) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, NFS3_COOKIEVERFSIZE); + if (!p) + return false; + memcpy(p, verf, NFS3_COOKIEVERFSIZE); + + return true; +} + +static bool +svcxdr_encode_writeverf3(struct xdr_stream *xdr, const __be32 *verf) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, NFS3_WRITEVERFSIZE); + if (!p) + return false; + memcpy(p, verf, NFS3_WRITEVERFSIZE); + + return true; } static bool @@ -253,115 +334,157 @@ svcxdr_decode_devicedata3(struct svc_rqst *rqstp, struct xdr_stream *xdr, svcxdr_decode_specdata3(xdr, args); } -static __be32 *encode_fsid(__be32 *p, struct svc_fh *fhp) +static bool +svcxdr_encode_fattr3(struct svc_rqst *rqstp, struct xdr_stream *xdr, + const struct svc_fh *fhp, const struct kstat *stat) { - u64 f; + struct user_namespace *userns = nfsd_user_namespace(rqstp); + __be32 *p; + u64 fsid; + + p = xdr_reserve_space(xdr, XDR_UNIT * 21); + if (!p) + return false; + + *p++ = cpu_to_be32(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]); + *p++ = cpu_to_be32((u32)(stat->mode & S_IALLUGO)); + *p++ = cpu_to_be32((u32)stat->nlink); + *p++ = cpu_to_be32((u32)from_kuid_munged(userns, stat->uid)); + *p++ = cpu_to_be32((u32)from_kgid_munged(userns, stat->gid)); + if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) + p = xdr_encode_hyper(p, (u64)NFS3_MAXPATHLEN); + else + p = xdr_encode_hyper(p, (u64)stat->size); + + /* used */ + p = xdr_encode_hyper(p, ((u64)stat->blocks) << 9); + + /* rdev */ + *p++ = cpu_to_be32((u32)MAJOR(stat->rdev)); + *p++ = cpu_to_be32((u32)MINOR(stat->rdev)); + switch(fsid_source(fhp)) { - default: - case FSIDSOURCE_DEV: - p = xdr_encode_hyper(p, (u64)huge_encode_dev - (fhp->fh_dentry->d_sb->s_dev)); - break; case FSIDSOURCE_FSID: - p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid); + fsid = (u64)fhp->fh_export->ex_fsid; break; case FSIDSOURCE_UUID: - f = ((u64*)fhp->fh_export->ex_uuid)[0]; - f ^= ((u64*)fhp->fh_export->ex_uuid)[1]; - p = xdr_encode_hyper(p, f); + fsid = ((u64 *)fhp->fh_export->ex_uuid)[0]; + fsid ^= ((u64 *)fhp->fh_export->ex_uuid)[1]; break; + default: + fsid = (u64)huge_encode_dev(fhp->fh_dentry->d_sb->s_dev); } - return p; -} + p = xdr_encode_hyper(p, fsid); -static __be32 * -encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, - struct kstat *stat) -{ - struct user_namespace *userns = nfsd_user_namespace(rqstp); - *p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]); - *p++ = htonl((u32) (stat->mode & S_IALLUGO)); - *p++ = htonl((u32) stat->nlink); - *p++ = htonl((u32) from_kuid_munged(userns, stat->uid)); - *p++ = htonl((u32) from_kgid_munged(userns, stat->gid)); - if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) { - p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN); - } else { - p = xdr_encode_hyper(p, (u64) stat->size); - } - p = xdr_encode_hyper(p, ((u64)stat->blocks) << 9); - *p++ = htonl((u32) MAJOR(stat->rdev)); - *p++ = htonl((u32) MINOR(stat->rdev)); - p = encode_fsid(p, fhp); + /* fileid */ p = xdr_encode_hyper(p, stat->ino); - p = encode_time3(p, &stat->atime); - p = encode_time3(p, &stat->mtime); - p = encode_time3(p, &stat->ctime); - return p; + p = encode_nfstime3(p, &stat->atime); + p = encode_nfstime3(p, &stat->mtime); + encode_nfstime3(p, &stat->ctime); + + return true; } -static __be32 * -encode_saved_post_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) +static bool +svcxdr_encode_wcc_attr(struct xdr_stream *xdr, const struct svc_fh *fhp) { - /* Attributes to follow */ - *p++ = xdr_one; - return encode_fattr3(rqstp, p, fhp, &fhp->fh_post_attr); + __be32 *p; + + p = xdr_reserve_space(xdr, XDR_UNIT * 6); + if (!p) + return false; + p = xdr_encode_hyper(p, (u64)fhp->fh_pre_size); + p = encode_nfstime3(p, &fhp->fh_pre_mtime); + encode_nfstime3(p, &fhp->fh_pre_ctime); + + return true; } -/* - * Encode post-operation attributes. - * The inode may be NULL if the call failed because of a stale file - * handle. In this case, no attributes are returned. - */ -static __be32 * -encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) +static bool +svcxdr_encode_pre_op_attr(struct xdr_stream *xdr, const struct svc_fh *fhp) { - struct dentry *dentry = fhp->fh_dentry; - if (!fhp->fh_no_wcc && dentry && d_really_is_positive(dentry)) { - __be32 err; - struct kstat stat; - - err = fh_getattr(fhp, &stat); - if (!err) { - *p++ = xdr_one; /* attributes follow */ - lease_get_mtime(d_inode(dentry), &stat.mtime); - return encode_fattr3(rqstp, p, fhp, &stat); - } + if (!fhp->fh_pre_saved) { + if (xdr_stream_encode_item_absent(xdr) < 0) + return false; + return true; } - *p++ = xdr_zero; - return p; + + if (xdr_stream_encode_item_present(xdr) < 0) + return false; + return svcxdr_encode_wcc_attr(xdr, fhp); } -/* Helper for NFSv3 ACLs */ -__be32 * -nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) +/** + * svcxdr_encode_post_op_attr - Encode NFSv3 post-op attributes + * @rqstp: Context of a completed RPC transaction + * @xdr: XDR stream + * @fhp: File handle to encode + * + * Return values: + * %false: Send buffer space was exhausted + * %true: Success + */ +bool +svcxdr_encode_post_op_attr(struct svc_rqst *rqstp, struct xdr_stream *xdr, + const struct svc_fh *fhp) { - return encode_post_op_attr(rqstp, p, fhp); + struct dentry *dentry = fhp->fh_dentry; + struct kstat stat; + + /* + * The inode may be NULL if the call failed because of a + * stale file handle. In this case, no attributes are + * returned. + */ + if (fhp->fh_no_wcc || !dentry || !d_really_is_positive(dentry)) + goto no_post_op_attrs; + if (fh_getattr(fhp, &stat) != nfs_ok) + goto no_post_op_attrs; + + if (xdr_stream_encode_item_present(xdr) < 0) + return false; + lease_get_mtime(d_inode(dentry), &stat.mtime); + if (!svcxdr_encode_fattr3(rqstp, xdr, fhp, &stat)) + return false; + + return true; + +no_post_op_attrs: + return xdr_stream_encode_item_absent(xdr) > 0; } /* - * Enocde weak cache consistency data + * Encode weak cache consistency data */ -static __be32 * -encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) +static bool +svcxdr_encode_wcc_data(struct svc_rqst *rqstp, struct xdr_stream *xdr, + const struct svc_fh *fhp) { - struct dentry *dentry = fhp->fh_dentry; - - if (dentry && d_really_is_positive(dentry) && fhp->fh_post_saved) { - if (fhp->fh_pre_saved) { - *p++ = xdr_one; - p = xdr_encode_hyper(p, (u64) fhp->fh_pre_size); - p = encode_time3(p, &fhp->fh_pre_mtime); - p = encode_time3(p, &fhp->fh_pre_ctime); - } else { - *p++ = xdr_zero; - } - return encode_saved_post_attr(rqstp, p, fhp); - } - /* no pre- or post-attrs */ - *p++ = xdr_zero; - return encode_post_op_attr(rqstp, p, fhp); + struct dentry *dentry = fhp->fh_dentry; + + if (!dentry || !d_really_is_positive(dentry) || !fhp->fh_post_saved) + goto neither; + + /* before */ + if (!svcxdr_encode_pre_op_attr(xdr, fhp)) + return false; + + /* after */ + if (xdr_stream_encode_item_present(xdr) < 0) + return false; + if (!svcxdr_encode_fattr3(rqstp, xdr, fhp, &fhp->fh_post_attr)) + return false; + + return true; + +neither: + if (xdr_stream_encode_item_absent(xdr) < 0) + return false; + if (!svcxdr_encode_post_op_attr(rqstp, xdr, fhp)) + return false; + + return true; } static bool fs_supports_change_attribute(struct super_block *sb) @@ -713,210 +836,252 @@ nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p) /* GETATTR */ int -nfs3svc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p) +nfs3svc_encode_getattrres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_attrstat *resp = rqstp->rq_resp; - *p++ = resp->status; - if (resp->status == 0) { - lease_get_mtime(d_inode(resp->fh.fh_dentry), - &resp->stat.mtime); - p = encode_fattr3(rqstp, p, &resp->fh, &resp->stat); + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + lease_get_mtime(d_inode(resp->fh.fh_dentry), &resp->stat.mtime); + if (!svcxdr_encode_fattr3(rqstp, xdr, &resp->fh, &resp->stat)) + return 0; + break; } - return xdr_ressize_check(rqstp, p); + + return 1; } /* SETATTR, REMOVE, RMDIR */ int nfs3svc_encode_wccstat(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_attrstat *resp = rqstp->rq_resp; - *p++ = resp->status; - p = encode_wcc_data(rqstp, p, &resp->fh); - return xdr_ressize_check(rqstp, p); + return svcxdr_encode_nfsstat3(xdr, resp->status) && + svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh); } /* LOOKUP */ -int -nfs3svc_encode_diropres(struct svc_rqst *rqstp, __be32 *p) +int nfs3svc_encode_lookupres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_diropres *resp = rqstp->rq_resp; - *p++ = resp->status; - if (resp->status == 0) { - p = encode_fh(p, &resp->fh); - p = encode_post_op_attr(rqstp, p, &resp->fh); + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_nfs_fh3(xdr, &resp->fh)) + return 0; + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) + return 0; + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->dirfh)) + return 0; + break; + default: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->dirfh)) + return 0; } - p = encode_post_op_attr(rqstp, p, &resp->dirfh); - return xdr_ressize_check(rqstp, p); + + return 1; } /* ACCESS */ int nfs3svc_encode_accessres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_accessres *resp = rqstp->rq_resp; - *p++ = resp->status; - p = encode_post_op_attr(rqstp, p, &resp->fh); - if (resp->status == 0) - *p++ = htonl(resp->access); - return xdr_ressize_check(rqstp, p); + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) + return 0; + if (xdr_stream_encode_u32(xdr, resp->access) < 0) + return 0; + break; + default: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) + return 0; + } + + return 1; } /* READLINK */ int nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_readlinkres *resp = rqstp->rq_resp; struct kvec *head = rqstp->rq_res.head; - *p++ = resp->status; - p = encode_post_op_attr(rqstp, p, &resp->fh); - if (resp->status == 0) { - *p++ = htonl(resp->len); - xdr_ressize_check(rqstp, p); - rqstp->rq_res.page_len = resp->len; - if (resp->len & 3) { - /* need to pad the tail */ - rqstp->rq_res.tail[0].iov_base = p; - *p = 0; - rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3); - } - if (svc_encode_result_payload(rqstp, head->iov_len, resp->len)) + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) return 0; - return 1; - } else - return xdr_ressize_check(rqstp, p); + if (xdr_stream_encode_u32(xdr, resp->len) < 0) + return 0; + xdr_write_pages(xdr, resp->pages, 0, resp->len); + if (svc_encode_result_payload(rqstp, head->iov_len, resp->len) < 0) + return 0; + break; + default: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) + return 0; + } + + return 1; } /* READ */ int nfs3svc_encode_readres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_readres *resp = rqstp->rq_resp; struct kvec *head = rqstp->rq_res.head; - *p++ = resp->status; - p = encode_post_op_attr(rqstp, p, &resp->fh); - if (resp->status == 0) { - *p++ = htonl(resp->count); - *p++ = htonl(resp->eof); - *p++ = htonl(resp->count); /* xdr opaque count */ - xdr_ressize_check(rqstp, p); - /* now update rqstp->rq_res to reflect data as well */ - rqstp->rq_res.page_len = resp->count; - if (resp->count & 3) { - /* need to pad the tail */ - rqstp->rq_res.tail[0].iov_base = p; - *p = 0; - rqstp->rq_res.tail[0].iov_len = 4 - (resp->count & 3); - } - if (svc_encode_result_payload(rqstp, head->iov_len, - resp->count)) + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) return 0; - return 1; - } else - return xdr_ressize_check(rqstp, p); + if (xdr_stream_encode_u32(xdr, resp->count) < 0) + return 0; + if (xdr_stream_encode_bool(xdr, resp->eof) < 0) + return 0; + if (xdr_stream_encode_u32(xdr, resp->count) < 0) + return 0; + xdr_write_pages(xdr, resp->pages, rqstp->rq_res.page_base, + resp->count); + if (svc_encode_result_payload(rqstp, head->iov_len, resp->count) < 0) + return 0; + break; + default: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) + return 0; + } + + return 1; } /* WRITE */ int nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_writeres *resp = rqstp->rq_resp; - *p++ = resp->status; - p = encode_wcc_data(rqstp, p, &resp->fh); - if (resp->status == 0) { - *p++ = htonl(resp->count); - *p++ = htonl(resp->committed); - *p++ = resp->verf[0]; - *p++ = resp->verf[1]; + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh)) + return 0; + if (xdr_stream_encode_u32(xdr, resp->count) < 0) + return 0; + if (xdr_stream_encode_u32(xdr, resp->committed) < 0) + return 0; + if (!svcxdr_encode_writeverf3(xdr, resp->verf)) + return 0; + break; + default: + if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh)) + return 0; } - return xdr_ressize_check(rqstp, p); + + return 1; } /* CREATE, MKDIR, SYMLINK, MKNOD */ int nfs3svc_encode_createres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_diropres *resp = rqstp->rq_resp; - *p++ = resp->status; - if (resp->status == 0) { - *p++ = xdr_one; - p = encode_fh(p, &resp->fh); - p = encode_post_op_attr(rqstp, p, &resp->fh); + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_post_op_fh3(xdr, &resp->fh)) + return 0; + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) + return 0; + if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->dirfh)) + return 0; + break; + default: + if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->dirfh)) + return 0; } - p = encode_wcc_data(rqstp, p, &resp->dirfh); - return xdr_ressize_check(rqstp, p); + + return 1; } /* RENAME */ int nfs3svc_encode_renameres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_renameres *resp = rqstp->rq_resp; - *p++ = resp->status; - p = encode_wcc_data(rqstp, p, &resp->ffh); - p = encode_wcc_data(rqstp, p, &resp->tfh); - return xdr_ressize_check(rqstp, p); + return svcxdr_encode_nfsstat3(xdr, resp->status) && + svcxdr_encode_wcc_data(rqstp, xdr, &resp->ffh) && + svcxdr_encode_wcc_data(rqstp, xdr, &resp->tfh); } /* LINK */ int nfs3svc_encode_linkres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_linkres *resp = rqstp->rq_resp; - *p++ = resp->status; - p = encode_post_op_attr(rqstp, p, &resp->fh); - p = encode_wcc_data(rqstp, p, &resp->tfh); - return xdr_ressize_check(rqstp, p); + return svcxdr_encode_nfsstat3(xdr, resp->status) && + svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh) && + svcxdr_encode_wcc_data(rqstp, xdr, &resp->tfh); } /* READDIR */ int nfs3svc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_readdirres *resp = rqstp->rq_resp; + struct xdr_buf *dirlist = &resp->dirlist; - *p++ = resp->status; - p = encode_post_op_attr(rqstp, p, &resp->fh); - - if (resp->status == 0) { - /* stupid readdir cookie */ - memcpy(p, resp->verf, 8); p += 2; - xdr_ressize_check(rqstp, p); - if (rqstp->rq_res.head[0].iov_len + (2<<2) > PAGE_SIZE) - return 1; /*No room for trailer */ - rqstp->rq_res.page_len = (resp->count) << 2; - - /* add the 'tail' to the end of the 'head' page - page 0. */ - rqstp->rq_res.tail[0].iov_base = p; - *p++ = 0; /* no more entries */ - *p++ = htonl(resp->common.err == nfserr_eof); - rqstp->rq_res.tail[0].iov_len = 2<<2; - return 1; - } else - return xdr_ressize_check(rqstp, p); -} - -static __be32 * -encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, - int namlen, u64 ino) -{ - *p++ = xdr_one; /* mark entry present */ - p = xdr_encode_hyper(p, ino); /* file id */ - p = xdr_encode_array(p, name, namlen);/* name length & name */ - - cd->offset = p; /* remember pointer */ - p = xdr_encode_hyper(p, NFS_OFFSET_MAX);/* offset of next entry */ + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) + return 0; + if (!svcxdr_encode_cookieverf3(xdr, resp->verf)) + return 0; + xdr_write_pages(xdr, dirlist->pages, 0, dirlist->len); + /* no more entries */ + if (xdr_stream_encode_item_absent(xdr) < 0) + return 0; + if (xdr_stream_encode_bool(xdr, resp->common.err == nfserr_eof) < 0) + return 0; + break; + default: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) + return 0; + } - return p; + return 1; } static __be32 @@ -957,267 +1122,327 @@ out: return rv; } -static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen, u64 ino) +/** + * nfs3svc_encode_cookie3 - Encode a directory offset cookie + * @resp: readdir result context + * @offset: offset cookie to encode + * + * The buffer space for the offset cookie has already been reserved + * by svcxdr_encode_entry3_common(). + */ +void nfs3svc_encode_cookie3(struct nfsd3_readdirres *resp, u64 offset) { - struct svc_fh *fh = &cd->scratch; - __be32 err; - - fh_init(fh, NFS3_FHSIZE); - err = compose_entry_fh(cd, fh, name, namlen, ino); - if (err) { - *p++ = 0; - *p++ = 0; - goto out; - } - p = encode_post_op_attr(cd->rqstp, p, fh); - *p++ = xdr_one; /* yes, a file handle follows */ - p = encode_fh(p, fh); -out: - fh_put(fh); - return p; -} + __be64 cookie = cpu_to_be64(offset); -/* - * Encode a directory entry. This one works for both normal readdir - * and readdirplus. - * The normal readdir reply requires 2 (fileid) + 1 (stringlen) - * + string + 2 (cookie) + 1 (next) words, i.e. 6 + strlen. - * - * The readdirplus baggage is 1+21 words for post_op_attr, plus the - * file handle. - */ + if (!resp->cookie_offset) + return; + write_bytes_to_xdr_buf(&resp->dirlist, resp->cookie_offset, &cookie, + sizeof(cookie)); + resp->cookie_offset = 0; +} -#define NFS3_ENTRY_BAGGAGE (2 + 1 + 2 + 1) -#define NFS3_ENTRYPLUS_BAGGAGE (1 + 21 + 1 + (NFS3_FHSIZE >> 2)) -static int -encode_entry(struct readdir_cd *ccd, const char *name, int namlen, - loff_t offset, u64 ino, unsigned int d_type, int plus) +static bool +svcxdr_encode_entry3_common(struct nfsd3_readdirres *resp, const char *name, + int namlen, loff_t offset, u64 ino) { - struct nfsd3_readdirres *cd = container_of(ccd, struct nfsd3_readdirres, - common); - __be32 *p = cd->buffer; - caddr_t curr_page_addr = NULL; - struct page ** page; - int slen; /* string (name) length */ - int elen; /* estimated entry length in words */ - int num_entry_words = 0; /* actual number of words */ - - if (cd->offset) { - u64 offset64 = offset; - - if (unlikely(cd->offset1)) { - /* we ended up with offset on a page boundary */ - *cd->offset = htonl(offset64 >> 32); - *cd->offset1 = htonl(offset64 & 0xffffffff); - cd->offset1 = NULL; - } else { - xdr_encode_hyper(cd->offset, offset64); - } - cd->offset = NULL; - } + struct xdr_buf *dirlist = &resp->dirlist; + struct xdr_stream *xdr = &resp->xdr; - /* - dprintk("encode_entry(%.*s @%ld%s)\n", - namlen, name, (long) offset, plus? " plus" : ""); - */ - - /* truncate filename if too long */ - namlen = min(namlen, NFS3_MAXNAMLEN); + if (xdr_stream_encode_item_present(xdr) < 0) + return false; + /* fileid */ + if (xdr_stream_encode_u64(xdr, ino) < 0) + return false; + /* name */ + if (xdr_stream_encode_opaque(xdr, name, min(namlen, NFS3_MAXNAMLEN)) < 0) + return false; + /* cookie */ + resp->cookie_offset = dirlist->len; + if (xdr_stream_encode_u64(xdr, NFS_OFFSET_MAX) < 0) + return false; - slen = XDR_QUADLEN(namlen); - elen = slen + NFS3_ENTRY_BAGGAGE - + (plus? NFS3_ENTRYPLUS_BAGGAGE : 0); + return true; +} - if (cd->buflen < elen) { - cd->common.err = nfserr_toosmall; - return -EINVAL; - } +/** + * nfs3svc_encode_entry3 - encode one NFSv3 READDIR entry + * @data: directory context + * @name: name of the object to be encoded + * @namlen: length of that name, in bytes + * @offset: the offset of the previous entry + * @ino: the fileid of this entry + * @d_type: unused + * + * Return values: + * %0: Entry was successfully encoded. + * %-EINVAL: An encoding problem occured, secondary status code in resp->common.err + * + * On exit, the following fields are updated: + * - resp->xdr + * - resp->common.err + * - resp->cookie_offset + */ +int nfs3svc_encode_entry3(void *data, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct readdir_cd *ccd = data; + struct nfsd3_readdirres *resp = container_of(ccd, + struct nfsd3_readdirres, + common); + unsigned int starting_length = resp->dirlist.len; - /* determine which page in rq_respages[] we are currently filling */ - for (page = cd->rqstp->rq_respages + 1; - page < cd->rqstp->rq_next_page; page++) { - curr_page_addr = page_address(*page); + /* The offset cookie for the previous entry */ + nfs3svc_encode_cookie3(resp, offset); - if (((caddr_t)cd->buffer >= curr_page_addr) && - ((caddr_t)cd->buffer < curr_page_addr + PAGE_SIZE)) - break; - } + if (!svcxdr_encode_entry3_common(resp, name, namlen, offset, ino)) + goto out_toosmall; - if ((caddr_t)(cd->buffer + elen) < (curr_page_addr + PAGE_SIZE)) { - /* encode entry in current page */ + xdr_commit_encode(&resp->xdr); + resp->common.err = nfs_ok; + return 0; - p = encode_entry_baggage(cd, p, name, namlen, ino); +out_toosmall: + resp->cookie_offset = 0; + resp->common.err = nfserr_toosmall; + resp->dirlist.len = starting_length; + return -EINVAL; +} - if (plus) - p = encode_entryplus_baggage(cd, p, name, namlen, ino); - num_entry_words = p - cd->buffer; - } else if (*(page+1) != NULL) { - /* temporarily encode entry into next page, then move back to - * current and next page in rq_respages[] */ - __be32 *p1, *tmp; - int len1, len2; +static bool +svcxdr_encode_entry3_plus(struct nfsd3_readdirres *resp, const char *name, + int namlen, u64 ino) +{ + struct xdr_stream *xdr = &resp->xdr; + struct svc_fh *fhp = &resp->scratch; + bool result; - /* grab next page for temporary storage of entry */ - p1 = tmp = page_address(*(page+1)); + result = false; + fh_init(fhp, NFS3_FHSIZE); + if (compose_entry_fh(resp, fhp, name, namlen, ino) != nfs_ok) + goto out_noattrs; - p1 = encode_entry_baggage(cd, p1, name, namlen, ino); + if (!svcxdr_encode_post_op_attr(resp->rqstp, xdr, fhp)) + goto out; + if (!svcxdr_encode_post_op_fh3(xdr, fhp)) + goto out; + result = true; - if (plus) - p1 = encode_entryplus_baggage(cd, p1, name, namlen, ino); +out: + fh_put(fhp); + return result; - /* determine entry word length and lengths to go in pages */ - num_entry_words = p1 - tmp; - len1 = curr_page_addr + PAGE_SIZE - (caddr_t)cd->buffer; - if ((num_entry_words << 2) < len1) { - /* the actual number of words in the entry is less - * than elen and can still fit in the current page - */ - memmove(p, tmp, num_entry_words << 2); - p += num_entry_words; - - /* update offset */ - cd->offset = cd->buffer + (cd->offset - tmp); - } else { - unsigned int offset_r = (cd->offset - tmp) << 2; - - /* update pointer to offset location. - * This is a 64bit quantity, so we need to - * deal with 3 cases: - * - entirely in first page - * - entirely in second page - * - 4 bytes in each page - */ - if (offset_r + 8 <= len1) { - cd->offset = p + (cd->offset - tmp); - } else if (offset_r >= len1) { - cd->offset -= len1 >> 2; - } else { - /* sitting on the fence */ - BUG_ON(offset_r != len1 - 4); - cd->offset = p + (cd->offset - tmp); - cd->offset1 = tmp; - } - - len2 = (num_entry_words << 2) - len1; - - /* move from temp page to current and next pages */ - memmove(p, tmp, len1); - memmove(tmp, (caddr_t)tmp+len1, len2); - - p = tmp + (len2 >> 2); - } - } - else { - cd->common.err = nfserr_toosmall; - return -EINVAL; - } +out_noattrs: + if (xdr_stream_encode_item_absent(xdr) < 0) + return false; + if (xdr_stream_encode_item_absent(xdr) < 0) + return false; + return true; +} - cd->buflen -= num_entry_words; - cd->buffer = p; - cd->common.err = nfs_ok; +/** + * nfs3svc_encode_entryplus3 - encode one NFSv3 READDIRPLUS entry + * @data: directory context + * @name: name of the object to be encoded + * @namlen: length of that name, in bytes + * @offset: the offset of the previous entry + * @ino: the fileid of this entry + * @d_type: unused + * + * Return values: + * %0: Entry was successfully encoded. + * %-EINVAL: An encoding problem occured, secondary status code in resp->common.err + * + * On exit, the following fields are updated: + * - resp->xdr + * - resp->common.err + * - resp->cookie_offset + */ +int nfs3svc_encode_entryplus3(void *data, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct readdir_cd *ccd = data; + struct nfsd3_readdirres *resp = container_of(ccd, + struct nfsd3_readdirres, + common); + unsigned int starting_length = resp->dirlist.len; + + /* The offset cookie for the previous entry */ + nfs3svc_encode_cookie3(resp, offset); + + if (!svcxdr_encode_entry3_common(resp, name, namlen, offset, ino)) + goto out_toosmall; + if (!svcxdr_encode_entry3_plus(resp, name, namlen, ino)) + goto out_toosmall; + + xdr_commit_encode(&resp->xdr); + resp->common.err = nfs_ok; return 0; +out_toosmall: + resp->cookie_offset = 0; + resp->common.err = nfserr_toosmall; + resp->dirlist.len = starting_length; + return -EINVAL; } -int -nfs3svc_encode_entry(void *cd, const char *name, - int namlen, loff_t offset, u64 ino, unsigned int d_type) +static bool +svcxdr_encode_fsstat3resok(struct xdr_stream *xdr, + const struct nfsd3_fsstatres *resp) { - return encode_entry(cd, name, namlen, offset, ino, d_type, 0); -} + const struct kstatfs *s = &resp->stats; + u64 bs = s->f_bsize; + __be32 *p; -int -nfs3svc_encode_entry_plus(void *cd, const char *name, - int namlen, loff_t offset, u64 ino, - unsigned int d_type) -{ - return encode_entry(cd, name, namlen, offset, ino, d_type, 1); + p = xdr_reserve_space(xdr, XDR_UNIT * 13); + if (!p) + return false; + p = xdr_encode_hyper(p, bs * s->f_blocks); /* total bytes */ + p = xdr_encode_hyper(p, bs * s->f_bfree); /* free bytes */ + p = xdr_encode_hyper(p, bs * s->f_bavail); /* user available bytes */ + p = xdr_encode_hyper(p, s->f_files); /* total inodes */ + p = xdr_encode_hyper(p, s->f_ffree); /* free inodes */ + p = xdr_encode_hyper(p, s->f_ffree); /* user available inodes */ + *p = cpu_to_be32(resp->invarsec); /* mean unchanged time */ + + return true; } /* FSSTAT */ int nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_fsstatres *resp = rqstp->rq_resp; - struct kstatfs *s = &resp->stats; - u64 bs = s->f_bsize; - - *p++ = resp->status; - *p++ = xdr_zero; /* no post_op_attr */ - - if (resp->status == 0) { - p = xdr_encode_hyper(p, bs * s->f_blocks); /* total bytes */ - p = xdr_encode_hyper(p, bs * s->f_bfree); /* free bytes */ - p = xdr_encode_hyper(p, bs * s->f_bavail); /* user available bytes */ - p = xdr_encode_hyper(p, s->f_files); /* total inodes */ - p = xdr_encode_hyper(p, s->f_ffree); /* free inodes */ - p = xdr_encode_hyper(p, s->f_ffree); /* user available inodes */ - *p++ = htonl(resp->invarsec); /* mean unchanged time */ + + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh)) + return 0; + if (!svcxdr_encode_fsstat3resok(xdr, resp)) + return 0; + break; + default: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh)) + return 0; } - return xdr_ressize_check(rqstp, p); + + return 1; +} + +static bool +svcxdr_encode_fsinfo3resok(struct xdr_stream *xdr, + const struct nfsd3_fsinfores *resp) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, XDR_UNIT * 12); + if (!p) + return false; + *p++ = cpu_to_be32(resp->f_rtmax); + *p++ = cpu_to_be32(resp->f_rtpref); + *p++ = cpu_to_be32(resp->f_rtmult); + *p++ = cpu_to_be32(resp->f_wtmax); + *p++ = cpu_to_be32(resp->f_wtpref); + *p++ = cpu_to_be32(resp->f_wtmult); + *p++ = cpu_to_be32(resp->f_dtpref); + p = xdr_encode_hyper(p, resp->f_maxfilesize); + p = encode_nfstime3(p, &nfs3svc_time_delta); + *p = cpu_to_be32(resp->f_properties); + + return true; } /* FSINFO */ int nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_fsinfores *resp = rqstp->rq_resp; - *p++ = resp->status; - *p++ = xdr_zero; /* no post_op_attr */ - - if (resp->status == 0) { - *p++ = htonl(resp->f_rtmax); - *p++ = htonl(resp->f_rtpref); - *p++ = htonl(resp->f_rtmult); - *p++ = htonl(resp->f_wtmax); - *p++ = htonl(resp->f_wtpref); - *p++ = htonl(resp->f_wtmult); - *p++ = htonl(resp->f_dtpref); - p = xdr_encode_hyper(p, resp->f_maxfilesize); - *p++ = xdr_one; - *p++ = xdr_zero; - *p++ = htonl(resp->f_properties); + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh)) + return 0; + if (!svcxdr_encode_fsinfo3resok(xdr, resp)) + return 0; + break; + default: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh)) + return 0; } - return xdr_ressize_check(rqstp, p); + return 1; +} + +static bool +svcxdr_encode_pathconf3resok(struct xdr_stream *xdr, + const struct nfsd3_pathconfres *resp) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, XDR_UNIT * 6); + if (!p) + return false; + *p++ = cpu_to_be32(resp->p_link_max); + *p++ = cpu_to_be32(resp->p_name_max); + p = xdr_encode_bool(p, resp->p_no_trunc); + p = xdr_encode_bool(p, resp->p_chown_restricted); + p = xdr_encode_bool(p, resp->p_case_insensitive); + xdr_encode_bool(p, resp->p_case_preserving); + + return true; } /* PATHCONF */ int nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_pathconfres *resp = rqstp->rq_resp; - *p++ = resp->status; - *p++ = xdr_zero; /* no post_op_attr */ - - if (resp->status == 0) { - *p++ = htonl(resp->p_link_max); - *p++ = htonl(resp->p_name_max); - *p++ = htonl(resp->p_no_trunc); - *p++ = htonl(resp->p_chown_restricted); - *p++ = htonl(resp->p_case_insensitive); - *p++ = htonl(resp->p_case_preserving); + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh)) + return 0; + if (!svcxdr_encode_pathconf3resok(xdr, resp)) + return 0; + break; + default: + if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh)) + return 0; } - return xdr_ressize_check(rqstp, p); + return 1; } /* COMMIT */ int nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd3_commitres *resp = rqstp->rq_resp; - *p++ = resp->status; - p = encode_wcc_data(rqstp, p, &resp->fh); - /* Write verifier */ - if (resp->status == 0) { - *p++ = resp->verf[0]; - *p++ = resp->verf[1]; + if (!svcxdr_encode_nfsstat3(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh)) + return 0; + if (!svcxdr_encode_writeverf3(xdr, resp->verf)) + return 0; + break; + default: + if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh)) + return 0; } - return xdr_ressize_check(rqstp, p); + + return 1; } /* diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index dd9f38d072dd..daf43b980d4b 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1383,10 +1383,13 @@ static void nfsd4_init_copy_res(struct nfsd4_copy *copy, bool sync) static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy) { ssize_t bytes_copied = 0; - size_t bytes_total = copy->cp_count; + u64 bytes_total = copy->cp_count; u64 src_pos = copy->cp_src_pos; u64 dst_pos = copy->cp_dst_pos; + /* See RFC 7862 p.67: */ + if (bytes_total == 0) + bytes_total = ULLONG_MAX; do { if (kthread_should_stop()) break; @@ -1538,8 +1541,8 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!nfs4_init_copy_state(nn, copy)) goto out_err; refcount_set(&async_copy->refcount, 1); - memcpy(©->cp_res.cb_stateid, ©->cp_stateid, - sizeof(copy->cp_stateid)); + memcpy(©->cp_res.cb_stateid, ©->cp_stateid.stid, + sizeof(copy->cp_res.cb_stateid)); dup_copy_fields(copy, async_copy); async_copy->copy_task = kthread_create(nfsd4_do_async_copy, async_copy, "%s", "copy thread"); @@ -2262,25 +2265,6 @@ static bool need_wrongsec_check(struct svc_rqst *rqstp) return !(nextd->op_flags & OP_HANDLES_WRONGSEC); } -static void svcxdr_init_encode(struct svc_rqst *rqstp, - struct nfsd4_compoundres *resp) -{ - struct xdr_stream *xdr = &resp->xdr; - struct xdr_buf *buf = &rqstp->rq_res; - struct kvec *head = buf->head; - - xdr->buf = buf; - xdr->iov = head; - xdr->p = head->iov_base + head->iov_len; - xdr->end = head->iov_base + PAGE_SIZE - rqstp->rq_auth_slack; - /* Tail and page_len should be zero at this point: */ - buf->len = buf->head[0].iov_len; - xdr_reset_scratch_buffer(xdr); - xdr->page_ptr = buf->pages - 1; - buf->buflen = PAGE_SIZE * (1 + rqstp->rq_page_end - buf->pages) - - rqstp->rq_auth_slack; -} - #ifdef CONFIG_NFSD_V4_2_INTER_SSC static void check_if_stalefh_allowed(struct nfsd4_compoundargs *args) @@ -2335,10 +2319,14 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); __be32 status; - svcxdr_init_encode(rqstp, resp); - resp->tagp = resp->xdr.p; + resp->xdr = &rqstp->rq_res_stream; + + /* reserve space for: NFS status code */ + xdr_reserve_space(resp->xdr, XDR_UNIT); + + resp->tagp = resp->xdr->p; /* reserve space for: taglen, tag, and opcnt */ - xdr_reserve_space(&resp->xdr, 8 + args->taglen); + xdr_reserve_space(resp->xdr, XDR_UNIT * 2 + args->taglen); resp->taglen = args->taglen; resp->tag = args->tag; resp->rqstp = rqstp; @@ -2444,7 +2432,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) encode_op: if (op->status == nfserr_replay_me) { op->replay = &cstate->replay_owner->so_replay; - nfsd4_encode_replay(&resp->xdr, op); + nfsd4_encode_replay(resp->xdr, op); status = op->status = op->replay->rp_status; } else { nfsd4_encode_operation(resp, op); diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 891395c6c7d3..6fedc49726bf 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -626,7 +626,7 @@ nfsd4_legacy_tracking_init(struct net *net) status = nfsd4_load_reboot_recovery_data(net); if (status) goto err; - printk("NFSD: Using legacy client tracking operations.\n"); + pr_info("NFSD: Using legacy client tracking operations.\n"); return 0; err: @@ -1028,7 +1028,7 @@ nfsd4_init_cld_pipe(struct net *net) status = __nfsd4_init_cld_pipe(net); if (!status) - printk("NFSD: Using old nfsdcld client tracking operations.\n"); + pr_info("NFSD: Using old nfsdcld client tracking operations.\n"); return status; } @@ -1605,7 +1605,7 @@ nfsd4_cld_tracking_init(struct net *net) nfs4_release_reclaim(nn); goto err_remove; } else - printk("NFSD: Using nfsdcld client tracking operations.\n"); + pr_info("NFSD: Using nfsdcld client tracking operations.\n"); return 0; err_remove: @@ -1864,7 +1864,7 @@ nfsd4_umh_cltrack_init(struct net *net) ret = nfsd4_umh_cltrack_upcall("init", NULL, grace_start, NULL); kfree(grace_start); if (!ret) - printk("NFSD: Using UMH upcall client tracking operations.\n"); + pr_info("NFSD: Using UMH upcall client tracking operations.\n"); return ret; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 97447a64bad0..7698172ac0c7 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -43,6 +43,7 @@ #include <linux/sunrpc/addr.h> #include <linux/jhash.h> #include <linux/string_helpers.h> +#include <linux/fsnotify.h> #include "xdr4.h" #include "xdr4cb.h" #include "vfs.h" @@ -2352,6 +2353,10 @@ static int client_info_show(struct seq_file *m, void *v) memcpy(&clid, &clp->cl_clientid, sizeof(clid)); seq_printf(m, "clientid: 0x%llx\n", clid); seq_printf(m, "address: \"%pISpc\"\n", (struct sockaddr *)&clp->cl_addr); + if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags)) + seq_puts(m, "status: confirmed\n"); + else + seq_puts(m, "status: unconfirmed\n"); seq_printf(m, "name: "); seq_quote_mem(m, clp->cl_name.data, clp->cl_name.len); seq_printf(m, "\nminor version: %d\n", clp->cl_minorversion); @@ -2702,6 +2707,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, int ret; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct dentry *dentries[ARRAY_SIZE(client_files)]; clp = alloc_client(name); if (clp == NULL) @@ -2721,9 +2727,11 @@ static struct nfs4_client *create_client(struct xdr_netobj name, memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage)); clp->cl_cb_session = NULL; clp->net = net; - clp->cl_nfsd_dentry = nfsd_client_mkdir(nn, &clp->cl_nfsdfs, - clp->cl_clientid.cl_id - nn->clientid_base, - client_files); + clp->cl_nfsd_dentry = nfsd_client_mkdir( + nn, &clp->cl_nfsdfs, + clp->cl_clientid.cl_id - nn->clientid_base, + client_files, dentries); + clp->cl_nfsd_info_dentry = dentries[0]; if (!clp->cl_nfsd_dentry) { free_client(clp); return NULL; @@ -2798,7 +2806,10 @@ move_to_confirmed(struct nfs4_client *clp) list_move(&clp->cl_idhash, &nn->conf_id_hashtbl[idhashval]); rb_erase(&clp->cl_namenode, &nn->unconf_name_tree); add_clp_to_name_tree(clp, &nn->conf_name_tree); - set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags); + if (!test_and_set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags) && + clp->cl_nfsd_dentry && + clp->cl_nfsd_info_dentry) + fsnotify_dentry(clp->cl_nfsd_info_dentry, FS_MODIFY); renew_client_locked(clp); } @@ -2903,7 +2914,7 @@ out_err: static void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) { - struct xdr_buf *buf = resp->xdr.buf; + struct xdr_buf *buf = resp->xdr->buf; struct nfsd4_slot *slot = resp->cstate.slot; unsigned int base; @@ -2973,7 +2984,7 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, struct nfsd4_sequence *seq) { struct nfsd4_slot *slot = resp->cstate.slot; - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; __be32 status; @@ -3708,7 +3719,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct nfsd4_sequence *seq = &u->sequence; struct nfsd4_compoundres *resp = rqstp->rq_resp; - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; struct nfsd4_session *session; struct nfs4_client *clp; struct nfsd4_slot *slot; @@ -5338,6 +5349,22 @@ static bool clients_still_reclaiming(struct nfsd_net *nn) return true; } +struct laundry_time { + time64_t cutoff; + time64_t new_timeo; +}; + +static bool state_expired(struct laundry_time *lt, time64_t last_refresh) +{ + time64_t time_remaining; + + if (last_refresh < lt->cutoff) + return true; + time_remaining = last_refresh - lt->cutoff; + lt->new_timeo = min(lt->new_timeo, time_remaining); + return false; +} + static time64_t nfs4_laundromat(struct nfsd_net *nn) { @@ -5347,14 +5374,16 @@ nfs4_laundromat(struct nfsd_net *nn) struct nfs4_ol_stateid *stp; struct nfsd4_blocked_lock *nbl; struct list_head *pos, *next, reaplist; - time64_t cutoff = ktime_get_boottime_seconds() - nn->nfsd4_lease; - time64_t t, new_timeo = nn->nfsd4_lease; + struct laundry_time lt = { + .cutoff = ktime_get_boottime_seconds() - nn->nfsd4_lease, + .new_timeo = nn->nfsd4_lease + }; struct nfs4_cpntf_state *cps; copy_stateid_t *cps_t; int i; if (clients_still_reclaiming(nn)) { - new_timeo = 0; + lt.new_timeo = 0; goto out; } nfsd4_end_grace(nn); @@ -5364,7 +5393,7 @@ nfs4_laundromat(struct nfsd_net *nn) idr_for_each_entry(&nn->s2s_cp_stateids, cps_t, i) { cps = container_of(cps_t, struct nfs4_cpntf_state, cp_stateid); if (cps->cp_stateid.sc_type == NFS4_COPYNOTIFY_STID && - cps->cpntf_time < cutoff) + state_expired(<, cps->cpntf_time)) _free_cpntf_state_locked(nn, cps); } spin_unlock(&nn->s2s_cp_lock); @@ -5372,11 +5401,8 @@ nfs4_laundromat(struct nfsd_net *nn) spin_lock(&nn->client_lock); list_for_each_safe(pos, next, &nn->client_lru) { clp = list_entry(pos, struct nfs4_client, cl_lru); - if (clp->cl_time > cutoff) { - t = clp->cl_time - cutoff; - new_timeo = min(new_timeo, t); + if (!state_expired(<, clp->cl_time)) break; - } if (mark_client_expired_locked(clp)) { trace_nfsd_clid_expired(&clp->cl_clientid); continue; @@ -5393,11 +5419,8 @@ nfs4_laundromat(struct nfsd_net *nn) spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); - if (dp->dl_time > cutoff) { - t = dp->dl_time - cutoff; - new_timeo = min(new_timeo, t); + if (!state_expired(<, dp->dl_time)) break; - } WARN_ON(!unhash_delegation_locked(dp)); list_add(&dp->dl_recall_lru, &reaplist); } @@ -5413,11 +5436,8 @@ nfs4_laundromat(struct nfsd_net *nn) while (!list_empty(&nn->close_lru)) { oo = list_first_entry(&nn->close_lru, struct nfs4_openowner, oo_close_lru); - if (oo->oo_time > cutoff) { - t = oo->oo_time - cutoff; - new_timeo = min(new_timeo, t); + if (!state_expired(<, oo->oo_time)) break; - } list_del_init(&oo->oo_close_lru); stp = oo->oo_last_closed_stid; oo->oo_last_closed_stid = NULL; @@ -5443,11 +5463,8 @@ nfs4_laundromat(struct nfsd_net *nn) while (!list_empty(&nn->blocked_locks_lru)) { nbl = list_first_entry(&nn->blocked_locks_lru, struct nfsd4_blocked_lock, nbl_lru); - if (nbl->nbl_time > cutoff) { - t = nbl->nbl_time - cutoff; - new_timeo = min(new_timeo, t); + if (!state_expired(<, nbl->nbl_time)) break; - } list_move(&nbl->nbl_lru, &reaplist); list_del_init(&nbl->nbl_list); } @@ -5460,8 +5477,7 @@ nfs4_laundromat(struct nfsd_net *nn) free_blocked_lock(nbl); } out: - new_timeo = max_t(time64_t, new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT); - return new_timeo; + return max_t(time64_t, lt.new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT); } static struct workqueue_struct *laundry_wq; @@ -7321,14 +7337,9 @@ nfs4_state_start_net(struct net *net) struct nfsd_net *nn = net_generic(net, nfsd_net_id); int ret; - ret = get_nfsdfs(net); - if (ret) - return ret; ret = nfs4_state_create_net(net); - if (ret) { - mntput(nn->nfsd_mnt); + if (ret) return ret; - } locks_start_grace(net, &nn->nfsd4_manager); nfsd4_client_tracking_init(net); if (nn->track_reclaim_completes && nn->reclaim_str_hashtbl_size == 0) @@ -7398,7 +7409,6 @@ nfs4_state_shutdown_net(struct net *net) nfsd4_client_tracking_exit(net); nfs4_state_destroy_net(net); - mntput(nn->nfsd_mnt); } void diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index eaaa1605b5b5..e0f06d3cbd44 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3581,7 +3581,7 @@ nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid) static __be32 nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, 8); @@ -3594,7 +3594,7 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 8); @@ -3611,7 +3611,7 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, static __be32 nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; return nfsd4_encode_stateid(xdr, &close->cl_stateid); } @@ -3620,7 +3620,7 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c static __be32 nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); @@ -3634,7 +3634,7 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ static __be32 nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, 20); @@ -3649,7 +3649,7 @@ static __be32 nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getattr *getattr) { struct svc_fh *fhp = getattr->ga_fhp; - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; return nfsd4_encode_fattr(xdr, fhp, fhp->fh_export, fhp->fh_dentry, getattr->ga_bmval, resp->rqstp, 0); @@ -3658,7 +3658,7 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 static __be32 nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh **fhpp) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; struct svc_fh *fhp = *fhpp; unsigned int len; __be32 *p; @@ -3713,7 +3713,7 @@ again: static __be32 nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; if (!nfserr) nfserr = nfsd4_encode_stateid(xdr, &lock->lk_resp_stateid); @@ -3726,7 +3726,7 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo static __be32 nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; if (nfserr == nfserr_denied) nfsd4_encode_lock_denied(xdr, &lockt->lt_denied); @@ -3736,7 +3736,7 @@ nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l static __be32 nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; return nfsd4_encode_stateid(xdr, &locku->lu_stateid); } @@ -3745,7 +3745,7 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l static __be32 nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, 20); @@ -3759,7 +3759,7 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li static __be32 nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid); @@ -3853,7 +3853,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op static __be32 nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; return nfsd4_encode_stateid(xdr, &oc->oc_resp_stateid); } @@ -3861,7 +3861,7 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct static __be32 nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; return nfsd4_encode_stateid(xdr, &od->od_stateid); } @@ -3871,7 +3871,7 @@ static __be32 nfsd4_encode_splice_read( struct nfsd4_read *read, struct file *file, unsigned long maxcount) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; struct xdr_buf *buf = xdr->buf; int status, space_left; u32 eof; @@ -3937,7 +3937,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, struct nfsd4_read *read, struct file *file, unsigned long maxcount) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; u32 eof; int starting_len = xdr->buf->len - 8; __be32 nfserr; @@ -3976,7 +3976,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_read *read) { unsigned long maxcount; - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; struct file *file; int starting_len = xdr->buf->len; __be32 *p; @@ -3990,7 +3990,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, WARN_ON_ONCE(test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)); return nfserr_resource; } - if (resp->xdr.buf->page_len && + if (resp->xdr->buf->page_len && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) { WARN_ON_ONCE(1); return nfserr_resource; @@ -4020,7 +4020,7 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd int maxcount; __be32 wire_count; int zero = 0; - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; int length_offset = xdr->buf->len; int status; __be32 *p; @@ -4072,7 +4072,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 int bytes_left; loff_t offset; __be64 wire_offset; - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; int starting_len = xdr->buf->len; __be32 *p; @@ -4083,8 +4083,8 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 /* XXX: Following NFSv3, we ignore the READDIR verifier for now. */ *p++ = cpu_to_be32(0); *p++ = cpu_to_be32(0); - resp->xdr.buf->head[0].iov_len = ((char *)resp->xdr.p) - - (char *)resp->xdr.buf->head[0].iov_base; + xdr->buf->head[0].iov_len = (char *)xdr->p - + (char *)xdr->buf->head[0].iov_base; /* * Number of bytes left for directory entries allowing for the @@ -4159,7 +4159,7 @@ err_no_verf: static __be32 nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, 20); @@ -4172,7 +4172,7 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ static __be32 nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, 40); @@ -4255,7 +4255,7 @@ static __be32 nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_secinfo *secinfo) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; return nfsd4_do_encode_secinfo(xdr, secinfo->si_exp); } @@ -4264,7 +4264,7 @@ static __be32 nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_secinfo_no_name *secinfo) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; return nfsd4_do_encode_secinfo(xdr, secinfo->sin_exp); } @@ -4276,7 +4276,7 @@ nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr, static __be32 nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, 16); @@ -4300,7 +4300,7 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 static __be32 nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; if (!nfserr) { @@ -4324,7 +4324,7 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct n static __be32 nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, 16); @@ -4341,7 +4341,7 @@ static __be32 nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_exchange_id *exid) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; char *major_id; char *server_scope; @@ -4419,7 +4419,7 @@ static __be32 nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create_session *sess) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, 24); @@ -4472,7 +4472,7 @@ static __be32 nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_sequence *seq) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 20); @@ -4495,7 +4495,7 @@ static __be32 nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_test_stateid *test_stateid) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; struct nfsd4_test_stateid_id *stateid, *next; __be32 *p; @@ -4516,7 +4516,7 @@ static __be32 nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getdeviceinfo *gdev) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; const struct nfsd4_layout_ops *ops; u32 starting_len = xdr->buf->len, needed_len; __be32 *p; @@ -4572,7 +4572,7 @@ static __be32 nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_layoutget *lgp) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; const struct nfsd4_layout_ops *ops; __be32 *p; @@ -4599,7 +4599,7 @@ static __be32 nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_layoutcommit *lcp) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, 4); @@ -4620,7 +4620,7 @@ static __be32 nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_layoutreturn *lrp) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, 4); @@ -4638,7 +4638,7 @@ nfsd42_encode_write_res(struct nfsd4_compoundres *resp, struct nfsd42_write_res *write, bool sync) { __be32 *p; - p = xdr_reserve_space(&resp->xdr, 4); + p = xdr_reserve_space(resp->xdr, 4); if (!p) return nfserr_resource; @@ -4647,11 +4647,11 @@ nfsd42_encode_write_res(struct nfsd4_compoundres *resp, else { __be32 nfserr; *p++ = cpu_to_be32(1); - nfserr = nfsd4_encode_stateid(&resp->xdr, &write->cb_stateid); + nfserr = nfsd4_encode_stateid(resp->xdr, &write->cb_stateid); if (nfserr) return nfserr; } - p = xdr_reserve_space(&resp->xdr, 8 + 4 + NFS4_VERIFIER_SIZE); + p = xdr_reserve_space(resp->xdr, 8 + 4 + NFS4_VERIFIER_SIZE); if (!p) return nfserr_resource; @@ -4665,7 +4665,7 @@ nfsd42_encode_write_res(struct nfsd4_compoundres *resp, static __be32 nfsd42_encode_nl4_server(struct nfsd4_compoundres *resp, struct nl4_server *ns) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; struct nfs42_netaddr *addr; __be32 *p; @@ -4713,7 +4713,7 @@ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr, if (nfserr) return nfserr; - p = xdr_reserve_space(&resp->xdr, 4 + 4); + p = xdr_reserve_space(resp->xdr, 4 + 4); *p++ = xdr_one; /* cr_consecutive */ *p++ = cpu_to_be32(copy->cp_synchronous); return 0; @@ -4723,7 +4723,7 @@ static __be32 nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_offload_status *os) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, 8 + 4); @@ -4740,7 +4740,7 @@ nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, unsigned long *maxcount, u32 *eof, loff_t *pos) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; struct file *file = read->rd_nf->nf_file; int starting_len = xdr->buf->len; loff_t hole_pos; @@ -4799,7 +4799,7 @@ nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp, count = data_pos - read->rd_offset; /* Content type, offset, byte count */ - p = xdr_reserve_space(&resp->xdr, 4 + 8 + 8); + p = xdr_reserve_space(resp->xdr, 4 + 8 + 8); if (!p) return nfserr_resource; @@ -4817,7 +4817,7 @@ nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_read *read) { unsigned long maxcount, count; - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; struct file *file; int starting_len = xdr->buf->len; int last_segment = xdr->buf->len; @@ -4888,7 +4888,7 @@ static __be32 nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_copy_notify *cn) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; if (nfserr) @@ -4924,7 +4924,7 @@ nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr, { __be32 *p; - p = xdr_reserve_space(&resp->xdr, 4 + 8); + p = xdr_reserve_space(resp->xdr, 4 + 8); *p++ = cpu_to_be32(seek->seek_eof); p = xdr_encode_hyper(p, seek->seek_pos); @@ -4985,7 +4985,7 @@ static __be32 nfsd4_encode_getxattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getxattr *getxattr) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p, err; p = xdr_reserve_space(xdr, 4); @@ -5009,7 +5009,7 @@ static __be32 nfsd4_encode_setxattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setxattr *setxattr) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, 20); @@ -5050,7 +5050,7 @@ static __be32 nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_listxattrs *listxattrs) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; u32 cookie_offset, count_offset, eof; u32 left, xdrleft, slen, count; u32 xdrlen, offset; @@ -5161,7 +5161,7 @@ static __be32 nfsd4_encode_removexattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_removexattr *removexattr) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; __be32 *p; p = xdr_reserve_space(xdr, 20); @@ -5301,7 +5301,7 @@ __be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 respsize) void nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) { - struct xdr_stream *xdr = &resp->xdr; + struct xdr_stream *xdr = resp->xdr; struct nfs4_stateowner *so = resp->cstate.replay_owner; struct svc_rqst *rqstp = resp->rqstp; const struct nfsd4_operation *opdesc = op->opdesc; @@ -5430,14 +5430,14 @@ int nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd4_compoundres *resp = rqstp->rq_resp; - struct xdr_buf *buf = resp->xdr.buf; + struct xdr_buf *buf = resp->xdr->buf; WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len + buf->tail[0].iov_len); *p = resp->cstate.status; - rqstp->rq_next_page = resp->xdr.page_ptr + 1; + rqstp->rq_next_page = resp->xdr->page_ptr + 1; p = resp->tagp; *p++ = htonl(resp->taglen); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index ef86ed23af82..853bf50a2a9b 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1266,7 +1266,8 @@ static void nfsdfs_remove_files(struct dentry *root) /* XXX: cut'n'paste from simple_fill_super; figure out if we could share * code instead. */ static int nfsdfs_create_files(struct dentry *root, - const struct tree_descr *files) + const struct tree_descr *files, + struct dentry **fdentries) { struct inode *dir = d_inode(root); struct inode *inode; @@ -1275,8 +1276,6 @@ static int nfsdfs_create_files(struct dentry *root, inode_lock(dir); for (i = 0; files->name && files->name[0]; i++, files++) { - if (!files->name) - continue; dentry = d_alloc_name(root, files->name); if (!dentry) goto out; @@ -1290,6 +1289,8 @@ static int nfsdfs_create_files(struct dentry *root, inode->i_private = __get_nfsdfs_client(dir); d_add(dentry, inode); fsnotify_create(dir, dentry); + if (fdentries) + fdentries[i] = dentry; } inode_unlock(dir); return 0; @@ -1301,8 +1302,9 @@ out: /* on success, returns positive number unique to that client. */ struct dentry *nfsd_client_mkdir(struct nfsd_net *nn, - struct nfsdfs_client *ncl, u32 id, - const struct tree_descr *files) + struct nfsdfs_client *ncl, u32 id, + const struct tree_descr *files, + struct dentry **fdentries) { struct dentry *dentry; char name[11]; @@ -1313,7 +1315,7 @@ struct dentry *nfsd_client_mkdir(struct nfsd_net *nn, dentry = nfsd_mkdir(nn->nfsd_client_dir, ncl, name); if (IS_ERR(dentry)) /* XXX: tossing errors? */ return NULL; - ret = nfsdfs_create_files(dentry, files); + ret = nfsdfs_create_files(dentry, files, fdentries); if (ret) { nfsd_client_rmdir(dentry); return NULL; @@ -1416,6 +1418,8 @@ static void nfsd_umount(struct super_block *sb) { struct net *net = sb->s_fs_info; + nfsd_shutdown_threads(net); + kill_litter_super(sb); put_net(net); } @@ -1428,18 +1432,6 @@ static struct file_system_type nfsd_fs_type = { }; MODULE_ALIAS_FS("nfsd"); -int get_nfsdfs(struct net *net) -{ - struct nfsd_net *nn = net_generic(net, nfsd_net_id); - struct vfsmount *mnt; - - mnt = vfs_kern_mount(&nfsd_fs_type, SB_KERNMOUNT, "nfsd", NULL); - if (IS_ERR(mnt)) - return PTR_ERR(mnt); - nn->nfsd_mnt = mnt; - return 0; -} - #ifdef CONFIG_PROC_FS static int create_proc_exports_entry(void) { diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 8bdc37aa2c2e..14dbfa75059d 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -93,13 +93,12 @@ int nfsd_get_nrthreads(int n, int *, struct net *); int nfsd_set_nrthreads(int n, int *, struct net *); int nfsd_pool_stats_open(struct inode *, struct file *); int nfsd_pool_stats_release(struct inode *, struct file *); +void nfsd_shutdown_threads(struct net *net); void nfsd_destroy(struct net *net); bool i_am_nfsd(void); -int get_nfsdfs(struct net *); - struct nfsdfs_client { struct kref cl_ref; void (*cl_release)(struct kref *kref); @@ -107,7 +106,9 @@ struct nfsdfs_client { struct nfsdfs_client *get_nfsdfs_client(struct inode *); struct dentry *nfsd_client_mkdir(struct nfsd_net *nn, - struct nfsdfs_client *ncl, u32 id, const struct tree_descr *); + struct nfsdfs_client *ncl, u32 id, + const struct tree_descr *, + struct dentry **fdentries); void nfsd_client_rmdir(struct dentry *dentry); diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 10b44421eace..c475d2271f9c 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -711,7 +711,7 @@ char * SVCFH_fmt(struct svc_fh *fhp) return buf; } -enum fsid_source fsid_source(struct svc_fh *fhp) +enum fsid_source fsid_source(const struct svc_fh *fhp) { if (fhp->fh_handle.fh_version != 1) return FSIDSOURCE_DEV; diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index f58933519f38..aff2cda5c6c3 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -82,7 +82,7 @@ enum fsid_source { FSIDSOURCE_FSID, FSIDSOURCE_UUID, }; -extern enum fsid_source fsid_source(struct svc_fh *fhp); +extern enum fsid_source fsid_source(const struct svc_fh *fhp); /* diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index a8d5449dd0e9..60d7c59e7935 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -151,13 +151,14 @@ nfsd_proc_readlink(struct svc_rqst *rqstp) { struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd_readlinkres *resp = rqstp->rq_resp; - char *buffer = page_address(*(rqstp->rq_next_page++)); dprintk("nfsd: READLINK %s\n", SVCFH_fmt(&argp->fh)); /* Read the symlink. */ resp->len = NFS_MAXPATHLEN; - resp->status = nfsd_readlink(rqstp, &argp->fh, buffer, &resp->len); + resp->page = *(rqstp->rq_next_page++); + resp->status = nfsd_readlink(rqstp, &argp->fh, + page_address(resp->page), &resp->len); fh_put(&argp->fh); return rpc_success; @@ -184,6 +185,7 @@ nfsd_proc_read(struct svc_rqst *rqstp) v = 0; len = argp->count; + resp->pages = rqstp->rq_next_page; while (len > 0) { struct page *page = *(rqstp->rq_next_page++); @@ -381,7 +383,7 @@ nfsd_proc_create(struct svc_rqst *rqstp) /* Make sure the type and device matches */ resp->status = nfserr_exist; - if (inode && type != (inode->i_mode & S_IFMT)) + if (inode && inode_wrong_type(inode, type)) goto out_unlock; } @@ -557,14 +559,27 @@ static void nfsd_init_dirlist_pages(struct svc_rqst *rqstp, struct nfsd_readdirres *resp, int count) { + struct xdr_buf *buf = &resp->dirlist; + struct xdr_stream *xdr = &resp->xdr; + count = min_t(u32, count, PAGE_SIZE); - /* Convert byte count to number of words (i.e. >> 2), - * and reserve room for the NULL ptr & eof flag (-2 words) */ - resp->buflen = (count >> 2) - 2; + memset(buf, 0, sizeof(*buf)); - resp->buffer = page_address(*rqstp->rq_next_page); + /* Reserve room for the NULL ptr & eof flag (-2 words) */ + buf->buflen = count - sizeof(__be32) * 2; + buf->pages = rqstp->rq_next_page; rqstp->rq_next_page++; + + /* This is xdr_init_encode(), but it assumes that + * the head kvec has already been consumed. */ + xdr_set_scratch_buffer(xdr, NULL, 0); + xdr->buf = buf; + xdr->page_ptr = buf->pages; + xdr->iov = NULL; + xdr->p = page_address(*buf->pages); + xdr->end = xdr->p + (PAGE_SIZE >> 2); + xdr->rqst = NULL; } /* @@ -576,25 +591,19 @@ nfsd_proc_readdir(struct svc_rqst *rqstp) struct nfsd_readdirargs *argp = rqstp->rq_argp; struct nfsd_readdirres *resp = rqstp->rq_resp; loff_t offset; - __be32 *buffer; dprintk("nfsd: READDIR %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), argp->count, argp->cookie); nfsd_init_dirlist_pages(rqstp, resp, argp->count); - buffer = resp->buffer; - resp->offset = NULL; resp->common.err = nfs_ok; - /* Read directory and encode entries on the fly */ + resp->cookie_offset = 0; offset = argp->cookie; resp->status = nfsd_readdir(rqstp, &argp->fh, &offset, &resp->common, nfssvc_encode_entry); - - resp->count = resp->buffer - buffer; - if (resp->offset) - *resp->offset = htonl(offset); + nfssvc_encode_nfscookie(resp, offset); fh_put(&argp->fh); return rpc_success; @@ -640,7 +649,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { [NFSPROC_GETATTR] = { .pc_func = nfsd_proc_getattr, .pc_decode = nfssvc_decode_fhandleargs, - .pc_encode = nfssvc_encode_attrstat, + .pc_encode = nfssvc_encode_attrstatres, .pc_release = nfssvc_release_attrstat, .pc_argsize = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd_attrstat), @@ -651,7 +660,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { [NFSPROC_SETATTR] = { .pc_func = nfsd_proc_setattr, .pc_decode = nfssvc_decode_sattrargs, - .pc_encode = nfssvc_encode_attrstat, + .pc_encode = nfssvc_encode_attrstatres, .pc_release = nfssvc_release_attrstat, .pc_argsize = sizeof(struct nfsd_sattrargs), .pc_ressize = sizeof(struct nfsd_attrstat), @@ -714,7 +723,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { [NFSPROC_WRITE] = { .pc_func = nfsd_proc_write, .pc_decode = nfssvc_decode_writeargs, - .pc_encode = nfssvc_encode_attrstat, + .pc_encode = nfssvc_encode_attrstatres, .pc_release = nfssvc_release_attrstat, .pc_argsize = sizeof(struct nfsd_writeargs), .pc_ressize = sizeof(struct nfsd_attrstat), @@ -736,7 +745,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { [NFSPROC_REMOVE] = { .pc_func = nfsd_proc_remove, .pc_decode = nfssvc_decode_diropargs, - .pc_encode = nfssvc_encode_stat, + .pc_encode = nfssvc_encode_statres, .pc_argsize = sizeof(struct nfsd_diropargs), .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, @@ -746,7 +755,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { [NFSPROC_RENAME] = { .pc_func = nfsd_proc_rename, .pc_decode = nfssvc_decode_renameargs, - .pc_encode = nfssvc_encode_stat, + .pc_encode = nfssvc_encode_statres, .pc_argsize = sizeof(struct nfsd_renameargs), .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, @@ -756,7 +765,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { [NFSPROC_LINK] = { .pc_func = nfsd_proc_link, .pc_decode = nfssvc_decode_linkargs, - .pc_encode = nfssvc_encode_stat, + .pc_encode = nfssvc_encode_statres, .pc_argsize = sizeof(struct nfsd_linkargs), .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, @@ -766,7 +775,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { [NFSPROC_SYMLINK] = { .pc_func = nfsd_proc_symlink, .pc_decode = nfssvc_decode_symlinkargs, - .pc_encode = nfssvc_encode_stat, + .pc_encode = nfssvc_encode_statres, .pc_argsize = sizeof(struct nfsd_symlinkargs), .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, @@ -787,7 +796,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { [NFSPROC_RMDIR] = { .pc_func = nfsd_proc_rmdir, .pc_decode = nfssvc_decode_diropargs, - .pc_encode = nfssvc_encode_stat, + .pc_encode = nfssvc_encode_statres, .pc_argsize = sizeof(struct nfsd_diropargs), .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 6de406322106..82ba034fa579 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -84,7 +84,7 @@ DEFINE_MUTEX(nfsd_mutex); * version 4.1 DRC caches. * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage. */ -spinlock_t nfsd_drc_lock; +DEFINE_SPINLOCK(nfsd_drc_lock); unsigned long nfsd_drc_max_mem; unsigned long nfsd_drc_mem_used; @@ -563,7 +563,6 @@ static void set_max_drc(void) nfsd_drc_max_mem = (nr_free_buffer_pages() >> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE; nfsd_drc_mem_used = 0; - spin_lock_init(&nfsd_drc_lock); dprintk("%s nfsd_drc_max_mem %lu \n", __func__, nfsd_drc_max_mem); } @@ -596,6 +595,37 @@ static const struct svc_serv_ops nfsd_thread_sv_ops = { .svo_module = THIS_MODULE, }; +static void nfsd_complete_shutdown(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + WARN_ON(!mutex_is_locked(&nfsd_mutex)); + + nn->nfsd_serv = NULL; + complete(&nn->nfsd_shutdown_complete); +} + +void nfsd_shutdown_threads(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct svc_serv *serv; + + mutex_lock(&nfsd_mutex); + serv = nn->nfsd_serv; + if (serv == NULL) { + mutex_unlock(&nfsd_mutex); + return; + } + + svc_get(serv); + /* Kill outstanding nfsd threads */ + serv->sv_ops->svo_setup(serv, NULL, 0); + nfsd_destroy(net); + mutex_unlock(&nfsd_mutex); + /* Wait for shutdown of nfsd_serv to complete */ + wait_for_completion(&nn->nfsd_shutdown_complete); +} + bool i_am_nfsd(void) { return kthread_func(current) == nfsd; @@ -618,11 +648,13 @@ int nfsd_create_serv(struct net *net) &nfsd_thread_sv_ops); if (nn->nfsd_serv == NULL) return -ENOMEM; + init_completion(&nn->nfsd_shutdown_complete); nn->nfsd_serv->sv_maxconn = nn->max_connections; error = svc_bind(nn->nfsd_serv, net); if (error < 0) { svc_destroy(nn->nfsd_serv); + nfsd_complete_shutdown(net); return error; } @@ -671,7 +703,7 @@ void nfsd_destroy(struct net *net) svc_shutdown_net(nn->nfsd_serv, net); svc_destroy(nn->nfsd_serv); if (destroy) - nn->nfsd_serv = NULL; + nfsd_complete_shutdown(net); } int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) @@ -997,7 +1029,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) * NFSv4 does some encoding while processing */ p = resv->iov_base + resv->iov_len; - resv->iov_len += sizeof(__be32); + svcxdr_init_encode(rqstp); *statp = proc->pc_func(rqstp); if (*statp == rpc_drop_reply || test_bit(RQ_DROPME, &rqstp->rq_flags)) @@ -1052,7 +1084,7 @@ int nfssvc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p) */ int nfssvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p) { - return xdr_ressize_check(rqstp, p); + return 1; } int nfsd_pool_stats_open(struct inode *inode, struct file *file) diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 5d79ef6a0c7f..a06c05fe3b42 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -9,12 +9,10 @@ #include "xdr.h" #include "auth.h" -#define NFSDDBG_FACILITY NFSDDBG_XDR - /* * Mapping of S_IF* types to NFS file types */ -static u32 nfs_ftypes[] = { +static const u32 nfs_ftypes[] = { NFNON, NFCHR, NFCHR, NFBAD, NFDIR, NFBAD, NFBLK, NFBAD, NFREG, NFBAD, NFLNK, NFBAD, @@ -27,6 +25,28 @@ static u32 nfs_ftypes[] = { */ /** + * svcxdr_encode_stat - Encode an NFSv2 status code + * @xdr: XDR stream + * @status: status value to encode + * + * Return values: + * %false: Send buffer space was exhausted + * %true: Success + */ +bool +svcxdr_encode_stat(struct xdr_stream *xdr, __be32 status) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, sizeof(status)); + if (!p) + return false; + *p = status; + + return true; +} + +/** * svcxdr_decode_fhandle - Decode an NFSv2 file handle * @xdr: XDR stream positioned at an encoded NFSv2 FH * @fhp: OUT: filled-in server file handle @@ -50,11 +70,28 @@ svcxdr_decode_fhandle(struct xdr_stream *xdr, struct svc_fh *fhp) return true; } -static __be32 * -encode_fh(__be32 *p, struct svc_fh *fhp) +static bool +svcxdr_encode_fhandle(struct xdr_stream *xdr, const struct svc_fh *fhp) { + __be32 *p; + + p = xdr_reserve_space(xdr, NFS_FHSIZE); + if (!p) + return false; memcpy(p, &fhp->fh_handle.fh_base, NFS_FHSIZE); - return p + (NFS_FHSIZE>> 2); + + return true; +} + +static __be32 * +encode_timeval(__be32 *p, const struct timespec64 *time) +{ + *p++ = cpu_to_be32((u32)time->tv_sec); + if (time->tv_nsec) + *p++ = cpu_to_be32(time->tv_nsec / NSEC_PER_USEC); + else + *p++ = xdr_zero; + return p; } static bool @@ -162,68 +199,73 @@ svcxdr_decode_sattr(struct svc_rqst *rqstp, struct xdr_stream *xdr, return true; } -static __be32 * -encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, - struct kstat *stat) +/** + * svcxdr_encode_fattr - Encode NFSv2 file attributes + * @rqstp: Context of a completed RPC transaction + * @xdr: XDR stream + * @fhp: File handle to encode + * @stat: Attributes to encode + * + * Return values: + * %false: Send buffer space was exhausted + * %true: Success + */ +bool +svcxdr_encode_fattr(struct svc_rqst *rqstp, struct xdr_stream *xdr, + const struct svc_fh *fhp, const struct kstat *stat) { struct user_namespace *userns = nfsd_user_namespace(rqstp); - struct dentry *dentry = fhp->fh_dentry; - int type; + struct dentry *dentry = fhp->fh_dentry; + int type = stat->mode & S_IFMT; struct timespec64 time; - u32 f; + __be32 *p; + u32 fsid; - type = (stat->mode & S_IFMT); + p = xdr_reserve_space(xdr, XDR_UNIT * 17); + if (!p) + return false; - *p++ = htonl(nfs_ftypes[type >> 12]); - *p++ = htonl((u32) stat->mode); - *p++ = htonl((u32) stat->nlink); - *p++ = htonl((u32) from_kuid_munged(userns, stat->uid)); - *p++ = htonl((u32) from_kgid_munged(userns, stat->gid)); + *p++ = cpu_to_be32(nfs_ftypes[type >> 12]); + *p++ = cpu_to_be32((u32)stat->mode); + *p++ = cpu_to_be32((u32)stat->nlink); + *p++ = cpu_to_be32((u32)from_kuid_munged(userns, stat->uid)); + *p++ = cpu_to_be32((u32)from_kgid_munged(userns, stat->gid)); - if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) { - *p++ = htonl(NFS_MAXPATHLEN); - } else { - *p++ = htonl((u32) stat->size); - } - *p++ = htonl((u32) stat->blksize); + if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) + *p++ = cpu_to_be32(NFS_MAXPATHLEN); + else + *p++ = cpu_to_be32((u32) stat->size); + *p++ = cpu_to_be32((u32) stat->blksize); if (S_ISCHR(type) || S_ISBLK(type)) - *p++ = htonl(new_encode_dev(stat->rdev)); + *p++ = cpu_to_be32(new_encode_dev(stat->rdev)); else - *p++ = htonl(0xffffffff); - *p++ = htonl((u32) stat->blocks); + *p++ = cpu_to_be32(0xffffffff); + *p++ = cpu_to_be32((u32)stat->blocks); + switch (fsid_source(fhp)) { - default: - case FSIDSOURCE_DEV: - *p++ = htonl(new_encode_dev(stat->dev)); - break; case FSIDSOURCE_FSID: - *p++ = htonl((u32) fhp->fh_export->ex_fsid); + fsid = (u32)fhp->fh_export->ex_fsid; break; case FSIDSOURCE_UUID: - f = ((u32*)fhp->fh_export->ex_uuid)[0]; - f ^= ((u32*)fhp->fh_export->ex_uuid)[1]; - f ^= ((u32*)fhp->fh_export->ex_uuid)[2]; - f ^= ((u32*)fhp->fh_export->ex_uuid)[3]; - *p++ = htonl(f); + fsid = ((u32 *)fhp->fh_export->ex_uuid)[0]; + fsid ^= ((u32 *)fhp->fh_export->ex_uuid)[1]; + fsid ^= ((u32 *)fhp->fh_export->ex_uuid)[2]; + fsid ^= ((u32 *)fhp->fh_export->ex_uuid)[3]; + break; + default: + fsid = new_encode_dev(stat->dev); break; } - *p++ = htonl((u32) stat->ino); - *p++ = htonl((u32) stat->atime.tv_sec); - *p++ = htonl(stat->atime.tv_nsec ? stat->atime.tv_nsec / 1000 : 0); - time = stat->mtime; - lease_get_mtime(d_inode(dentry), &time); - *p++ = htonl((u32) time.tv_sec); - *p++ = htonl(time.tv_nsec ? time.tv_nsec / 1000 : 0); - *p++ = htonl((u32) stat->ctime.tv_sec); - *p++ = htonl(stat->ctime.tv_nsec ? stat->ctime.tv_nsec / 1000 : 0); + *p++ = cpu_to_be32(fsid); - return p; -} + *p++ = cpu_to_be32((u32)stat->ino); + p = encode_timeval(p, &stat->atime); + time = stat->mtime; + lease_get_mtime(d_inode(dentry), &time); + p = encode_timeval(p, &time); + encode_timeval(p, &stat->ctime); -/* Helper function for NFSv2 ACL code */ -__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat) -{ - return encode_fattr(rqstp, p, fhp, stat); + return true; } /* @@ -390,106 +432,118 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) */ int -nfssvc_encode_stat(struct svc_rqst *rqstp, __be32 *p) +nfssvc_encode_statres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd_stat *resp = rqstp->rq_resp; - *p++ = resp->status; - return xdr_ressize_check(rqstp, p); + return svcxdr_encode_stat(xdr, resp->status); } int -nfssvc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p) +nfssvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd_attrstat *resp = rqstp->rq_resp; - *p++ = resp->status; - if (resp->status != nfs_ok) - goto out; - p = encode_fattr(rqstp, p, &resp->fh, &resp->stat); -out: - return xdr_ressize_check(rqstp, p); + if (!svcxdr_encode_stat(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat)) + return 0; + break; + } + + return 1; } int nfssvc_encode_diropres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd_diropres *resp = rqstp->rq_resp; - *p++ = resp->status; - if (resp->status != nfs_ok) - goto out; - p = encode_fh(p, &resp->fh); - p = encode_fattr(rqstp, p, &resp->fh, &resp->stat); -out: - return xdr_ressize_check(rqstp, p); + if (!svcxdr_encode_stat(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_fhandle(xdr, &resp->fh)) + return 0; + if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat)) + return 0; + break; + } + + return 1; } int nfssvc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd_readlinkres *resp = rqstp->rq_resp; struct kvec *head = rqstp->rq_res.head; - *p++ = resp->status; - if (resp->status != nfs_ok) - return xdr_ressize_check(rqstp, p); - - *p++ = htonl(resp->len); - xdr_ressize_check(rqstp, p); - rqstp->rq_res.page_len = resp->len; - if (resp->len & 3) { - /* need to pad the tail */ - rqstp->rq_res.tail[0].iov_base = p; - *p = 0; - rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3); - } - if (svc_encode_result_payload(rqstp, head->iov_len, resp->len)) + if (!svcxdr_encode_stat(xdr, resp->status)) return 0; + switch (resp->status) { + case nfs_ok: + if (xdr_stream_encode_u32(xdr, resp->len) < 0) + return 0; + xdr_write_pages(xdr, &resp->page, 0, resp->len); + if (svc_encode_result_payload(rqstp, head->iov_len, resp->len) < 0) + return 0; + break; + } + return 1; } int nfssvc_encode_readres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd_readres *resp = rqstp->rq_resp; struct kvec *head = rqstp->rq_res.head; - *p++ = resp->status; - if (resp->status != nfs_ok) - return xdr_ressize_check(rqstp, p); - - p = encode_fattr(rqstp, p, &resp->fh, &resp->stat); - *p++ = htonl(resp->count); - xdr_ressize_check(rqstp, p); - - /* now update rqstp->rq_res to reflect data as well */ - rqstp->rq_res.page_len = resp->count; - if (resp->count & 3) { - /* need to pad the tail */ - rqstp->rq_res.tail[0].iov_base = p; - *p = 0; - rqstp->rq_res.tail[0].iov_len = 4 - (resp->count&3); - } - if (svc_encode_result_payload(rqstp, head->iov_len, resp->count)) + if (!svcxdr_encode_stat(xdr, resp->status)) return 0; + switch (resp->status) { + case nfs_ok: + if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat)) + return 0; + if (xdr_stream_encode_u32(xdr, resp->count) < 0) + return 0; + xdr_write_pages(xdr, resp->pages, rqstp->rq_res.page_base, + resp->count); + if (svc_encode_result_payload(rqstp, head->iov_len, resp->count) < 0) + return 0; + break; + } + return 1; } int nfssvc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd_readdirres *resp = rqstp->rq_resp; + struct xdr_buf *dirlist = &resp->dirlist; - *p++ = resp->status; - if (resp->status != nfs_ok) - return xdr_ressize_check(rqstp, p); - - xdr_ressize_check(rqstp, p); - p = resp->buffer; - *p++ = 0; /* no more entries */ - *p++ = htonl((resp->common.err == nfserr_eof)); - rqstp->rq_res.page_len = (((unsigned long)p-1) & ~PAGE_MASK)+1; + if (!svcxdr_encode_stat(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + xdr_write_pages(xdr, dirlist->pages, 0, dirlist->len); + /* no more entries */ + if (xdr_stream_encode_item_absent(xdr) < 0) + return 0; + if (xdr_stream_encode_bool(xdr, resp->common.err == nfserr_eof) < 0) + return 0; + break; + } return 1; } @@ -497,64 +551,113 @@ nfssvc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p) int nfssvc_encode_statfsres(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_res_stream; struct nfsd_statfsres *resp = rqstp->rq_resp; struct kstatfs *stat = &resp->stats; - *p++ = resp->status; - if (resp->status != nfs_ok) - return xdr_ressize_check(rqstp, p); + if (!svcxdr_encode_stat(xdr, resp->status)) + return 0; + switch (resp->status) { + case nfs_ok: + p = xdr_reserve_space(xdr, XDR_UNIT * 5); + if (!p) + return 0; + *p++ = cpu_to_be32(NFSSVC_MAXBLKSIZE_V2); + *p++ = cpu_to_be32(stat->f_bsize); + *p++ = cpu_to_be32(stat->f_blocks); + *p++ = cpu_to_be32(stat->f_bfree); + *p = cpu_to_be32(stat->f_bavail); + break; + } - *p++ = htonl(NFSSVC_MAXBLKSIZE_V2); /* max transfer size */ - *p++ = htonl(stat->f_bsize); - *p++ = htonl(stat->f_blocks); - *p++ = htonl(stat->f_bfree); - *p++ = htonl(stat->f_bavail); - return xdr_ressize_check(rqstp, p); + return 1; } -int -nfssvc_encode_entry(void *ccdv, const char *name, - int namlen, loff_t offset, u64 ino, unsigned int d_type) +/** + * nfssvc_encode_nfscookie - Encode a directory offset cookie + * @resp: readdir result context + * @offset: offset cookie to encode + * + * The buffer space for the offset cookie has already been reserved + * by svcxdr_encode_entry_common(). + */ +void nfssvc_encode_nfscookie(struct nfsd_readdirres *resp, u32 offset) { - struct readdir_cd *ccd = ccdv; - struct nfsd_readdirres *cd = container_of(ccd, struct nfsd_readdirres, common); - __be32 *p = cd->buffer; - int buflen, slen; + __be32 cookie = cpu_to_be32(offset); - /* - dprintk("nfsd: entry(%.*s off %ld ino %ld)\n", - namlen, name, offset, ino); - */ + if (!resp->cookie_offset) + return; - if (offset > ~((u32) 0)) { - cd->common.err = nfserr_fbig; - return -EINVAL; - } - if (cd->offset) - *cd->offset = htonl(offset); + write_bytes_to_xdr_buf(&resp->dirlist, resp->cookie_offset, &cookie, + sizeof(cookie)); + resp->cookie_offset = 0; +} - /* truncate filename */ - namlen = min(namlen, NFS2_MAXNAMLEN); - slen = XDR_QUADLEN(namlen); +static bool +svcxdr_encode_entry_common(struct nfsd_readdirres *resp, const char *name, + int namlen, loff_t offset, u64 ino) +{ + struct xdr_buf *dirlist = &resp->dirlist; + struct xdr_stream *xdr = &resp->xdr; - if ((buflen = cd->buflen - slen - 4) < 0) { - cd->common.err = nfserr_toosmall; - return -EINVAL; - } - if (ino > ~((u32) 0)) { - cd->common.err = nfserr_fbig; - return -EINVAL; - } - *p++ = xdr_one; /* mark entry present */ - *p++ = htonl((u32) ino); /* file id */ - p = xdr_encode_array(p, name, namlen);/* name length & name */ - cd->offset = p; /* remember pointer */ - *p++ = htonl(~0U); /* offset of next entry */ - - cd->buflen = buflen; - cd->buffer = p; - cd->common.err = nfs_ok; + if (xdr_stream_encode_item_present(xdr) < 0) + return false; + /* fileid */ + if (xdr_stream_encode_u32(xdr, (u32)ino) < 0) + return false; + /* name */ + if (xdr_stream_encode_opaque(xdr, name, min(namlen, NFS2_MAXNAMLEN)) < 0) + return false; + /* cookie */ + resp->cookie_offset = dirlist->len; + if (xdr_stream_encode_u32(xdr, ~0U) < 0) + return false; + + return true; +} + +/** + * nfssvc_encode_entry - encode one NFSv2 READDIR entry + * @data: directory context + * @name: name of the object to be encoded + * @namlen: length of that name, in bytes + * @offset: the offset of the previous entry + * @ino: the fileid of this entry + * @d_type: unused + * + * Return values: + * %0: Entry was successfully encoded. + * %-EINVAL: An encoding problem occured, secondary status code in resp->common.err + * + * On exit, the following fields are updated: + * - resp->xdr + * - resp->common.err + * - resp->cookie_offset + */ +int nfssvc_encode_entry(void *data, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct readdir_cd *ccd = data; + struct nfsd_readdirres *resp = container_of(ccd, + struct nfsd_readdirres, + common); + unsigned int starting_length = resp->dirlist.len; + + /* The offset cookie for the previous entry */ + nfssvc_encode_nfscookie(resp, offset); + + if (!svcxdr_encode_entry_common(resp, name, namlen, offset, ino)) + goto out_toosmall; + + xdr_commit_encode(&resp->xdr); + resp->common.err = nfs_ok; return 0; + +out_toosmall: + resp->cookie_offset = 0; + resp->common.err = nfserr_toosmall; + resp->dirlist.len = starting_length; + return -EINVAL; } /* diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 73deea353169..54cab651ac1d 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -371,6 +371,10 @@ struct nfs4_client { /* debugging info directory under nfsd/clients/ : */ struct dentry *cl_nfsd_dentry; + /* 'info' file within that directory. Ref is not counted, + * but will remain valid iff cl_nfsd_dentry != NULL + */ + struct dentry *cl_nfsd_info_dentry; /* for nfs41 callbacks */ /* We currently support a single back channel with a single slot */ diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 92a0973dd671..27a93ebd1d80 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -391,6 +391,30 @@ DEFINE_EVENT(nfsd_err_class, nfsd_##name, \ DEFINE_NFSD_ERR_EVENT(read_err); DEFINE_NFSD_ERR_EVENT(write_err); +TRACE_EVENT(nfsd_dirent, + TP_PROTO(struct svc_fh *fhp, + u64 ino, + const char *name, + int namlen), + TP_ARGS(fhp, ino, name, namlen), + TP_STRUCT__entry( + __field(u32, fh_hash) + __field(u64, ino) + __field(int, len) + __dynamic_array(unsigned char, name, namlen) + ), + TP_fast_assign( + __entry->fh_hash = fhp ? knfsd_fh_hash(&fhp->fh_handle) : 0; + __entry->ino = ino; + __entry->len = namlen; + memcpy(__get_str(name), name, namlen); + __assign_str(name, name); + ), + TP_printk("fh_hash=0x%08x ino=%llu name=%.*s", + __entry->fh_hash, __entry->ino, + __entry->len, __get_str(name)) +) + #include "state.h" #include "filecache.h" #include "vfs.h" diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index fd6be35a1642..15adf1f6ab21 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1968,8 +1968,9 @@ static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name, return 0; } -static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func, - struct readdir_cd *cdp, loff_t *offsetp) +static __be32 nfsd_buffered_readdir(struct file *file, struct svc_fh *fhp, + nfsd_filldir_t func, struct readdir_cd *cdp, + loff_t *offsetp) { struct buffered_dirent *de; int host_err; @@ -2015,6 +2016,8 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func, if (cdp->err != nfs_ok) break; + trace_nfsd_dirent(fhp, de->ino, de->name, de->namlen); + reclen = ALIGN(sizeof(*de) + de->namlen, sizeof(u64)); size -= reclen; @@ -2062,7 +2065,7 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, goto out_close; } - err = nfsd_buffered_readdir(file, func, cdp, offsetp); + err = nfsd_buffered_readdir(file, fhp, func, cdp, offsetp); if (err == nfserr_eof || err == nfserr_toosmall) err = nfs_ok; /* can still be found in ->err */ diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index a2442ebe5acf..b21b76e6b9a8 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -152,7 +152,7 @@ static inline void fh_drop_write(struct svc_fh *fh) } } -static inline __be32 fh_getattr(struct svc_fh *fh, struct kstat *stat) +static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat) { struct path p = {.mnt = fh->fh_export->ex_path.mnt, .dentry = fh->fh_dentry}; diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h index 3018b52b6d5e..f45b4bc93f52 100644 --- a/fs/nfsd/xdr.h +++ b/fs/nfsd/xdr.h @@ -94,6 +94,7 @@ struct nfsd_diropres { struct nfsd_readlinkres { __be32 status; int len; + struct page *page; }; struct nfsd_readres { @@ -101,17 +102,20 @@ struct nfsd_readres { struct svc_fh fh; unsigned long count; struct kstat stat; + struct page **pages; }; struct nfsd_readdirres { + /* Components of the reply */ __be32 status; int count; + /* Used to encode the reply's entry list */ + struct xdr_stream xdr; + struct xdr_buf dirlist; struct readdir_cd common; - __be32 * buffer; - int buflen; - __be32 * offset; + unsigned int cookie_offset; }; struct nfsd_statfsres { @@ -147,23 +151,26 @@ int nfssvc_decode_renameargs(struct svc_rqst *, __be32 *); int nfssvc_decode_linkargs(struct svc_rqst *, __be32 *); int nfssvc_decode_symlinkargs(struct svc_rqst *, __be32 *); int nfssvc_decode_readdirargs(struct svc_rqst *, __be32 *); -int nfssvc_encode_stat(struct svc_rqst *, __be32 *); -int nfssvc_encode_attrstat(struct svc_rqst *, __be32 *); +int nfssvc_encode_statres(struct svc_rqst *, __be32 *); +int nfssvc_encode_attrstatres(struct svc_rqst *, __be32 *); int nfssvc_encode_diropres(struct svc_rqst *, __be32 *); int nfssvc_encode_readlinkres(struct svc_rqst *, __be32 *); int nfssvc_encode_readres(struct svc_rqst *, __be32 *); int nfssvc_encode_statfsres(struct svc_rqst *, __be32 *); int nfssvc_encode_readdirres(struct svc_rqst *, __be32 *); -int nfssvc_encode_entry(void *, const char *name, - int namlen, loff_t offset, u64 ino, unsigned int); +void nfssvc_encode_nfscookie(struct nfsd_readdirres *resp, u32 offset); +int nfssvc_encode_entry(void *data, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type); void nfssvc_release_attrstat(struct svc_rqst *rqstp); void nfssvc_release_diropres(struct svc_rqst *rqstp); void nfssvc_release_readres(struct svc_rqst *rqstp); /* Helper functions for NFSv2 ACL code */ -__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat); bool svcxdr_decode_fhandle(struct xdr_stream *xdr, struct svc_fh *fhp); +bool svcxdr_encode_stat(struct xdr_stream *xdr, __be32 status); +bool svcxdr_encode_fattr(struct svc_rqst *rqstp, struct xdr_stream *xdr, + const struct svc_fh *fhp, const struct kstat *stat); #endif /* LINUX_NFSD_H */ diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index 3e1578953f54..933008382bbe 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -137,6 +137,7 @@ struct nfsd3_readlinkres { __be32 status; struct svc_fh fh; __u32 len; + struct page **pages; }; struct nfsd3_readres { @@ -144,6 +145,7 @@ struct nfsd3_readres { struct svc_fh fh; unsigned long count; __u32 eof; + struct page **pages; }; struct nfsd3_writeres { @@ -167,19 +169,17 @@ struct nfsd3_linkres { }; struct nfsd3_readdirres { + /* Components of the reply */ __be32 status; struct svc_fh fh; - /* Just to save kmalloc on every readdirplus entry (svc_fh is a - * little large for the stack): */ - struct svc_fh scratch; - int count; __be32 verf[2]; + /* Used to encode the reply's entry list */ + struct xdr_stream xdr; + struct xdr_buf dirlist; + struct svc_fh scratch; struct readdir_cd common; - __be32 * buffer; - int buflen; - __be32 * offset; - __be32 * offset1; + unsigned int cookie_offset; struct svc_rqst * rqstp; }; @@ -280,9 +280,9 @@ int nfs3svc_decode_symlinkargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_readdirargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_readdirplusargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_commitargs(struct svc_rqst *, __be32 *); -int nfs3svc_encode_attrstat(struct svc_rqst *, __be32 *); +int nfs3svc_encode_getattrres(struct svc_rqst *, __be32 *); int nfs3svc_encode_wccstat(struct svc_rqst *, __be32 *); -int nfs3svc_encode_diropres(struct svc_rqst *, __be32 *); +int nfs3svc_encode_lookupres(struct svc_rqst *, __be32 *); int nfs3svc_encode_accessres(struct svc_rqst *, __be32 *); int nfs3svc_encode_readlinkres(struct svc_rqst *, __be32 *); int nfs3svc_encode_readres(struct svc_rqst *, __be32 *); @@ -298,15 +298,16 @@ int nfs3svc_encode_commitres(struct svc_rqst *, __be32 *); void nfs3svc_release_fhandle(struct svc_rqst *); void nfs3svc_release_fhandle2(struct svc_rqst *); -int nfs3svc_encode_entry(void *, const char *name, - int namlen, loff_t offset, u64 ino, - unsigned int); -int nfs3svc_encode_entry_plus(void *, const char *name, - int namlen, loff_t offset, u64 ino, - unsigned int); + +void nfs3svc_encode_cookie3(struct nfsd3_readdirres *resp, u64 offset); +int nfs3svc_encode_entry3(void *data, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type); +int nfs3svc_encode_entryplus3(void *data, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type); /* Helper functions for NFSv3 ACL code */ -__be32 *nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, - struct svc_fh *fhp); bool svcxdr_decode_nfs_fh3(struct xdr_stream *xdr, struct svc_fh *fhp); +bool svcxdr_encode_nfsstat3(struct xdr_stream *xdr, __be32 status); +bool svcxdr_encode_post_op_attr(struct svc_rqst *rqstp, struct xdr_stream *xdr, + const struct svc_fh *fhp); #endif /* _LINUX_NFSD_XDR3_H */ diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index c300885ae75d..fe540a3415c6 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -698,7 +698,7 @@ struct nfsd4_compoundargs { struct nfsd4_compoundres { /* scratch variables for XDR encode */ - struct xdr_stream xdr; + struct xdr_stream *xdr; struct svc_rqst * rqstp; u32 taglen; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 3bfb4147895a..ad20403b383f 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2295,7 +2295,7 @@ static int ocfs2_dio_end_io_write(struct inode *inode, struct ocfs2_alloc_context *meta_ac = NULL; handle_t *handle = NULL; loff_t end = offset + bytes; - int ret = 0, credits = 0, locked = 0; + int ret = 0, credits = 0; ocfs2_init_dealloc_ctxt(&dealloc); @@ -2306,13 +2306,6 @@ static int ocfs2_dio_end_io_write(struct inode *inode, !dwc->dw_orphaned) goto out; - /* ocfs2_file_write_iter will get i_mutex, so we need not lock if we - * are in that context. */ - if (dwc->dw_writer_pid != task_pid_nr(current)) { - inode_lock(inode); - locked = 1; - } - ret = ocfs2_inode_lock(inode, &di_bh, 1); if (ret < 0) { mlog_errno(ret); @@ -2393,8 +2386,6 @@ out: if (meta_ac) ocfs2_free_alloc_context(meta_ac); ocfs2_run_deallocs(osb, &dealloc); - if (locked) - inode_unlock(inode); ocfs2_dio_free_write_ctx(inode, dwc); return ret; diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 8e3a369086db..0fbe8bf7190f 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2204,7 +2204,7 @@ static void ocfs2_unpack_timespec(struct timespec64 *spec, spec->tv_nsec = packed_time & OCFS2_NSEC_MASK; } -static void ocfs2_refresh_inode_from_lvb(struct inode *inode) +static int ocfs2_refresh_inode_from_lvb(struct inode *inode) { struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; @@ -2213,6 +2213,8 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode) mlog_meta_lvb(0, lockres); lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + if (inode_wrong_type(inode, be16_to_cpu(lvb->lvb_imode))) + return -ESTALE; /* We're safe here without the lockres lock... */ spin_lock(&oi->ip_lock); @@ -2240,6 +2242,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode) ocfs2_unpack_timespec(&inode->i_ctime, be64_to_cpu(lvb->lvb_ictime_packed)); spin_unlock(&oi->ip_lock); + return 0; } static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode, @@ -2342,7 +2345,8 @@ static int ocfs2_inode_lock_update(struct inode *inode, if (ocfs2_meta_lvb_is_trustable(inode, lockres)) { mlog(0, "Trusting LVB on inode %llu\n", (unsigned long long)oi->ip_blkno); - ocfs2_refresh_inode_from_lvb(inode); + status = ocfs2_refresh_inode_from_lvb(inode); + goto bail_refresh; } else { /* Boo, we have to go to disk. */ /* read bh, cast, ocfs2_refresh_inode */ @@ -2352,6 +2356,10 @@ static int ocfs2_inode_lock_update(struct inode *inode, goto bail_refresh; } fe = (struct ocfs2_dinode *) (*bh)->b_data; + if (inode_wrong_type(inode, le16_to_cpu(fe->i_mode))) { + status = -ESTALE; + goto bail_refresh; + } /* This is a good chance to make sure we're not * locking an invalid object. ocfs2_read_inode_block() diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 908d22b431fa..db8a6265b749 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1245,22 +1245,24 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, goto bail_unlock; } } + down_write(&OCFS2_I(inode)->ip_alloc_sem); handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS + 2 * ocfs2_quota_trans_credits(sb)); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); - goto bail_unlock; + goto bail_unlock_alloc; } status = __dquot_transfer(inode, transfer_to); if (status < 0) goto bail_commit; } else { + down_write(&OCFS2_I(inode)->ip_alloc_sem); handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); - goto bail_unlock; + goto bail_unlock_alloc; } } @@ -1273,6 +1275,8 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, bail_commit: ocfs2_commit_trans(osb, handle); +bail_unlock_alloc: + up_write(&OCFS2_I(inode)->ip_alloc_sem); bail_unlock: if (status && inode_locked) { ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock); diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 40c8c2e32fa3..f825176ff4ed 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -236,27 +236,31 @@ found: mutex_unlock(&op_mutex); if (IS_ERR(inode)) return ERR_CAST(inode); - ent_oi = OP_I(inode); - ent_oi->type = ent_type; - ent_oi->u = ent_data; - - switch (ent_type) { - case op_inode_node: - inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; - inode->i_op = &openprom_inode_operations; - inode->i_fop = &openprom_operations; - set_nlink(inode, 2); - break; - case op_inode_prop: - if (of_node_name_eq(dp, "options") && (len == 17) && - !strncmp (name, "security-password", 17)) - inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR; - else - inode->i_mode = S_IFREG | S_IRUGO; - inode->i_fop = &openpromfs_prop_ops; - set_nlink(inode, 1); - inode->i_size = ent_oi->u.prop->length; - break; + if (inode->i_state & I_NEW) { + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + ent_oi = OP_I(inode); + ent_oi->type = ent_type; + ent_oi->u = ent_data; + + switch (ent_type) { + case op_inode_node: + inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; + inode->i_op = &openprom_inode_operations; + inode->i_fop = &openprom_operations; + set_nlink(inode, 2); + break; + case op_inode_prop: + if (of_node_name_eq(dp, "options") && (len == 17) && + !strncmp (name, "security-password", 17)) + inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR; + else + inode->i_mode = S_IFREG | S_IRUGO; + inode->i_fop = &openpromfs_prop_ops; + set_nlink(inode, 1); + inode->i_size = ent_oi->u.prop->length; + break; + } + unlock_new_inode(inode); } return d_splice_alias(inode, dentry); @@ -345,20 +349,9 @@ static void openprom_free_inode(struct inode *inode) static struct inode *openprom_iget(struct super_block *sb, ino_t ino) { - struct inode *inode; - - inode = iget_locked(sb, ino); + struct inode *inode = iget_locked(sb, ino); if (!inode) - return ERR_PTR(-ENOMEM); - if (inode->i_state & I_NEW) { - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); - if (inode->i_ino == OPENPROM_ROOT_INO) { - inode->i_op = &openprom_inode_operations; - inode->i_fop = &openprom_operations; - inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; - } - unlock_new_inode(inode); - } + inode = ERR_PTR(-ENOMEM); return inode; } @@ -394,9 +387,15 @@ static int openprom_fill_super(struct super_block *s, struct fs_context *fc) goto out_no_root; } + root_inode->i_mtime = root_inode->i_atime = + root_inode->i_ctime = current_time(root_inode); + root_inode->i_op = &openprom_inode_operations; + root_inode->i_fop = &openprom_operations; + root_inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; oi = OP_I(root_inode); oi->type = op_inode_node; oi->u.node = of_find_node_by_path("/"); + unlock_new_inode(root_inode); s->s_root = d_make_root(root_inode); if (!s->s_root) diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index d4b7ae763186..46b7dcff18ac 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -221,7 +221,7 @@ static int orangefs_inode_is_stale(struct inode *inode, * If the inode type or symlink target have changed then this * inode is stale. */ - if (type == -1 || !(inode->i_mode & type)) { + if (type == -1 || inode_wrong_type(inode, type)) { orangefs_make_bad_inode(inode); return 1; } diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 9bd4167cc7fb..c144183a7e09 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -430,20 +430,11 @@ static int ovl_mmap(struct file *file, struct vm_area_struct *vma) if (WARN_ON(file != vma->vm_file)) return -EIO; - vma->vm_file = get_file(realfile); + vma_set_file(vma, realfile); old_cred = ovl_override_creds(file_inode(file)->i_sb); ret = call_mmap(vma->vm_file, vma); revert_creds(old_cred); - - if (ret) { - /* Drop reference count from new vm_file value */ - fput(realfile); - } else { - /* Drop reference count from previous vm_file value */ - fput(file); - } - ovl_file_accessed(file); return ret; diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 3fe05fb5d145..1d573972ce22 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -371,7 +371,7 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, return PTR_ERR(origin); if (upperdentry && !ovl_is_whiteout(upperdentry) && - ((d_inode(origin)->i_mode ^ d_inode(upperdentry)->i_mode) & S_IFMT)) + inode_wrong_type(d_inode(upperdentry), d_inode(origin)->i_mode)) goto invalid; if (!*stackp) @@ -730,7 +730,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, index = ERR_PTR(-ESTALE); goto out; } else if (ovl_dentry_weird(index) || ovl_is_whiteout(index) || - ((inode->i_mode ^ d_inode(origin)->i_mode) & S_IFMT)) { + inode_wrong_type(inode, d_inode(origin)->i_mode)) { /* * Index should always be of the same file type as origin * except for the case of a whiteout index. A whiteout diff --git a/fs/proc/array.c b/fs/proc/array.c index bb87e4d89cd8..7ec59171f197 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -342,9 +342,11 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p) seq_put_decimal_ull(m, "NoNewPrivs:\t", task_no_new_privs(p)); #ifdef CONFIG_SECCOMP seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode); +#ifdef CONFIG_SECCOMP_FILTER seq_put_decimal_ull(m, "\nSeccomp_filters:\t", atomic_read(&p->seccomp.filter_count)); #endif +#endif seq_puts(m, "\nSpeculation_Store_Bypass:\t"); switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) { case -EINVAL: diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index ca6d8a867285..fefe3d391d3a 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -56,7 +56,7 @@ MODULE_PARM_DESC(mem_size, static unsigned int mem_type; module_param(mem_type, uint, 0400); MODULE_PARM_DESC(mem_type, - "set to 1 to try to use unbuffered memory (default 0)"); + "memory type: 0=write-combined (default), 1=unbuffered, 2=cached"); static int ramoops_max_reason = -1; module_param_named(max_reason, ramoops_max_reason, int, 0400); @@ -648,6 +648,10 @@ static int ramoops_parse_dt(struct platform_device *pdev, pdata->mem_size = resource_size(res); pdata->mem_address = res->start; + /* + * Setting "unbuffered" is deprecated and will be ignored if + * "mem_type" is also specified. + */ pdata->mem_type = of_property_read_bool(of_node, "unbuffered"); /* * Setting "no-dump-oops" is deprecated and will be ignored if @@ -666,6 +670,7 @@ static int ramoops_parse_dt(struct platform_device *pdev, field = value; \ } + parse_u32("mem-type", pdata->record_size, pdata->mem_type); parse_u32("record-size", pdata->record_size, 0); parse_u32("console-size", pdata->console_size, 0); parse_u32("ftrace-size", pdata->ftrace_size, 0); diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index fff363bfd484..fe5305028c6e 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -396,6 +396,10 @@ void persistent_ram_zap(struct persistent_ram_zone *prz) persistent_ram_update_header_ecc(prz); } +#define MEM_TYPE_WCOMBINE 0 +#define MEM_TYPE_NONCACHED 1 +#define MEM_TYPE_NORMAL 2 + static void *persistent_ram_vmap(phys_addr_t start, size_t size, unsigned int memtype) { @@ -409,10 +413,20 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size, page_start = start - offset_in_page(start); page_count = DIV_ROUND_UP(size + offset_in_page(start), PAGE_SIZE); - if (memtype) + switch (memtype) { + case MEM_TYPE_NORMAL: + prot = PAGE_KERNEL; + break; + case MEM_TYPE_NONCACHED: prot = pgprot_noncached(PAGE_KERNEL); - else + break; + case MEM_TYPE_WCOMBINE: prot = pgprot_writecombine(PAGE_KERNEL); + break; + default: + pr_err("invalid mem_type=%d\n", memtype); + return NULL; + } pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL); if (!pages) { diff --git a/fs/readdir.c b/fs/readdir.c index 19434b3c982c..09e8ed7d4161 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -150,6 +150,9 @@ static int fillonedir(struct dir_context *ctx, const char *name, int namlen, if (buf->result) return -EINVAL; + buf->result = verify_dirent_name(name, namlen); + if (buf->result < 0) + return buf->result; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { buf->result = -EOVERFLOW; @@ -405,6 +408,9 @@ static int compat_fillonedir(struct dir_context *ctx, const char *name, if (buf->result) return -EINVAL; + buf->result = verify_dirent_name(name, namlen); + if (buf->result < 0) + return buf->result; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { buf->result = -EOVERFLOW; diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index a4aaeea63893..dc3e26e9ed7b 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -102,7 +102,8 @@ static int switch_gc_head(struct ubifs_info *c) * This function compares data nodes @a and @b. Returns %1 if @a has greater * inode or block number, and %-1 otherwise. */ -static int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) +static int data_nodes_cmp(void *priv, const struct list_head *a, + const struct list_head *b) { ino_t inuma, inumb; struct ubifs_info *c = priv; @@ -145,8 +146,8 @@ static int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) * first and sorted by length in descending order. Directory entry nodes go * after inode nodes and are sorted in ascending hash valuer order. */ -static int nondata_nodes_cmp(void *priv, struct list_head *a, - struct list_head *b) +static int nondata_nodes_cmp(void *priv, const struct list_head *a, + const struct list_head *b) { ino_t inuma, inumb; struct ubifs_info *c = priv; diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index 0f8a6a16421b..4d17e5382b74 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -298,8 +298,8 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r) * entries @a and @b by comparing their sequence numer. Returns %1 if @a has * greater sequence number and %-1 otherwise. */ -static int replay_entries_cmp(void *priv, struct list_head *a, - struct list_head *b) +static int replay_entries_cmp(void *priv, const struct list_head *a, + const struct list_head *b) { struct ubifs_info *c = priv; struct replay_entry *ra, *rb; diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c index 7aee0ec63ade..eac6788fc6cf 100644 --- a/fs/vboxsf/dir.c +++ b/fs/vboxsf/dir.c @@ -225,7 +225,7 @@ static struct dentry *vboxsf_dir_lookup(struct inode *parent, } else { inode = vboxsf_new_inode(parent->i_sb); if (!IS_ERR(inode)) - vboxsf_init_inode(sbi, inode, &fsinfo); + vboxsf_init_inode(sbi, inode, &fsinfo, false); } return d_splice_alias(inode, dentry); @@ -245,7 +245,7 @@ static int vboxsf_dir_instantiate(struct inode *parent, struct dentry *dentry, sf_i = VBOXSF_I(inode); /* The host may have given us different attr then requested */ sf_i->force_restat = 1; - vboxsf_init_inode(sbi, inode, info); + vboxsf_init_inode(sbi, inode, info, false); d_instantiate(dentry, inode); diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c index d7816c01a4f6..4f5e59f06284 100644 --- a/fs/vboxsf/super.c +++ b/fs/vboxsf/super.c @@ -207,7 +207,7 @@ static int vboxsf_fill_super(struct super_block *sb, struct fs_context *fc) err = -ENOMEM; goto fail_unmap; } - vboxsf_init_inode(sbi, iroot, &sbi->root_info); + vboxsf_init_inode(sbi, iroot, &sbi->root_info, false); unlock_new_inode(iroot); droot = d_make_root(iroot); @@ -418,7 +418,7 @@ static int vboxsf_reconfigure(struct fs_context *fc) /* Apply changed options to the root inode */ sbi->o = ctx->o; - vboxsf_init_inode(sbi, iroot, &sbi->root_info); + vboxsf_init_inode(sbi, iroot, &sbi->root_info, true); return 0; } diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c index 3b847e3fba24..aec2ebf7d25a 100644 --- a/fs/vboxsf/utils.c +++ b/fs/vboxsf/utils.c @@ -45,12 +45,12 @@ struct inode *vboxsf_new_inode(struct super_block *sb) } /* set [inode] attributes based on [info], uid/gid based on [sbi] */ -void vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode, - const struct shfl_fsobjinfo *info) +int vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode, + const struct shfl_fsobjinfo *info, bool reinit) { const struct shfl_fsobjattr *attr; s64 allocated; - int mode; + umode_t mode; attr = &info->attr; @@ -75,29 +75,44 @@ void vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode, inode->i_mapping->a_ops = &vboxsf_reg_aops; if (SHFL_IS_DIRECTORY(attr->mode)) { - inode->i_mode = sbi->o.dmode_set ? sbi->o.dmode : mode; - inode->i_mode &= ~sbi->o.dmask; - inode->i_mode |= S_IFDIR; - inode->i_op = &vboxsf_dir_iops; - inode->i_fop = &vboxsf_dir_fops; - /* - * XXX: this probably should be set to the number of entries - * in the directory plus two (. ..) - */ - set_nlink(inode, 1); + if (sbi->o.dmode_set) + mode = sbi->o.dmode; + mode &= ~sbi->o.dmask; + mode |= S_IFDIR; + if (!reinit) { + inode->i_op = &vboxsf_dir_iops; + inode->i_fop = &vboxsf_dir_fops; + /* + * XXX: this probably should be set to the number of entries + * in the directory plus two (. ..) + */ + set_nlink(inode, 1); + } else if (!S_ISDIR(inode->i_mode)) + return -ESTALE; + inode->i_mode = mode; } else if (SHFL_IS_SYMLINK(attr->mode)) { - inode->i_mode = sbi->o.fmode_set ? sbi->o.fmode : mode; - inode->i_mode &= ~sbi->o.fmask; - inode->i_mode |= S_IFLNK; - inode->i_op = &vboxsf_lnk_iops; - set_nlink(inode, 1); + if (sbi->o.fmode_set) + mode = sbi->o.fmode; + mode &= ~sbi->o.fmask; + mode |= S_IFLNK; + if (!reinit) { + inode->i_op = &vboxsf_lnk_iops; + set_nlink(inode, 1); + } else if (!S_ISLNK(inode->i_mode)) + return -ESTALE; + inode->i_mode = mode; } else { - inode->i_mode = sbi->o.fmode_set ? sbi->o.fmode : mode; - inode->i_mode &= ~sbi->o.fmask; - inode->i_mode |= S_IFREG; - inode->i_op = &vboxsf_reg_iops; - inode->i_fop = &vboxsf_reg_fops; - set_nlink(inode, 1); + if (sbi->o.fmode_set) + mode = sbi->o.fmode; + mode &= ~sbi->o.fmask; + mode |= S_IFREG; + if (!reinit) { + inode->i_op = &vboxsf_reg_iops; + inode->i_fop = &vboxsf_reg_fops; + set_nlink(inode, 1); + } else if (!S_ISREG(inode->i_mode)) + return -ESTALE; + inode->i_mode = mode; } inode->i_uid = sbi->o.uid; @@ -116,6 +131,7 @@ void vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode, info->change_time.ns_relative_to_unix_epoch); inode->i_mtime = ns_to_timespec64( info->modification_time.ns_relative_to_unix_epoch); + return 0; } int vboxsf_create_at_dentry(struct dentry *dentry, @@ -199,7 +215,9 @@ int vboxsf_inode_revalidate(struct dentry *dentry) dentry->d_time = jiffies; sf_i->force_restat = 0; - vboxsf_init_inode(sbi, inode, &info); + err = vboxsf_init_inode(sbi, inode, &info, true); + if (err) + return err; /* * If the file was changed on the host side we need to invalidate the diff --git a/fs/vboxsf/vfsmod.h b/fs/vboxsf/vfsmod.h index 760524e78c88..6a7a9cedebc6 100644 --- a/fs/vboxsf/vfsmod.h +++ b/fs/vboxsf/vfsmod.h @@ -82,8 +82,8 @@ extern const struct dentry_operations vboxsf_dentry_ops; /* from utils.c */ struct inode *vboxsf_new_inode(struct super_block *sb); -void vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode, - const struct shfl_fsobjinfo *info); +int vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode, + const struct shfl_fsobjinfo *info, bool reinit); int vboxsf_create_at_dentry(struct dentry *dentry, struct shfl_createparms *params); int vboxsf_stat(struct vboxsf_sbi *sbi, struct shfl_string *path, diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig index 88fb25119899..24d1b54de807 100644 --- a/fs/verity/Kconfig +++ b/fs/verity/Kconfig @@ -3,9 +3,13 @@ config FS_VERITY bool "FS Verity (read-only file-based authenticity protection)" select CRYPTO - # SHA-256 is selected as it's intended to be the default hash algorithm. + # SHA-256 is implied as it's intended to be the default hash algorithm. # To avoid bloat, other wanted algorithms must be selected explicitly. - select CRYPTO_SHA256 + # Note that CRYPTO_SHA256 denotes the generic C implementation, but + # some architectures provided optimized implementations of the same + # algorithm that may be used instead. In this case, CRYPTO_SHA256 may + # be omitted even if SHA-256 is being used. + imply CRYPTO_SHA256 help This option enables fs-verity. fs-verity is the dm-verity mechanism implemented at the file level. On supported diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index f88694f22d05..813b5f219113 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -63,8 +63,8 @@ xbitmap_init( static int xbitmap_range_cmp( void *priv, - struct list_head *a, - struct list_head *b) + const struct list_head *a, + const struct list_head *b) { struct xbitmap_range *ap; struct xbitmap_range *bp; diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 2344757ede63..e3a691937e92 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -265,8 +265,8 @@ xfs_trans_log_finish_bmap_update( static int xfs_bmap_update_diff_items( void *priv, - struct list_head *a, - struct list_head *b) + const struct list_head *a, + const struct list_head *b) { struct xfs_bmap_intent *ba; struct xfs_bmap_intent *bb; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 37a1d12762d8..592800c8852f 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -2124,9 +2124,9 @@ xfs_buf_delwri_queue( */ static int xfs_buf_cmp( - void *priv, - struct list_head *a, - struct list_head *b) + void *priv, + const struct list_head *a, + const struct list_head *b) { struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list); struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index ef17c1f6db32..a4075685d9eb 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -629,8 +629,8 @@ xfs_extent_busy_wait_all( int xfs_extent_busy_ag_cmp( void *priv, - struct list_head *l1, - struct list_head *l2) + const struct list_head *l1, + const struct list_head *l2) { struct xfs_extent_busy *b1 = container_of(l1, struct xfs_extent_busy, list); diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h index 990ab3891971..8aea07100092 100644 --- a/fs/xfs/xfs_extent_busy.h +++ b/fs/xfs/xfs_extent_busy.h @@ -58,7 +58,8 @@ void xfs_extent_busy_wait_all(struct xfs_mount *mp); int -xfs_extent_busy_ag_cmp(void *priv, struct list_head *a, struct list_head *b); +xfs_extent_busy_ag_cmp(void *priv, const struct list_head *a, + const struct list_head *b); static inline void xfs_extent_busy_sort(struct list_head *list) { diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 93223ebb3372..2424230ca2c3 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -397,8 +397,8 @@ xfs_trans_free_extent( static int xfs_extent_free_diff_items( void *priv, - struct list_head *a, - struct list_head *b) + const struct list_head *a, + const struct list_head *b) { struct xfs_mount *mp = priv; struct xfs_extent_free_item *ra; diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 07ebccbbf4df..746f4eda724c 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -269,8 +269,8 @@ xfs_trans_log_finish_refcount_update( static int xfs_refcount_update_diff_items( void *priv, - struct list_head *a, - struct list_head *b) + const struct list_head *a, + const struct list_head *b) { struct xfs_mount *mp = priv; struct xfs_refcount_intent *ra; diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 49cebd68b672..dc4f0c9f0897 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -337,8 +337,8 @@ xfs_trans_log_finish_rmap_update( static int xfs_rmap_update_diff_items( void *priv, - struct list_head *a, - struct list_head *b) + const struct list_head *a, + const struct list_head *b) { struct xfs_mount *mp = priv; struct xfs_rmap_intent *ra; |