diff options
Diffstat (limited to 'fs')
47 files changed, 823 insertions, 318 deletions
@@ -68,9 +68,9 @@ struct aio_ring { #define AIO_RING_PAGES 8 struct kioctx_table { - struct rcu_head rcu; - unsigned nr; - struct kioctx *table[]; + struct rcu_head rcu; + unsigned nr; + struct kioctx __rcu *table[]; }; struct kioctx_cpu { @@ -115,7 +115,8 @@ struct kioctx { struct page **ring_pages; long nr_pages; - struct work_struct free_work; + struct rcu_head free_rcu; + struct work_struct free_work; /* see free_ioctx() */ /* * signals when all in-flight requests are done @@ -329,7 +330,7 @@ static int aio_ring_mremap(struct vm_area_struct *vma) for (i = 0; i < table->nr; i++) { struct kioctx *ctx; - ctx = table->table[i]; + ctx = rcu_dereference(table->table[i]); if (ctx && ctx->aio_ring_file == file) { if (!atomic_read(&ctx->dead)) { ctx->user_id = ctx->mmap_base = vma->vm_start; @@ -588,6 +589,12 @@ static int kiocb_cancel(struct aio_kiocb *kiocb) return cancel(&kiocb->common); } +/* + * free_ioctx() should be RCU delayed to synchronize against the RCU + * protected lookup_ioctx() and also needs process context to call + * aio_free_ring(), so the double bouncing through kioctx->free_rcu and + * ->free_work. + */ static void free_ioctx(struct work_struct *work) { struct kioctx *ctx = container_of(work, struct kioctx, free_work); @@ -601,6 +608,14 @@ static void free_ioctx(struct work_struct *work) kmem_cache_free(kioctx_cachep, ctx); } +static void free_ioctx_rcufn(struct rcu_head *head) +{ + struct kioctx *ctx = container_of(head, struct kioctx, free_rcu); + + INIT_WORK(&ctx->free_work, free_ioctx); + schedule_work(&ctx->free_work); +} + static void free_ioctx_reqs(struct percpu_ref *ref) { struct kioctx *ctx = container_of(ref, struct kioctx, reqs); @@ -609,8 +624,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref) if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count)) complete(&ctx->rq_wait->comp); - INIT_WORK(&ctx->free_work, free_ioctx); - schedule_work(&ctx->free_work); + /* Synchronize against RCU protected table->table[] dereferences */ + call_rcu(&ctx->free_rcu, free_ioctx_rcufn); } /* @@ -651,9 +666,9 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) while (1) { if (table) for (i = 0; i < table->nr; i++) - if (!table->table[i]) { + if (!rcu_access_pointer(table->table[i])) { ctx->id = i; - table->table[i] = ctx; + rcu_assign_pointer(table->table[i], ctx); spin_unlock(&mm->ioctx_lock); /* While kioctx setup is in progress, @@ -834,11 +849,11 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, } table = rcu_dereference_raw(mm->ioctx_table); - WARN_ON(ctx != table->table[ctx->id]); - table->table[ctx->id] = NULL; + WARN_ON(ctx != rcu_access_pointer(table->table[ctx->id])); + RCU_INIT_POINTER(table->table[ctx->id], NULL); spin_unlock(&mm->ioctx_lock); - /* percpu_ref_kill() will do the necessary call_rcu() */ + /* free_ioctx_reqs() will do the necessary RCU synchronization */ wake_up_all(&ctx->wait); /* @@ -880,7 +895,8 @@ void exit_aio(struct mm_struct *mm) skipped = 0; for (i = 0; i < table->nr; ++i) { - struct kioctx *ctx = table->table[i]; + struct kioctx *ctx = + rcu_dereference_protected(table->table[i], true); if (!ctx) { skipped++; @@ -1069,7 +1085,7 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) if (!table || id >= table->nr) goto out; - ctx = table->table[id]; + ctx = rcu_dereference(table->table[id]); if (ctx && ctx->user_id == ctx_id) { percpu_ref_get(&ctx->users); ret = ctx; diff --git a/fs/block_dev.c b/fs/block_dev.c index 4a181fcb5175..fe09ef9c21f3 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1058,6 +1058,27 @@ retry: return 0; } +static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno) +{ + struct gendisk *disk = get_gendisk(bdev->bd_dev, partno); + + if (!disk) + return NULL; + /* + * Now that we hold gendisk reference we make sure bdev we looked up is + * not stale. If it is, it means device got removed and created before + * we looked up gendisk and we fail open in such case. Associating + * unhashed bdev with newly created gendisk could lead to two bdevs + * (and thus two independent caches) being associated with one device + * which is bad. + */ + if (inode_unhashed(bdev->bd_inode)) { + put_disk_and_module(disk); + return NULL; + } + return disk; +} + /** * bd_start_claiming - start claiming a block device * @bdev: block device of interest @@ -1094,7 +1115,7 @@ static struct block_device *bd_start_claiming(struct block_device *bdev, * @bdev might not have been initialized properly yet, look up * and grab the outer block device the hard way. */ - disk = get_gendisk(bdev->bd_dev, &partno); + disk = bdev_get_gendisk(bdev, &partno); if (!disk) return ERR_PTR(-ENXIO); @@ -1111,8 +1132,7 @@ static struct block_device *bd_start_claiming(struct block_device *bdev, else whole = bdgrab(bdev); - module_put(disk->fops->owner); - put_disk(disk); + put_disk_and_module(disk); if (!whole) return ERR_PTR(-ENOMEM); @@ -1407,10 +1427,10 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) { struct gendisk *disk; - struct module *owner; int ret; int partno; int perm = 0; + bool first_open = false; if (mode & FMODE_READ) perm |= MAY_READ; @@ -1430,14 +1450,14 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) restart: ret = -ENXIO; - disk = get_gendisk(bdev->bd_dev, &partno); + disk = bdev_get_gendisk(bdev, &partno); if (!disk) goto out; - owner = disk->fops->owner; disk_block_events(disk); mutex_lock_nested(&bdev->bd_mutex, for_part); if (!bdev->bd_openers) { + first_open = true; bdev->bd_disk = disk; bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; @@ -1463,8 +1483,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_queue = NULL; mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); - put_disk(disk); - module_put(owner); + put_disk_and_module(disk); goto restart; } } @@ -1524,15 +1543,15 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (ret) goto out_unlock_bdev; } - /* only one opener holds refs to the module and disk */ - put_disk(disk); - module_put(owner); } bdev->bd_openers++; if (for_part) bdev->bd_part_count++; mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); + /* only one opener holds refs to the module and disk */ + if (!first_open) + put_disk_and_module(disk); return 0; out_clear: @@ -1546,8 +1565,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); - put_disk(disk); - module_put(owner); + put_disk_and_module(disk); out: bdput(bdev); @@ -1770,8 +1788,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) disk->fops->release(disk, mode); } if (!bdev->bd_openers) { - struct module *owner = disk->fops->owner; - disk_put_part(bdev->bd_part); bdev->bd_part = NULL; bdev->bd_disk = NULL; @@ -1779,8 +1795,7 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) victim = bdev->bd_contains; bdev->bd_contains = NULL; - put_disk(disk); - module_put(owner); + put_disk_and_module(disk); } mutex_unlock(&bdev->bd_mutex); bdput(bdev); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index e4054e533f6d..26484648d090 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1264,7 +1264,16 @@ again: while (node) { ref = rb_entry(node, struct prelim_ref, rbnode); node = rb_next(&ref->rbnode); - WARN_ON(ref->count < 0); + /* + * ref->count < 0 can happen here if there are delayed + * refs with a node->action of BTRFS_DROP_DELAYED_REF. + * prelim_ref_insert() relies on this when merging + * identical refs to keep the overall count correct. + * prelim_ref_insert() will merge only those refs + * which compare identically. Any refs having + * e.g. different offsets would not be merged, + * and would retain their original ref->count < 0. + */ if (roots && ref->count && ref->root_id && ref->parent == 0) { if (sc && sc->root_objectid && ref->root_id != sc->root_objectid) { @@ -1510,6 +1519,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) if (!node) break; bytenr = node->val; + shared.share_count = 0; cond_resched(); } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1a462ab85c49..da308774b8a4 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2974,7 +2974,7 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info) kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); security_free_mnt_opts(&fs_info->security_opts); - kfree(fs_info); + kvfree(fs_info); } /* tree mod log functions from ctree.c */ @@ -3095,7 +3095,10 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, u64 inode_objectid, u64 ref_objectid, int ins_len, int cow); -int btrfs_find_name_in_ext_backref(struct btrfs_path *path, +int btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, + const char *name, + int name_len, struct btrfs_inode_ref **ref_ret); +int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot, u64 ref_objectid, const char *name, int name_len, struct btrfs_inode_extref **extref_ret); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index a1a40cf382e3..7ab5e0128f0c 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -821,7 +821,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, spin_unlock(&delayed_refs->lock); if (qrecord_inserted) - return btrfs_qgroup_trace_extent_post(fs_info, record); + btrfs_qgroup_trace_extent_post(fs_info, record); + return 0; free_head_ref: diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 05751a677da4..c1618ab9fecf 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2147,6 +2147,10 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 bytes; struct request_queue *req_q; + if (!stripe->dev->bdev) { + ASSERT(btrfs_test_opt(fs_info, DEGRADED)); + continue; + } req_q = bdev_get_queue(stripe->dev->bdev); if (!blk_queue_discard(req_q)) continue; diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 39c968f80157..65e1a76bf755 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -22,10 +22,10 @@ #include "transaction.h" #include "print-tree.h" -static int find_name_in_backref(struct btrfs_path *path, const char *name, - int name_len, struct btrfs_inode_ref **ref_ret) +int btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, + const char *name, + int name_len, struct btrfs_inode_ref **ref_ret) { - struct extent_buffer *leaf; struct btrfs_inode_ref *ref; unsigned long ptr; unsigned long name_ptr; @@ -33,9 +33,8 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name, u32 cur_offset = 0; int len; - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + item_size = btrfs_item_size_nr(leaf, slot); + ptr = btrfs_item_ptr_offset(leaf, slot); while (cur_offset < item_size) { ref = (struct btrfs_inode_ref *)(ptr + cur_offset); len = btrfs_inode_ref_name_len(leaf, ref); @@ -44,18 +43,19 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name, if (len != name_len) continue; if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) { - *ref_ret = ref; + if (ref_ret) + *ref_ret = ref; return 1; } } return 0; } -int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid, +int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot, + u64 ref_objectid, const char *name, int name_len, struct btrfs_inode_extref **extref_ret) { - struct extent_buffer *leaf; struct btrfs_inode_extref *extref; unsigned long ptr; unsigned long name_ptr; @@ -63,9 +63,8 @@ int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid, u32 cur_offset = 0; int ref_name_len; - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + item_size = btrfs_item_size_nr(leaf, slot); + ptr = btrfs_item_ptr_offset(leaf, slot); /* * Search all extended backrefs in this item. We're only @@ -113,7 +112,9 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, return ERR_PTR(ret); if (ret > 0) return NULL; - if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref)) + if (!btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], + ref_objectid, name, name_len, + &extref)) return NULL; return extref; } @@ -155,7 +156,8 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, * This should always succeed so error here will make the FS * readonly. */ - if (!btrfs_find_name_in_ext_backref(path, ref_objectid, + if (!btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], + ref_objectid, name, name_len, &extref)) { btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL); ret = -EROFS; @@ -225,7 +227,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, } else if (ret < 0) { goto out; } - if (!find_name_in_backref(path, name, name_len, &ref)) { + if (!btrfs_find_name_in_backref(path->nodes[0], path->slots[0], + name, name_len, &ref)) { ret = -ENOENT; search_ext_refs = 1; goto out; @@ -293,7 +296,9 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, root, path, &key, ins_len); if (ret == -EEXIST) { - if (btrfs_find_name_in_ext_backref(path, ref_objectid, + if (btrfs_find_name_in_ext_backref(path->nodes[0], + path->slots[0], + ref_objectid, name, name_len, NULL)) goto out; @@ -351,7 +356,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, if (ret == -EEXIST) { u32 old_size; - if (find_name_in_backref(path, name, name_len, &ref)) + if (btrfs_find_name_in_backref(path->nodes[0], path->slots[0], + name, name_len, &ref)) goto out; old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); @@ -365,7 +371,9 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, ret = 0; } else if (ret < 0) { if (ret == -EOVERFLOW) { - if (find_name_in_backref(path, name, name_len, &ref)) + if (btrfs_find_name_in_backref(path->nodes[0], + path->slots[0], + name, name_len, &ref)) ret = -EEXIST; else ret = -EMLINK; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 53ca025655fc..f53470112670 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1335,8 +1335,11 @@ next_slot: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); - if (ret < 0) + if (ret < 0) { + if (cow_start != (u64)-1) + cur_offset = cow_start; goto error; + } if (ret > 0) break; leaf = path->nodes[0]; @@ -2040,12 +2043,15 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, struct inode *inode, struct list_head *list) { struct btrfs_ordered_sum *sum; + int ret; list_for_each_entry(sum, list, list) { trans->adding_csums = true; - btrfs_csum_file_blocks(trans, + ret = btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root->fs_info->csum_root, sum); trans->adding_csums = false; + if (ret) + return ret; } return 0; } @@ -3059,7 +3065,11 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - add_pending_csums(trans, inode, &ordered_extent->list); + ret = add_pending_csums(trans, inode, &ordered_extent->list); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } btrfs_ordered_update_i_size(inode, 0, ordered_extent); ret = btrfs_update_inode_fallback(trans, root, inode); @@ -3385,6 +3395,11 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, ret = btrfs_orphan_reserve_metadata(trans, inode); ASSERT(!ret); if (ret) { + /* + * dec doesn't need spin_lock as ->orphan_block_rsv + * would be released only if ->orphan_inodes is + * zero. + */ atomic_dec(&root->orphan_inodes); clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, &inode->runtime_flags); @@ -3399,12 +3414,17 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, if (insert >= 1) { ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); if (ret) { - atomic_dec(&root->orphan_inodes); if (reserve) { clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, &inode->runtime_flags); btrfs_orphan_release_metadata(inode); } + /* + * btrfs_orphan_commit_root may race with us and set + * ->orphan_block_rsv to zero, in order to avoid that, + * decrease ->orphan_inodes after everything is done. + */ + atomic_dec(&root->orphan_inodes); if (ret != -EEXIST) { clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &inode->runtime_flags); @@ -3436,28 +3456,26 @@ static int btrfs_orphan_del(struct btrfs_trans_handle *trans, { struct btrfs_root *root = inode->root; int delete_item = 0; - int release_rsv = 0; int ret = 0; - spin_lock(&root->orphan_lock); if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &inode->runtime_flags)) delete_item = 1; + if (delete_item && trans) + ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode)); + if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, &inode->runtime_flags)) - release_rsv = 1; - spin_unlock(&root->orphan_lock); + btrfs_orphan_release_metadata(inode); - if (delete_item) { + /* + * btrfs_orphan_commit_root may race with us and set ->orphan_block_rsv + * to zero, in order to avoid that, decrease ->orphan_inodes after + * everything is done. + */ + if (delete_item) atomic_dec(&root->orphan_inodes); - if (trans) - ret = btrfs_del_orphan_item(trans, root, - btrfs_ino(inode)); - } - - if (release_rsv) - btrfs_orphan_release_metadata(inode); return ret; } @@ -5281,7 +5299,7 @@ void btrfs_evict_inode(struct inode *inode) trace_btrfs_inode_evict(inode); if (!root) { - kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); + clear_inode(inode); return; } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 9e61dd624f7b..aa259d6986e1 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1442,8 +1442,13 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info, int ret; ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root, false); - if (ret < 0) - return ret; + if (ret < 0) { + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + btrfs_warn(fs_info, +"error accounting new delayed refs extent (err code: %d), quota inconsistent", + ret); + return 0; + } /* * Here we don't need to get the lock of diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index dec0907dfb8a..fcfc20de2df3 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1370,6 +1370,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, stripe_start = stripe->physical; if (physical >= stripe_start && physical < stripe_start + rbio->stripe_len && + stripe->dev->bdev && bio->bi_disk == stripe->dev->bdev->bd_disk && bio->bi_partno == stripe->dev->bdev->bd_partno) { return i; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index f0c3f00e97cb..cd2298d185dd 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3268,8 +3268,22 @@ static int relocate_file_extent_cluster(struct inode *inode, nr++; } - btrfs_set_extent_delalloc(inode, page_start, page_end, 0, NULL, - 0); + ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, + NULL, 0); + if (ret) { + unlock_page(page); + put_page(page); + btrfs_delalloc_release_metadata(BTRFS_I(inode), + PAGE_SIZE); + btrfs_delalloc_release_extents(BTRFS_I(inode), + PAGE_SIZE); + + clear_extent_bits(&BTRFS_I(inode)->io_tree, + page_start, page_end, + EXTENT_LOCKED | EXTENT_BOUNDARY); + goto out; + + } set_page_dirty(page); unlock_extent(&BTRFS_I(inode)->io_tree, diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f306c608dc28..484e2af793de 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5005,6 +5005,9 @@ static int send_hole(struct send_ctx *sctx, u64 end) u64 len; int ret = 0; + if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) + return send_update_extent(sctx, offset, end - offset); + p = fs_path_alloc(); if (!p) return -ENOMEM; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 6e71a2a78363..4b817947e00f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1545,7 +1545,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, * it for searching for existing supers, so this lets us do that and * then open_ctree will properly initialize everything later. */ - fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL); + fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL); if (!fs_info) { error = -ENOMEM; goto error_sec_opts; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index afadaadab18e..434457794c27 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -29,6 +29,7 @@ #include "hash.h" #include "compression.h" #include "qgroup.h" +#include "inode-map.h" /* magic values for the inode_only field in btrfs_log_inode: * @@ -966,7 +967,9 @@ static noinline int backref_in_log(struct btrfs_root *log, ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); if (key->type == BTRFS_INODE_EXTREF_KEY) { - if (btrfs_find_name_in_ext_backref(path, ref_objectid, + if (btrfs_find_name_in_ext_backref(path->nodes[0], + path->slots[0], + ref_objectid, name, namelen, NULL)) match = 1; @@ -1190,7 +1193,8 @@ static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, read_extent_buffer(eb, *name, (unsigned long)&extref->name, *namelen); - *index = btrfs_inode_extref_index(eb, extref); + if (index) + *index = btrfs_inode_extref_index(eb, extref); if (parent_objectid) *parent_objectid = btrfs_inode_extref_parent(eb, extref); @@ -1211,12 +1215,102 @@ static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); - *index = btrfs_inode_ref_index(eb, ref); + if (index) + *index = btrfs_inode_ref_index(eb, ref); return 0; } /* + * Take an inode reference item from the log tree and iterate all names from the + * inode reference item in the subvolume tree with the same key (if it exists). + * For any name that is not in the inode reference item from the log tree, do a + * proper unlink of that name (that is, remove its entry from the inode + * reference item and both dir index keys). + */ +static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_inode *inode, + struct extent_buffer *log_eb, + int log_slot, + struct btrfs_key *key) +{ + int ret; + unsigned long ref_ptr; + unsigned long ref_end; + struct extent_buffer *eb; + +again: + btrfs_release_path(path); + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret > 0) { + ret = 0; + goto out; + } + if (ret < 0) + goto out; + + eb = path->nodes[0]; + ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); + ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]); + while (ref_ptr < ref_end) { + char *name = NULL; + int namelen; + u64 parent_id; + + if (key->type == BTRFS_INODE_EXTREF_KEY) { + ret = extref_get_fields(eb, ref_ptr, &namelen, &name, + NULL, &parent_id); + } else { + parent_id = key->offset; + ret = ref_get_fields(eb, ref_ptr, &namelen, &name, + NULL); + } + if (ret) + goto out; + + if (key->type == BTRFS_INODE_EXTREF_KEY) + ret = btrfs_find_name_in_ext_backref(log_eb, log_slot, + parent_id, name, + namelen, NULL); + else + ret = btrfs_find_name_in_backref(log_eb, log_slot, name, + namelen, NULL); + + if (!ret) { + struct inode *dir; + + btrfs_release_path(path); + dir = read_one_inode(root, parent_id); + if (!dir) { + ret = -ENOENT; + kfree(name); + goto out; + } + ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), + inode, name, namelen); + kfree(name); + iput(dir); + if (ret) + goto out; + goto again; + } + + kfree(name); + ref_ptr += namelen; + if (key->type == BTRFS_INODE_EXTREF_KEY) + ref_ptr += sizeof(struct btrfs_inode_extref); + else + ref_ptr += sizeof(struct btrfs_inode_ref); + } + ret = 0; + out: + btrfs_release_path(path); + return ret; +} + +/* * replay one inode back reference item found in the log tree. * eb, slot and key refer to the buffer and key found in the log tree. * root is the destination we are replaying into, and path is for temp @@ -1344,6 +1438,19 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, } } + /* + * Before we overwrite the inode reference item in the subvolume tree + * with the item from the log tree, we must unlink all names from the + * parent directory that are in the subvolume's tree inode reference + * item, otherwise we end up with an inconsistent subvolume tree where + * dir index entries exist for a name but there is no inode reference + * item with the same name. + */ + ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, + key); + if (ret) + goto out; + /* finally write the back reference in the inode */ ret = overwrite_item(trans, root, path, eb, slot, key); out: @@ -2472,6 +2579,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, clean_tree_block(fs_info, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); + } else { + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) + clear_extent_buffer_dirty(next); } WARN_ON(root_owner != @@ -2552,6 +2662,9 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, clean_tree_block(fs_info, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); + } else { + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) + clear_extent_buffer_dirty(next); } WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); @@ -2630,6 +2743,9 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, clean_tree_block(fs_info, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); + } else { + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) + clear_extent_buffer_dirty(next); } WARN_ON(log->root_key.objectid != @@ -3018,13 +3134,14 @@ static void free_log_tree(struct btrfs_trans_handle *trans, while (1) { ret = find_first_extent_bit(&log->dirty_log_pages, - 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW, + 0, &start, &end, + EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT, NULL); if (ret) break; clear_extent_bits(&log->dirty_log_pages, start, end, - EXTENT_DIRTY | EXTENT_NEW); + EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); } /* @@ -5677,6 +5794,23 @@ again: path); } + if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { + struct btrfs_root *root = wc.replay_dest; + + btrfs_release_path(path); + + /* + * We have just replayed everything, and the highest + * objectid of fs roots probably has changed in case + * some inode_item's got replayed. + * + * root->objectid_mutex is not acquired as log replay + * could only happen during mount. + */ + ret = btrfs_find_highest_objectid(root, + &root->highest_objectid); + } + key.offset = found_key.offset - 1; wc.replay_dest->log_root = NULL; free_extent_buffer(log->node); @@ -5825,7 +5959,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, * this will force the logging code to walk the dentry chain * up for the file */ - if (S_ISREG(inode->vfs_inode.i_mode)) + if (!S_ISDIR(inode->vfs_inode.i_mode)) inode->last_unlink_trans = trans->transid; /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b5036bd69e6a..b2d05c6b1c56 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -645,6 +645,7 @@ static void btrfs_free_stale_devices(const char *path, btrfs_sysfs_remove_fsid(fs_devs); list_del(&fs_devs->list); free_fs_devices(fs_devs); + break; } else { fs_devs->num_devices--; list_del(&dev->dev_list); @@ -4828,10 +4829,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, ndevs = min(ndevs, devs_max); /* - * the primary goal is to maximize the number of stripes, so use as many - * devices as possible, even if the stripes are not maximum sized. + * The primary goal is to maximize the number of stripes, so use as + * many devices as possible, even if the stripes are not maximum sized. + * + * The DUP profile stores more than one stripe per device, the + * max_avail is the total size so we have to adjust. */ - stripe_size = devices_info[ndevs-1].max_avail; + stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes); num_stripes = ndevs * dev_stripes; /* @@ -4866,8 +4870,6 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, stripe_size = devices_info[ndevs-1].max_avail; } - stripe_size = div_u64(stripe_size, dev_stripes); - /* align to BTRFS_STRIPE_LEN */ stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 6582c4507e6c..0e5bd3e3344e 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3965,6 +3965,32 @@ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode) } /* + * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it + * looks like the link count will hit 0, drop any other caps (other + * than PIN) we don't specifically want (due to the file still being + * open). + */ +int ceph_drop_caps_for_unlink(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; + + spin_lock(&ci->i_ceph_lock); + if (inode->i_nlink == 1) { + drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); + + ci->i_ceph_flags |= CEPH_I_NODELAY; + if (__ceph_caps_dirty(ci)) { + struct ceph_mds_client *mdsc = + ceph_inode_to_client(inode)->mdsc; + __cap_delay_requeue_front(mdsc, ci); + } + } + spin_unlock(&ci->i_ceph_lock); + return drop; +} + +/* * Helpers for embedding cap and dentry lease releases into mds * requests. * diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 0c4346806e17..f1d9c6cc0491 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1003,26 +1003,6 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, } /* - * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it - * looks like the link count will hit 0, drop any other caps (other - * than PIN) we don't specifically want (due to the file still being - * open). - */ -static int drop_caps_for_unlink(struct inode *inode) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; - - spin_lock(&ci->i_ceph_lock); - if (inode->i_nlink == 1) { - drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); - ci->i_ceph_flags |= CEPH_I_NODELAY; - } - spin_unlock(&ci->i_ceph_lock); - return drop; -} - -/* * rmdir and unlink are differ only by the metadata op code */ static int ceph_unlink(struct inode *dir, struct dentry *dentry) @@ -1056,7 +1036,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; - req->r_inode_drop = drop_caps_for_unlink(inode); + req->r_inode_drop = ceph_drop_caps_for_unlink(inode); err = ceph_mdsc_do_request(mdsc, dir, req); if (!err && !req->r_reply_info.head->is_dentry) d_delete(dentry); @@ -1104,8 +1084,10 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, req->r_dentry_unless = CEPH_CAP_FILE_EXCL; /* release LINK_RDCACHE on source inode (mds will lock it) */ req->r_old_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; - if (d_really_is_positive(new_dentry)) - req->r_inode_drop = drop_caps_for_unlink(d_inode(new_dentry)); + if (d_really_is_positive(new_dentry)) { + req->r_inode_drop = + ceph_drop_caps_for_unlink(d_inode(new_dentry)); + } err = ceph_mdsc_do_request(mdsc, old_dir, req); if (!err && !req->r_reply_info.head->is_dentry) { /* diff --git a/fs/ceph/super.c b/fs/ceph/super.c index a62d2a9841dc..fb2bc9c15a23 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -225,6 +225,7 @@ static int parse_fsopt_token(char *c, void *private) return -ENOMEM; break; case Opt_mds_namespace: + kfree(fsopt->mds_namespace); fsopt->mds_namespace = kstrndup(argstr[0].from, argstr[0].to-argstr[0].from, GFP_KERNEL); @@ -232,6 +233,7 @@ static int parse_fsopt_token(char *c, void *private) return -ENOMEM; break; case Opt_fscache_uniq: + kfree(fsopt->fscache_uniq); fsopt->fscache_uniq = kstrndup(argstr[0].from, argstr[0].to-argstr[0].from, GFP_KERNEL); @@ -711,14 +713,17 @@ static int __init init_caches(void) goto bad_dentry; ceph_file_cachep = KMEM_CACHE(ceph_file_info, SLAB_MEM_SPREAD); - if (!ceph_file_cachep) goto bad_file; - if ((error = ceph_fscache_register())) - goto bad_file; + error = ceph_fscache_register(); + if (error) + goto bad_fscache; return 0; + +bad_fscache: + kmem_cache_destroy(ceph_file_cachep); bad_file: kmem_cache_destroy(ceph_dentry_cachep); bad_dentry: @@ -836,7 +841,6 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc) int err; unsigned long started = jiffies; /* note the start time */ struct dentry *root; - int first = 0; /* first vfsmount for this super_block */ dout("mount start %p\n", fsc); mutex_lock(&fsc->client->mount_mutex); @@ -861,17 +865,17 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc) path = fsc->mount_options->server_path + 1; dout("mount opening path %s\n", path); } + + err = ceph_fs_debugfs_init(fsc); + if (err < 0) + goto out; + root = open_root_dentry(fsc, path, started); if (IS_ERR(root)) { err = PTR_ERR(root); goto out; } fsc->sb->s_root = dget(root); - first = 1; - - err = ceph_fs_debugfs_init(fsc); - if (err < 0) - goto fail; } else { root = dget(fsc->sb->s_root); } @@ -881,11 +885,6 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc) mutex_unlock(&fsc->client->mount_mutex); return root; -fail: - if (first) { - dput(fsc->sb->s_root); - fsc->sb->s_root = NULL; - } out: mutex_unlock(&fsc->client->mount_mutex); return ERR_PTR(err); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 21b2e5b004eb..1c2086e0fec2 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -987,7 +987,7 @@ extern void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_session *session); extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc); extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc); - +extern int ceph_drop_caps_for_unlink(struct inode *inode); extern int ceph_encode_inode_release(void **p, struct inode *inode, int mds, int drop, int unless, int force); extern int ceph_encode_dentry_release(void **p, struct dentry *dn, diff --git a/fs/dcache.c b/fs/dcache.c index 7c38f39958bc..8945e6cabd93 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -647,11 +647,16 @@ again: spin_unlock(&parent->d_lock); goto again; } - rcu_read_unlock(); - if (parent != dentry) + if (parent != dentry) { spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - else + if (unlikely(dentry->d_lockref.count < 0)) { + spin_unlock(&parent->d_lock); + parent = NULL; + } + } else { parent = NULL; + } + rcu_read_unlock(); return parent; } @@ -2474,7 +2479,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent, retry: rcu_read_lock(); - seq = smp_load_acquire(&parent->d_inode->i_dir_seq) & ~1; + seq = smp_load_acquire(&parent->d_inode->i_dir_seq); r_seq = read_seqbegin(&rename_lock); dentry = __d_lookup_rcu(parent, name, &d_seq); if (unlikely(dentry)) { @@ -2495,8 +2500,14 @@ retry: rcu_read_unlock(); goto retry; } + + if (unlikely(seq & 1)) { + rcu_read_unlock(); + goto retry; + } + hlist_bl_lock(b); - if (unlikely(parent->d_inode->i_dir_seq != seq)) { + if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) { hlist_bl_unlock(b); rcu_read_unlock(); goto retry; diff --git a/fs/direct-io.c b/fs/direct-io.c index a0ca9e48e993..1357ef563893 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1274,8 +1274,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, */ if (dio->is_async && iov_iter_rw(iter) == WRITE) { retval = 0; - if ((iocb->ki_filp->f_flags & O_DSYNC) || - IS_SYNC(iocb->ki_filp->f_mapping->host)) + if (iocb->ki_flags & IOCB_DSYNC) retval = dio_set_defer_completion(dio); else if (!dio->inode->i_sb->s_dio_done_wq) { /* diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index 5f22e74bbade..8e568428c88b 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -8,6 +8,7 @@ */ #include <linux/efi.h> +#include <linux/delay.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/mount.h> @@ -74,6 +75,11 @@ static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf, ssize_t size = 0; int err; + while (!__ratelimit(&file->f_cred->user->ratelimit)) { + if (!msleep_interruptible(50)) + return -EINTR; + } + err = efivar_entry_size(var, &datasize); /* diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 86863792f36a..51f940e76c5e 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -716,7 +716,7 @@ int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, __be64 *ptr; sector_t lblock; sector_t lend; - int ret; + int ret = 0; int eob; unsigned int len; struct buffer_head *bh; @@ -728,12 +728,14 @@ int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, goto out; } - if ((flags & IOMAP_REPORT) && gfs2_is_stuffed(ip)) { - gfs2_stuffed_iomap(inode, iomap); - if (pos >= iomap->length) - return -ENOENT; - ret = 0; - goto out; + if (gfs2_is_stuffed(ip)) { + if (flags & IOMAP_REPORT) { + gfs2_stuffed_iomap(inode, iomap); + if (pos >= iomap->length) + ret = -ENOENT; + goto out; + } + BUG_ON(!(flags & IOMAP_WRITE)); } lblock = pos >> inode->i_blkbits; @@ -744,7 +746,7 @@ int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, iomap->type = IOMAP_HOLE; iomap->length = (u64)(lend - lblock) << inode->i_blkbits; iomap->flags = IOMAP_F_MERGED; - bmap_lock(ip, 0); + bmap_lock(ip, flags & IOMAP_WRITE); /* * Directory data blocks have a struct gfs2_meta_header header, so the @@ -787,27 +789,25 @@ int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, iomap->flags |= IOMAP_F_BOUNDARY; iomap->length = (u64)len << inode->i_blkbits; - ret = 0; - out_release: release_metapath(&mp); - bmap_unlock(ip, 0); + bmap_unlock(ip, flags & IOMAP_WRITE); out: trace_gfs2_iomap_end(ip, iomap, ret); return ret; do_alloc: - if (!(flags & IOMAP_WRITE)) { - if (pos >= i_size_read(inode)) { + if (flags & IOMAP_WRITE) { + ret = gfs2_iomap_alloc(inode, iomap, flags, &mp); + } else if (flags & IOMAP_REPORT) { + loff_t size = i_size_read(inode); + if (pos >= size) ret = -ENOENT; - goto out_release; - } - ret = 0; - iomap->length = hole_size(inode, lblock, &mp); - goto out_release; + else if (height <= ip->i_height) + iomap->length = hole_size(inode, lblock, &mp); + else + iomap->length = size - pos; } - - ret = gfs2_iomap_alloc(inode, iomap, flags, &mp); goto out_release; } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 8fe1b0aa2896..b9a254dcc0e7 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -108,6 +108,16 @@ static void huge_pagevec_release(struct pagevec *pvec) pagevec_reinit(pvec); } +/* + * Mask used when checking the page offset value passed in via system + * calls. This value will be converted to a loff_t which is signed. + * Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the + * value. The extra bit (- 1 in the shift value) is to take the sign + * bit into account. + */ +#define PGOFF_LOFFT_MAX \ + (((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1))) + static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); @@ -127,12 +137,13 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_ops = &hugetlb_vm_ops; /* - * Offset passed to mmap (before page shift) could have been - * negative when represented as a (l)off_t. + * page based offset in vm_pgoff could be sufficiently large to + * overflow a (l)off_t when converted to byte offset. */ - if (((loff_t)vma->vm_pgoff << PAGE_SHIFT) < 0) + if (vma->vm_pgoff & PGOFF_LOFFT_MAX) return -EINVAL; + /* must be huge page aligned */ if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) return -EINVAL; diff --git a/fs/namei.c b/fs/namei.c index 921ae32dbc80..cafa365eeb70 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -559,9 +559,10 @@ static int __nd_alloc_stack(struct nameidata *nd) static bool path_connected(const struct path *path) { struct vfsmount *mnt = path->mnt; + struct super_block *sb = mnt->mnt_sb; - /* Only bind mounts can have disconnected paths */ - if (mnt->mnt_root == mnt->mnt_sb->s_root) + /* Bind mounts and multi-root filesystems can have disconnected paths */ + if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root)) return true; return is_subdir(path->dentry, mnt->mnt_root); diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 2435af56b87e..a50d7813e3ea 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -572,7 +572,7 @@ out: } static bool -validate_bitmap_values(unsigned long mask) +validate_bitmap_values(unsigned int mask) { return (mask & ~RCA4_TYPE_MASK_ALL) == 0; } @@ -596,17 +596,15 @@ __be32 nfs4_callback_recallany(void *argp, void *resp, goto out; status = cpu_to_be32(NFS4_OK); - if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *) - &args->craa_type_mask)) + if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_RDATA_DLG)) flags = FMODE_READ; - if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *) - &args->craa_type_mask)) + if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_WDATA_DLG)) flags |= FMODE_WRITE; - if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *) - &args->craa_type_mask)) - pnfs_recall_all_layouts(cps->clp); if (flags) nfs_expire_unused_delegation_types(cps->clp, flags); + + if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_FILE_LAYOUT)) + pnfs_recall_all_layouts(cps->clp); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 8c10b0562e75..621c517b325c 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -86,10 +86,10 @@ struct nfs_direct_req { struct nfs_direct_mirror mirrors[NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX]; int mirror_count; + loff_t io_start; /* Start offset for I/O */ ssize_t count, /* bytes actually processed */ max_count, /* max expected count */ bytes_left, /* bytes left to be sent */ - io_start, /* start of IO */ error; /* any reported error */ struct completion completion; /* wait for i/o completion */ diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 49f848fd1f04..7327930ad970 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -873,7 +873,7 @@ static void nfs3_nlm_release_call(void *data) } } -const struct nlmclnt_operations nlmclnt_fl_close_lock_ops = { +static const struct nlmclnt_operations nlmclnt_fl_close_lock_ops = { .nlmclnt_alloc_call = nfs3_nlm_alloc_call, .nlmclnt_unlock_prepare = nfs3_nlm_unlock_prepare, .nlmclnt_release_call = nfs3_nlm_release_call, diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 04612c24d394..979631411a0e 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -868,8 +868,10 @@ static int nfs4_set_client(struct nfs_server *server, if (IS_ERR(clp)) return PTR_ERR(clp); - if (server->nfs_client == clp) + if (server->nfs_client == clp) { + nfs_put_client(clp); return -ELOOP; + } /* * Query for the lease time on clientid setup or renewal @@ -1244,11 +1246,11 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, clp->cl_proto, clnt->cl_timeout, clp->cl_minorversion, net); clear_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status); - nfs_put_client(clp); if (error != 0) { nfs_server_insert_lists(server); return error; } + nfs_put_client(clp); if (server->nfs_client->cl_hostname == NULL) server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index c13e826614b5..ee723aa153a3 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -292,8 +292,11 @@ pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo) void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) { - struct inode *inode = lo->plh_inode; + struct inode *inode; + if (!lo) + return; + inode = lo->plh_inode; pnfs_layoutreturn_before_put_layout_hdr(lo); if (refcount_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { @@ -1241,10 +1244,12 @@ retry: spin_lock(&ino->i_lock); lo = nfsi->layout; if (!lo || !pnfs_layout_is_valid(lo) || - test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) + test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { + lo = NULL; goto out_noroc; + } + pnfs_get_layout_hdr(lo); if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) { - pnfs_get_layout_hdr(lo); spin_unlock(&ino->i_lock); wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, TASK_UNINTERRUPTIBLE); @@ -1312,10 +1317,12 @@ out_noroc: struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; if (ld->prepare_layoutreturn) ld->prepare_layoutreturn(args); + pnfs_put_layout_hdr(lo); return true; } if (layoutreturn) pnfs_send_layoutreturn(lo, &stateid, iomode, true); + pnfs_put_layout_hdr(lo); return false; } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 29bacdc56f6a..5e470e233c83 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2631,6 +2631,8 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server, /* initial superblock/root creation */ mount_info->fill_super(s, mount_info); nfs_get_cache_cookie(s, mount_info->parsed, mount_info->cloned); + if (!(server->flags & NFS_MOUNT_UNSHARED)) + s->s_iflags |= SB_I_MULTIROOT; } mntroot = nfs_get_root(s, mount_info->mntfh, dev_name); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 7428a669d7a7..e7d8ceae8f26 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1876,40 +1876,43 @@ int nfs_generic_commit_list(struct inode *inode, struct list_head *head, return status; } -int nfs_commit_inode(struct inode *inode, int how) +static int __nfs_commit_inode(struct inode *inode, int how, + struct writeback_control *wbc) { LIST_HEAD(head); struct nfs_commit_info cinfo; int may_wait = how & FLUSH_SYNC; - int error = 0; - int res; + int ret, nscan; nfs_init_cinfo_from_inode(&cinfo, inode); nfs_commit_begin(cinfo.mds); - res = nfs_scan_commit(inode, &head, &cinfo); - if (res) - error = nfs_generic_commit_list(inode, &head, how, &cinfo); + for (;;) { + ret = nscan = nfs_scan_commit(inode, &head, &cinfo); + if (ret <= 0) + break; + ret = nfs_generic_commit_list(inode, &head, how, &cinfo); + if (ret < 0) + break; + ret = 0; + if (wbc && wbc->sync_mode == WB_SYNC_NONE) { + if (nscan < wbc->nr_to_write) + wbc->nr_to_write -= nscan; + else + wbc->nr_to_write = 0; + } + if (nscan < INT_MAX) + break; + cond_resched(); + } nfs_commit_end(cinfo.mds); - if (res == 0) - return res; - if (error < 0) - goto out_error; - if (!may_wait) - goto out_mark_dirty; - error = wait_on_commit(cinfo.mds); - if (error < 0) - return error; - return res; -out_error: - res = error; - /* Note: If we exit without ensuring that the commit is complete, - * we must mark the inode as dirty. Otherwise, future calls to - * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure - * that the data is on the disk. - */ -out_mark_dirty: - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); - return res; + if (ret || !may_wait) + return ret; + return wait_on_commit(cinfo.mds); +} + +int nfs_commit_inode(struct inode *inode, int how) +{ + return __nfs_commit_inode(inode, how, NULL); } EXPORT_SYMBOL_GPL(nfs_commit_inode); @@ -1919,11 +1922,11 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) int flags = FLUSH_SYNC; int ret = 0; - /* no commits means nothing needs to be done */ - if (!atomic_long_read(&nfsi->commit_info.ncommit)) - return ret; - if (wbc->sync_mode == WB_SYNC_NONE) { + /* no commits means nothing needs to be done */ + if (!atomic_long_read(&nfsi->commit_info.ncommit)) + goto check_requests_outstanding; + /* Don't commit yet if this is a non-blocking flush and there * are a lot of outstanding writes for this mapping. */ @@ -1934,16 +1937,16 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) flags = 0; } - ret = nfs_commit_inode(inode, flags); - if (ret >= 0) { - if (wbc->sync_mode == WB_SYNC_NONE) { - if (ret < wbc->nr_to_write) - wbc->nr_to_write -= ret; - else - wbc->nr_to_write = 0; - } - return 0; - } + ret = __nfs_commit_inode(inode, flags, wbc); + if (!ret) { + if (flags & FLUSH_SYNC) + return 0; + } else if (atomic_long_read(&nfsi->commit_info.ncommit)) + goto out_mark_dirty; + +check_requests_outstanding: + if (!atomic_read(&nfsi->commit_info.rpcs_out)) + return ret; out_mark_dirty: __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return ret; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 150521c9671b..61b770e39809 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -268,6 +268,35 @@ free_blocked_lock(struct nfsd4_blocked_lock *nbl) kfree(nbl); } +static void +remove_blocked_locks(struct nfs4_lockowner *lo) +{ + struct nfs4_client *clp = lo->lo_owner.so_client; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + struct nfsd4_blocked_lock *nbl; + LIST_HEAD(reaplist); + + /* Dequeue all blocked locks */ + spin_lock(&nn->blocked_locks_lock); + while (!list_empty(&lo->lo_blocked)) { + nbl = list_first_entry(&lo->lo_blocked, + struct nfsd4_blocked_lock, + nbl_list); + list_del_init(&nbl->nbl_list); + list_move(&nbl->nbl_lru, &reaplist); + } + spin_unlock(&nn->blocked_locks_lock); + + /* Now free them */ + while (!list_empty(&reaplist)) { + nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock, + nbl_lru); + list_del_init(&nbl->nbl_lru); + posix_unblock_lock(&nbl->nbl_lock); + free_blocked_lock(nbl); + } +} + static int nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task) { @@ -1866,6 +1895,7 @@ static __be32 mark_client_expired_locked(struct nfs4_client *clp) static void __destroy_client(struct nfs4_client *clp) { + int i; struct nfs4_openowner *oo; struct nfs4_delegation *dp; struct list_head reaplist; @@ -1895,6 +1925,16 @@ __destroy_client(struct nfs4_client *clp) nfs4_get_stateowner(&oo->oo_owner); release_openowner(oo); } + for (i = 0; i < OWNER_HASH_SIZE; i++) { + struct nfs4_stateowner *so, *tmp; + + list_for_each_entry_safe(so, tmp, &clp->cl_ownerstr_hashtbl[i], + so_strhash) { + /* Should be no openowners at this point */ + WARN_ON_ONCE(so->so_is_open_owner); + remove_blocked_locks(lockowner(so)); + } + } nfsd4_return_all_client_layouts(clp); nfsd4_shutdown_callback(clp); if (clp->cl_cb_conn.cb_xprt) @@ -6355,6 +6395,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, } spin_unlock(&clp->cl_lock); free_ol_stateid_reaplist(&reaplist); + remove_blocked_locks(lo); nfs4_put_stateowner(&lo->lo_owner); return status; @@ -7140,6 +7181,8 @@ nfs4_state_destroy_net(struct net *net) } } + WARN_ON(!list_empty(&nn->blocked_locks_lru)); + for (i = 0; i < CLIENT_HASH_SIZE; i++) { while (!list_empty(&nn->unconf_id_hashtbl[i])) { clp = list_entry(nn->unconf_id_hashtbl[i].next, struct nfs4_client, cl_idhash); @@ -7206,7 +7249,6 @@ nfs4_state_shutdown_net(struct net *net) struct nfs4_delegation *dp = NULL; struct list_head *pos, *next, reaplist; struct nfsd_net *nn = net_generic(net, nfsd_net_id); - struct nfsd4_blocked_lock *nbl; cancel_delayed_work_sync(&nn->laundromat_work); locks_end_grace(&nn->nfsd4_manager); @@ -7227,24 +7269,6 @@ nfs4_state_shutdown_net(struct net *net) nfs4_put_stid(&dp->dl_stid); } - BUG_ON(!list_empty(&reaplist)); - spin_lock(&nn->blocked_locks_lock); - while (!list_empty(&nn->blocked_locks_lru)) { - nbl = list_first_entry(&nn->blocked_locks_lru, - struct nfsd4_blocked_lock, nbl_lru); - list_move(&nbl->nbl_lru, &reaplist); - list_del_init(&nbl->nbl_list); - } - spin_unlock(&nn->blocked_locks_lock); - - while (!list_empty(&reaplist)) { - nbl = list_first_entry(&reaplist, - struct nfsd4_blocked_lock, nbl_lru); - list_del_init(&nbl->nbl_lru); - posix_unblock_lock(&nbl->nbl_lock); - free_blocked_lock(nbl); - } - nfsd4_client_tracking_exit(net); nfs4_state_destroy_net(net); } diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index 406e72de88f6..ce6ff5a0a6e4 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -24,6 +24,8 @@ config OVERLAY_FS_REDIRECT_DIR an overlay which has redirects on a kernel that doesn't support this feature will have unexpected results. + If unsure, say N. + config OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW bool "Overlayfs: follow redirects even if redirects are turned off" default y @@ -32,8 +34,13 @@ config OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW Disable this to get a possibly more secure configuration, but that might not be backward compatible with previous kernels. + If backward compatibility is not an issue, then it is safe and + recommended to say N here. + For more information, see Documentation/filesystems/overlayfs.txt + If unsure, say Y. + config OVERLAY_FS_INDEX bool "Overlayfs: turn on inodes index feature by default" depends on OVERLAY_FS @@ -51,6 +58,8 @@ config OVERLAY_FS_INDEX That is, mounting an overlay which has an inodes index on a kernel that doesn't support this feature will have unexpected results. + If unsure, say N. + config OVERLAY_FS_NFS_EXPORT bool "Overlayfs: turn on NFS export feature by default" depends on OVERLAY_FS @@ -72,3 +81,8 @@ config OVERLAY_FS_NFS_EXPORT Note, that the NFS export feature is not backward compatible. That is, mounting an overlay which has a full index on a kernel that doesn't support this feature will have unexpected results. + + Most users should say N here and enable this feature on a case-by- + case basis with the "nfs_export=on" mount option. + + Say N unless you fully understand the consequences. diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index bb94ce9da5c8..87bd4148f4fb 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -19,6 +19,142 @@ #include <linux/ratelimit.h> #include "overlayfs.h" +static int ovl_encode_maybe_copy_up(struct dentry *dentry) +{ + int err; + + if (ovl_dentry_upper(dentry)) + return 0; + + err = ovl_want_write(dentry); + if (!err) { + err = ovl_copy_up(dentry); + ovl_drop_write(dentry); + } + + if (err) { + pr_warn_ratelimited("overlayfs: failed to copy up on encode (%pd2, err=%i)\n", + dentry, err); + } + + return err; +} + +/* + * Before encoding a non-upper directory file handle from real layer N, we need + * to check if it will be possible to reconnect an overlay dentry from the real + * lower decoded dentry. This is done by following the overlay ancestry up to a + * "layer N connected" ancestor and verifying that all parents along the way are + * "layer N connectable". If an ancestor that is NOT "layer N connectable" is + * found, we need to copy up an ancestor, which is "layer N connectable", thus + * making that ancestor "layer N connected". For example: + * + * layer 1: /a + * layer 2: /a/b/c + * + * The overlay dentry /a is NOT "layer 2 connectable", because if dir /a is + * copied up and renamed, upper dir /a will be indexed by lower dir /a from + * layer 1. The dir /a from layer 2 will never be indexed, so the algorithm (*) + * in ovl_lookup_real_ancestor() will not be able to lookup a connected overlay + * dentry from the connected lower dentry /a/b/c. + * + * To avoid this problem on decode time, we need to copy up an ancestor of + * /a/b/c, which is "layer 2 connectable", on encode time. That ancestor is + * /a/b. After copy up (and index) of /a/b, it will become "layer 2 connected" + * and when the time comes to decode the file handle from lower dentry /a/b/c, + * ovl_lookup_real_ancestor() will find the indexed ancestor /a/b and decoding + * a connected overlay dentry will be accomplished. + * + * (*) the algorithm in ovl_lookup_real_ancestor() can be improved to lookup an + * entry /a in the lower layers above layer N and find the indexed dir /a from + * layer 1. If that improvement is made, then the check for "layer N connected" + * will need to verify there are no redirects in lower layers above N. In the + * example above, /a will be "layer 2 connectable". However, if layer 2 dir /a + * is a target of a layer 1 redirect, then /a will NOT be "layer 2 connectable": + * + * layer 1: /A (redirect = /a) + * layer 2: /a/b/c + */ + +/* Return the lowest layer for encoding a connectable file handle */ +static int ovl_connectable_layer(struct dentry *dentry) +{ + struct ovl_entry *oe = OVL_E(dentry); + + /* We can get overlay root from root of any layer */ + if (dentry == dentry->d_sb->s_root) + return oe->numlower; + + /* + * If it's an unindexed merge dir, then it's not connectable with any + * lower layer + */ + if (ovl_dentry_upper(dentry) && + !ovl_test_flag(OVL_INDEX, d_inode(dentry))) + return 0; + + /* We can get upper/overlay path from indexed/lower dentry */ + return oe->lowerstack[0].layer->idx; +} + +/* + * @dentry is "connected" if all ancestors up to root or a "connected" ancestor + * have the same uppermost lower layer as the origin's layer. We may need to + * copy up a "connectable" ancestor to make it "connected". A "connected" dentry + * cannot become non "connected", so cache positive result in dentry flags. + * + * Return the connected origin layer or < 0 on error. + */ +static int ovl_connect_layer(struct dentry *dentry) +{ + struct dentry *next, *parent = NULL; + int origin_layer; + int err = 0; + + if (WARN_ON(dentry == dentry->d_sb->s_root) || + WARN_ON(!ovl_dentry_lower(dentry))) + return -EIO; + + origin_layer = OVL_E(dentry)->lowerstack[0].layer->idx; + if (ovl_dentry_test_flag(OVL_E_CONNECTED, dentry)) + return origin_layer; + + /* Find the topmost origin layer connectable ancestor of @dentry */ + next = dget(dentry); + for (;;) { + parent = dget_parent(next); + if (WARN_ON(parent == next)) { + err = -EIO; + break; + } + + /* + * If @parent is not origin layer connectable, then copy up + * @next which is origin layer connectable and we are done. + */ + if (ovl_connectable_layer(parent) < origin_layer) { + err = ovl_encode_maybe_copy_up(next); + break; + } + + /* If @parent is connected or indexed we are done */ + if (ovl_dentry_test_flag(OVL_E_CONNECTED, parent) || + ovl_test_flag(OVL_INDEX, d_inode(parent))) + break; + + dput(next); + next = parent; + } + + dput(parent); + dput(next); + + if (!err) + ovl_dentry_set_flag(OVL_E_CONNECTED, dentry); + + return err ?: origin_layer; +} + /* * We only need to encode origin if there is a chance that the same object was * encoded pre copy up and then we need to stay consistent with the same @@ -41,73 +177,59 @@ * L = lower file handle * * (*) Connecting an overlay dir from real lower dentry is not always - * possible when there are redirects in lower layers. To mitigate this case, - * we copy up the lower dir first and then encode an upper dir file handle. + * possible when there are redirects in lower layers and non-indexed merge dirs. + * To mitigate those case, we may copy up the lower dir ancestor before encode + * a lower dir file handle. + * + * Return 0 for upper file handle, > 0 for lower file handle or < 0 on error. */ -static bool ovl_should_encode_origin(struct dentry *dentry) +static int ovl_check_encode_origin(struct dentry *dentry) { struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + /* Upper file handle for pure upper */ if (!ovl_dentry_lower(dentry)) - return false; + return 0; /* - * Decoding a merge dir, whose origin's parent is under a redirected - * lower dir is not always possible. As a simple aproximation, we do - * not encode lower dir file handles when overlay has multiple lower - * layers and origin is below the topmost lower layer. + * Upper file handle for non-indexed upper. * - * TODO: copy up only the parent that is under redirected lower. + * Root is never indexed, so if there's an upper layer, encode upper for + * root. */ - if (d_is_dir(dentry) && ofs->upper_mnt && - OVL_E(dentry)->lowerstack[0].layer->idx > 1) - return false; - - /* Decoding a non-indexed upper from origin is not implemented */ if (ovl_dentry_upper(dentry) && !ovl_test_flag(OVL_INDEX, d_inode(dentry))) - return false; - - return true; -} - -static int ovl_encode_maybe_copy_up(struct dentry *dentry) -{ - int err; - - if (ovl_dentry_upper(dentry)) return 0; - err = ovl_want_write(dentry); - if (err) - return err; - - err = ovl_copy_up(dentry); + /* + * Decoding a merge dir, whose origin's ancestor is under a redirected + * lower dir or under a non-indexed upper is not always possible. + * ovl_connect_layer() will try to make origin's layer "connected" by + * copying up a "connectable" ancestor. + */ + if (d_is_dir(dentry) && ofs->upper_mnt) + return ovl_connect_layer(dentry); - ovl_drop_write(dentry); - return err; + /* Lower file handle for indexed and non-upper dir/non-dir */ + return 1; } static int ovl_d_to_fh(struct dentry *dentry, char *buf, int buflen) { - struct dentry *origin = ovl_dentry_lower(dentry); struct ovl_fh *fh = NULL; - int err; + int err, enc_lower; /* - * If we should not encode a lower dir file handle, copy up and encode - * an upper dir file handle. + * Check if we should encode a lower or upper file handle and maybe + * copy up an ancestor to make lower file handle connectable. */ - if (!ovl_should_encode_origin(dentry)) { - err = ovl_encode_maybe_copy_up(dentry); - if (err) - goto fail; - - origin = NULL; - } + err = enc_lower = ovl_check_encode_origin(dentry); + if (enc_lower < 0) + goto fail; - /* Encode an upper or origin file handle */ - fh = ovl_encode_fh(origin ?: ovl_dentry_upper(dentry), !origin); + /* Encode an upper or lower file handle */ + fh = ovl_encode_fh(enc_lower ? ovl_dentry_lower(dentry) : + ovl_dentry_upper(dentry), !enc_lower); err = PTR_ERR(fh); if (IS_ERR(fh)) goto fail; @@ -355,8 +477,8 @@ static struct dentry *ovl_lookup_real_inode(struct super_block *sb, dput(upper); } - if (!this) - return NULL; + if (IS_ERR_OR_NULL(this)) + return this; if (WARN_ON(ovl_dentry_real_at(this, layer->idx) != real)) { dput(this); @@ -498,7 +620,7 @@ static struct dentry *ovl_lookup_real(struct super_block *sb, if (err == -ECHILD) { this = ovl_lookup_real_ancestor(sb, real, layer); - err = IS_ERR(this) ? PTR_ERR(this) : 0; + err = PTR_ERR_OR_ZERO(this); } if (!err) { dput(connected); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index fcd97b783fa1..3b1bd469accd 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -669,38 +669,59 @@ struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real, return inode; } +/* + * Does overlay inode need to be hashed by lower inode? + */ +static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper, + struct dentry *lower, struct dentry *index) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + /* No, if pure upper */ + if (!lower) + return false; + + /* Yes, if already indexed */ + if (index) + return true; + + /* Yes, if won't be copied up */ + if (!ofs->upper_mnt) + return true; + + /* No, if lower hardlink is or will be broken on copy up */ + if ((upper || !ovl_indexdir(sb)) && + !d_is_dir(lower) && d_inode(lower)->i_nlink > 1) + return false; + + /* No, if non-indexed upper with NFS export */ + if (sb->s_export_op && upper) + return false; + + /* Otherwise, hash by lower inode for fsnotify */ + return true; +} + struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry, struct dentry *lowerdentry, struct dentry *index, unsigned int numlower) { - struct ovl_fs *ofs = sb->s_fs_info; struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL; struct inode *inode; - /* Already indexed or could be indexed on copy up? */ - bool indexed = (index || (ovl_indexdir(sb) && !upperdentry)); - struct dentry *origin = indexed ? lowerdentry : NULL; + bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, index); bool is_dir; - if (WARN_ON(upperdentry && indexed && !lowerdentry)) - return ERR_PTR(-EIO); - if (!realinode) realinode = d_inode(lowerdentry); /* - * Copy up origin (lower) may exist for non-indexed non-dir upper, but - * we must not use lower as hash key in that case. - * Hash non-dir that is or could be indexed by origin inode. - * Hash dir that is or could be merged by origin inode. - * Hash pure upper and non-indexed non-dir by upper inode. - * Hash non-indexed dir by upper inode for NFS export. + * Copy up origin (lower) may exist for non-indexed upper, but we must + * not use lower as hash key if this is a broken hardlink. */ is_dir = S_ISDIR(realinode->i_mode); - if (is_dir && (indexed || !sb->s_export_op || !ofs->upper_mnt)) - origin = lowerdentry; - - if (upperdentry || origin) { - struct inode *key = d_inode(origin ?: upperdentry); + if (upperdentry || bylower) { + struct inode *key = d_inode(bylower ? lowerdentry : + upperdentry); unsigned int nlink = is_dir ? 1 : realinode->i_nlink; inode = iget5_locked(sb, (unsigned long) key, @@ -728,6 +749,7 @@ struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry, nlink = ovl_get_nlink(lowerdentry, upperdentry, nlink); set_nlink(inode, nlink); } else { + /* Lower hardlink that will be broken on copy up */ inode = new_inode(sb); if (!inode) goto out_nomem; diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index de3e6da1d5a5..70fcfcc684cc 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -913,9 +913,6 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, stack[ctr].layer = lower.layer; ctr++; - if (d.stop) - break; - /* * Following redirects can have security consequences: it's like * a symlink into the lower layer without the permission checks. @@ -933,6 +930,9 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, goto out_put; } + if (d.stop) + break; + if (d.redirect && d.redirect[0] == '/' && poe != roe) { poe = roe; /* Find the current layer on the root dentry */ diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 0df25a9c94bd..225ff1171147 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -40,6 +40,7 @@ enum ovl_inode_flag { enum ovl_entry_flag { OVL_E_UPPER_ALIAS, OVL_E_OPAQUE, + OVL_E_CONNECTED, }; /* diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 9ee37c76091d..7c24619ae7fc 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1359,6 +1359,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) /* Root is always merge -> can have whiteouts */ ovl_set_flag(OVL_WHITEOUTS, d_inode(root_dentry)); + ovl_dentry_set_flag(OVL_E_CONNECTED, root_dentry); ovl_inode_init(d_inode(root_dentry), upperpath.dentry, ovl_dentry_lower(root_dentry)); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index e8a93bc8285d..d1e82761de81 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -510,6 +510,10 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) /* we have to zero-fill user buffer even if no read */ if (copy_to_user(buffer, buf, tsz)) return -EFAULT; + } else if (m->type == KCORE_USER) { + /* User page is handled prior to normal kernel page: */ + if (copy_to_user(buffer, (char *)start, tsz)) + return -EFAULT; } else { if (kern_addr_valid(start)) { /* diff --git a/fs/signalfd.c b/fs/signalfd.c index 9990957264e3..76bf9cc62074 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -118,13 +118,22 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno); #endif #ifdef BUS_MCEERR_AO - /* + /* + * Other callers might not initialize the si_lsb field, + * so check explicitly for the right codes here. + */ + if (kinfo->si_signo == SIGBUS && + kinfo->si_code == BUS_MCEERR_AO) + err |= __put_user((short) kinfo->si_addr_lsb, + &uinfo->ssi_addr_lsb); +#endif +#ifdef BUS_MCEERR_AR + /* * Other callers might not initialize the si_lsb field, * so check explicitly for the right codes here. */ if (kinfo->si_signo == SIGBUS && - (kinfo->si_code == BUS_MCEERR_AR || - kinfo->si_code == BUS_MCEERR_AO)) + kinfo->si_code == BUS_MCEERR_AR) err |= __put_user((short) kinfo->si_addr_lsb, &uinfo->ssi_addr_lsb); #endif diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 8664db25a9a6..215c225b2ca1 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -106,6 +106,7 @@ int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, { return sysfs_do_create_link(kobj, target, name, 0); } +EXPORT_SYMBOL_GPL(sysfs_create_link_nowarn); /** * sysfs_delete_link - remove symlink in object's directory. diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index fd975524f460..05c66e05ae20 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -767,7 +767,7 @@ int xfs_scrub_agfl( struct xfs_scrub_context *sc) { - struct xfs_scrub_agfl_info sai = { 0 }; + struct xfs_scrub_agfl_info sai; struct xfs_agf *agf; xfs_agnumber_t agno; unsigned int agflcount; @@ -795,6 +795,7 @@ xfs_scrub_agfl( xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); goto out; } + memset(&sai, 0, sizeof(sai)); sai.sz_entries = agflcount; sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount, KM_NOFS); if (!sai.entries) { diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 66e1edbfb2b2..046469fcc1b8 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -955,15 +955,29 @@ static inline bool imap_needs_alloc(struct inode *inode, (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN); } +static inline bool needs_cow_for_zeroing(struct xfs_bmbt_irec *imap, int nimaps) +{ + return nimaps && + imap->br_startblock != HOLESTARTBLOCK && + imap->br_state != XFS_EXT_UNWRITTEN; +} + static inline bool need_excl_ilock(struct xfs_inode *ip, unsigned flags) { /* - * COW writes will allocate delalloc space, so we need to make sure - * to take the lock exclusively here. + * COW writes may allocate delalloc space or convert unwritten COW + * extents, so we need to make sure to take the lock exclusively here. */ if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO))) return true; - if ((flags & IOMAP_DIRECT) && (flags & IOMAP_WRITE)) + + /* + * Extents not yet cached requires exclusive access, don't block. + * This is an opencoded xfs_ilock_data_map_shared() to cater for the + * non-blocking behaviour. + */ + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && + !(ip->i_df.if_flags & XFS_IFEXTENTS)) return true; return false; } @@ -993,16 +1007,18 @@ xfs_file_iomap_begin( return xfs_file_iomap_begin_delay(inode, offset, length, iomap); } - if (need_excl_ilock(ip, flags)) { + if (need_excl_ilock(ip, flags)) lockmode = XFS_ILOCK_EXCL; - xfs_ilock(ip, XFS_ILOCK_EXCL); - } else { - lockmode = xfs_ilock_data_map_shared(ip); - } + else + lockmode = XFS_ILOCK_SHARED; - if ((flags & IOMAP_NOWAIT) && !(ip->i_df.if_flags & XFS_IFEXTENTS)) { - error = -EAGAIN; - goto out_unlock; + if (flags & IOMAP_NOWAIT) { + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) + return -EAGAIN; + if (!xfs_ilock_nowait(ip, lockmode)) + return -EAGAIN; + } else { + xfs_ilock(ip, lockmode); } ASSERT(offset <= mp->m_super->s_maxbytes); @@ -1024,7 +1040,9 @@ xfs_file_iomap_begin( goto out_unlock; } - if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { + if (xfs_is_reflink_inode(ip) && + ((flags & IOMAP_WRITE) || + ((flags & IOMAP_ZERO) && needs_cow_for_zeroing(&imap, nimaps)))) { if (flags & IOMAP_DIRECT) { /* * A reflinked inode will result in CoW alloc. diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 3a55d6fc271b..7a39f40645f7 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -23,6 +23,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" +#include "xfs_shared.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_trans.h" @@ -456,10 +457,12 @@ xfs_cui_recover( * transaction. Normally, any work that needs to be deferred * gets attached to the same defer_ops that scheduled the * refcount update. However, we're in log recovery here, so we - * we create our own defer_ops and use that to finish up any - * work that doesn't fit. + * we use the passed in defer_ops and to finish up any work that + * doesn't fit. We need to reserve enough blocks to handle a + * full btree split on either end of the refcount range. */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, + mp->m_refc_maxlevels * 2, 0, XFS_TRANS_RESERVE, &tp); if (error) return error; cudp = xfs_trans_get_cud(tp, cuip); diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index f3b139c9aa16..49d3124863a8 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -23,6 +23,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" +#include "xfs_shared.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_trans.h" @@ -470,7 +471,8 @@ xfs_rui_recover( } } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, + mp->m_rmap_maxlevels, 0, XFS_TRANS_RESERVE, &tp); if (error) return error; rudp = xfs_trans_get_rud(tp, ruip); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 7aba628dc527..93588ea3d3d2 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -250,6 +250,7 @@ xfs_parseargs( return -EINVAL; break; case Opt_logdev: + kfree(mp->m_logname); mp->m_logname = match_strdup(args); if (!mp->m_logname) return -ENOMEM; @@ -258,6 +259,7 @@ xfs_parseargs( xfs_warn(mp, "%s option not allowed on this system", p); return -EINVAL; case Opt_rtdev: + kfree(mp->m_rtname); mp->m_rtname = match_strdup(args); if (!mp->m_rtname) return -ENOMEM; |