diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-12-14 21:35:31 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-12-14 21:35:31 -0800 |
commit | 5cc60aeedf315a7513f92e98314e86d515b986d1 (patch) | |
tree | c83e252b8e2ad2cd42cbbb7974348290831dbb2a | |
parent | 5c2992ee7fd8a29d04125dc0aa3522784c5fa5eb (diff) | |
parent | 9807b773dad4555665931d57065d7f8adab58e0f (diff) |
Merge tag 'xfs-for-linus-4.10-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs
Pull xfs updates from Dave Chinner:
"There is quite a varied bunch of stuff in this update, and some of it
you will have already merged through the ext4 tree which imported the
dax-4.10-iomap-pmd topic branch from the XFS tree.
There is also a new direct IO implementation that uses the iomap
infrastructure. It's much simpler, faster, and has lower IO latency
than the existing direct IO infrastructure.
Summary:
- DAX PMD faults via iomap infrastructure
- Direct-io support in iomap infrastructure
- removal of now-redundant XFS inode iolock, replaced with VFS
i_rwsem
- synchronisation with fixes and changes in userspace libxfs code
- extent tree lookup helpers
- lots of little corruption detection improvements to verifiers
- optimised CRC calculations
- faster buffer cache lookups
- deprecation of barrier/nobarrier mount options - we always use
REQ_FUA/REQ_FLUSH where appropriate for data integrity now
- cleanups to speculative preallocation
- miscellaneous minor bug fixes and cleanups"
* tag 'xfs-for-linus-4.10-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs: (63 commits)
xfs: nuke unused tracepoint definitions
xfs: use GPF_NOFS when allocating btree cursors
xfs: use xfs_vn_setattr_size to check on new size
xfs: deprecate barrier/nobarrier mount option
xfs: Always flush caches when integrity is required
xfs: ignore leaf attr ichdr.count in verifier during log replay
xfs: use rhashtable to track buffer cache
xfs: optimise CRC updates
xfs: make xfs btree stats less huge
xfs: don't cap maximum dedupe request length
xfs: don't allow di_size with high bit set
xfs: error out if trying to add attrs and anextents > 0
xfs: don't crash if reading a directory results in an unexpected hole
xfs: complain if we don't get nextents bmap records
xfs: check for bogus values in btree block headers
xfs: forbid AG btrees with level == 0
xfs: several xattr functions can be void
xfs: handle cow fork in xfs_bmap_trace_exlist
xfs: pass state not whichfork to trace_xfs_extlist
xfs: Move AGI buffer type setting to xfs_read_agi
...
67 files changed, 1357 insertions, 1464 deletions
diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt index c2d44e6e117b..3b9b5c149f32 100644 --- a/Documentation/filesystems/xfs.txt +++ b/Documentation/filesystems/xfs.txt @@ -51,13 +51,6 @@ default behaviour. CRC enabled filesystems always use the attr2 format, and so will reject the noattr2 mount option if it is set. - barrier (*) - nobarrier - Enables/disables the use of block layer write barriers for - writes into the journal and for data integrity operations. - This allows for drive level write caching to be enabled, for - devices that support write barriers. - discard nodiscard (*) Enable/disable the issuing of commands to let the block @@ -228,7 +221,10 @@ default behaviour. Deprecated Mount Options ======================== -None at present. + Name Removal Schedule + ---- ---------------- + barrier no earlier than v4.15 + nobarrier no earlier than v4.15 Removed Mount Options diff --git a/fs/direct-io.c b/fs/direct-io.c index 86aa79859d4d..aeae8c063451 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -554,7 +554,7 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio) * filesystems that don't need it and also allows us to create the workqueue * late enough so the we can include s_id in the name of the workqueue. */ -static int sb_init_dio_done_wq(struct super_block *sb) +int sb_init_dio_done_wq(struct super_block *sb) { struct workqueue_struct *old; struct workqueue_struct *wq = alloc_workqueue("dio/%s", diff --git a/fs/internal.h b/fs/internal.h index f4da3341b4a3..4fcf51766d4a 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -184,3 +184,6 @@ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, struct iomap_ops *ops, void *data, iomap_actor_t actor); + +/* direct-io.c: */ +int sb_init_dio_done_wq(struct super_block *sb); diff --git a/fs/iomap.c b/fs/iomap.c index 13dd413b2b9c..354a123f170e 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -24,6 +24,7 @@ #include <linux/uio.h> #include <linux/backing-dev.h> #include <linux/buffer_head.h> +#include <linux/task_io_accounting_ops.h> #include <linux/dax.h> #include "internal.h" @@ -584,3 +585,375 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, return 0; } EXPORT_SYMBOL_GPL(iomap_fiemap); + +/* + * Private flags for iomap_dio, must not overlap with the public ones in + * iomap.h: + */ +#define IOMAP_DIO_WRITE (1 << 30) +#define IOMAP_DIO_DIRTY (1 << 31) + +struct iomap_dio { + struct kiocb *iocb; + iomap_dio_end_io_t *end_io; + loff_t i_size; + loff_t size; + atomic_t ref; + unsigned flags; + int error; + + union { + /* used during submission and for synchronous completion: */ + struct { + struct iov_iter *iter; + struct task_struct *waiter; + struct request_queue *last_queue; + blk_qc_t cookie; + } submit; + + /* used for aio completion: */ + struct { + struct work_struct work; + } aio; + }; +}; + +static ssize_t iomap_dio_complete(struct iomap_dio *dio) +{ + struct kiocb *iocb = dio->iocb; + ssize_t ret; + + if (dio->end_io) { + ret = dio->end_io(iocb, + dio->error ? dio->error : dio->size, + dio->flags); + } else { + ret = dio->error; + } + + if (likely(!ret)) { + ret = dio->size; + /* check for short read */ + if (iocb->ki_pos + ret > dio->i_size && + !(dio->flags & IOMAP_DIO_WRITE)) + ret = dio->i_size - iocb->ki_pos; + iocb->ki_pos += ret; + } + + inode_dio_end(file_inode(iocb->ki_filp)); + kfree(dio); + + return ret; +} + +static void iomap_dio_complete_work(struct work_struct *work) +{ + struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); + struct kiocb *iocb = dio->iocb; + bool is_write = (dio->flags & IOMAP_DIO_WRITE); + ssize_t ret; + + ret = iomap_dio_complete(dio); + if (is_write && ret > 0) + ret = generic_write_sync(iocb, ret); + iocb->ki_complete(iocb, ret, 0); +} + +/* + * Set an error in the dio if none is set yet. We have to use cmpxchg + * as the submission context and the completion context(s) can race to + * update the error. + */ +static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret) +{ + cmpxchg(&dio->error, 0, ret); +} + +static void iomap_dio_bio_end_io(struct bio *bio) +{ + struct iomap_dio *dio = bio->bi_private; + bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); + + if (bio->bi_error) + iomap_dio_set_error(dio, bio->bi_error); + + if (atomic_dec_and_test(&dio->ref)) { + if (is_sync_kiocb(dio->iocb)) { + struct task_struct *waiter = dio->submit.waiter; + + WRITE_ONCE(dio->submit.waiter, NULL); + wake_up_process(waiter); + } else if (dio->flags & IOMAP_DIO_WRITE) { + struct inode *inode = file_inode(dio->iocb->ki_filp); + + INIT_WORK(&dio->aio.work, iomap_dio_complete_work); + queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); + } else { + iomap_dio_complete_work(&dio->aio.work); + } + } + + if (should_dirty) { + bio_check_pages_dirty(bio); + } else { + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) + put_page(bvec->bv_page); + bio_put(bio); + } +} + +static blk_qc_t +iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, + unsigned len) +{ + struct page *page = ZERO_PAGE(0); + struct bio *bio; + + bio = bio_alloc(GFP_KERNEL, 1); + bio->bi_bdev = iomap->bdev; + bio->bi_iter.bi_sector = + iomap->blkno + ((pos - iomap->offset) >> 9); + bio->bi_private = dio; + bio->bi_end_io = iomap_dio_bio_end_io; + + get_page(page); + if (bio_add_page(bio, page, len, 0) != len) + BUG(); + bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE); + + atomic_inc(&dio->ref); + return submit_bio(bio); +} + +static loff_t +iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, + void *data, struct iomap *iomap) +{ + struct iomap_dio *dio = data; + unsigned blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev)); + unsigned fs_block_size = (1 << inode->i_blkbits), pad; + unsigned align = iov_iter_alignment(dio->submit.iter); + struct iov_iter iter; + struct bio *bio; + bool need_zeroout = false; + int nr_pages, ret; + + if ((pos | length | align) & ((1 << blkbits) - 1)) + return -EINVAL; + + switch (iomap->type) { + case IOMAP_HOLE: + if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE)) + return -EIO; + /*FALLTHRU*/ + case IOMAP_UNWRITTEN: + if (!(dio->flags & IOMAP_DIO_WRITE)) { + iov_iter_zero(length, dio->submit.iter); + dio->size += length; + return length; + } + dio->flags |= IOMAP_DIO_UNWRITTEN; + need_zeroout = true; + break; + case IOMAP_MAPPED: + if (iomap->flags & IOMAP_F_SHARED) + dio->flags |= IOMAP_DIO_COW; + if (iomap->flags & IOMAP_F_NEW) + need_zeroout = true; + break; + default: + WARN_ON_ONCE(1); + return -EIO; + } + + /* + * Operate on a partial iter trimmed to the extent we were called for. + * We'll update the iter in the dio once we're done with this extent. + */ + iter = *dio->submit.iter; + iov_iter_truncate(&iter, length); + + nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES); + if (nr_pages <= 0) + return nr_pages; + + if (need_zeroout) { + /* zero out from the start of the block to the write offset */ + pad = pos & (fs_block_size - 1); + if (pad) + iomap_dio_zero(dio, iomap, pos - pad, pad); + } + + do { + if (dio->error) + return 0; + + bio = bio_alloc(GFP_KERNEL, nr_pages); + bio->bi_bdev = iomap->bdev; + bio->bi_iter.bi_sector = + iomap->blkno + ((pos - iomap->offset) >> 9); + bio->bi_private = dio; + bio->bi_end_io = iomap_dio_bio_end_io; + + ret = bio_iov_iter_get_pages(bio, &iter); + if (unlikely(ret)) { + bio_put(bio); + return ret; + } + + if (dio->flags & IOMAP_DIO_WRITE) { + bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE); + task_io_account_write(bio->bi_iter.bi_size); + } else { + bio_set_op_attrs(bio, REQ_OP_READ, 0); + if (dio->flags & IOMAP_DIO_DIRTY) + bio_set_pages_dirty(bio); + } + + dio->size += bio->bi_iter.bi_size; + pos += bio->bi_iter.bi_size; + + nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES); + + atomic_inc(&dio->ref); + + dio->submit.last_queue = bdev_get_queue(iomap->bdev); + dio->submit.cookie = submit_bio(bio); + } while (nr_pages); + + if (need_zeroout) { + /* zero out from the end of the write to the end of the block */ + pad = pos & (fs_block_size - 1); + if (pad) + iomap_dio_zero(dio, iomap, pos, fs_block_size - pad); + } + + iov_iter_advance(dio->submit.iter, length); + return length; +} + +ssize_t +iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_ops *ops, + iomap_dio_end_io_t end_io) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = file_inode(iocb->ki_filp); + size_t count = iov_iter_count(iter); + loff_t pos = iocb->ki_pos, end = iocb->ki_pos + count - 1, ret = 0; + unsigned int flags = IOMAP_DIRECT; + struct blk_plug plug; + struct iomap_dio *dio; + + lockdep_assert_held(&inode->i_rwsem); + + if (!count) + return 0; + + dio = kmalloc(sizeof(*dio), GFP_KERNEL); + if (!dio) + return -ENOMEM; + + dio->iocb = iocb; + atomic_set(&dio->ref, 1); + dio->size = 0; + dio->i_size = i_size_read(inode); + dio->end_io = end_io; + dio->error = 0; + dio->flags = 0; + + dio->submit.iter = iter; + if (is_sync_kiocb(iocb)) { + dio->submit.waiter = current; + dio->submit.cookie = BLK_QC_T_NONE; + dio->submit.last_queue = NULL; + } + + if (iov_iter_rw(iter) == READ) { + if (pos >= dio->i_size) + goto out_free_dio; + + if (iter->type == ITER_IOVEC) + dio->flags |= IOMAP_DIO_DIRTY; + } else { + dio->flags |= IOMAP_DIO_WRITE; + flags |= IOMAP_WRITE; + } + + if (mapping->nrpages) { + ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end); + if (ret) + goto out_free_dio; + + ret = invalidate_inode_pages2_range(mapping, + iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); + WARN_ON_ONCE(ret); + ret = 0; + } + + inode_dio_begin(inode); + + blk_start_plug(&plug); + do { + ret = iomap_apply(inode, pos, count, flags, ops, dio, + iomap_dio_actor); + if (ret <= 0) { + /* magic error code to fall back to buffered I/O */ + if (ret == -ENOTBLK) + ret = 0; + break; + } + pos += ret; + } while ((count = iov_iter_count(iter)) > 0); + blk_finish_plug(&plug); + + if (ret < 0) + iomap_dio_set_error(dio, ret); + + if (ret >= 0 && iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) && + !inode->i_sb->s_dio_done_wq) { + ret = sb_init_dio_done_wq(inode->i_sb); + if (ret < 0) + iomap_dio_set_error(dio, ret); + } + + if (!atomic_dec_and_test(&dio->ref)) { + if (!is_sync_kiocb(iocb)) + return -EIOCBQUEUED; + + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(dio->submit.waiter)) + break; + + if (!(iocb->ki_flags & IOCB_HIPRI) || + !dio->submit.last_queue || + !blk_mq_poll(dio->submit.last_queue, + dio->submit.cookie)) + io_schedule(); + } + __set_current_state(TASK_RUNNING); + } + + /* + * Try again to invalidate clean pages which might have been cached by + * non-direct readahead, or faulted in by get_user_pages() if the source + * of the write was an mmap'ed region of the file we're writing. Either + * one is a pretty crazy thing to do, so we don't support it 100%. If + * this invalidation fails, tough, the write still worked... + */ + if (iov_iter_rw(iter) == WRITE && mapping->nrpages) { + ret = invalidate_inode_pages2_range(mapping, + iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); + WARN_ON_ONCE(ret); + } + + return iomap_dio_complete(dio); + +out_free_dio: + kfree(dio); + return ret; +} +EXPORT_SYMBOL_GPL(iomap_dio_rw); diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index effb64cf714f..5050056a0b06 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2455,12 +2455,15 @@ xfs_agf_verify( be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp))) return false; - if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS || + if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 || + be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 || + be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS || be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS) return false; if (xfs_sb_version_hasrmapbt(&mp->m_sb) && - be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS) + (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 || + be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS)) return false; /* @@ -2477,7 +2480,8 @@ xfs_agf_verify( return false; if (xfs_sb_version_hasreflink(&mp->m_sb) && - be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS) + (be32_to_cpu(agf->agf_refcount_level) < 1 || + be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS)) return false; return true;; diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 5ba2dac5e67c..efb467b10a71 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -421,13 +421,17 @@ xfs_allocbt_init_cursor( ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT); - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); cur->bc_tp = tp; cur->bc_mp = mp; cur->bc_btnum = btnum; cur->bc_blocklog = mp->m_sb.sb_blocklog; cur->bc_ops = &xfs_allocbt_ops; + if (btnum == XFS_BTNUM_BNO) + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); + else + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2); if (btnum == XFS_BTNUM_CNT) { cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 8ea91f363093..2852521fc8ec 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -253,6 +253,7 @@ xfs_attr3_leaf_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_attr_leafblock *leaf = bp->b_addr; + struct xfs_perag *pag = bp->b_pag; struct xfs_attr3_icleaf_hdr ichdr; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); @@ -273,7 +274,12 @@ xfs_attr3_leaf_verify( if (ichdr.magic != XFS_ATTR_LEAF_MAGIC) return false; } - if (ichdr.count == 0) + /* + * In recovery there is a transient state where count == 0 is valid + * because we may have transitioned an empty shortform attr to a leaf + * if the attr didn't fit in shortform. + */ + if (pag && pag->pagf_init && ichdr.count == 0) return false; /* XXX: need to range check rest of attr header values */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 4f2aed04f827..f7dda0c237b0 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -51,7 +51,7 @@ int xfs_attr_shortform_getvalue(struct xfs_da_args *args); int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); int xfs_attr_shortform_remove(struct xfs_da_args *args); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); -int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); +int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes); void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp); /* @@ -77,7 +77,7 @@ int xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); int xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); -int xfs_attr3_leaf_list_int(struct xfs_buf *bp, +void xfs_attr3_leaf_list_int(struct xfs_buf *bp, struct xfs_attr_list_context *context); /* diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index c6eb21940783..2760bc3b2536 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -49,6 +49,8 @@ #include "xfs_rmap.h" #include "xfs_ag_resv.h" #include "xfs_refcount.h" +#include "xfs_rmap_btree.h" +#include "xfs_icache.h" kmem_zone_t *xfs_bmap_free_item_zone; @@ -190,8 +192,12 @@ xfs_bmap_worst_indlen( int maxrecs; /* maximum record count at this level */ xfs_mount_t *mp; /* mount structure */ xfs_filblks_t rval; /* return value */ + xfs_filblks_t orig_len; mp = ip->i_mount; + + /* Calculate the worst-case size of the bmbt. */ + orig_len = len; maxrecs = mp->m_bmap_dmxr[0]; for (level = 0, rval = 0; level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); @@ -199,12 +205,20 @@ xfs_bmap_worst_indlen( len += maxrecs - 1; do_div(len, maxrecs); rval += len; - if (len == 1) - return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - + if (len == 1) { + rval += XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - level - 1; + break; + } if (level == 0) maxrecs = mp->m_bmap_dmxr[1]; } + + /* Calculate the worst-case size of the rmapbt. */ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + rval += 1 + xfs_rmapbt_calc_size(mp, orig_len) + + mp->m_rmap_maxlevels; + return rval; } @@ -504,7 +518,7 @@ void xfs_bmap_trace_exlist( xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t cnt, /* count of entries in the list */ - int whichfork, /* data or attr fork */ + int whichfork, /* data or attr or cow fork */ unsigned long caller_ip) { xfs_extnum_t idx; /* extent record index */ @@ -513,11 +527,13 @@ xfs_bmap_trace_exlist( if (whichfork == XFS_ATTR_FORK) state |= BMAP_ATTRFORK; + else if (whichfork == XFS_COW_FORK) + state |= BMAP_COWFORK; ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); + ASSERT(cnt == xfs_iext_count(ifp)); for (idx = 0; idx < cnt; idx++) - trace_xfs_extlist(ip, idx, whichfork, caller_ip); + trace_xfs_extlist(ip, idx, state, caller_ip); } /* @@ -811,7 +827,7 @@ try_another_ag: XFS_BTREE_LONG_PTRS); arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); for (cnt = i = 0; i < nextents; i++) { ep = xfs_iext_get_ext(ifp, i); if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) { @@ -1137,6 +1153,10 @@ xfs_bmap_add_attrfork( goto trans_cancel; if (XFS_IFORK_Q(ip)) goto trans_cancel; + if (ip->i_d.di_anextents != 0) { + error = -EFSCORRUPTED; + goto trans_cancel; + } if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { /* * For inodes coming from pre-6.2 filesystems. @@ -1144,7 +1164,6 @@ xfs_bmap_add_attrfork( ASSERT(ip->i_d.di_aformat == 0); ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; } - ASSERT(ip->i_d.di_anextents == 0); xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -1296,7 +1315,7 @@ xfs_bmap_read_extents( /* * Here with bp and block set to the leftmost leaf node in the tree. */ - room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + room = xfs_iext_count(ifp); i = 0; /* * Loop over all leaf nodes. Copy information to the extent records. @@ -1361,8 +1380,9 @@ xfs_bmap_read_extents( return error; block = XFS_BUF_TO_BLOCK(bp); } - ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); - ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork)); + if (i != XFS_IFORK_NEXTENTS(ip, whichfork)) + return -EFSCORRUPTED; + ASSERT(i == xfs_iext_count(ifp)); XFS_BMAP_TRACE_EXLIST(ip, i, whichfork); return 0; error0: @@ -1370,97 +1390,6 @@ error0: return -EFSCORRUPTED; } - -/* - * Search the extent records for the entry containing block bno. - * If bno lies in a hole, point to the next entry. If bno lies - * past eof, *eofp will be set, and *prevp will contain the last - * entry (null if none). Else, *lastxp will be set to the index - * of the found entry; *gotp will contain the entry. - */ -STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ -xfs_bmap_search_multi_extents( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_fileoff_t bno, /* block number searched for */ - int *eofp, /* out: end of file found */ - xfs_extnum_t *lastxp, /* out: last extent index */ - xfs_bmbt_irec_t *gotp, /* out: extent entry found */ - xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ -{ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - xfs_extnum_t lastx; /* last extent index */ - - /* - * Initialize the extent entry structure to catch access to - * uninitialized br_startblock field. - */ - gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL; - gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL; - gotp->br_state = XFS_EXT_INVALID; - gotp->br_startblock = 0xffffa5a5a5a5a5a5LL; - prevp->br_startoff = NULLFILEOFF; - - ep = xfs_iext_bno_to_ext(ifp, bno, &lastx); - if (lastx > 0) { - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp); - } - if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { - xfs_bmbt_get_all(ep, gotp); - *eofp = 0; - } else { - if (lastx > 0) { - *gotp = *prevp; - } - *eofp = 1; - ep = NULL; - } - *lastxp = lastx; - return ep; -} - -/* - * Search the extents list for the inode, for the extent containing bno. - * If bno lies in a hole, point to the next entry. If bno lies past eof, - * *eofp will be set, and *prevp will contain the last entry (null if none). - * Else, *lastxp will be set to the index of the found - * entry; *gotp will contain the entry. - */ -xfs_bmbt_rec_host_t * /* pointer to found extent entry */ -xfs_bmap_search_extents( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fileoff_t bno, /* block number searched for */ - int fork, /* data or attr fork */ - int *eofp, /* out: end of file found */ - xfs_extnum_t *lastxp, /* out: last extent index */ - xfs_bmbt_irec_t *gotp, /* out: extent entry found */ - xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ -{ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - - XFS_STATS_INC(ip->i_mount, xs_look_exlist); - ifp = XFS_IFORK_PTR(ip, fork); - - ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp); - - if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) && - !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) { - xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, - "Access to block zero in inode %llu " - "start_block: %llx start_off: %llx " - "blkcnt: %llx extent-state: %x lastx: %x", - (unsigned long long)ip->i_ino, - (unsigned long long)gotp->br_startblock, - (unsigned long long)gotp->br_startoff, - (unsigned long long)gotp->br_blockcount, - gotp->br_state, *lastxp); - *lastxp = NULLEXTNUM; - *eofp = 1; - return NULL; - } - return ep; -} - /* * Returns the file-relative block number of the first unused block(s) * in the file with at least "len" logically contiguous blocks free. @@ -1497,7 +1426,7 @@ xfs_bmap_first_unused( (error = xfs_iread_extents(tp, ip, whichfork))) return error; lowest = *first_unused; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) { xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx); off = xfs_bmbt_get_startoff(ep); @@ -1523,44 +1452,44 @@ xfs_bmap_first_unused( */ int /* error */ xfs_bmap_last_before( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - xfs_fileoff_t *last_block, /* last block */ - int whichfork) /* data or attr fork */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t *last_block, /* last block */ + int whichfork) /* data or attr fork */ { - xfs_fileoff_t bno; /* input file offset */ - int eof; /* hit end of file */ - xfs_bmbt_rec_host_t *ep; /* pointer to last extent */ - int error; /* error return value */ - xfs_bmbt_irec_t got; /* current extent value */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t lastx; /* last extent used */ - xfs_bmbt_irec_t prev; /* previous extent value */ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_bmbt_irec got; + xfs_extnum_t idx; + int error; - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) - return -EIO; - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + switch (XFS_IFORK_FORMAT(ip, whichfork)) { + case XFS_DINODE_FMT_LOCAL: *last_block = 0; return 0; + case XFS_DINODE_FMT_BTREE: + case XFS_DINODE_FMT_EXTENTS: + break; + default: + return -EIO; } - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) - return error; - bno = *last_block - 1; - ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, - &prev); - if (eof || xfs_bmbt_get_startoff(ep) > bno) { - if (prev.br_startoff == NULLFILEOFF) - *last_block = 0; - else - *last_block = prev.br_startoff + prev.br_blockcount; + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; } - /* - * Otherwise *last_block is already the right answer. - */ + + if (xfs_iext_lookup_extent(ip, ifp, *last_block - 1, &idx, &got)) { + if (got.br_startoff <= *last_block - 1) + return 0; + } + + if (xfs_iext_get_extent(ifp, idx - 1, &got)) { + *last_block = got.br_startoff + got.br_blockcount; + return 0; + } + + *last_block = 0; return 0; } @@ -1582,7 +1511,7 @@ xfs_bmap_last_extent( return error; } - nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); if (nextents == 0) { *is_empty = 1; return 0; @@ -1735,7 +1664,7 @@ xfs_bmap_add_extent_delay_real( &bma->ip->i_d.di_nextents); ASSERT(bma->idx >= 0); - ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(bma->idx <= xfs_iext_count(ifp)); ASSERT(!isnullstartblock(new->br_startblock)); ASSERT(!bma->cur || (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); @@ -1794,7 +1723,7 @@ xfs_bmap_add_extent_delay_real( * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { + if (bma->idx < xfs_iext_count(ifp) - 1) { state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT); @@ -2300,7 +2229,7 @@ xfs_bmap_add_extent_unwritten_real( ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); ASSERT(*idx >= 0); - ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(*idx <= xfs_iext_count(ifp)); ASSERT(!isnullstartblock(new->br_startblock)); XFS_STATS_INC(mp, xs_add_exlist); @@ -2356,7 +2285,7 @@ xfs_bmap_add_extent_unwritten_real( * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { + if (*idx < xfs_iext_count(&ip->i_df) - 1) { state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT); if (isnullstartblock(RIGHT.br_startblock)) @@ -2836,7 +2765,7 @@ xfs_bmap_add_extent_hole_delay( * Check and set flags if the current (right) segment exists. * If it doesn't exist, we're converting the hole at end-of-file. */ - if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { + if (*idx < xfs_iext_count(ifp)) { state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); @@ -2966,7 +2895,7 @@ xfs_bmap_add_extent_hole_real( ifp = XFS_IFORK_PTR(bma->ip, whichfork); ASSERT(bma->idx >= 0); - ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(bma->idx <= xfs_iext_count(ifp)); ASSERT(!isnullstartblock(new->br_startblock)); ASSERT(!bma->cur || !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); @@ -2992,7 +2921,7 @@ xfs_bmap_add_extent_hole_real( * Check and set flags if this segment has a current value. * Not true if we're inserting into the "hole" at eof. */ - if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { + if (bma->idx < xfs_iext_count(ifp)) { state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right); if (isnullstartblock(right.br_startblock)) @@ -4145,12 +4074,11 @@ xfs_bmapi_read( struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; struct xfs_bmbt_irec got; - struct xfs_bmbt_irec prev; xfs_fileoff_t obno; xfs_fileoff_t end; - xfs_extnum_t lastx; + xfs_extnum_t idx; int error; - int eof; + bool eof = false; int n = 0; int whichfork = xfs_bmapi_whichfork(flags); @@ -4190,7 +4118,8 @@ xfs_bmapi_read( return error; } - xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev); + if (!xfs_iext_lookup_extent(ip, ifp, bno, &idx, &got)) + eof = true; end = bno + len; obno = bno; @@ -4221,10 +4150,8 @@ xfs_bmapi_read( break; /* Else go on to the next record. */ - if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got); - else - eof = 1; + if (!xfs_iext_get_extent(ifp, ++idx, &got)) + eof = true; } *nmap = n; return 0; @@ -4234,10 +4161,10 @@ int xfs_bmapi_reserve_delalloc( struct xfs_inode *ip, int whichfork, - xfs_fileoff_t aoff, + xfs_fileoff_t off, xfs_filblks_t len, + xfs_filblks_t prealloc, struct xfs_bmbt_irec *got, - struct xfs_bmbt_irec *prev, xfs_extnum_t *lastx, int eof) { @@ -4248,10 +4175,17 @@ xfs_bmapi_reserve_delalloc( char rt = XFS_IS_REALTIME_INODE(ip); xfs_extlen_t extsz; int error; + xfs_fileoff_t aoff = off; - alen = XFS_FILBLKS_MIN(len, MAXEXTLEN); + /* + * Cap the alloc length. Keep track of prealloc so we know whether to + * tag the inode before we return. + */ + alen = XFS_FILBLKS_MIN(len + prealloc, MAXEXTLEN); if (!eof) alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); + if (prealloc && alen >= len) + prealloc = alen - len; /* Figure out the extent size, adjust alen */ if (whichfork == XFS_COW_FORK) @@ -4259,7 +4193,12 @@ xfs_bmapi_reserve_delalloc( else extsz = xfs_get_extsz_hint(ip); if (extsz) { - error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof, + struct xfs_bmbt_irec prev; + + if (!xfs_iext_get_extent(ifp, *lastx - 1, &prev)) + prev.br_startoff = NULLFILEOFF; + + error = xfs_bmap_extsize_align(mp, got, &prev, extsz, rt, eof, 1, 0, &aoff, &alen); ASSERT(!error); } @@ -4312,6 +4251,16 @@ xfs_bmapi_reserve_delalloc( */ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got); + /* + * Tag the inode if blocks were preallocated. Note that COW fork + * preallocation can occur at the start or end of the extent, even when + * prealloc == 0, so we must also check the aligned offset and length. + */ + if (whichfork == XFS_DATA_FORK && prealloc) + xfs_inode_set_eofblocks_tag(ip); + if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len)) + xfs_inode_set_cowblocks_tag(ip); + ASSERT(got->br_startoff <= aoff); ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen); ASSERT(isnullstartblock(got->br_startblock)); @@ -4349,7 +4298,7 @@ xfs_bmapi_allocate( if (bma->wasdel) { bma->length = (xfs_extlen_t)bma->got.br_blockcount; bma->offset = bma->got.br_startoff; - if (bma->idx != NULLEXTNUM && bma->idx) { + if (bma->idx) { xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &bma->prev); } @@ -4563,7 +4512,7 @@ xfs_bmapi_write( struct xfs_ifork *ifp; struct xfs_bmalloca bma = { NULL }; /* args for xfs_bmap_alloc */ xfs_fileoff_t end; /* end of mapped file region */ - int eof; /* after the end of extents */ + bool eof = false; /* after the end of extents */ int error; /* error return */ int n; /* current extent index */ xfs_fileoff_t obno; /* old block number (offset) */ @@ -4641,12 +4590,14 @@ xfs_bmapi_write( goto error0; } - xfs_bmap_search_extents(ip, bno, whichfork, &eof, &bma.idx, &bma.got, - &bma.prev); n = 0; end = bno + len; obno = bno; + if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.idx, &bma.got)) + eof = true; + if (!xfs_iext_get_extent(ifp, bma.idx - 1, &bma.prev)) + bma.prev.br_startoff = NULLFILEOFF; bma.tp = tp; bma.ip = ip; bma.total = total; @@ -4733,11 +4684,8 @@ xfs_bmapi_write( /* Else go on to the next record. */ bma.prev = bma.got; - if (++bma.idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) { - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma.idx), - &bma.got); - } else - eof = 1; + if (!xfs_iext_get_extent(ifp, ++bma.idx, &bma.got)) + eof = true; } *nmap = n; @@ -4885,7 +4833,7 @@ xfs_bmap_del_extent_delay( da_new = 0; ASSERT(*idx >= 0); - ASSERT(*idx < ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(*idx <= xfs_iext_count(ifp)); ASSERT(del->br_blockcount > 0); ASSERT(got->br_startoff <= del->br_startoff); ASSERT(got_endoff >= del_endoff); @@ -4902,8 +4850,11 @@ xfs_bmap_del_extent_delay( * sb counters as we might have to borrow some blocks for the * indirect block accounting. */ - xfs_trans_reserve_quota_nblks(NULL, ip, -((long)del->br_blockcount), 0, + error = xfs_trans_reserve_quota_nblks(NULL, ip, + -((long)del->br_blockcount), 0, isrt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + if (error) + return error; ip->i_delayed_blks -= del->br_blockcount; if (whichfork == XFS_COW_FORK) @@ -5013,7 +4964,7 @@ xfs_bmap_del_extent_cow( got_endoff = got->br_startoff + got->br_blockcount; ASSERT(*idx >= 0); - ASSERT(*idx < ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(*idx <= xfs_iext_count(ifp)); ASSERT(del->br_blockcount > 0); ASSERT(got->br_startoff <= del->br_startoff); ASSERT(got_endoff >= del_endoff); @@ -5119,8 +5070,7 @@ xfs_bmap_del_extent( state |= BMAP_COWFORK; ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT((*idx >= 0) && (*idx < ifp->if_bytes / - (uint)sizeof(xfs_bmbt_rec_t))); + ASSERT((*idx >= 0) && (*idx < xfs_iext_count(ifp))); ASSERT(del->br_blockcount > 0); ep = xfs_iext_get_ext(ifp, *idx); xfs_bmbt_get_all(ep, &got); @@ -5434,8 +5384,6 @@ __xfs_bunmapi( { xfs_btree_cur_t *cur; /* bmap btree cursor */ xfs_bmbt_irec_t del; /* extent being deleted */ - int eof; /* is deleting at eof */ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ int error; /* error return value */ xfs_extnum_t extno; /* extent number in list */ xfs_bmbt_irec_t got; /* current extent record */ @@ -5445,8 +5393,6 @@ __xfs_bunmapi( int logflags; /* transaction logging flags */ xfs_extlen_t mod; /* rt extent offset */ xfs_mount_t *mp; /* mount structure */ - xfs_extnum_t nextents; /* number of file extents */ - xfs_bmbt_irec_t prev; /* previous extent record */ xfs_fileoff_t start; /* first file offset deleted */ int tmp_logflags; /* partial logging flags */ int wasdel; /* was a delayed alloc extent */ @@ -5477,8 +5423,7 @@ __xfs_bunmapi( if (!(ifp->if_flags & XFS_IFEXTENTS) && (error = xfs_iread_extents(tp, ip, whichfork))) return error; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - if (nextents == 0) { + if (xfs_iext_count(ifp) == 0) { *rlen = 0; return 0; } @@ -5486,18 +5431,17 @@ __xfs_bunmapi( isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); start = bno; bno = start + len - 1; - ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, - &prev); /* * Check to see if the given block number is past the end of the * file, back up to the last block if so... */ - if (eof) { - ep = xfs_iext_get_ext(ifp, --lastx); - xfs_bmbt_get_all(ep, &got); + if (!xfs_iext_lookup_extent(ip, ifp, bno, &lastx, &got)) { + ASSERT(lastx > 0); + xfs_iext_get_extent(ifp, --lastx, &got); bno = got.br_startoff + got.br_blockcount - 1; } + logflags = 0; if (ifp->if_flags & XFS_IFBROOT) { ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); @@ -5528,8 +5472,7 @@ __xfs_bunmapi( if (got.br_startoff > bno) { if (--lastx < 0) break; - ep = xfs_iext_get_ext(ifp, lastx); - xfs_bmbt_get_all(ep, &got); + xfs_iext_get_extent(ifp, lastx, &got); } /* * Is the last block of this extent before the range @@ -5543,7 +5486,6 @@ __xfs_bunmapi( * Then deal with the (possibly delayed) allocated space * we found. */ - ASSERT(ep != NULL); del = got; wasdel = isnullstartblock(del.br_startblock); if (got.br_startoff < start) { @@ -5624,15 +5566,12 @@ __xfs_bunmapi( */ ASSERT(bno >= del.br_blockcount); bno -= del.br_blockcount; - if (got.br_startoff > bno) { - if (--lastx >= 0) { - ep = xfs_iext_get_ext(ifp, - lastx); - xfs_bmbt_get_all(ep, &got); - } - } + if (got.br_startoff > bno && --lastx >= 0) + xfs_iext_get_extent(ifp, lastx, &got); continue; } else if (del.br_state == XFS_EXT_UNWRITTEN) { + struct xfs_bmbt_irec prev; + /* * This one is already unwritten. * It must have a written left neighbor. @@ -5640,8 +5579,7 @@ __xfs_bunmapi( * try again. */ ASSERT(lastx > 0); - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, - lastx - 1), &prev); + xfs_iext_get_extent(ifp, lastx - 1, &prev); ASSERT(prev.br_state == XFS_EXT_NORM); ASSERT(!isnullstartblock(prev.br_startblock)); ASSERT(del.br_startblock == @@ -5739,13 +5677,9 @@ nodelete: */ if (bno != (xfs_fileoff_t)-1 && bno >= start) { if (lastx >= 0) { - ep = xfs_iext_get_ext(ifp, lastx); - if (xfs_bmbt_get_startoff(ep) > bno) { - if (--lastx >= 0) - ep = xfs_iext_get_ext(ifp, - lastx); - } - xfs_bmbt_get_all(ep, &got); + xfs_iext_get_extent(ifp, lastx, &got); + if (got.br_startoff > bno && --lastx >= 0) + xfs_iext_get_extent(ifp, lastx, &got); } extno++; } @@ -5963,7 +5897,7 @@ xfs_bmse_shift_one( mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); - total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + total_extents = xfs_iext_count(ifp); xfs_bmbt_get_all(gotp, &got); @@ -6140,7 +6074,7 @@ xfs_bmap_shift_extents( * are collapsing out, so we cannot use the count of real extents here. * Instead we have to calculate it from the incore fork. */ - total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + total_extents = xfs_iext_count(ifp); if (total_extents == 0) { *done = 1; goto del_cursor; @@ -6200,7 +6134,7 @@ xfs_bmap_shift_extents( * count can change. Update the total and grade the next record. */ if (direction == SHIFT_LEFT) { - total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + total_extents = xfs_iext_count(ifp); stop_extent = total_extents; } diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 7cae6ec27fa6..cecd094404cc 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -237,14 +237,9 @@ int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_defer_ops *dfops, enum shift_direction direction, int num_exts); int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset); -struct xfs_bmbt_rec_host * - xfs_bmap_search_extents(struct xfs_inode *ip, xfs_fileoff_t bno, - int fork, int *eofp, xfs_extnum_t *lastxp, - struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp); int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, - xfs_fileoff_t aoff, xfs_filblks_t len, - struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *prev, - xfs_extnum_t *lastx, int eof); + xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, + struct xfs_bmbt_irec *got, xfs_extnum_t *lastx, int eof); enum xfs_bmap_intent_type { XFS_BMAP_MAP = 1, diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 8007d2ba9aef..d6330c297ca0 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -796,13 +796,14 @@ xfs_bmbt_init_cursor( struct xfs_btree_cur *cur; ASSERT(whichfork != XFS_COW_FORK); - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); cur->bc_tp = tp; cur->bc_mp = mp; cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1; cur->bc_btnum = XFS_BTNUM_BMAP; cur->bc_blocklog = mp->m_sb.sb_blocklog; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2); cur->bc_ops = &xfs_bmbt_ops; cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 0e80993c8a59..21e6a6ab6b9a 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1769,8 +1769,28 @@ xfs_btree_lookup_get_block( if (error) return error; + /* Check the inode owner since the verifiers don't. */ + if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) && + (cur->bc_flags & XFS_BTREE_LONG_PTRS) && + be64_to_cpu((*blkp)->bb_u.l.bb_owner) != + cur->bc_private.b.ip->i_ino) + goto out_bad; + + /* Did we get the level we were looking for? */ + if (be16_to_cpu((*blkp)->bb_level) != level) + goto out_bad; + + /* Check that internal nodes have at least one record. */ + if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0) + goto out_bad; + xfs_btree_setbuf(cur, level, bp); return 0; + +out_bad: + *blkp = NULL; + xfs_trans_brelse(cur->bc_tp, bp); + return -EFSCORRUPTED; } /* diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index c2b01d1c79ee..b69b947c4c1b 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -96,46 +96,10 @@ union xfs_btree_rec { /* * Generic stats interface */ -#define __XFS_BTREE_STATS_INC(mp, type, stat) \ - XFS_STATS_INC(mp, xs_ ## type ## _2_ ## stat) #define XFS_BTREE_STATS_INC(cur, stat) \ -do { \ - struct xfs_mount *__mp = cur->bc_mp; \ - switch (cur->bc_btnum) { \ - case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(__mp, abtb, stat); break; \ - case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(__mp, abtc, stat); break; \ - case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(__mp, bmbt, stat); break; \ - case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \ - case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \ - case XFS_BTNUM_RMAP: __XFS_BTREE_STATS_INC(__mp, rmap, stat); break; \ - case XFS_BTNUM_REFC: __XFS_BTREE_STATS_INC(__mp, refcbt, stat); break; \ - case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ - } \ -} while (0) - -#define __XFS_BTREE_STATS_ADD(mp, type, stat, val) \ - XFS_STATS_ADD(mp, xs_ ## type ## _2_ ## stat, val) -#define XFS_BTREE_STATS_ADD(cur, stat, val) \ -do { \ - struct xfs_mount *__mp = cur->bc_mp; \ - switch (cur->bc_btnum) { \ - case XFS_BTNUM_BNO: \ - __XFS_BTREE_STATS_ADD(__mp, abtb, stat, val); break; \ - case XFS_BTNUM_CNT: \ - __XFS_BTREE_STATS_ADD(__mp, abtc, stat, val); break; \ - case XFS_BTNUM_BMAP: \ - __XFS_BTREE_STATS_ADD(__mp, bmbt, stat, val); break; \ - case XFS_BTNUM_INO: \ - __XFS_BTREE_STATS_ADD(__mp, ibt, stat, val); break; \ - case XFS_BTNUM_FINO: \ - __XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \ - case XFS_BTNUM_RMAP: \ - __XFS_BTREE_STATS_ADD(__mp, rmap, stat, val); break; \ - case XFS_BTNUM_REFC: \ - __XFS_BTREE_STATS_ADD(__mp, refcbt, stat, val); break; \ - case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ - } \ -} while (0) + XFS_STATS_INC_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat) +#define XFS_BTREE_STATS_ADD(cur, stat, val) \ + XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val) #define XFS_BTREE_MAXLEVELS 9 /* max of all btrees */ @@ -253,6 +217,7 @@ typedef struct xfs_btree_cur __uint8_t bc_nlevels; /* number of levels in the tree */ __uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */ xfs_btnum_t bc_btnum; /* identifies which btree type */ + int bc_statoff; /* offset of btre stats array */ union { struct { /* needed for BNO, CNT, INO */ struct xfs_buf *agbp; /* agf/agi buffer pointer */ diff --git a/fs/xfs/libxfs/xfs_cksum.h b/fs/xfs/libxfs/xfs_cksum.h index fad1676ad8cd..a416c7cb23ea 100644 --- a/fs/xfs/libxfs/xfs_cksum.h +++ b/fs/xfs/libxfs/xfs_cksum.h @@ -6,10 +6,11 @@ /* * Calculate the intermediate checksum for a buffer that has the CRC field * inside it. The offset of the 32bit crc fields is passed as the - * cksum_offset parameter. + * cksum_offset parameter. We do not modify the buffer during verification, + * hence we have to split the CRC calculation across the cksum_offset. */ static inline __uint32_t -xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset) +xfs_start_cksum_safe(char *buffer, size_t length, unsigned long cksum_offset) { __uint32_t zero = 0; __uint32_t crc; @@ -26,6 +27,20 @@ xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset) } /* + * Fast CRC method where the buffer is modified. Callers must have exclusive + * access to the buffer while the calculation takes place. + */ +static inline __uint32_t +xfs_start_cksum_update(char *buffer, size_t length, unsigned long cksum_offset) +{ + /* zero the CRC field */ + *(__le32 *)(buffer + cksum_offset) = 0; + + /* single pass CRC calculation for the entire buffer */ + return crc32c(XFS_CRC_SEED, buffer, length); +} + +/* * Convert the intermediate checksum to the final ondisk format. * * The CRC32c calculation uses LE format even on BE machines, but returns the @@ -40,11 +55,14 @@ xfs_end_cksum(__uint32_t crc) /* * Helper to generate the checksum for a buffer. + * + * This modifies the buffer temporarily - callers must have exclusive + * access to the buffer while the calculation takes place. */ static inline void xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset) { - __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset); + __uint32_t crc = xfs_start_cksum_update(buffer, length, cksum_offset); *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc); } @@ -55,7 +73,7 @@ xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset) static inline int xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset) { - __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset); + __uint32_t crc = xfs_start_cksum_safe(buffer, length, cksum_offset); return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc); } diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 20a96dd5af7e..c58d72c220f5 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -93,7 +93,7 @@ xfs_ascii_ci_compname( return result; } -static struct xfs_nameops xfs_ascii_ci_nameops = { +static const struct xfs_nameops xfs_ascii_ci_nameops = { .hashname = xfs_ascii_ci_hashname, .compname = xfs_ascii_ci_compname, }; diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index becc926c3e3d..0197590fa7d7 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -157,6 +157,9 @@ extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r); extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, struct xfs_buf *bp); +extern void xfs_dir2_data_freescan_int(struct xfs_da_geometry *geo, + const struct xfs_dir_ops *ops, + struct xfs_dir2_data_hdr *hdr, int *loghead); extern void xfs_dir2_data_freescan(struct xfs_inode *dp, struct xfs_dir2_data_hdr *hdr, int *loghead); extern void xfs_dir2_data_log_entry(struct xfs_da_args *args, @@ -177,6 +180,8 @@ extern struct xfs_dir2_data_free *xfs_dir2_data_freefind( struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf, struct xfs_dir2_data_unused *dup); +extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); + extern const struct xfs_buf_ops xfs_dir3_block_buf_ops; extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops; extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops; diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 725fc7841fde..d478065b9544 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -329,7 +329,7 @@ xfs_dir3_data_read( err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, XFS_DATA_FORK, &xfs_dir3_data_buf_ops); - if (!err && tp) + if (!err && tp && *bpp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF); return err; } @@ -505,8 +505,9 @@ xfs_dir2_data_freeremove( * Given a data block, reconstruct its bestfree map. */ void -xfs_dir2_data_freescan( - struct xfs_inode *dp, +xfs_dir2_data_freescan_int( + struct xfs_da_geometry *geo, + const struct xfs_dir_ops *ops, struct xfs_dir2_data_hdr *hdr, int *loghead) { @@ -516,7 +517,6 @@ xfs_dir2_data_freescan( struct xfs_dir2_data_free *bf; char *endp; /* end of block's data */ char *p; /* current entry pointer */ - struct xfs_da_geometry *geo = dp->i_mount->m_dir_geo; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || @@ -526,13 +526,13 @@ xfs_dir2_data_freescan( /* * Start by clearing the table. */ - bf = dp->d_ops->data_bestfree_p(hdr); + bf = ops->data_bestfree_p(hdr); memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT); *loghead = 1; /* * Set up pointers. */ - p = (char *)dp->d_ops->data_entry_p(hdr); + p = (char *)ops->data_entry_p(hdr); if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { btp = xfs_dir2_block_tail_p(geo, hdr); @@ -559,12 +559,22 @@ xfs_dir2_data_freescan( else { dep = (xfs_dir2_data_entry_t *)p; ASSERT((char *)dep - (char *)hdr == - be16_to_cpu(*dp->d_ops->data_entry_tag_p(dep))); - p += dp->d_ops->data_entsize(dep->namelen); + be16_to_cpu(*ops->data_entry_tag_p(dep))); + p += ops->data_entsize(dep->namelen); } } } +void +xfs_dir2_data_freescan( + struct xfs_inode *dp, + struct xfs_dir2_data_hdr *hdr, + int *loghead) +{ + return xfs_dir2_data_freescan_int(dp->i_mount->m_dir_geo, dp->d_ops, + hdr, loghead); +} + /* * Initialize a data block at the given block number in the directory. * Give back the buffer for the created block. diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index ef9f6ead96a4..d04547fcf274 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -21,7 +21,6 @@ struct dir_context; /* xfs_dir2.c */ -extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, xfs_dir2_db_t *dbp); extern int xfs_dir_cilookup_result(struct xfs_da_args *args, diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 51b4e0de1fdc..f272abff11e1 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2344,7 +2344,8 @@ xfs_imap( imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno); imap->im_len = XFS_FSB_TO_BB(mp, 1); - imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog); + imap->im_boffset = (unsigned short)(offset << + mp->m_sb.sb_inodelog); return 0; } @@ -2372,7 +2373,7 @@ out_map: imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno); imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster); - imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog); + imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); /* * If the inode number maps to a block outside the bounds @@ -2450,8 +2451,6 @@ xfs_ialloc_log_agi( ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); #endif - xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF); - /* * Compute byte offsets for the first and last fields in the first * region and log the agi buffer. This only logs up through @@ -2512,8 +2511,15 @@ xfs_agi_verify( if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) return false; - if (be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS) + if (be32_to_cpu(agi->agi_level) < 1 || + be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS) + return false; + + if (xfs_sb_version_hasfinobt(&mp->m_sb) && + (be32_to_cpu(agi->agi_free_level) < 1 || + be32_to_cpu(agi->agi_free_level) > XFS_BTREE_MAXLEVELS)) return false; + /* * during growfs operations, the perag is not fully initialised, * so we can't use it for any useful checking. growfs ensures we can't @@ -2592,6 +2598,8 @@ xfs_read_agi( XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops); if (error) return error; + if (tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_AGI_BUF); xfs_buf_set_ref(*bpp, XFS_AGI_REF); return 0; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index eab68ae2e011..0fd086d03d41 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -357,7 +357,7 @@ xfs_inobt_init_cursor( struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); struct xfs_btree_cur *cur; - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); cur->bc_tp = tp; cur->bc_mp = mp; @@ -365,9 +365,11 @@ xfs_inobt_init_cursor( if (btnum == XFS_BTNUM_INO) { cur->bc_nlevels = be32_to_cpu(agi->agi_level); cur->bc_ops = &xfs_inobt_ops; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2); } else { cur->bc_nlevels = be32_to_cpu(agi->agi_free_level); cur->bc_ops = &xfs_finobt_ops; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_fibt_2); } cur->bc_blocklog = mp->m_sb.sb_blocklog; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 134424fac434..dd483e2767f7 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -383,7 +383,7 @@ xfs_log_dinode_to_disk( static bool xfs_dinode_verify( struct xfs_mount *mp, - struct xfs_inode *ip, + xfs_ino_t ino, struct xfs_dinode *dip) { uint16_t flags; @@ -392,6 +392,14 @@ xfs_dinode_verify( if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) return false; + /* don't allow invalid i_size */ + if (be64_to_cpu(dip->di_size) & (1ULL << 63)) + return false; + + /* No zero-length symlinks. */ + if (S_ISLNK(be16_to_cpu(dip->di_mode)) && dip->di_size == 0) + return false; + /* only version 3 or greater inodes are extensively verified here */ if (dip->di_version < 3) return true; @@ -401,7 +409,7 @@ xfs_dinode_verify( if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, XFS_DINODE_CRC_OFF)) return false; - if (be64_to_cpu(dip->di_ino) != ip->i_ino) + if (be64_to_cpu(dip->di_ino) != ino) return false; if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid)) return false; @@ -436,7 +444,7 @@ xfs_dinode_calc_crc( return; ASSERT(xfs_sb_version_hascrc(&mp->m_sb)); - crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize, + crc = xfs_start_cksum_update((char *)dip, mp->m_sb.sb_inodesize, XFS_DINODE_CRC_OFF); dip->di_crc = xfs_end_cksum(crc); } @@ -493,7 +501,7 @@ xfs_iread( return error; /* even unallocated inodes are verified */ - if (!xfs_dinode_verify(mp, ip, dip)) { + if (!xfs_dinode_verify(mp, ip->i_ino, dip)) { xfs_alert(mp, "%s: validation failed for inode %lld failed", __func__, ip->i_ino); diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 3cfe12a4f58a..6848a0afbce7 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -58,8 +58,8 @@ struct xfs_icdinode { */ struct xfs_imap { xfs_daddr_t im_blkno; /* starting BB of inode chunk */ - ushort im_len; /* length in BBs of inode chunk */ - ushort im_boffset; /* inode offset in block in bytes */ + unsigned short im_len; /* length in BBs of inode chunk */ + unsigned short im_boffset; /* inode offset in block in bytes */ }; int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 5dd56d3dbb3a..222e103356c6 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -775,6 +775,13 @@ xfs_idestroy_fork( } } +/* Count number of incore extents based on if_bytes */ +xfs_extnum_t +xfs_iext_count(struct xfs_ifork *ifp) +{ + return ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); +} + /* * Convert in-core extents to on-disk form * @@ -803,7 +810,7 @@ xfs_iextents_copy( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(ifp->if_bytes > 0); - nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nrecs = xfs_iext_count(ifp); XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork); ASSERT(nrecs > 0); @@ -941,7 +948,7 @@ xfs_iext_get_ext( xfs_extnum_t idx) /* index of target extent */ { ASSERT(idx >= 0); - ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); + ASSERT(idx < xfs_iext_count(ifp)); if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) { return ifp->if_u1.if_ext_irec->er_extbuf; @@ -1017,7 +1024,7 @@ xfs_iext_add( int new_size; /* size of extents after adding */ xfs_extnum_t nextents; /* number of extents in file */ - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); ASSERT((idx >= 0) && (idx <= nextents)); byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t); new_size = ifp->if_bytes + byte_diff; @@ -1241,7 +1248,7 @@ xfs_iext_remove( trace_xfs_iext_remove(ip, idx, state, _RET_IP_); ASSERT(ext_diff > 0); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t); if (new_size == 0) { @@ -1270,7 +1277,7 @@ xfs_iext_remove_inline( ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); ASSERT(idx < XFS_INLINE_EXTS); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); ASSERT(((nextents - ext_diff) > 0) && (nextents - ext_diff) < XFS_INLINE_EXTS); @@ -1309,7 +1316,7 @@ xfs_iext_remove_direct( ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); new_size = ifp->if_bytes - (ext_diff * sizeof(xfs_bmbt_rec_t)); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); if (new_size == 0) { xfs_iext_destroy(ifp); @@ -1546,7 +1553,7 @@ xfs_iext_indirect_to_direct( int size; /* size of file extents */ ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); ASSERT(nextents <= XFS_LINEAR_EXTS); size = nextents * sizeof(xfs_bmbt_rec_t); @@ -1620,7 +1627,7 @@ xfs_iext_bno_to_ext( xfs_extnum_t nextents; /* number of file extents */ xfs_fileoff_t startoff = 0; /* start offset of extent */ - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); if (nextents == 0) { *idxp = 0; return NULL; @@ -1733,8 +1740,8 @@ xfs_iext_idx_to_irec( ASSERT(ifp->if_flags & XFS_IFEXTIREC); ASSERT(page_idx >= 0); - ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); - ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc); + ASSERT(page_idx <= xfs_iext_count(ifp)); + ASSERT(page_idx < xfs_iext_count(ifp) || realloc); nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; erp_idx = 0; @@ -1782,7 +1789,7 @@ xfs_iext_irec_init( xfs_extnum_t nextents; /* number of extents in file */ ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); ASSERT(nextents <= XFS_LINEAR_EXTS); erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS); @@ -1906,7 +1913,7 @@ xfs_iext_irec_compact( ASSERT(ifp->if_flags & XFS_IFEXTIREC); nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); if (nextents == 0) { xfs_iext_destroy(ifp); @@ -1996,3 +2003,49 @@ xfs_ifork_init_cow( ip->i_cformat = XFS_DINODE_FMT_EXTENTS; ip->i_cnextents = 0; } + +/* + * Lookup the extent covering bno. + * + * If there is an extent covering bno return the extent index, and store the + * expanded extent structure in *gotp, and the extent index in *idx. + * If there is no extent covering bno, but there is an extent after it (e.g. + * it lies in a hole) return that extent in *gotp and its index in *idx + * instead. + * If bno is beyond the last extent return false, and return the index after + * the last valid index in *idxp. + */ +bool +xfs_iext_lookup_extent( + struct xfs_inode *ip, + struct xfs_ifork *ifp, + xfs_fileoff_t bno, + xfs_extnum_t *idxp, + struct xfs_bmbt_irec *gotp) +{ + struct xfs_bmbt_rec_host *ep; + + XFS_STATS_INC(ip->i_mount, xs_look_exlist); + + ep = xfs_iext_bno_to_ext(ifp, bno, idxp); + if (!ep) + return false; + xfs_bmbt_get_all(ep, gotp); + return true; +} + +/* + * Return true if there is an extent at index idx, and return the expanded + * extent structure at idx in that case. Else return false. + */ +bool +xfs_iext_get_extent( + struct xfs_ifork *ifp, + xfs_extnum_t idx, + struct xfs_bmbt_irec *gotp) +{ + if (idx < 0 || idx >= xfs_iext_count(ifp)) + return false; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), gotp); + return true; +} diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index c9476f50e32d..7fb8365326d1 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -152,6 +152,7 @@ void xfs_init_local_fork(struct xfs_inode *, int, const void *, int); struct xfs_bmbt_rec_host * xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t); +xfs_extnum_t xfs_iext_count(struct xfs_ifork *); void xfs_iext_insert(struct xfs_inode *, xfs_extnum_t, xfs_extnum_t, struct xfs_bmbt_irec *, int); void xfs_iext_add(struct xfs_ifork *, xfs_extnum_t, int); @@ -181,6 +182,12 @@ void xfs_iext_irec_compact_pages(struct xfs_ifork *); void xfs_iext_irec_compact_full(struct xfs_ifork *); void xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int); +bool xfs_iext_lookup_extent(struct xfs_inode *ip, + struct xfs_ifork *ifp, xfs_fileoff_t bno, + xfs_extnum_t *idxp, struct xfs_bmbt_irec *gotp); +bool xfs_iext_get_extent(struct xfs_ifork *ifp, xfs_extnum_t idx, + struct xfs_bmbt_irec *gotp); + extern struct kmem_zone *xfs_ifork_zone; extern void xfs_ifork_init_cow(struct xfs_inode *ip); diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 083cdd6d6c28..7ae571f8e34a 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -481,8 +481,8 @@ static inline uint xfs_log_dinode_size(int version) typedef struct xfs_buf_log_format { unsigned short blf_type; /* buf log item type indicator */ unsigned short blf_size; /* size of this item */ - ushort blf_flags; /* misc state */ - ushort blf_len; /* number of blocks in this buf */ + unsigned short blf_flags; /* misc state */ + unsigned short blf_len; /* number of blocks in this buf */ __int64_t blf_blkno; /* starting blkno of this buf */ unsigned int blf_map_size; /* used size of data bitmap in words */ unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */ diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 8e385f91d660..d9f65e2d5cc8 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -52,7 +52,7 @@ typedef struct xlog_recover { struct list_head r_itemq; /* q for items */ } xlog_recover_t; -#define ITEM_TYPE(i) (*(ushort *)(i)->ri_buf[0].i_addr) +#define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].i_addr) /* * This is the number of entries in the l_buf_cancel_table used during diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 453bb2757ec2..6fb2215f8ff7 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -354,6 +354,7 @@ xfs_refcountbt_init_cursor( cur->bc_btnum = XFS_BTNUM_REFC; cur->bc_blocklog = mp->m_sb.sb_blocklog; cur->bc_ops = &xfs_refcountbt_ops; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2); cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 83e672ff7577..de25771764ba 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -484,6 +484,7 @@ xfs_rmapbt_init_cursor( cur->bc_blocklog = mp->m_sb.sb_blocklog; cur->bc_ops = &xfs_rmapbt_ops; cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); cur->bc_private.a.agbp = agbp; cur->bc_private.a.agno = agno; diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index e2e1106c9fad..ea45584a9913 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1016,4 +1016,3 @@ xfs_rtfree_extent( } return 0; } - diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index a70aec910626..2580262e4ea0 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -262,6 +262,12 @@ xfs_mount_validate_sb( return -EFSCORRUPTED; } + if (xfs_sb_version_hascrc(&mp->m_sb) && + sbp->sb_blocksize < XFS_MIN_CRC_BLOCKSIZE) { + xfs_notice(mp, "v5 SB sanity check failed"); + return -EFSCORRUPTED; + } + /* * Until this is fixed only page-sized or smaller data blocks work. */ @@ -338,13 +344,16 @@ xfs_sb_quota_from_disk(struct xfs_sb *sbp) XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD; sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD); - if (sbp->sb_qflags & XFS_PQUOTA_ACCT) { + if (sbp->sb_qflags & XFS_PQUOTA_ACCT && + sbp->sb_gquotino != NULLFSINO) { /* * In older version of superblock, on-disk superblock only * has sb_gquotino, and in-core superblock has both sb_gquotino * and sb_pquotino. But, only one of them is supported at any * point of time. So, if PQUOTA is set in disk superblock, - * copy over sb_gquotino to sb_pquotino. + * copy over sb_gquotino to sb_pquotino. The NULLFSINO test + * above is to make sure we don't do this twice and wipe them + * both out! */ sbp->sb_pquotino = sbp->sb_gquotino; sbp->sb_gquotino = NULLFSINO; diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 8d74870468c2..717909f2f7b7 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -57,7 +57,6 @@ typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */ #define NULLAGBLOCK ((xfs_agblock_t)-1) #define NULLAGNUMBER ((xfs_agnumber_t)-1) -#define NULLEXTNUM ((xfs_extnum_t)-1) #define NULLCOMMITLSN ((xfs_lsn_t)-1) @@ -75,11 +74,14 @@ typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */ * Minimum and maximum blocksize and sectorsize. * The blocksize upper limit is pretty much arbitrary. * The sectorsize upper limit is due to sizeof(sb_sectsize). + * CRC enable filesystems use 512 byte inodes, meaning 512 byte block sizes + * cannot be used. */ #define XFS_MIN_BLOCKSIZE_LOG 9 /* i.e. 512 bytes */ #define XFS_MAX_BLOCKSIZE_LOG 16 /* i.e. 65536 bytes */ #define XFS_MIN_BLOCKSIZE (1 << XFS_MIN_BLOCKSIZE_LOG) #define XFS_MAX_BLOCKSIZE (1 << XFS_MAX_BLOCKSIZE_LOG) +#define XFS_MIN_CRC_BLOCKSIZE (1 << (XFS_MIN_BLOCKSIZE_LOG + 1)) #define XFS_MIN_SECTORSIZE_LOG 9 /* i.e. 512 bytes */ #define XFS_MAX_SECTORSIZE_LOG 15 /* i.e. 32768 bytes */ #define XFS_MIN_SECTORSIZE (1 << XFS_MIN_SECTORSIZE_LOG) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 38755ca96c7a..0f56fcd3a5d5 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -37,11 +37,6 @@ #include <linux/pagevec.h> #include <linux/writeback.h> -/* flags for direct write completions */ -#define XFS_DIO_FLAG_UNWRITTEN (1 << 0) -#define XFS_DIO_FLAG_APPEND (1 << 1) -#define XFS_DIO_FLAG_COW (1 << 2) - /* * structure owned by writepages passed to individual writepage calls */ @@ -776,7 +771,7 @@ xfs_map_cow( { struct xfs_inode *ip = XFS_I(inode); struct xfs_bmbt_irec imap; - bool is_cow = false, need_alloc = false; + bool is_cow = false; int error; /* @@ -794,7 +789,7 @@ xfs_map_cow( * Else we need to check if there is a COW mapping at this offset. */ xfs_ilock(ip, XFS_ILOCK_SHARED); - is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap, &need_alloc); + is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap); xfs_iunlock(ip, XFS_ILOCK_SHARED); if (!is_cow) @@ -804,7 +799,7 @@ xfs_map_cow( * And if the COW mapping has a delayed extent here we need to * allocate real space for it now. */ - if (need_alloc) { + if (isnullstartblock(imap.br_startblock)) { error = xfs_iomap_write_allocate(ip, XFS_COW_FORK, offset, &imap); if (error) @@ -1175,45 +1170,6 @@ xfs_vm_releasepage( } /* - * When we map a DIO buffer, we may need to pass flags to - * xfs_end_io_direct_write to tell it what kind of write IO we are doing. - * - * Note that for DIO, an IO to the highest supported file block offset (i.e. - * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64 - * bit variable. Hence if we see this overflow, we have to assume that the IO is - * extending the file size. We won't know for sure until IO completion is run - * and the actual max write offset is communicated to the IO completion - * routine. - */ -static void -xfs_map_direct( - struct inode *inode, - struct buffer_head *bh_result, - struct xfs_bmbt_irec *imap, - xfs_off_t offset, - bool is_cow) -{ - uintptr_t *flags = (uintptr_t *)&bh_result->b_private; - xfs_off_t size = bh_result->b_size; - - trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size, - ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : is_cow ? XFS_IO_COW : - XFS_IO_OVERWRITE, imap); - - if (ISUNWRITTEN(imap)) { - *flags |= XFS_DIO_FLAG_UNWRITTEN; - set_buffer_defer_completion(bh_result); - } else if (is_cow) { - *flags |= XFS_DIO_FLAG_COW; - set_buffer_defer_completion(bh_result); - } - if (offset + size > i_size_read(inode) || offset + size < 0) { - *flags |= XFS_DIO_FLAG_APPEND; - set_buffer_defer_completion(bh_result); - } -} - -/* * If this is O_DIRECT or the mpage code calling tell them how large the mapping * is, so that we can avoid repeated get_blocks calls. * @@ -1253,51 +1209,12 @@ xfs_map_trim_size( bh_result->b_size = mapping_size; } -/* Bounce unaligned directio writes to the page cache. */ static int -xfs_bounce_unaligned_dio_write( - struct xfs_inode *ip, - xfs_fileoff_t offset_fsb, - struct xfs_bmbt_irec *imap) -{ - struct xfs_bmbt_irec irec; - xfs_fileoff_t delta; - bool shared; - bool x; - int error; - - irec = *imap; - if (offset_fsb > irec.br_startoff) { - delta = offset_fsb - irec.br_startoff; - irec.br_blockcount -= delta; - irec.br_startblock += delta; - irec.br_startoff = offset_fsb; - } - error = xfs_reflink_trim_around_shared(ip, &irec, &shared, &x); - if (error) - return error; - - /* - * We're here because we're trying to do a directio write to a - * region that isn't aligned to a filesystem block. If any part - * of the extent is shared, fall back to buffered mode to handle - * the RMW. This is done by returning -EREMCHG ("remote addr - * changed"), which is caught further up the call stack. - */ - if (shared) { - trace_xfs_reflink_bounce_dio_write(ip, imap); - return -EREMCHG; - } - return 0; -} - -STATIC int -__xfs_get_blocks( +xfs_get_blocks( struct inode *inode, sector_t iblock, struct buffer_head *bh_result, - int create, - bool direct) + int create) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1308,11 +1225,8 @@ __xfs_get_blocks( int nimaps = 1; xfs_off_t offset; ssize_t size; - int new = 0; - bool is_cow = false; - bool need_alloc = false; - BUG_ON(create && !direct); + BUG_ON(create); if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -1321,7 +1235,7 @@ __xfs_get_blocks( ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); size = bh_result->b_size; - if (!create && offset >= i_size_read(inode)) + if (offset >= i_size_read(inode)) return 0; /* @@ -1336,52 +1250,12 @@ __xfs_get_blocks( end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); offset_fsb = XFS_B_TO_FSBT(mp, offset); - if (create && direct && xfs_is_reflink_inode(ip)) - is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap, - &need_alloc); - if (!is_cow) { - error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, - &imap, &nimaps, XFS_BMAPI_ENTIRE); - /* - * Truncate an overwrite extent if there's a pending CoW - * reservation before the end of this extent. This - * forces us to come back to get_blocks to take care of - * the CoW. - */ - if (create && direct && nimaps && - imap.br_startblock != HOLESTARTBLOCK && - imap.br_startblock != DELAYSTARTBLOCK && - !ISUNWRITTEN(&imap)) - xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb, - &imap); - } - ASSERT(!need_alloc); + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, + &imap, &nimaps, XFS_BMAPI_ENTIRE); if (error) goto out_unlock; - /* for DAX, we convert unwritten extents directly */ - if (create && - (!nimaps || - (imap.br_startblock == HOLESTARTBLOCK || - imap.br_startblock == DELAYSTARTBLOCK) || - (IS_DAX(inode) && ISUNWRITTEN(&imap)))) { - /* - * xfs_iomap_write_direct() expects the shared lock. It - * is unlocked on return. - */ - if (lockmode == XFS_ILOCK_EXCL) - xfs_ilock_demote(ip, lockmode); - - error = xfs_iomap_write_direct(ip, offset, size, - &imap, nimaps); - if (error) - return error; - new = 1; - - trace_xfs_get_blocks_alloc(ip, offset, size, - ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN - : XFS_IO_DELALLOC, &imap); - } else if (nimaps) { + if (nimaps) { trace_xfs_get_blocks_found(ip, offset, size, ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, &imap); @@ -1391,12 +1265,6 @@ __xfs_get_blocks( goto out_unlock; } - if (IS_DAX(inode) && create) { - ASSERT(!ISUNWRITTEN(&imap)); - /* zeroing is not needed at a higher layer */ - new = 0; - } - /* trim mapping down to size requested */ xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size); @@ -1406,45 +1274,14 @@ __xfs_get_blocks( */ if (imap.br_startblock != HOLESTARTBLOCK && imap.br_startblock != DELAYSTARTBLOCK && - (create || !ISUNWRITTEN(&imap))) { - if (create && direct && !is_cow) { - error = xfs_bounce_unaligned_dio_write(ip, offset_fsb, - &imap); - if (error) - return error; - } - + !ISUNWRITTEN(&imap)) xfs_map_buffer(inode, bh_result, &imap, offset); - if (ISUNWRITTEN(&imap)) - set_buffer_unwritten(bh_result); - /* direct IO needs special help */ - if (create) - xfs_map_direct(inode, bh_result, &imap, offset, is_cow); - } /* * If this is a realtime file, data may be on a different device. * to that pointed to from the buffer_head b_bdev currently. */ bh_result->b_bdev = xfs_find_bdev_for_inode(inode); - - /* - * If we previously allocated a block out beyond eof and we are now - * coming back to use it then we will need to flag it as new even if it - * has a disk address. - * - * With sub-block writes into unwritten extents we also need to mark - * the buffer as new so that the unwritten parts of the buffer gets - * correctly zeroed. - */ - if (create && - ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || - (offset >= i_size_read(inode)) || - (new || ISUNWRITTEN(&imap)))) - set_buffer_new(bh_result); - - BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK); - return 0; out_unlock: @@ -1452,100 +1289,6 @@ out_unlock: return error; } -int -xfs_get_blocks( - struct inode *inode, - sector_t iblock, - struct buffer_head *bh_result, - int create) -{ - return __xfs_get_blocks(inode, iblock, bh_result, create, false); -} - -int -xfs_get_blocks_direct( - struct inode *inode, - sector_t iblock, - struct buffer_head *bh_result, - int create) -{ - return __xfs_get_blocks(inode, iblock, bh_result, create, true); -} - -/* - * Complete a direct I/O write request. - * - * xfs_map_direct passes us some flags in the private data to tell us what to - * do. If no flags are set, then the write IO is an overwrite wholly within - * the existing allocated file size and so there is nothing for us to do. - * - * Note that in this case the completion can be called in interrupt context, - * whereas if we have flags set we will always be called in task context - * (i.e. from a workqueue). - */ -int -xfs_end_io_direct_write( - struct kiocb *iocb, - loff_t offset, - ssize_t size, - void *private) -{ - struct inode *inode = file_inode(iocb->ki_filp); - struct xfs_inode *ip = XFS_I(inode); - uintptr_t flags = (uintptr_t)private; - int error = 0; - - trace_xfs_end_io_direct_write(ip, offset, size); - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return -EIO; - - if (size <= 0) - return size; - - /* - * The flags tell us whether we are doing unwritten extent conversions - * or an append transaction that updates the on-disk file size. These - * cases are the only cases where we should *potentially* be needing - * to update the VFS inode size. - */ - if (flags == 0) { - ASSERT(offset + size <= i_size_read(inode)); - return 0; - } - - /* - * We need to update the in-core inode size here so that we don't end up - * with the on-disk inode size being outside the in-core inode size. We - * have no other method of updating EOF for AIO, so always do it here - * if necessary. - * - * We need to lock the test/set EOF update as we can be racing with - * other IO completions here to update the EOF. Failing to serialise - * here can result in EOF moving backwards and Bad Things Happen when - * that occurs. - */ - spin_lock(&ip->i_flags_lock); - if (offset + size > i_size_read(inode)) - i_size_write(inode, offset + size); - spin_unlock(&ip->i_flags_lock); - - if (flags & XFS_DIO_FLAG_COW) - error = xfs_reflink_end_cow(ip, offset, size); - if (flags & XFS_DIO_FLAG_UNWRITTEN) { - trace_xfs_end_io_direct_write_unwritten(ip, offset, size); - - error = xfs_iomap_write_unwritten(ip, offset, size); - } - if (flags & XFS_DIO_FLAG_APPEND) { - trace_xfs_end_io_direct_write_append(ip, offset, size); - - error = xfs_setfilesize(ip, offset, size); - } - - return error; -} - STATIC ssize_t xfs_vm_direct_IO( struct kiocb *iocb, @@ -1566,7 +1309,6 @@ xfs_vm_bmap( struct xfs_inode *ip = XFS_I(inode); trace_xfs_vm_bmap(XFS_I(inode)); - xfs_ilock(ip, XFS_IOLOCK_SHARED); /* * The swap code (ab-)uses ->bmap to get a block mapping and then @@ -1574,12 +1316,10 @@ xfs_vm_bmap( * that on reflinks inodes, so we have to skip out here. And yes, * 0 is the magic code for a bmap error.. */ - if (xfs_is_reflink_inode(ip)) { - xfs_iunlock(ip, XFS_IOLOCK_SHARED); + if (xfs_is_reflink_inode(ip)) return 0; - } + filemap_write_and_wait(mapping); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); return generic_block_bmap(mapping, block, xfs_get_blocks); } diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 34dc00dfb91d..cc174ec6c2fd 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -55,12 +55,6 @@ struct xfs_ioend { extern const struct address_space_operations xfs_address_space_operations; -int xfs_get_blocks(struct inode *inode, sector_t offset, - struct buffer_head *map_bh, int create); -int xfs_get_blocks_direct(struct inode *inode, sector_t offset, - struct buffer_head *map_bh, int create); -int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset, - ssize_t size, void *private); int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); extern void xfs_count_page_state(struct page *, int *, int *); diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index e3da5d448bcf..d14691aa02b4 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -112,8 +112,8 @@ typedef struct attrlist_cursor_kern { *========================================================================*/ -/* Return 0 on success, or -errno; other state communicated via *context */ -typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int, +/* void; state communicated via *context */ +typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int, unsigned char *, int, int); typedef struct xfs_attr_list_context { diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 25e76cd6c053..97c45b6eb91e 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -74,7 +74,6 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) xfs_attr_sf_entry_t *sfe; xfs_inode_t *dp; int sbsize, nsbuf, count, i; - int error; ASSERT(context != NULL); dp = context->dp; @@ -102,13 +101,11 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) (XFS_ISRESET_CURSOR(cursor) && (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) { for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { - error = context->put_listent(context, - sfe->flags, - sfe->nameval, - (int)sfe->namelen, - (int)sfe->valuelen); - if (error) - return error; + context->put_listent(context, + sfe->flags, + sfe->nameval, + (int)sfe->namelen, + (int)sfe->valuelen); /* * Either search callback finished early or * didn't fit it all in the buffer after all. @@ -193,15 +190,11 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) cursor->hashval = sbp->hash; cursor->offset = 0; } - error = context->put_listent(context, - sbp->flags, - sbp->name, - sbp->namelen, - sbp->valuelen); - if (error) { - kmem_free(sbuf); - return error; - } + context->put_listent(context, + sbp->flags, + sbp->name, + sbp->namelen, + sbp->valuelen); if (context->seen_enough) break; cursor->offset++; @@ -335,11 +328,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) */ for (;;) { leaf = bp->b_addr; - error = xfs_attr3_leaf_list_int(bp, context); - if (error) { - xfs_trans_brelse(NULL, bp); - return error; - } + xfs_attr3_leaf_list_int(bp, context); xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); if (context->seen_enough || leafhdr.forw == 0) break; @@ -356,7 +345,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) /* * Copy out attribute list entries for attr_list(), for leaf attribute lists. */ -int +void xfs_attr3_leaf_list_int( struct xfs_buf *bp, struct xfs_attr_list_context *context) @@ -366,7 +355,6 @@ xfs_attr3_leaf_list_int( struct xfs_attr3_icleaf_hdr ichdr; struct xfs_attr_leaf_entry *entries; struct xfs_attr_leaf_entry *entry; - int retval; int i; struct xfs_mount *mp = context->dp->i_mount; @@ -399,7 +387,7 @@ xfs_attr3_leaf_list_int( } if (i == ichdr.count) { trace_xfs_attr_list_notfound(context); - return 0; + return; } } else { entry = &entries[0]; @@ -410,7 +398,6 @@ xfs_attr3_leaf_list_int( /* * We have found our place, start copying out the new attributes. */ - retval = 0; for (; i < ichdr.count; entry++, i++) { char *name; int namelen, valuelen; @@ -439,16 +426,14 @@ xfs_attr3_leaf_list_int( valuelen = be32_to_cpu(name_rmt->valuelen); } - retval = context->put_listent(context, entry->flags, + context->put_listent(context, entry->flags, name, namelen, valuelen); - if (retval) - break; if (context->seen_enough) break; cursor->offset++; } trace_xfs_attr_list_leaf_end(context); - return retval; + return; } /* @@ -467,9 +452,9 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context) if (error) return error; - error = xfs_attr3_leaf_list_int(bp, context); + xfs_attr3_leaf_list_int(bp, context); xfs_trans_brelse(NULL, bp); - return error; + return 0; } int @@ -513,7 +498,7 @@ xfs_attr_list_int( * Take care to check values and protect against them changing later, * we may be reading them directly out of a user buffer. */ -STATIC int +STATIC void xfs_attr_put_listent( xfs_attr_list_context_t *context, int flags, @@ -536,10 +521,10 @@ xfs_attr_put_listent( */ if (((context->flags & ATTR_SECURE) == 0) != ((flags & XFS_ATTR_SECURE) == 0)) - return 0; + return; if (((context->flags & ATTR_ROOT) == 0) != ((flags & XFS_ATTR_ROOT) == 0)) - return 0; + return; arraytop = sizeof(*alist) + context->count * sizeof(alist->al_offset[0]); @@ -548,7 +533,7 @@ xfs_attr_put_listent( trace_xfs_attr_list_full(context); alist->al_more = 1; context->seen_enough = 1; - return 0; + return; } aep = (attrlist_ent_t *)&context->alist[context->firstu]; @@ -558,7 +543,7 @@ xfs_attr_put_listent( alist->al_offset[context->count++] = context->firstu; alist->al_count = context->count; trace_xfs_attr_list_add(context); - return 0; + return; } /* diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 552465e011ec..b9abce524c33 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -359,9 +359,7 @@ xfs_bmap_count_blocks( mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) { - xfs_bmap_count_leaves(ifp, 0, - ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t), - count); + xfs_bmap_count_leaves(ifp, 0, xfs_iext_count(ifp), count); return 0; } @@ -426,7 +424,7 @@ xfs_getbmapx_fix_eof_hole( ifp = XFS_IFORK_PTR(ip, whichfork); if (!moretocome && xfs_iext_bno_to_ext(ifp, fileblock, &lastx) && - (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1)) + (lastx == xfs_iext_count(ifp) - 1)) out->bmv_oflags |= BMV_OF_LAST; } @@ -1792,6 +1790,7 @@ xfs_swap_extent_forks( struct xfs_ifork tempifp, *ifp, *tifp; int aforkblks = 0; int taforkblks = 0; + xfs_extnum_t nextents; __uint64_t tmp; int error; @@ -1877,14 +1876,13 @@ xfs_swap_extent_forks( switch (ip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: - /* If the extents fit in the inode, fix the - * pointer. Otherwise it's already NULL or - * pointing to the extent. + /* + * If the extents fit in the inode, fix the pointer. Otherwise + * it's already NULL or pointing to the extent. */ - if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) { - ifp->if_u1.if_extents = - ifp->if_u2.if_inline_ext; - } + nextents = xfs_iext_count(&ip->i_df); + if (nextents <= XFS_INLINE_EXTS) + ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; (*src_log_flags) |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: @@ -1896,14 +1894,13 @@ xfs_swap_extent_forks( switch (tip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: - /* If the extents fit in the inode, fix the - * pointer. Otherwise it's already NULL or - * pointing to the extent. + /* + * If the extents fit in the inode, fix the pointer. Otherwise + * it's already NULL or pointing to the extent. */ - if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) { - tifp->if_u1.if_extents = - tifp->if_u2.if_inline_ext; - } + nextents = xfs_iext_count(&tip->i_df); + if (nextents <= XFS_INLINE_EXTS) + tifp->if_u1.if_extents = tifp->if_u2.if_inline_ext; (*target_log_flags) |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: @@ -1938,8 +1935,8 @@ xfs_swap_extents( * page cache safely. Once we have done this we can take the ilocks and * do the rest of the checks. */ - lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; - xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); + lock_two_nondirectories(VFS_I(ip), VFS_I(tip)); + lock_flags = XFS_MMAPLOCK_EXCL; xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL); /* Verify that both files have the same format */ @@ -2079,15 +2076,13 @@ xfs_swap_extents( trace_xfs_swap_extent_after(ip, 0); trace_xfs_swap_extent_after(tip, 1); +out_unlock: xfs_iunlock(ip, lock_flags); xfs_iunlock(tip, lock_flags); + unlock_two_nondirectories(VFS_I(ip), VFS_I(tip)); return error; out_trans_cancel: xfs_trans_cancel(tp); - -out_unlock: - xfs_iunlock(ip, lock_flags); - xfs_iunlock(tip, lock_flags); - return error; + goto out_unlock; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 33c435f3316c..7f0a01f7b592 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -219,7 +219,6 @@ _xfs_buf_alloc( init_completion(&bp->b_iowait); INIT_LIST_HEAD(&bp->b_lru); INIT_LIST_HEAD(&bp->b_list); - RB_CLEAR_NODE(&bp->b_rbnode); sema_init(&bp->b_sema, 0); /* held, no waiters */ spin_lock_init(&bp->b_lock); XB_SET_OWNER(bp); @@ -473,6 +472,62 @@ _xfs_buf_map_pages( /* * Finding and Reading Buffers */ +static int +_xfs_buf_obj_cmp( + struct rhashtable_compare_arg *arg, + const void *obj) +{ + const struct xfs_buf_map *map = arg->key; + const struct xfs_buf *bp = obj; + + /* + * The key hashing in the lookup path depends on the key being the + * first element of the compare_arg, make sure to assert this. + */ + BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0); + + if (bp->b_bn != map->bm_bn) + return 1; + + if (unlikely(bp->b_length != map->bm_len)) { + /* + * found a block number match. If the range doesn't + * match, the only way this is allowed is if the buffer + * in the cache is stale and the transaction that made + * it stale has not yet committed. i.e. we are + * reallocating a busy extent. Skip this buffer and + * continue searching for an exact match. + */ + ASSERT(bp->b_flags & XBF_STALE); + return 1; + } + return 0; +} + +static const struct rhashtable_params xfs_buf_hash_params = { + .min_size = 32, /* empty AGs have minimal footprint */ + .nelem_hint = 16, + .key_len = sizeof(xfs_daddr_t), + .key_offset = offsetof(struct xfs_buf, b_bn), + .head_offset = offsetof(struct xfs_buf, b_rhash_head), + .automatic_shrinking = true, + .obj_cmpfn = _xfs_buf_obj_cmp, +}; + +int +xfs_buf_hash_init( + struct xfs_perag *pag) +{ + spin_lock_init(&pag->pag_buf_lock); + return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params); +} + +void +xfs_buf_hash_destroy( + struct xfs_perag *pag) +{ + rhashtable_destroy(&pag->pag_buf_hash); +} /* * Look up, and creates if absent, a lockable buffer for @@ -488,27 +543,24 @@ _xfs_buf_find( xfs_buf_t *new_bp) { struct xfs_perag *pag; - struct rb_node **rbp; - struct rb_node *parent; xfs_buf_t *bp; - xfs_daddr_t blkno = map[0].bm_bn; + struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn }; xfs_daddr_t eofs; - int numblks = 0; int i; for (i = 0; i < nmaps; i++) - numblks += map[i].bm_len; + cmap.bm_len += map[i].bm_len; /* Check for IOs smaller than the sector size / not sector aligned */ - ASSERT(!(BBTOB(numblks) < btp->bt_meta_sectorsize)); - ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask)); + ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize)); + ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask)); /* * Corrupted block numbers can get through to here, unfortunately, so we * have to check that the buffer falls within the filesystem bounds. */ eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); - if (blkno < 0 || blkno >= eofs) { + if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) { /* * XXX (dgc): we should really be returning -EFSCORRUPTED here, * but none of the higher level infrastructure supports @@ -516,53 +568,29 @@ _xfs_buf_find( */ xfs_alert(btp->bt_mount, "%s: Block out of range: block 0x%llx, EOFS 0x%llx ", - __func__, blkno, eofs); + __func__, cmap.bm_bn, eofs); WARN_ON(1); return NULL; } - /* get tree root */ pag = xfs_perag_get(btp->bt_mount, - xfs_daddr_to_agno(btp->bt_mount, blkno)); + xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn)); - /* walk tree */ spin_lock(&pag->pag_buf_lock); - rbp = &pag->pag_buf_tree.rb_node; - parent = NULL; - bp = NULL; - while (*rbp) { - parent = *rbp; - bp = rb_entry(parent, struct xfs_buf, b_rbnode); - - if (blkno < bp->b_bn) - rbp = &(*rbp)->rb_left; - else if (blkno > bp->b_bn) - rbp = &(*rbp)->rb_right; - else { - /* - * found a block number match. If the range doesn't - * match, the only way this is allowed is if the buffer - * in the cache is stale and the transaction that made - * it stale has not yet committed. i.e. we are - * reallocating a busy extent. Skip this buffer and - * continue searching to the right for an exact match. - */ - if (bp->b_length != numblks) { - ASSERT(bp->b_flags & XBF_STALE); - rbp = &(*rbp)->rb_right; - continue; - } - atomic_inc(&bp->b_hold); - goto found; - } + bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap, + xfs_buf_hash_params); + if (bp) { + atomic_inc(&bp->b_hold); + goto found; } /* No match found */ if (new_bp) { - rb_link_node(&new_bp->b_rbnode, parent, rbp); - rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree); /* the buffer keeps the perag reference until it is freed */ new_bp->b_pag = pag; + rhashtable_insert_fast(&pag->pag_buf_hash, + &new_bp->b_rhash_head, + xfs_buf_hash_params); spin_unlock(&pag->pag_buf_lock); } else { XFS_STATS_INC(btp->bt_mount, xb_miss_locked); @@ -930,7 +958,6 @@ xfs_buf_rele( if (!pag) { ASSERT(list_empty(&bp->b_lru)); - ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); if (atomic_dec_and_test(&bp->b_hold)) { xfs_buf_ioacct_dec(bp); xfs_buf_free(bp); @@ -938,8 +965,6 @@ xfs_buf_rele( return; } - ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); - ASSERT(atomic_read(&bp->b_hold) > 0); release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); @@ -983,7 +1008,8 @@ xfs_buf_rele( } ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); - rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); + rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head, + xfs_buf_hash_params); spin_unlock(&pag->pag_buf_lock); xfs_perag_put(pag); freebuf = true; @@ -1711,8 +1737,7 @@ xfs_free_buftarg( percpu_counter_destroy(&btp->bt_io_count); list_lru_destroy(&btp->bt_lru); - if (mp->m_flags & XFS_MOUNT_BARRIER) - xfs_blkdev_issue_flush(btp); + xfs_blkdev_issue_flush(btp); kmem_free(btp); } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 1c2e52b2d926..8a9d3a9599f0 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -71,6 +71,7 @@ typedef unsigned int xfs_buf_flags_t; { XBF_READ, "READ" }, \ { XBF_WRITE, "WRITE" }, \ { XBF_READ_AHEAD, "READ_AHEAD" }, \ + { XBF_NO_IOACCT, "NO_IOACCT" }, \ { XBF_ASYNC, "ASYNC" }, \ { XBF_DONE, "DONE" }, \ { XBF_STALE, "STALE" }, \ @@ -150,7 +151,7 @@ typedef struct xfs_buf { * which is the only bit that is touched if we hit the semaphore * fast-path on locking. */ - struct rb_node b_rbnode; /* rbtree node */ + struct rhash_head b_rhash_head; /* pag buffer hash node */ xfs_daddr_t b_bn; /* block number of buffer */ int b_length; /* size of buffer in BBs */ atomic_t b_hold; /* reference count */ diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 29816981b50a..003a99b83bd8 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -677,7 +677,6 @@ xfs_readdir( args.dp = dp; args.geo = dp->i_mount->m_dir_geo; - xfs_ilock(dp, XFS_IOLOCK_SHARED); if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) rval = xfs_dir2_sf_getdents(&args, ctx); else if ((rval = xfs_dir2_isblock(&args, &v))) @@ -686,7 +685,6 @@ xfs_readdir( rval = xfs_dir2_block_getdents(&args, ctx); else rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize); - xfs_iunlock(dp, XFS_IOLOCK_SHARED); return rval; } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index d818c160451f..65d27a502909 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -48,40 +48,6 @@ static const struct vm_operations_struct xfs_file_vm_ops; /* - * Locking primitives for read and write IO paths to ensure we consistently use - * and order the inode->i_mutex, ip->i_lock and ip->i_iolock. - */ -static inline void -xfs_rw_ilock( - struct xfs_inode *ip, - int type) -{ - if (type & XFS_IOLOCK_EXCL) - inode_lock(VFS_I(ip)); - xfs_ilock(ip, type); -} - -static inline void -xfs_rw_iunlock( - struct xfs_inode *ip, - int type) -{ - xfs_iunlock(ip, type); - if (type & XFS_IOLOCK_EXCL) - inode_unlock(VFS_I(ip)); -} - -static inline void -xfs_rw_ilock_demote( - struct xfs_inode *ip, - int type) -{ - xfs_ilock_demote(ip, type); - if (type & XFS_IOLOCK_EXCL) - inode_unlock(VFS_I(ip)); -} - -/* * Clear the specified ranges to zero through either the pagecache or DAX. * Holes and unwritten extents will be left as-is as they already are zeroed. */ @@ -183,19 +149,16 @@ xfs_file_fsync( xfs_iflags_clear(ip, XFS_ITRUNCATED); - if (mp->m_flags & XFS_MOUNT_BARRIER) { - /* - * If we have an RT and/or log subvolume we need to make sure - * to flush the write cache the device used for file data - * first. This is to ensure newly written file data make - * it to disk before logging the new inode size in case of - * an extending write. - */ - if (XFS_IS_REALTIME_INODE(ip)) - xfs_blkdev_issue_flush(mp->m_rtdev_targp); - else if (mp->m_logdev_targp != mp->m_ddev_targp) - xfs_blkdev_issue_flush(mp->m_ddev_targp); - } + /* + * If we have an RT and/or log subvolume we need to make sure to flush + * the write cache the device used for file data first. This is to + * ensure newly written file data make it to disk before logging the new + * inode size in case of an extending write. + */ + if (XFS_IS_REALTIME_INODE(ip)) + xfs_blkdev_issue_flush(mp->m_rtdev_targp); + else if (mp->m_logdev_targp != mp->m_ddev_targp) + xfs_blkdev_issue_flush(mp->m_ddev_targp); /* * All metadata updates are logged, which means that we just have to @@ -230,10 +193,8 @@ xfs_file_fsync( * an already allocated file and thus do not have any metadata to * commit. */ - if ((mp->m_flags & XFS_MOUNT_BARRIER) && - mp->m_logdev_targp == mp->m_ddev_targp && - !XFS_IS_REALTIME_INODE(ip) && - !log_flushed) + if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) && + mp->m_logdev_targp == mp->m_ddev_targp) xfs_blkdev_issue_flush(mp->m_ddev_targp); return error; @@ -244,62 +205,21 @@ xfs_file_dio_aio_read( struct kiocb *iocb, struct iov_iter *to) { - struct address_space *mapping = iocb->ki_filp->f_mapping; - struct inode *inode = mapping->host; - struct xfs_inode *ip = XFS_I(inode); - loff_t isize = i_size_read(inode); + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); size_t count = iov_iter_count(to); - loff_t end = iocb->ki_pos + count - 1; - struct iov_iter data; - struct xfs_buftarg *target; - ssize_t ret = 0; + ssize_t ret; trace_xfs_file_direct_read(ip, count, iocb->ki_pos); if (!count) return 0; /* skip atime */ - if (XFS_IS_REALTIME_INODE(ip)) - target = ip->i_mount->m_rtdev_targp; - else - target = ip->i_mount->m_ddev_targp; - - /* DIO must be aligned to device logical sector size */ - if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { - if (iocb->ki_pos == isize) - return 0; - return -EINVAL; - } - file_accessed(iocb->ki_filp); - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - if (mapping->nrpages) { - ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end); - if (ret) - goto out_unlock; - - /* - * Invalidate whole pages. This can return an error if we fail - * to invalidate a page, but this should never happen on XFS. - * Warn if it does fail. - */ - ret = invalidate_inode_pages2_range(mapping, - iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); - WARN_ON_ONCE(ret); - ret = 0; - } + xfs_ilock(ip, XFS_IOLOCK_SHARED); + ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); - data = *to; - ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, - xfs_get_blocks_direct, NULL, NULL, 0); - if (ret >= 0) { - iocb->ki_pos += ret; - iov_iter_advance(to, ret); - } - -out_unlock: - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } @@ -317,9 +237,9 @@ xfs_file_dax_read( if (!count) return 0; /* skip atime */ - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + xfs_ilock(ip, XFS_IOLOCK_SHARED); ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops); - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); file_accessed(iocb->ki_filp); return ret; @@ -335,9 +255,9 @@ xfs_file_buffered_aio_read( trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos); - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + xfs_ilock(ip, XFS_IOLOCK_SHARED); ret = generic_file_read_iter(iocb, to); - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } @@ -418,15 +338,18 @@ restart: if (error <= 0) return error; - error = xfs_break_layouts(inode, iolock, true); + error = xfs_break_layouts(inode, iolock); if (error) return error; - /* For changing security info in file_remove_privs() we need i_mutex */ + /* + * For changing security info in file_remove_privs() we need i_rwsem + * exclusively. + */ if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { - xfs_rw_iunlock(ip, *iolock); + xfs_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, *iolock); + xfs_ilock(ip, *iolock); goto restart; } /* @@ -451,9 +374,9 @@ restart: spin_unlock(&ip->i_flags_lock); if (!drained_dio) { if (*iolock == XFS_IOLOCK_SHARED) { - xfs_rw_iunlock(ip, *iolock); + xfs_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, *iolock); + xfs_ilock(ip, *iolock); iov_iter_reexpand(from, count); } /* @@ -496,6 +419,58 @@ restart: return 0; } +static int +xfs_dio_write_end_io( + struct kiocb *iocb, + ssize_t size, + unsigned flags) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct xfs_inode *ip = XFS_I(inode); + loff_t offset = iocb->ki_pos; + bool update_size = false; + int error = 0; + + trace_xfs_end_io_direct_write(ip, offset, size); + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + if (size <= 0) + return size; + + /* + * We need to update the in-core inode size here so that we don't end up + * with the on-disk inode size being outside the in-core inode size. We + * have no other method of updating EOF for AIO, so always do it here + * if necessary. + * + * We need to lock the test/set EOF update as we can be racing with + * other IO completions here to update the EOF. Failing to serialise + * here can result in EOF moving backwards and Bad Things Happen when + * that occurs. + */ + spin_lock(&ip->i_flags_lock); + if (offset + size > i_size_read(inode)) { + i_size_write(inode, offset + size); + update_size = true; + } + spin_unlock(&ip->i_flags_lock); + + if (flags & IOMAP_DIO_COW) { + error = xfs_reflink_end_cow(ip, offset, size); + if (error) + return error; + } + + if (flags & IOMAP_DIO_UNWRITTEN) + error = xfs_iomap_write_unwritten(ip, offset, size); + else if (update_size) + error = xfs_setfilesize(ip, offset, size); + + return error; +} + /* * xfs_file_dio_aio_write - handle direct IO writes * @@ -535,9 +510,7 @@ xfs_file_dio_aio_write( int unaligned_io = 0; int iolock; size_t count = iov_iter_count(from); - loff_t end; - struct iov_iter data; - struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? + struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ @@ -559,29 +532,12 @@ xfs_file_dio_aio_write( iolock = XFS_IOLOCK_SHARED; } - xfs_rw_ilock(ip, iolock); + xfs_ilock(ip, iolock); ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; count = iov_iter_count(from); - end = iocb->ki_pos + count - 1; - - if (mapping->nrpages) { - ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end); - if (ret) - goto out; - - /* - * Invalidate whole pages. This can return an error if we fail - * to invalidate a page, but this should never happen on XFS. - * Warn if it does fail. - */ - ret = invalidate_inode_pages2_range(mapping, - iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); - WARN_ON_ONCE(ret); - ret = 0; - } /* * If we are doing unaligned IO, wait for all other IO to drain, @@ -591,7 +547,7 @@ xfs_file_dio_aio_write( if (unaligned_io) inode_dio_wait(inode); else if (iolock == XFS_IOLOCK_EXCL) { - xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; } @@ -604,24 +560,9 @@ xfs_file_dio_aio_write( goto out; } - data = *from; - ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, - xfs_get_blocks_direct, xfs_end_io_direct_write, - NULL, DIO_ASYNC_EXTEND); - - /* see generic_file_direct_write() for why this is necessary */ - if (mapping->nrpages) { - invalidate_inode_pages2_range(mapping, - iocb->ki_pos >> PAGE_SHIFT, - end >> PAGE_SHIFT); - } - - if (ret > 0) { - iocb->ki_pos += ret; - iov_iter_advance(from, ret); - } + ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io); out: - xfs_rw_iunlock(ip, iolock); + xfs_iunlock(ip, iolock); /* * No fallback to buffered IO on errors for XFS, direct IO will either @@ -643,7 +584,7 @@ xfs_file_dax_write( size_t count; loff_t pos; - xfs_rw_ilock(ip, iolock); + xfs_ilock(ip, iolock); ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; @@ -652,15 +593,13 @@ xfs_file_dax_write( count = iov_iter_count(from); trace_xfs_file_dax_write(ip, count, pos); - ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops); if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { i_size_write(inode, iocb->ki_pos); error = xfs_setfilesize(ip, pos, ret); } - out: - xfs_rw_iunlock(ip, iolock); + xfs_iunlock(ip, iolock); return error ? error : ret; } @@ -677,7 +616,7 @@ xfs_file_buffered_aio_write( int enospc = 0; int iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, iolock); + xfs_ilock(ip, iolock); ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) @@ -721,7 +660,7 @@ write_retry: current->backing_dev_info = NULL; out: - xfs_rw_iunlock(ip, iolock); + xfs_iunlock(ip, iolock); return ret; } @@ -797,7 +736,7 @@ xfs_file_fallocate( return -EOPNOTSUPP; xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock, false); + error = xfs_break_layouts(inode, &iolock); if (error) goto out_unlock; @@ -939,7 +878,6 @@ xfs_file_clone_range( len, false); } -#define XFS_MAX_DEDUPE_LEN (16 * 1024 * 1024) STATIC ssize_t xfs_file_dedupe_range( struct file *src_file, @@ -950,14 +888,6 @@ xfs_file_dedupe_range( { int error; - /* - * Limit the total length we will dedupe for each operation. - * This is intended to bound the total time spent in this - * ioctl to something sane. - */ - if (len > XFS_MAX_DEDUPE_LEN) - len = XFS_MAX_DEDUPE_LEN; - error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff, len, true); if (error) @@ -1501,15 +1431,9 @@ xfs_filemap_fault( return xfs_filemap_page_mkwrite(vma, vmf); xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - if (IS_DAX(inode)) { - /* - * we do not want to trigger unwritten extent conversion on read - * faults - that is unnecessary overhead and would also require - * changes to xfs_get_blocks_direct() to map unwritten extent - * ioend for conversion on read-only mappings. - */ + if (IS_DAX(inode)) ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops); - } else + else ret = filemap_fault(vma, vmf); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index f295049db681..ff4d6311c7f4 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -70,8 +70,6 @@ xfs_inode_alloc( ASSERT(!xfs_isiflocked(ip)); ASSERT(ip->i_ino == 0); - mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); - /* initialise the xfs inode */ ip->i_ino = ino; ip->i_mount = mp; @@ -123,7 +121,6 @@ __xfs_inode_free( { /* asserts to verify all state is correct here */ ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!xfs_isiflocked(ip)); XFS_STATS_DEC(ip->i_mount, vn_active); call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); @@ -133,6 +130,8 @@ void xfs_inode_free( struct xfs_inode *ip) { + ASSERT(!xfs_isiflocked(ip)); + /* * Because we use RCU freeing we need to ensure the inode always * appears to be reclaimed with an invalid inode number when in the @@ -393,8 +392,8 @@ xfs_iget_cache_hit( xfs_inode_clear_reclaim_tag(pag, ip->i_ino); inode->i_state = I_NEW; - ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); - mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + ASSERT(!rwsem_is_locked(&inode->i_rwsem)); + init_rwsem(&inode->i_rwsem); spin_unlock(&ip->i_flags_lock); spin_unlock(&pag->pag_ici_lock); @@ -981,6 +980,7 @@ restart: if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_iunpin_wait(ip); + /* xfs_iflush_abort() drops the flush lock */ xfs_iflush_abort(ip, false); goto reclaim; } @@ -989,10 +989,10 @@ restart: goto out_ifunlock; xfs_iunpin_wait(ip); } - if (xfs_iflags_test(ip, XFS_ISTALE)) - goto reclaim; - if (xfs_inode_clean(ip)) + if (xfs_iflags_test(ip, XFS_ISTALE) || xfs_inode_clean(ip)) { + xfs_ifunlock(ip); goto reclaim; + } /* * Never flush out dirty data during non-blocking reclaim, as it would @@ -1030,25 +1030,24 @@ restart: xfs_buf_relse(bp); } - xfs_iflock(ip); reclaim: + ASSERT(!xfs_isiflocked(ip)); + /* * Because we use RCU freeing we need to ensure the inode always appears * to be reclaimed with an invalid inode number when in the free state. - * We do this as early as possible under the ILOCK and flush lock so - * that xfs_iflush_cluster() can be guaranteed to detect races with us - * here. By doing this, we guarantee that once xfs_iflush_cluster has - * locked both the XFS_ILOCK and the flush lock that it will see either - * a valid, flushable inode that will serialise correctly against the - * locks below, or it will see a clean (and invalid) inode that it can - * skip. + * We do this as early as possible under the ILOCK so that + * xfs_iflush_cluster() can be guaranteed to detect races with us here. + * By doing this, we guarantee that once xfs_iflush_cluster has locked + * XFS_ILOCK that it will see either a valid, flushable inode that will + * serialise correctly, or it will see a clean (and invalid) inode that + * it can skip. */ spin_lock(&ip->i_flags_lock); ip->i_flags = XFS_IRECLAIM; ip->i_ino = 0; spin_unlock(&ip->i_flags_lock); - xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); XFS_STATS_INC(ip->i_mount, xs_ig_reclaims); @@ -1580,10 +1579,15 @@ xfs_inode_free_cowblocks( struct xfs_eofblocks *eofb = args; bool need_iolock = true; int match; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0)); - if (!xfs_reflink_has_real_cow_blocks(ip)) { + /* + * Just clear the tag if we have an empty cow fork or none at all. It's + * possible the inode was fully unshared since it was originally tagged. + */ + if (!xfs_is_reflink_inode(ip) || !ifp->if_bytes) { trace_xfs_inode_free_cowblocks_invalid(ip); xfs_inode_clear_cowblocks_tag(ip); return 0; diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index d45ca72af6fb..865ad1373e5e 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -133,7 +133,7 @@ xfs_icreate_item_committing( /* * This is the ops vector shared by all buf log items. */ -static struct xfs_item_ops xfs_icreate_item_ops = { +static const struct xfs_item_ops xfs_icreate_item_ops = { .iop_size = xfs_icreate_item_size, .iop_format = xfs_icreate_item_format, .iop_pin = xfs_icreate_item_pin, diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 4e560e6a12c1..b9557795eb74 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -142,31 +142,31 @@ xfs_ilock_attr_map_shared( } /* - * The xfs inode contains 3 multi-reader locks: the i_iolock the i_mmap_lock and - * the i_lock. This routine allows various combinations of the locks to be - * obtained. + * In addition to i_rwsem in the VFS inode, the xfs inode contains 2 + * multi-reader locks: i_mmap_lock and the i_lock. This routine allows + * various combinations of the locks to be obtained. * * The 3 locks should always be ordered so that the IO lock is obtained first, * the mmap lock second and the ilock last in order to prevent deadlock. * * Basic locking order: * - * i_iolock -> i_mmap_lock -> page_lock -> i_ilock + * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock * * mmap_sem locking order: * - * i_iolock -> page lock -> mmap_sem + * i_rwsem -> page lock -> mmap_sem * mmap_sem -> i_mmap_lock -> page_lock * * The difference in mmap_sem locking order mean that we cannot hold the * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can * fault in pages during copy in/out (for buffered IO) or require the mmap_sem * in get_user_pages() to map the user pages into the kernel address space for - * direct IO. Similarly the i_iolock cannot be taken inside a page fault because + * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because * page faults already hold the mmap_sem. * * Hence to serialise fully against both syscall and mmap based IO, we need to - * take both the i_iolock and the i_mmap_lock. These locks should *only* be both + * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both * taken in places where we need to invalidate the page cache in a race * free manner (e.g. truncate, hole punch and other extent manipulation * functions). @@ -191,10 +191,13 @@ xfs_ilock( (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); - if (lock_flags & XFS_IOLOCK_EXCL) - mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); - else if (lock_flags & XFS_IOLOCK_SHARED) - mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + if (lock_flags & XFS_IOLOCK_EXCL) { + down_write_nested(&VFS_I(ip)->i_rwsem, + XFS_IOLOCK_DEP(lock_flags)); + } else if (lock_flags & XFS_IOLOCK_SHARED) { + down_read_nested(&VFS_I(ip)->i_rwsem, + XFS_IOLOCK_DEP(lock_flags)); + } if (lock_flags & XFS_MMAPLOCK_EXCL) mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); @@ -240,10 +243,10 @@ xfs_ilock_nowait( ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); if (lock_flags & XFS_IOLOCK_EXCL) { - if (!mrtryupdate(&ip->i_iolock)) + if (!down_write_trylock(&VFS_I(ip)->i_rwsem)) goto out; } else if (lock_flags & XFS_IOLOCK_SHARED) { - if (!mrtryaccess(&ip->i_iolock)) + if (!down_read_trylock(&VFS_I(ip)->i_rwsem)) goto out; } @@ -271,9 +274,9 @@ out_undo_mmaplock: mrunlock_shared(&ip->i_mmaplock); out_undo_iolock: if (lock_flags & XFS_IOLOCK_EXCL) - mrunlock_excl(&ip->i_iolock); + up_write(&VFS_I(ip)->i_rwsem); else if (lock_flags & XFS_IOLOCK_SHARED) - mrunlock_shared(&ip->i_iolock); + up_read(&VFS_I(ip)->i_rwsem); out: return 0; } @@ -310,9 +313,9 @@ xfs_iunlock( ASSERT(lock_flags != 0); if (lock_flags & XFS_IOLOCK_EXCL) - mrunlock_excl(&ip->i_iolock); + up_write(&VFS_I(ip)->i_rwsem); else if (lock_flags & XFS_IOLOCK_SHARED) - mrunlock_shared(&ip->i_iolock); + up_read(&VFS_I(ip)->i_rwsem); if (lock_flags & XFS_MMAPLOCK_EXCL) mrunlock_excl(&ip->i_mmaplock); @@ -345,7 +348,7 @@ xfs_ilock_demote( if (lock_flags & XFS_MMAPLOCK_EXCL) mrdemote(&ip->i_mmaplock); if (lock_flags & XFS_IOLOCK_EXCL) - mrdemote(&ip->i_iolock); + downgrade_write(&VFS_I(ip)->i_rwsem); trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); } @@ -370,8 +373,9 @@ xfs_isilocked( if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { if (!(lock_flags & XFS_IOLOCK_SHARED)) - return !!ip->i_iolock.mr_writer; - return rwsem_is_locked(&ip->i_iolock.mr_lock); + return !debug_locks || + lockdep_is_held_type(&VFS_I(ip)->i_rwsem, 0); + return rwsem_is_locked(&VFS_I(ip)->i_rwsem); } ASSERT(0); @@ -421,11 +425,7 @@ xfs_lock_inumorder(int lock_mode, int subclass) if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS); - ASSERT(xfs_lockdep_subclass_ok(subclass + - XFS_IOLOCK_PARENT_VAL)); class += subclass << XFS_IOLOCK_SHIFT; - if (lock_mode & XFS_IOLOCK_PARENT) - class += XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT; } if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) { @@ -477,8 +477,6 @@ xfs_lock_inodes( XFS_ILOCK_EXCL)); ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED | XFS_ILOCK_SHARED))); - ASSERT(!(lock_mode & XFS_IOLOCK_EXCL) || - inodes <= XFS_IOLOCK_MAX_SUBCLASS + 1); ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) || inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1); ASSERT(!(lock_mode & XFS_ILOCK_EXCL) || @@ -581,10 +579,8 @@ xfs_lock_two_inodes( int attempts = 0; xfs_log_item_t *lp; - if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { - ASSERT(!(lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); - ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); - } else if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) + ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); + if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); ASSERT(ip0->i_ino != ip1->i_ino); @@ -715,7 +711,6 @@ xfs_lookup( if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; - xfs_ilock(dp, XFS_IOLOCK_SHARED); error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); if (error) goto out_unlock; @@ -724,14 +719,12 @@ xfs_lookup( if (error) goto out_free_name; - xfs_iunlock(dp, XFS_IOLOCK_SHARED); return 0; out_free_name: if (ci_name) kmem_free(ci_name->name); out_unlock: - xfs_iunlock(dp, XFS_IOLOCK_SHARED); *ipp = NULL; return error; } @@ -1215,8 +1208,7 @@ xfs_create( if (error) goto out_release_inode; - xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL | - XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT); + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; xfs_defer_init(&dfops, &first_block); @@ -1252,7 +1244,7 @@ xfs_create( * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); unlock_dp_on_error = false; error = xfs_dir_createname(tp, dp, name, ip->i_ino, @@ -1325,7 +1317,7 @@ xfs_create( xfs_qm_dqrele(pdqp); if (unlock_dp_on_error) - xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; } @@ -1466,11 +1458,10 @@ xfs_link( if (error) goto std_return; - xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, tdp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); /* * If we are using project inheritance, we only allow hard link @@ -2041,7 +2032,6 @@ xfs_iunlink( agi->agi_unlinked[bucket_index] = cpu_to_be32(agino); offset = offsetof(xfs_agi_t, agi_unlinked) + (sizeof(xfs_agino_t) * bucket_index); - xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF); xfs_trans_log_buf(tp, agibp, offset, (offset + sizeof(xfs_agino_t) - 1)); return 0; @@ -2133,7 +2123,6 @@ xfs_iunlink_remove( agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino); offset = offsetof(xfs_agi_t, agi_unlinked) + (sizeof(xfs_agino_t) * bucket_index); - xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF); xfs_trans_log_buf(tp, agibp, offset, (offset + sizeof(xfs_agino_t) - 1)); } else { @@ -2579,10 +2568,9 @@ xfs_remove( goto std_return; } - xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); /* @@ -2963,12 +2951,6 @@ xfs_rename( * whether the target directory is the same as the source * directory, we can lock from 2 to 4 inodes. */ - if (!new_parent) - xfs_ilock(src_dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); - else - xfs_lock_two_inodes(src_dp, target_dp, - XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); - xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); /* @@ -2976,9 +2958,9 @@ xfs_rename( * we can rely on either trans_commit or trans_cancel to unlock * them. */ - xfs_trans_ijoin(tp, src_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); if (new_parent) - xfs_trans_ijoin(tp, target_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); if (target_ip) xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index f14c1de2549d..10dcf27b4c85 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -56,7 +56,6 @@ typedef struct xfs_inode { /* Transaction and locking information. */ struct xfs_inode_log_item *i_itemp; /* logging information */ mrlock_t i_lock; /* inode lock */ - mrlock_t i_iolock; /* inode IO lock */ mrlock_t i_mmaplock; /* inode mmap IO lock */ atomic_t i_pincount; /* inode pin count */ spinlock_t i_flags_lock; /* inode i_flags lock */ @@ -246,6 +245,11 @@ static inline bool xfs_is_reflink_inode(struct xfs_inode *ip) * Synchronize processes attempting to flush the in-core inode back to disk. */ +static inline int xfs_isiflocked(struct xfs_inode *ip) +{ + return xfs_iflags_test(ip, XFS_IFLOCK); +} + extern void __xfs_iflock(struct xfs_inode *ip); static inline int xfs_iflock_nowait(struct xfs_inode *ip) @@ -261,16 +265,12 @@ static inline void xfs_iflock(struct xfs_inode *ip) static inline void xfs_ifunlock(struct xfs_inode *ip) { + ASSERT(xfs_isiflocked(ip)); xfs_iflags_clear(ip, XFS_IFLOCK); smp_mb(); wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT); } -static inline int xfs_isiflocked(struct xfs_inode *ip) -{ - return xfs_iflags_test(ip, XFS_IFLOCK); -} - /* * Flags for inode locking. * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) @@ -332,7 +332,7 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) * IOLOCK values * * 0-3 subclass value - * 4-7 PARENT subclass values + * 4-7 unused * * MMAPLOCK values * @@ -347,10 +347,8 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) * */ #define XFS_IOLOCK_SHIFT 16 -#define XFS_IOLOCK_PARENT_VAL 4 -#define XFS_IOLOCK_MAX_SUBCLASS (XFS_IOLOCK_PARENT_VAL - 1) +#define XFS_IOLOCK_MAX_SUBCLASS 3 #define XFS_IOLOCK_DEP_MASK 0x000f0000 -#define XFS_IOLOCK_PARENT (XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT) #define XFS_MMAPLOCK_SHIFT 20 #define XFS_MMAPLOCK_NUMORDER 0 diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 9610e9c00952..d90e7811ccdd 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -164,7 +164,7 @@ xfs_inode_item_format_data_fork( struct xfs_bmbt_rec *p; ASSERT(ip->i_df.if_u1.if_extents != NULL); - ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0); + ASSERT(xfs_iext_count(&ip->i_df) > 0); p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT); data_bytes = xfs_iextents_copy(ip, p, XFS_DATA_FORK); @@ -261,7 +261,7 @@ xfs_inode_item_format_attr_fork( ip->i_afp->if_bytes > 0) { struct xfs_bmbt_rec *p; - ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) == + ASSERT(xfs_iext_count(ip->i_afp) == ip->i_d.di_anextents); ASSERT(ip->i_afp->if_u1.if_extents != NULL); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index c245bed3249b..fc563b82aea6 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -639,7 +639,7 @@ xfs_ioc_space( return error; xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock, false); + error = xfs_break_layouts(inode, &iolock); if (error) goto out_unlock; @@ -910,16 +910,14 @@ xfs_ioc_fsgetxattr( if (attr) { if (ip->i_afp) { if (ip->i_afp->if_flags & XFS_IFEXTENTS) - fa.fsx_nextents = ip->i_afp->if_bytes / - sizeof(xfs_bmbt_rec_t); + fa.fsx_nextents = xfs_iext_count(ip->i_afp); else fa.fsx_nextents = ip->i_d.di_anextents; } else fa.fsx_nextents = 0; } else { if (ip->i_df.if_flags & XFS_IFEXTENTS) - fa.fsx_nextents = ip->i_df.if_bytes / - sizeof(xfs_bmbt_rec_t); + fa.fsx_nextents = xfs_iext_count(&ip->i_df); else fa.fsx_nextents = ip->i_d.di_nextents; } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 436e109bb01e..0d147428971e 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -395,11 +395,12 @@ xfs_iomap_prealloc_size( struct xfs_inode *ip, loff_t offset, loff_t count, - xfs_extnum_t idx, - struct xfs_bmbt_irec *prev) + xfs_extnum_t idx) { struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + struct xfs_bmbt_irec prev; int shift = 0; int64_t freesp; xfs_fsblock_t qblocks; @@ -419,8 +420,8 @@ xfs_iomap_prealloc_size( */ if ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) || XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) || - idx == 0 || - prev->br_startoff + prev->br_blockcount < offset_fsb) + !xfs_iext_get_extent(ifp, idx - 1, &prev) || + prev.br_startoff + prev.br_blockcount < offset_fsb) return mp->m_writeio_blocks; /* @@ -439,8 +440,8 @@ xfs_iomap_prealloc_size( * always extends to MAXEXTLEN rather than falling short due to things * like stripe unit/width alignment of real extents. */ - if (prev->br_blockcount <= (MAXEXTLEN >> 1)) - alloc_blocks = prev->br_blockcount << 1; + if (prev.br_blockcount <= (MAXEXTLEN >> 1)) + alloc_blocks = prev.br_blockcount << 1; else alloc_blocks = XFS_B_TO_FSB(mp, offset); if (!alloc_blocks) @@ -535,11 +536,11 @@ xfs_file_iomap_begin_delay( xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t maxbytes_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); - xfs_fileoff_t end_fsb, orig_end_fsb; + xfs_fileoff_t end_fsb; int error = 0, eof = 0; struct xfs_bmbt_irec got; - struct xfs_bmbt_irec prev; xfs_extnum_t idx; + xfs_fsblock_t prealloc_blocks = 0; ASSERT(!XFS_IS_REALTIME_INODE(ip)); ASSERT(!xfs_get_extsz_hint(ip)); @@ -563,8 +564,7 @@ xfs_file_iomap_begin_delay( goto out_unlock; } - xfs_bmap_search_extents(ip, offset_fsb, XFS_DATA_FORK, &eof, &idx, - &got, &prev); + eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got); if (!eof && got.br_startoff <= offset_fsb) { if (xfs_is_reflink_inode(ip)) { bool shared; @@ -595,35 +595,32 @@ xfs_file_iomap_begin_delay( * the lower level functions are updated. */ count = min_t(loff_t, count, 1024 * PAGE_SIZE); - end_fsb = orig_end_fsb = - min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); + end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); if (eof) { - xfs_fsblock_t prealloc_blocks; - - prealloc_blocks = - xfs_iomap_prealloc_size(ip, offset, count, idx, &prev); + prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count, idx); if (prealloc_blocks) { xfs_extlen_t align; xfs_off_t end_offset; + xfs_fileoff_t p_end_fsb; end_offset = XFS_WRITEIO_ALIGN(mp, offset + count - 1); - end_fsb = XFS_B_TO_FSBT(mp, end_offset) + - prealloc_blocks; + p_end_fsb = XFS_B_TO_FSBT(mp, end_offset) + + prealloc_blocks; align = xfs_eof_alignment(ip, 0); if (align) - end_fsb = roundup_64(end_fsb, align); + p_end_fsb = roundup_64(p_end_fsb, align); - end_fsb = min(end_fsb, maxbytes_fsb); - ASSERT(end_fsb > offset_fsb); + p_end_fsb = min(p_end_fsb, maxbytes_fsb); + ASSERT(p_end_fsb > offset_fsb); + prealloc_blocks = p_end_fsb - end_fsb; } } retry: error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb, - end_fsb - offset_fsb, &got, - &prev, &idx, eof); + end_fsb - offset_fsb, prealloc_blocks, &got, &idx, eof); switch (error) { case 0: break; @@ -631,8 +628,8 @@ retry: case -EDQUOT: /* retry without any preallocation */ trace_xfs_delalloc_enospc(ip, offset, count); - if (end_fsb != orig_end_fsb) { - end_fsb = orig_end_fsb; + if (prealloc_blocks) { + prealloc_blocks = 0; goto retry; } /*FALLTHRU*/ @@ -640,13 +637,6 @@ retry: goto out_unlock; } - /* - * Tag the inode as speculatively preallocated so we can reclaim this - * space on demand, if necessary. - */ - if (end_fsb != orig_end_fsb) - xfs_inode_set_eofblocks_tag(ip); - trace_xfs_iomap_alloc(ip, offset, count, 0, &got); done: if (isnullstartblock(got.br_startblock)) @@ -960,6 +950,19 @@ static inline bool imap_needs_alloc(struct inode *inode, (IS_DAX(inode) && ISUNWRITTEN(imap)); } +static inline bool need_excl_ilock(struct xfs_inode *ip, unsigned flags) +{ + /* + * COW writes will allocate delalloc space, so we need to make sure + * to take the lock exclusively here. + */ + if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO))) + return true; + if ((flags & IOMAP_DIRECT) && (flags & IOMAP_WRITE)) + return true; + return false; +} + static int xfs_file_iomap_begin( struct inode *inode, @@ -979,18 +982,14 @@ xfs_file_iomap_begin( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if ((flags & IOMAP_WRITE) && !IS_DAX(inode) && - !xfs_get_extsz_hint(ip)) { + if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) && + !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) { /* Reserve delalloc blocks for regular writeback. */ return xfs_file_iomap_begin_delay(inode, offset, length, flags, iomap); } - /* - * COW writes will allocate delalloc space, so we need to make sure - * to take the lock exclusively here. - */ - if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { + if (need_excl_ilock(ip, flags)) { lockmode = XFS_ILOCK_EXCL; xfs_ilock(ip, XFS_ILOCK_EXCL); } else { @@ -1003,17 +1002,41 @@ xfs_file_iomap_begin( offset_fsb = XFS_B_TO_FSBT(mp, offset); end_fsb = XFS_B_TO_FSB(mp, offset + length); + if (xfs_is_reflink_inode(ip) && + (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT)) { + shared = xfs_reflink_find_cow_mapping(ip, offset, &imap); + if (shared) { + xfs_iunlock(ip, lockmode); + goto alloc_done; + } + ASSERT(!isnullstartblock(imap.br_startblock)); + } + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, 0); if (error) goto out_unlock; - if (flags & IOMAP_REPORT) { + if ((flags & IOMAP_REPORT) || + (xfs_is_reflink_inode(ip) && + (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT))) { /* Trim the mapping to the nearest shared extent boundary. */ error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed); if (error) goto out_unlock; + + /* + * We're here because we're trying to do a directio write to a + * region that isn't aligned to a filesystem block. If the + * extent is shared, fall back to buffered mode to handle the + * RMW. + */ + if (!(flags & IOMAP_REPORT) && shared) { + trace_xfs_reflink_bounce_dio_write(ip, &imap); + error = -EREMCHG; + goto out_unlock; + } } if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { @@ -1048,6 +1071,7 @@ xfs_file_iomap_begin( if (error) return error; +alloc_done: iomap->flags = IOMAP_F_NEW; trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); } else { diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 405a65cd9d6b..b930be0b1596 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -983,15 +983,13 @@ xfs_vn_setattr( struct xfs_inode *ip = XFS_I(d_inode(dentry)); uint iolock = XFS_IOLOCK_EXCL; - xfs_ilock(ip, iolock); - error = xfs_break_layouts(d_inode(dentry), &iolock, true); - if (!error) { - xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - iolock |= XFS_MMAPLOCK_EXCL; + error = xfs_break_layouts(d_inode(dentry), &iolock); + if (error) + return error; - error = xfs_vn_setattr_size(dentry, iattr); - } - xfs_iunlock(ip, iolock); + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + error = xfs_vn_setattr_size(dentry, iattr); + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); } else { error = xfs_vn_setattr_nonsize(dentry, iattr); } diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 68640fb63a54..a415f822f2c1 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -78,6 +78,7 @@ typedef __u32 xfs_nlink_t; #include <linux/freezer.h> #include <linux/list_sort.h> #include <linux/ratelimit.h> +#include <linux/rhashtable.h> #include <asm/page.h> #include <asm/div64.h> diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 3b74fa011bb1..c39ac14ff540 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1668,7 +1668,7 @@ xlog_cksum( __uint32_t crc; /* first generate the crc for the record header ... */ - crc = xfs_start_cksum((char *)rhead, + crc = xfs_start_cksum_update((char *)rhead, sizeof(struct xlog_rec_header), offsetof(struct xlog_rec_header, h_crc)); @@ -1862,26 +1862,21 @@ xlog_sync( bp->b_io_length = BTOBB(count); bp->b_fspriv = iclog; - bp->b_flags &= ~(XBF_FUA | XBF_FLUSH); - bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE); + bp->b_flags &= ~XBF_FLUSH; + bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA); - if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) { - bp->b_flags |= XBF_FUA; - - /* - * Flush the data device before flushing the log to make - * sure all meta data written back from the AIL actually made - * it to disk before stamping the new log tail LSN into the - * log buffer. For an external log we need to issue the - * flush explicitly, and unfortunately synchronously here; - * for an internal log we can simply use the block layer - * state machine for preflushes. - */ - if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp) - xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp); - else - bp->b_flags |= XBF_FLUSH; - } + /* + * Flush the data device before flushing the log to make sure all meta + * data written back from the AIL actually made it to disk before + * stamping the new log tail LSN into the log buffer. For an external + * log we need to issue the flush explicitly, and unfortunately + * synchronously here; for an internal log we can simply use the block + * layer state machine for preflushes. + */ + if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp) + xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp); + else + bp->b_flags |= XBF_FLUSH; ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); @@ -1906,10 +1901,8 @@ xlog_sync( xfs_buf_associate_memory(bp, (char *)&iclog->ic_header + count, split); bp->b_fspriv = iclog; - bp->b_flags &= ~(XBF_FUA | XBF_FLUSH); - bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE); - if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) - bp->b_flags |= XBF_FUA; + bp->b_flags &= ~XBF_FLUSH; + bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA); ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 9b3d7c76915d..4a98762ec8b4 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2025,7 +2025,7 @@ xlog_peek_buffer_cancelled( struct xlog *log, xfs_daddr_t blkno, uint len, - ushort flags) + unsigned short flags) { struct list_head *bucket; struct xfs_buf_cancel *bcp; @@ -2065,7 +2065,7 @@ xlog_check_buffer_cancelled( struct xlog *log, xfs_daddr_t blkno, uint len, - ushort flags) + unsigned short flags) { struct xfs_buf_cancel *bcp; @@ -5113,19 +5113,21 @@ xlog_recover_process( struct list_head *buffer_list) { int error; + __le32 old_crc = rhead->h_crc; __le32 crc; + crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); /* * Nothing else to do if this is a CRC verification pass. Just return * if this a record with a non-zero crc. Unfortunately, mkfs always - * sets h_crc to 0 so we must consider this valid even on v5 supers. + * sets old_crc to 0 so we must consider this valid even on v5 supers. * Otherwise, return EFSBADCRC on failure so the callers up the stack * know precisely what failed. */ if (pass == XLOG_RECOVER_CRCPASS) { - if (rhead->h_crc && crc != rhead->h_crc) + if (old_crc && crc != old_crc) return -EFSBADCRC; return 0; } @@ -5136,11 +5138,11 @@ xlog_recover_process( * zero CRC check prevents warnings from being emitted when upgrading * the kernel from one that does not add CRCs by default. */ - if (crc != rhead->h_crc) { - if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { + if (crc != old_crc) { + if (old_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { xfs_alert(log->l_mp, "log record CRC mismatch: found 0x%x, expected 0x%x.", - le32_to_cpu(rhead->h_crc), + le32_to_cpu(old_crc), le32_to_cpu(crc)); xfs_hex_dump(dp, 32); } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index b341f10cf481..9b9540db17a6 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -157,6 +157,7 @@ xfs_free_perag( spin_unlock(&mp->m_perag_lock); ASSERT(pag); ASSERT(atomic_read(&pag->pag_ref) == 0); + xfs_buf_hash_destroy(pag); call_rcu(&pag->rcu_head, __xfs_free_perag); } } @@ -212,8 +213,8 @@ xfs_initialize_perag( spin_lock_init(&pag->pag_ici_lock); mutex_init(&pag->pag_ici_reclaim_lock); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); - spin_lock_init(&pag->pag_buf_lock); - pag->pag_buf_tree = RB_ROOT; + if (xfs_buf_hash_init(pag)) + goto out_unwind; if (radix_tree_preload(GFP_NOFS)) goto out_unwind; @@ -239,9 +240,11 @@ xfs_initialize_perag( return 0; out_unwind: + xfs_buf_hash_destroy(pag); kmem_free(pag); for (; index > first_initialised; index--) { pag = radix_tree_delete(&mp->m_perag_tree, index); + xfs_buf_hash_destroy(pag); kmem_free(pag); } return error; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 819b80b15bfb..84f785218907 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -393,8 +393,8 @@ typedef struct xfs_perag { unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ /* buffer cache index */ - spinlock_t pag_buf_lock; /* lock for pag_buf_tree */ - struct rb_root pag_buf_tree; /* ordered tree of active buffers */ + spinlock_t pag_buf_lock; /* lock for pag_buf_hash */ + struct rhashtable pag_buf_hash; /* for rcu-safe freeing */ struct rcu_head rcu_head; @@ -424,6 +424,9 @@ xfs_perag_resv( } } +int xfs_buf_hash_init(xfs_perag_t *pag); +void xfs_buf_hash_destroy(xfs_perag_t *pag); + extern void xfs_uuid_table_free(void); extern int xfs_log_sbcount(xfs_mount_t *); extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 93a7aafa56d6..2f2dc3c09ad0 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -32,8 +32,7 @@ int xfs_break_layouts( struct inode *inode, - uint *iolock, - bool with_imutex) + uint *iolock) { struct xfs_inode *ip = XFS_I(inode); int error; @@ -42,12 +41,8 @@ xfs_break_layouts( while ((error = break_layout(inode, false) == -EWOULDBLOCK)) { xfs_iunlock(ip, *iolock); - if (with_imutex && (*iolock & XFS_IOLOCK_EXCL)) - inode_unlock(inode); error = break_layout(inode, true); *iolock = XFS_IOLOCK_EXCL; - if (with_imutex) - inode_lock(inode); xfs_ilock(ip, *iolock); } diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h index e8339f74966b..b587cb99b2b7 100644 --- a/fs/xfs/xfs_pnfs.h +++ b/fs/xfs/xfs_pnfs.h @@ -8,10 +8,10 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length, int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps, struct iattr *iattr); -int xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex); +int xfs_break_layouts(struct inode *inode, uint *iolock); #else static inline int -xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex) +xfs_break_layouts(struct inode *inode, uint *iolock) { return 0; } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index a60d9e2739d1..45e50ea90769 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1135,7 +1135,7 @@ xfs_qm_get_rtblks( return error; } rtblks = 0; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); for (idx = 0; idx < nextents; idx++) rtblks += xfs_bmbt_get_blockcount(xfs_iext_get_ext(ifp, idx)); *O_rtblks = (xfs_qcnt_t)rtblks; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index a279b4e7f5fe..88fd03c66e99 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -243,12 +243,11 @@ xfs_reflink_reserve_cow( struct xfs_bmbt_irec *imap, bool *shared) { - struct xfs_bmbt_irec got, prev; - xfs_fileoff_t end_fsb, orig_end_fsb; - int eof = 0, error = 0; - bool trimmed; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + struct xfs_bmbt_irec got; + int error = 0; + bool eof = false, trimmed; xfs_extnum_t idx; - xfs_extlen_t align; /* * Search the COW fork extent list first. This serves two purposes: @@ -258,8 +257,9 @@ xfs_reflink_reserve_cow( * extent list is generally faster than going out to the shared extent * tree. */ - xfs_bmap_search_extents(ip, imap->br_startoff, XFS_COW_FORK, &eof, &idx, - &got, &prev); + + if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &idx, &got)) + eof = true; if (!eof && got.br_startoff <= imap->br_startoff) { trace_xfs_reflink_cow_found(ip, imap); xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); @@ -285,33 +285,12 @@ xfs_reflink_reserve_cow( if (error) return error; - end_fsb = orig_end_fsb = imap->br_startoff + imap->br_blockcount; - - align = xfs_eof_alignment(ip, xfs_get_cowextsz_hint(ip)); - if (align) - end_fsb = roundup_64(end_fsb, align); - -retry: error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff, - end_fsb - imap->br_startoff, &got, &prev, &idx, eof); - switch (error) { - case 0: - break; - case -ENOSPC: - case -EDQUOT: - /* retry without any preallocation */ + imap->br_blockcount, 0, &got, &idx, eof); + if (error == -ENOSPC || error == -EDQUOT) trace_xfs_reflink_cow_enospc(ip, imap); - if (end_fsb != orig_end_fsb) { - end_fsb = orig_end_fsb; - goto retry; - } - /*FALLTHRU*/ - default: + if (error) return error; - } - - if (end_fsb != orig_end_fsb) - xfs_inode_set_cowblocks_tag(ip); trace_xfs_reflink_cow_alloc(ip, &got); return 0; @@ -418,87 +397,65 @@ xfs_reflink_allocate_cow_range( } /* - * Find the CoW reservation (and whether or not it needs block allocation) - * for a given byte offset of a file. + * Find the CoW reservation for a given byte offset of a file. */ bool xfs_reflink_find_cow_mapping( struct xfs_inode *ip, xfs_off_t offset, - struct xfs_bmbt_irec *imap, - bool *need_alloc) + struct xfs_bmbt_irec *imap) { - struct xfs_bmbt_irec irec; - struct xfs_ifork *ifp; - struct xfs_bmbt_rec_host *gotp; - xfs_fileoff_t bno; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + xfs_fileoff_t offset_fsb; + struct xfs_bmbt_irec got; xfs_extnum_t idx; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); ASSERT(xfs_is_reflink_inode(ip)); - /* Find the extent in the CoW fork. */ - ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - bno = XFS_B_TO_FSBT(ip->i_mount, offset); - gotp = xfs_iext_bno_to_ext(ifp, bno, &idx); - if (!gotp) + offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); + if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got)) return false; - - xfs_bmbt_get_all(gotp, &irec); - if (bno >= irec.br_startoff + irec.br_blockcount || - bno < irec.br_startoff) + if (got.br_startoff > offset_fsb) return false; trace_xfs_reflink_find_cow_mapping(ip, offset, 1, XFS_IO_OVERWRITE, - &irec); - - /* If it's still delalloc, we must allocate later. */ - *imap = irec; - *need_alloc = !!(isnullstartblock(irec.br_startblock)); - + &got); + *imap = got; return true; } /* * Trim an extent to end at the next CoW reservation past offset_fsb. */ -int +void xfs_reflink_trim_irec_to_next_cow( struct xfs_inode *ip, xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap) { - struct xfs_bmbt_irec irec; - struct xfs_ifork *ifp; - struct xfs_bmbt_rec_host *gotp; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + struct xfs_bmbt_irec got; xfs_extnum_t idx; if (!xfs_is_reflink_inode(ip)) - return 0; + return; /* Find the extent in the CoW fork. */ - ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - gotp = xfs_iext_bno_to_ext(ifp, offset_fsb, &idx); - if (!gotp) - return 0; - xfs_bmbt_get_all(gotp, &irec); + if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got)) + return; /* This is the extent before; try sliding up one. */ - if (irec.br_startoff < offset_fsb) { - idx++; - if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) - return 0; - gotp = xfs_iext_get_ext(ifp, idx); - xfs_bmbt_get_all(gotp, &irec); + if (got.br_startoff < offset_fsb) { + if (!xfs_iext_get_extent(ifp, idx + 1, &got)) + return; } - if (irec.br_startoff >= imap->br_startoff + imap->br_blockcount) - return 0; + if (got.br_startoff >= imap->br_startoff + imap->br_blockcount) + return; - imap->br_blockcount = irec.br_startoff - imap->br_startoff; + imap->br_blockcount = got.br_startoff - imap->br_startoff; trace_xfs_reflink_trim_irec(ip, imap); - - return 0; } /* @@ -512,18 +469,15 @@ xfs_reflink_cancel_cow_blocks( xfs_fileoff_t end_fsb) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - struct xfs_bmbt_irec got, prev, del; + struct xfs_bmbt_irec got, del; xfs_extnum_t idx; xfs_fsblock_t firstfsb; struct xfs_defer_ops dfops; - int error = 0, eof = 0; + int error = 0; if (!xfs_is_reflink_inode(ip)) return 0; - - xfs_bmap_search_extents(ip, offset_fsb, XFS_COW_FORK, &eof, &idx, - &got, &prev); - if (eof) + if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got)) return 0; while (got.br_startoff < end_fsb) { @@ -566,9 +520,8 @@ xfs_reflink_cancel_cow_blocks( xfs_bmap_del_extent_cow(ip, &idx, &got, &del); } - if (++idx >= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)) + if (!xfs_iext_get_extent(ifp, ++idx, &got)) break; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got); } /* clear tag if cow fork is emptied */ @@ -638,13 +591,13 @@ xfs_reflink_end_cow( xfs_off_t count) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - struct xfs_bmbt_irec got, prev, del; + struct xfs_bmbt_irec got, del; struct xfs_trans *tp; xfs_fileoff_t offset_fsb; xfs_fileoff_t end_fsb; xfs_fsblock_t firstfsb; struct xfs_defer_ops dfops; - int error, eof = 0; + int error; unsigned int resblks; xfs_filblks_t rlen; xfs_extnum_t idx; @@ -668,13 +621,11 @@ xfs_reflink_end_cow( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - xfs_bmap_search_extents(ip, end_fsb - 1, XFS_COW_FORK, &eof, &idx, - &got, &prev); - /* If there is a hole at end_fsb - 1 go to the previous extent */ - if (eof || got.br_startoff > end_fsb) { + if (!xfs_iext_lookup_extent(ip, ifp, end_fsb - 1, &idx, &got) || + got.br_startoff > end_fsb) { ASSERT(idx > 0); - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, --idx), &got); + xfs_iext_get_extent(ifp, --idx, &got); } /* Walk backwards until we're out of the I/O range... */ @@ -722,11 +673,9 @@ xfs_reflink_end_cow( error = xfs_defer_finish(&tp, &dfops, ip); if (error) goto out_defer; - next_extent: - if (idx < 0) + if (!xfs_iext_get_extent(ifp, idx, &got)) break; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got); } error = xfs_trans_commit(tp); @@ -1302,13 +1251,11 @@ xfs_reflink_remap_range( return -EIO; /* Lock both files against IO */ - if (same_inode) { - xfs_ilock(src, XFS_IOLOCK_EXCL); + lock_two_nondirectories(inode_in, inode_out); + if (same_inode) xfs_ilock(src, XFS_MMAPLOCK_EXCL); - } else { - xfs_lock_two_inodes(src, dest, XFS_IOLOCK_EXCL); + else xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL); - } /* Don't touch certain kinds of inodes */ ret = -EPERM; @@ -1345,8 +1292,14 @@ xfs_reflink_remap_range( goto out_unlock; } - if (len == 0) + /* Zero length dedupe exits immediately; reflink goes to EOF. */ + if (len == 0) { + if (is_dedupe) { + ret = 0; + goto out_unlock; + } len = isize - pos_in; + } /* Ensure offsets don't wrap and the input is inside i_size */ if (pos_in + len < pos_in || pos_out + len < pos_out || @@ -1447,11 +1400,9 @@ xfs_reflink_remap_range( out_unlock: xfs_iunlock(src, XFS_MMAPLOCK_EXCL); - xfs_iunlock(src, XFS_IOLOCK_EXCL); - if (src->i_ino != dest->i_ino) { + if (!same_inode) xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); - xfs_iunlock(dest, XFS_IOLOCK_EXCL); - } + unlock_two_nondirectories(inode_in, inode_out); if (ret) trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); return ret; @@ -1697,37 +1648,3 @@ out: trace_xfs_reflink_unshare_error(ip, error, _RET_IP_); return error; } - -/* - * Does this inode have any real CoW reservations? - */ -bool -xfs_reflink_has_real_cow_blocks( - struct xfs_inode *ip) -{ - struct xfs_bmbt_irec irec; - struct xfs_ifork *ifp; - struct xfs_bmbt_rec_host *gotp; - xfs_extnum_t idx; - - if (!xfs_is_reflink_inode(ip)) - return false; - - /* Go find the old extent in the CoW fork. */ - ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - gotp = xfs_iext_bno_to_ext(ifp, 0, &idx); - while (gotp) { - xfs_bmbt_get_all(gotp, &irec); - - if (!isnullstartblock(irec.br_startblock)) - return true; - - /* Roll on... */ - idx++; - if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) - break; - gotp = xfs_iext_get_ext(ifp, idx); - } - - return false; -} diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index fad11607c9ad..aa6a4d64bd35 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -31,8 +31,8 @@ extern int xfs_reflink_reserve_cow(struct xfs_inode *ip, extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset, - struct xfs_bmbt_irec *imap, bool *need_alloc); -extern int xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip, + struct xfs_bmbt_irec *imap); +extern void xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip, xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap); extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip, @@ -50,6 +50,4 @@ extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip, extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len); -extern bool xfs_reflink_has_real_cow_blocks(struct xfs_inode *ip); - #endif /* __XFS_REFLINK_H */ diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 12d48cd8f8a4..f11282c96887 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -80,9 +80,9 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) } /* extra precision counters */ for_each_possible_cpu(i) { - xs_xstrat_bytes += per_cpu_ptr(stats, i)->xs_xstrat_bytes; - xs_write_bytes += per_cpu_ptr(stats, i)->xs_write_bytes; - xs_read_bytes += per_cpu_ptr(stats, i)->xs_read_bytes; + xs_xstrat_bytes += per_cpu_ptr(stats, i)->s.xs_xstrat_bytes; + xs_write_bytes += per_cpu_ptr(stats, i)->s.xs_write_bytes; + xs_read_bytes += per_cpu_ptr(stats, i)->s.xs_read_bytes; } len += snprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", @@ -106,9 +106,9 @@ void xfs_stats_clearall(struct xfsstats __percpu *stats) for_each_possible_cpu(c) { preempt_disable(); /* save vn_active, it's a universal truth! */ - vn_active = per_cpu_ptr(stats, c)->vn_active; + vn_active = per_cpu_ptr(stats, c)->s.vn_active; memset(per_cpu_ptr(stats, c), 0, sizeof(*stats)); - per_cpu_ptr(stats, c)->vn_active = vn_active; + per_cpu_ptr(stats, c)->s.vn_active = vn_active; preempt_enable(); } } diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 79ad2e69fc33..375840f5a99a 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -22,9 +22,37 @@ #include <linux/percpu.h> /* + * The btree stats arrays have fixed offsets for the different stats. We + * store the base index in the btree cursor via XFS_STATS_CALC_INDEX() and + * that allows us to use fixed offsets into the stats array for each btree + * stat. These index offsets are defined in the order they will be emitted + * in the stats files, so it is possible to add new btree stat types by + * appending to the enum list below. + */ +enum { + __XBTS_lookup = 0, + __XBTS_compare = 1, + __XBTS_insrec = 2, + __XBTS_delrec = 3, + __XBTS_newroot = 4, + __XBTS_killroot = 5, + __XBTS_increment = 6, + __XBTS_decrement = 7, + __XBTS_lshift = 8, + __XBTS_rshift = 9, + __XBTS_split = 10, + __XBTS_join = 11, + __XBTS_alloc = 12, + __XBTS_free = 13, + __XBTS_moves = 14, + + __XBTS_MAX = 15, +}; + +/* * XFS global statistics */ -struct xfsstats { +struct __xfsstats { # define XFSSTAT_END_EXTENT_ALLOC 4 __uint32_t xs_allocx; __uint32_t xs_allocb; @@ -117,118 +145,20 @@ struct xfsstats { __uint32_t xb_page_found; __uint32_t xb_get_read; /* Version 2 btree counters */ -#define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF+15) - __uint32_t xs_abtb_2_lookup; - __uint32_t xs_abtb_2_compare; - __uint32_t xs_abtb_2_insrec; - __uint32_t xs_abtb_2_delrec; - __uint32_t xs_abtb_2_newroot; - __uint32_t xs_abtb_2_killroot; - __uint32_t xs_abtb_2_increment; - __uint32_t xs_abtb_2_decrement; - __uint32_t xs_abtb_2_lshift; - __uint32_t xs_abtb_2_rshift; - __uint32_t xs_abtb_2_split; - __uint32_t xs_abtb_2_join; - __uint32_t xs_abtb_2_alloc; - __uint32_t xs_abtb_2_free; - __uint32_t xs_abtb_2_moves; -#define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2+15) - __uint32_t xs_abtc_2_lookup; - __uint32_t xs_abtc_2_compare; - __uint32_t xs_abtc_2_insrec; - __uint32_t xs_abtc_2_delrec; - __uint32_t xs_abtc_2_newroot; - __uint32_t xs_abtc_2_killroot; - __uint32_t xs_abtc_2_increment; - __uint32_t xs_abtc_2_decrement; - __uint32_t xs_abtc_2_lshift; - __uint32_t xs_abtc_2_rshift; - __uint32_t xs_abtc_2_split; - __uint32_t xs_abtc_2_join; - __uint32_t xs_abtc_2_alloc; - __uint32_t xs_abtc_2_free; - __uint32_t xs_abtc_2_moves; -#define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2+15) - __uint32_t xs_bmbt_2_lookup; - __uint32_t xs_bmbt_2_compare; - __uint32_t xs_bmbt_2_insrec; - __uint32_t xs_bmbt_2_delrec; - __uint32_t xs_bmbt_2_newroot; - __uint32_t xs_bmbt_2_killroot; - __uint32_t xs_bmbt_2_increment; - __uint32_t xs_bmbt_2_decrement; - __uint32_t xs_bmbt_2_lshift; - __uint32_t xs_bmbt_2_rshift; - __uint32_t xs_bmbt_2_split; - __uint32_t xs_bmbt_2_join; - __uint32_t xs_bmbt_2_alloc; - __uint32_t xs_bmbt_2_free; - __uint32_t xs_bmbt_2_moves; -#define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2+15) - __uint32_t xs_ibt_2_lookup; - __uint32_t xs_ibt_2_compare; - __uint32_t xs_ibt_2_insrec; - __uint32_t xs_ibt_2_delrec; - __uint32_t xs_ibt_2_newroot; - __uint32_t xs_ibt_2_killroot; - __uint32_t xs_ibt_2_increment; - __uint32_t xs_ibt_2_decrement; - __uint32_t xs_ibt_2_lshift; - __uint32_t xs_ibt_2_rshift; - __uint32_t xs_ibt_2_split; - __uint32_t xs_ibt_2_join; - __uint32_t xs_ibt_2_alloc; - __uint32_t xs_ibt_2_free; - __uint32_t xs_ibt_2_moves; -#define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2+15) - __uint32_t xs_fibt_2_lookup; - __uint32_t xs_fibt_2_compare; - __uint32_t xs_fibt_2_insrec; - __uint32_t xs_fibt_2_delrec; - __uint32_t xs_fibt_2_newroot; - __uint32_t xs_fibt_2_killroot; - __uint32_t xs_fibt_2_increment; - __uint32_t xs_fibt_2_decrement; - __uint32_t xs_fibt_2_lshift; - __uint32_t xs_fibt_2_rshift; - __uint32_t xs_fibt_2_split; - __uint32_t xs_fibt_2_join; - __uint32_t xs_fibt_2_alloc; - __uint32_t xs_fibt_2_free; - __uint32_t xs_fibt_2_moves; -#define XFSSTAT_END_RMAP_V2 (XFSSTAT_END_FIBT_V2+15) - __uint32_t xs_rmap_2_lookup; - __uint32_t xs_rmap_2_compare; - __uint32_t xs_rmap_2_insrec; - __uint32_t xs_rmap_2_delrec; - __uint32_t xs_rmap_2_newroot; - __uint32_t xs_rmap_2_killroot; - __uint32_t xs_rmap_2_increment; - __uint32_t xs_rmap_2_decrement; - __uint32_t xs_rmap_2_lshift; - __uint32_t xs_rmap_2_rshift; - __uint32_t xs_rmap_2_split; - __uint32_t xs_rmap_2_join; - __uint32_t xs_rmap_2_alloc; - __uint32_t xs_rmap_2_free; - __uint32_t xs_rmap_2_moves; -#define XFSSTAT_END_REFCOUNT (XFSSTAT_END_RMAP_V2 + 15) - __uint32_t xs_refcbt_2_lookup; - __uint32_t xs_refcbt_2_compare; - __uint32_t xs_refcbt_2_insrec; - __uint32_t xs_refcbt_2_delrec; - __uint32_t xs_refcbt_2_newroot; - __uint32_t xs_refcbt_2_killroot; - __uint32_t xs_refcbt_2_increment; - __uint32_t xs_refcbt_2_decrement; - __uint32_t xs_refcbt_2_lshift; - __uint32_t xs_refcbt_2_rshift; - __uint32_t xs_refcbt_2_split; - __uint32_t xs_refcbt_2_join; - __uint32_t xs_refcbt_2_alloc; - __uint32_t xs_refcbt_2_free; - __uint32_t xs_refcbt_2_moves; +#define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF + __XBTS_MAX) + __uint32_t xs_abtb_2[__XBTS_MAX]; +#define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2 + __XBTS_MAX) + __uint32_t xs_abtc_2[__XBTS_MAX]; +#define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2 + __XBTS_MAX) + __uint32_t xs_bmbt_2[__XBTS_MAX]; +#define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2 + __XBTS_MAX) + __uint32_t xs_ibt_2[__XBTS_MAX]; +#define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2 + __XBTS_MAX) + __uint32_t xs_fibt_2[__XBTS_MAX]; +#define XFSSTAT_END_RMAP_V2 (XFSSTAT_END_FIBT_V2 + __XBTS_MAX) + __uint32_t xs_rmap_2[__XBTS_MAX]; +#define XFSSTAT_END_REFCOUNT (XFSSTAT_END_RMAP_V2 + __XBTS_MAX) + __uint32_t xs_refcbt_2[__XBTS_MAX]; #define XFSSTAT_END_XQMSTAT (XFSSTAT_END_REFCOUNT + 6) __uint32_t xs_qm_dqreclaims; __uint32_t xs_qm_dqreclaim_misses; @@ -245,26 +175,58 @@ struct xfsstats { __uint64_t xs_read_bytes; }; +struct xfsstats { + union { + struct __xfsstats s; + uint32_t a[XFSSTAT_END_XQMSTAT]; + }; +}; + +/* + * simple wrapper for getting the array index of s struct member offset + */ +#define XFS_STATS_CALC_INDEX(member) \ + (offsetof(struct __xfsstats, member) / (int)sizeof(__uint32_t)) + + int xfs_stats_format(struct xfsstats __percpu *stats, char *buf); void xfs_stats_clearall(struct xfsstats __percpu *stats); extern struct xstats xfsstats; #define XFS_STATS_INC(mp, v) \ do { \ - per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v++; \ - per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v++; \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->s.v++; \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->s.v++; \ } while (0) #define XFS_STATS_DEC(mp, v) \ do { \ - per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v--; \ - per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v--; \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->s.v--; \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->s.v--; \ } while (0) #define XFS_STATS_ADD(mp, v, inc) \ do { \ - per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v += (inc); \ - per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v += (inc); \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->s.v += (inc); \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->s.v += (inc); \ +} while (0) + +#define XFS_STATS_INC_OFF(mp, off) \ +do { \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->a[off]++; \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->a[off]++; \ +} while (0) + +#define XFS_STATS_DEC_OFF(mp, off) \ +do { \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->a[off]; \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->a[off]; \ +} while (0) + +#define XFS_STATS_ADD_OFF(mp, off, inc) \ +do { \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->a[off] += (inc); \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->a[off] += (inc); \ } while (0) #if defined(CONFIG_PROC_FS) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index ade4691e3f74..eecbaac08eba 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -104,9 +104,6 @@ static const match_table_t tokens = { {Opt_sysvgroups,"sysvgroups"}, /* group-ID from current process */ {Opt_allocsize, "allocsize=%s"},/* preferred allocation size */ {Opt_norecovery,"norecovery"}, /* don't run XFS recovery */ - {Opt_barrier, "barrier"}, /* use writer barriers for log write and - * unwritten extent conversion */ - {Opt_nobarrier, "nobarrier"}, /* .. disable */ {Opt_inode64, "inode64"}, /* inodes can be allocated anywhere */ {Opt_inode32, "inode32"}, /* inode allocation limited to * XFS_MAXINUMBER_32 */ @@ -134,6 +131,12 @@ static const match_table_t tokens = { {Opt_nodiscard, "nodiscard"}, /* Do not discard unused blocks */ {Opt_dax, "dax"}, /* Enable direct access to bdev pages */ + + /* Deprecated mount options scheduled for removal */ + {Opt_barrier, "barrier"}, /* use writer barriers for log write and + * unwritten extent conversion */ + {Opt_nobarrier, "nobarrier"}, /* .. disable */ + {Opt_err, NULL}, }; @@ -301,12 +304,6 @@ xfs_parseargs( case Opt_nouuid: mp->m_flags |= XFS_MOUNT_NOUUID; break; - case Opt_barrier: - mp->m_flags |= XFS_MOUNT_BARRIER; - break; - case Opt_nobarrier: - mp->m_flags &= ~XFS_MOUNT_BARRIER; - break; case Opt_ikeep: mp->m_flags |= XFS_MOUNT_IKEEP; break; @@ -374,6 +371,14 @@ xfs_parseargs( mp->m_flags |= XFS_MOUNT_DAX; break; #endif + case Opt_barrier: + xfs_warn(mp, "%s option is deprecated, ignoring.", p); + mp->m_flags |= XFS_MOUNT_BARRIER; + break; + case Opt_nobarrier: + xfs_warn(mp, "%s option is deprecated, ignoring.", p); + mp->m_flags &= ~XFS_MOUNT_BARRIER; + break; default: xfs_warn(mp, "unknown mount option [%s].", p); return -EINVAL; @@ -943,7 +948,7 @@ xfs_fs_destroy_inode( trace_xfs_destroy_inode(ip); - ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + ASSERT(!rwsem_is_locked(&inode->i_rwsem)); XFS_STATS_INC(ip->i_mount, vn_rele); XFS_STATS_INC(ip->i_mount, vn_remove); @@ -1238,9 +1243,11 @@ xfs_fs_remount( token = match_token(p, tokens, args); switch (token) { case Opt_barrier: + xfs_warn(mp, "%s option is deprecated, ignoring.", p); mp->m_flags |= XFS_MOUNT_BARRIER; break; case Opt_nobarrier: + xfs_warn(mp, "%s option is deprecated, ignoring.", p); mp->m_flags &= ~XFS_MOUNT_BARRIER; break; case Opt_inode64: diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 58142aeeeea6..f2cb45ed1d54 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -238,8 +238,7 @@ xfs_symlink( if (error) goto out_release_inode; - xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL | - XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT); + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; /* @@ -287,7 +286,7 @@ xfs_symlink( * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); unlock_dp_on_error = false; /* @@ -412,7 +411,7 @@ out_release_inode: xfs_qm_dqrele(pdqp); if (unlock_dp_on_error) - xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 0907752be62d..69c5bcd9a51b 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -355,7 +355,6 @@ DEFINE_BUF_EVENT(xfs_buf_rele); DEFINE_BUF_EVENT(xfs_buf_iodone); DEFINE_BUF_EVENT(xfs_buf_submit); DEFINE_BUF_EVENT(xfs_buf_submit_wait); -DEFINE_BUF_EVENT(xfs_buf_bawrite); DEFINE_BUF_EVENT(xfs_buf_lock); DEFINE_BUF_EVENT(xfs_buf_lock_done); DEFINE_BUF_EVENT(xfs_buf_trylock_fail); @@ -367,19 +366,15 @@ DEFINE_BUF_EVENT(xfs_buf_delwri_queue); DEFINE_BUF_EVENT(xfs_buf_delwri_queued); DEFINE_BUF_EVENT(xfs_buf_delwri_split); DEFINE_BUF_EVENT(xfs_buf_get_uncached); -DEFINE_BUF_EVENT(xfs_bdstrat_shut); DEFINE_BUF_EVENT(xfs_buf_item_relse); DEFINE_BUF_EVENT(xfs_buf_item_iodone_async); DEFINE_BUF_EVENT(xfs_buf_error_relse); DEFINE_BUF_EVENT(xfs_buf_wait_buftarg); -DEFINE_BUF_EVENT(xfs_trans_read_buf_io); DEFINE_BUF_EVENT(xfs_trans_read_buf_shut); /* not really buffer traces, but the buf provides useful information */ DEFINE_BUF_EVENT(xfs_btree_corrupt); -DEFINE_BUF_EVENT(xfs_da_btree_corrupt); DEFINE_BUF_EVENT(xfs_reset_dqcounts); -DEFINE_BUF_EVENT(xfs_inode_item_push); /* pass flags explicitly */ DECLARE_EVENT_CLASS(xfs_buf_flags_class, @@ -541,7 +536,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin); DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold); DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release); DEFINE_BUF_ITEM_EVENT(xfs_trans_binval); -DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered); DECLARE_EVENT_CLASS(xfs_filestream_class, TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno), @@ -680,7 +674,6 @@ DEFINE_INODE_EVENT(xfs_ioctl_setattr); DEFINE_INODE_EVENT(xfs_dir_fsync); DEFINE_INODE_EVENT(xfs_file_fsync); DEFINE_INODE_EVENT(xfs_destroy_inode); -DEFINE_INODE_EVENT(xfs_evict_inode); DEFINE_INODE_EVENT(xfs_update_time); DEFINE_INODE_EVENT(xfs_dquot_dqalloc); @@ -798,7 +791,6 @@ TRACE_EVENT(xfs_irec_merge_post, DEFINE_EVENT(xfs_iref_class, name, \ TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \ TP_ARGS(ip, caller_ip)) -DEFINE_IREF_EVENT(xfs_ihold); DEFINE_IREF_EVENT(xfs_irele); DEFINE_IREF_EVENT(xfs_inode_pin); DEFINE_IREF_EVENT(xfs_inode_unpin); @@ -939,7 +931,6 @@ DEFINE_DQUOT_EVENT(xfs_dqget_miss); DEFINE_DQUOT_EVENT(xfs_dqget_freeing); DEFINE_DQUOT_EVENT(xfs_dqget_dup); DEFINE_DQUOT_EVENT(xfs_dqput); -DEFINE_DQUOT_EVENT(xfs_dqput_wait); DEFINE_DQUOT_EVENT(xfs_dqput_free); DEFINE_DQUOT_EVENT(xfs_dqrele); DEFINE_DQUOT_EVENT(xfs_dqflush); @@ -1815,7 +1806,6 @@ DEFINE_ATTR_EVENT(xfs_attr_sf_addname); DEFINE_ATTR_EVENT(xfs_attr_sf_create); DEFINE_ATTR_EVENT(xfs_attr_sf_lookup); DEFINE_ATTR_EVENT(xfs_attr_sf_remove); -DEFINE_ATTR_EVENT(xfs_attr_sf_removename); DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf); DEFINE_ATTR_EVENT(xfs_attr_leaf_add); @@ -1844,7 +1834,6 @@ DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall); DEFINE_ATTR_EVENT(xfs_attr_node_addname); DEFINE_ATTR_EVENT(xfs_attr_node_get); -DEFINE_ATTR_EVENT(xfs_attr_node_lookup); DEFINE_ATTR_EVENT(xfs_attr_node_replace); DEFINE_ATTR_EVENT(xfs_attr_node_removename); @@ -2440,11 +2429,9 @@ DEFINE_DEFER_EVENT(xfs_defer_finish_done); DEFINE_DEFER_ERROR_EVENT(xfs_defer_trans_roll_error); DEFINE_DEFER_ERROR_EVENT(xfs_defer_finish_error); -DEFINE_DEFER_ERROR_EVENT(xfs_defer_op_finish_error); DEFINE_DEFER_PENDING_EVENT(xfs_defer_intake_work); DEFINE_DEFER_PENDING_EVENT(xfs_defer_intake_cancel); -DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_commit); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_cancel); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort); @@ -3092,87 +3079,6 @@ DEFINE_EVENT(xfs_double_io_class, name, \ struct xfs_inode *dest, xfs_off_t doffset), \ TP_ARGS(src, soffset, len, dest, doffset)) -/* two-file vfs io tracepoint class */ -DECLARE_EVENT_CLASS(xfs_double_vfs_io_class, - TP_PROTO(struct inode *src, u64 soffset, u64 len, - struct inode *dest, u64 doffset), - TP_ARGS(src, soffset, len, dest, doffset), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(unsigned long, src_ino) - __field(loff_t, src_isize) - __field(loff_t, src_offset) - __field(size_t, len) - __field(unsigned long, dest_ino) - __field(loff_t, dest_isize) - __field(loff_t, dest_offset) - ), - TP_fast_assign( - __entry->dev = src->i_sb->s_dev; - __entry->src_ino = src->i_ino; - __entry->src_isize = i_size_read(src); - __entry->src_offset = soffset; - __entry->len = len; - __entry->dest_ino = dest->i_ino; - __entry->dest_isize = i_size_read(dest); - __entry->dest_offset = doffset; - ), - TP_printk("dev %d:%d count %zd " - "ino 0x%lx isize 0x%llx offset 0x%llx -> " - "ino 0x%lx isize 0x%llx offset 0x%llx", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->len, - __entry->src_ino, - __entry->src_isize, - __entry->src_offset, - __entry->dest_ino, - __entry->dest_isize, - __entry->dest_offset) -) - -#define DEFINE_DOUBLE_VFS_IO_EVENT(name) \ -DEFINE_EVENT(xfs_double_vfs_io_class, name, \ - TP_PROTO(struct inode *src, u64 soffset, u64 len, \ - struct inode *dest, u64 doffset), \ - TP_ARGS(src, soffset, len, dest, doffset)) - -/* CoW write tracepoint */ -DECLARE_EVENT_CLASS(xfs_copy_on_write_class, - TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk, - xfs_extlen_t len, xfs_fsblock_t new_pblk), - TP_ARGS(ip, lblk, pblk, len, new_pblk), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_ino_t, ino) - __field(xfs_fileoff_t, lblk) - __field(xfs_fsblock_t, pblk) - __field(xfs_extlen_t, len) - __field(xfs_fsblock_t, new_pblk) - ), - TP_fast_assign( - __entry->dev = VFS_I(ip)->i_sb->s_dev; - __entry->ino = ip->i_ino; - __entry->lblk = lblk; - __entry->pblk = pblk; - __entry->len = len; - __entry->new_pblk = new_pblk; - ), - TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx pblk 0x%llx " - "len 0x%x new_pblk %llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __entry->lblk, - __entry->pblk, - __entry->len, - __entry->new_pblk) -) - -#define DEFINE_COW_EVENT(name) \ -DEFINE_EVENT(xfs_copy_on_write_class, name, \ - TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk, \ - xfs_extlen_t len, xfs_fsblock_t new_pblk), \ - TP_ARGS(ip, lblk, pblk, len, new_pblk)) - /* inode/irec events */ DECLARE_EVENT_CLASS(xfs_inode_irec_class, TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec), @@ -3292,8 +3198,6 @@ DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range); DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_reflink_main_loop_error); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_read_iomap_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error); @@ -3302,9 +3206,6 @@ DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents); DEFINE_INODE_ERROR_EVENT(xfs_reflink_compare_extents_error); /* ioctl tracepoints */ -DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_reflink); -DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_clone_range); -DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_file_extent_same); TRACE_EVENT(xfs_ioctl_clone, TP_PROTO(struct inode *src, struct inode *dest), TP_ARGS(src, dest), @@ -3334,11 +3235,7 @@ TRACE_EVENT(xfs_ioctl_clone, /* unshare tracepoints */ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare); -DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cow_eof_block); -DEFINE_PAGE_EVENT(xfs_reflink_unshare_page); DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_cow_eof_block_error); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_dirty_page_error); /* copy on write */ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared); @@ -3361,14 +3258,8 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_allocate_cow_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error); -DEFINE_COW_EVENT(xfs_reflink_fork_buf); -DEFINE_COW_EVENT(xfs_reflink_finish_fork_buf); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_fork_buf_error); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_finish_fork_buf_error); -DEFINE_INODE_EVENT(xfs_reflink_cancel_pending_cow); DEFINE_INODE_IREC_EVENT(xfs_reflink_cancel_cow); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_pending_cow_error); /* rmap swapext tracepoints */ DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap); diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 62900938f26d..0594db435972 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -130,7 +130,7 @@ const struct xattr_handler *xfs_xattr_handlers[] = { NULL }; -static int +static void __xfs_xattr_put_listent( struct xfs_attr_list_context *context, char *prefix, @@ -148,7 +148,7 @@ __xfs_xattr_put_listent( if (arraytop > context->firstu) { context->count = -1; /* insufficient space */ context->seen_enough = 1; - return 0; + return; } offset = (char *)context->alist + context->count; strncpy(offset, prefix, prefix_len); @@ -159,10 +159,10 @@ __xfs_xattr_put_listent( compute_size: context->count += prefix_len + namelen + 1; - return 0; + return; } -static int +static void xfs_xattr_put_listent( struct xfs_attr_list_context *context, int flags, @@ -180,23 +180,19 @@ xfs_xattr_put_listent( if (namelen == SGI_ACL_FILE_SIZE && strncmp(name, SGI_ACL_FILE, SGI_ACL_FILE_SIZE) == 0) { - int ret = __xfs_xattr_put_listent( + __xfs_xattr_put_listent( context, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN, XATTR_POSIX_ACL_ACCESS, strlen(XATTR_POSIX_ACL_ACCESS)); - if (ret) - return ret; } else if (namelen == SGI_ACL_DEFAULT_SIZE && strncmp(name, SGI_ACL_DEFAULT, SGI_ACL_DEFAULT_SIZE) == 0) { - int ret = __xfs_xattr_put_listent( + __xfs_xattr_put_listent( context, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN, XATTR_POSIX_ACL_DEFAULT, strlen(XATTR_POSIX_ACL_DEFAULT)); - if (ret) - return ret; } #endif @@ -205,7 +201,7 @@ xfs_xattr_put_listent( * see them. */ if (!capable(CAP_SYS_ADMIN)) - return 0; + return; prefix = XATTR_TRUSTED_PREFIX; prefix_len = XATTR_TRUSTED_PREFIX_LEN; @@ -217,8 +213,9 @@ xfs_xattr_put_listent( prefix_len = XATTR_USER_PREFIX_LEN; } - return __xfs_xattr_put_listent(context, prefix, prefix_len, name, - namelen); + __xfs_xattr_put_listent(context, prefix, prefix_len, name, + namelen); + return; } ssize_t diff --git a/include/linux/iomap.h b/include/linux/iomap.h index f185156de74d..a4c94b86401e 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -50,6 +50,7 @@ struct iomap { #define IOMAP_ZERO (1 << 1) /* zeroing operation, may skip holes */ #define IOMAP_REPORT (1 << 2) /* report extent status, e.g. FIEMAP */ #define IOMAP_FAULT (1 << 3) /* mapping for page fault */ +#define IOMAP_DIRECT (1 << 4) /* direct I/O */ struct iomap_ops { /* @@ -83,4 +84,14 @@ int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, loff_t start, loff_t len, struct iomap_ops *ops); +/* + * Flags for direct I/O ->end_io: + */ +#define IOMAP_DIO_UNWRITTEN (1 << 0) /* covers unwritten extent(s) */ +#define IOMAP_DIO_COW (1 << 1) /* covers COW extent(s) */ +typedef int (iomap_dio_end_io_t)(struct kiocb *iocb, ssize_t ret, + unsigned flags); +ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, + struct iomap_ops *ops, iomap_dio_end_io_t end_io); + #endif /* LINUX_IOMAP_H */ diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index c1458fede1f9..1e327bb80838 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -338,9 +338,18 @@ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass, extern void lock_release(struct lockdep_map *lock, int nested, unsigned long ip); -#define lockdep_is_held(lock) lock_is_held(&(lock)->dep_map) +/* + * Same "read" as for lock_acquire(), except -1 means any. + */ +extern int lock_is_held_type(struct lockdep_map *lock, int read); + +static inline int lock_is_held(struct lockdep_map *lock) +{ + return lock_is_held_type(lock, -1); +} -extern int lock_is_held(struct lockdep_map *lock); +#define lockdep_is_held(lock) lock_is_held(&(lock)->dep_map) +#define lockdep_is_held_type(lock, r) lock_is_held_type(&(lock)->dep_map, (r)) extern void lock_set_class(struct lockdep_map *lock, const char *name, struct lock_class_key *key, unsigned int subclass, @@ -372,6 +381,14 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); WARN_ON(debug_locks && !lockdep_is_held(l)); \ } while (0) +#define lockdep_assert_held_exclusive(l) do { \ + WARN_ON(debug_locks && !lockdep_is_held_type(l, 0)); \ + } while (0) + +#define lockdep_assert_held_read(l) do { \ + WARN_ON(debug_locks && !lockdep_is_held_type(l, 1)); \ + } while (0) + #define lockdep_assert_held_once(l) do { \ WARN_ON_ONCE(debug_locks && !lockdep_is_held(l)); \ } while (0) @@ -428,7 +445,11 @@ struct lock_class_key { }; #define lockdep_depth(tsk) (0) +#define lockdep_is_held_type(l, r) (1) + #define lockdep_assert_held(l) do { (void)(l); } while (0) +#define lockdep_assert_held_exclusive(l) do { (void)(l); } while (0) +#define lockdep_assert_held_read(l) do { (void)(l); } while (0) #define lockdep_assert_held_once(l) do { (void)(l); } while (0) #define lockdep_recursing(tsk) (0) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 7bd265f6b098..7c38f8f3d97b 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3191,7 +3191,7 @@ print_lock_nested_lock_not_held(struct task_struct *curr, return 0; } -static int __lock_is_held(struct lockdep_map *lock); +static int __lock_is_held(struct lockdep_map *lock, int read); /* * This gets called for every mutex_lock*()/spin_lock*() operation. @@ -3332,7 +3332,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, } chain_key = iterate_chain_key(chain_key, class_idx); - if (nest_lock && !__lock_is_held(nest_lock)) + if (nest_lock && !__lock_is_held(nest_lock, -1)) return print_lock_nested_lock_not_held(curr, hlock, ip); if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) @@ -3579,7 +3579,7 @@ found_it: return 1; } -static int __lock_is_held(struct lockdep_map *lock) +static int __lock_is_held(struct lockdep_map *lock, int read) { struct task_struct *curr = current; int i; @@ -3587,8 +3587,12 @@ static int __lock_is_held(struct lockdep_map *lock) for (i = 0; i < curr->lockdep_depth; i++) { struct held_lock *hlock = curr->held_locks + i; - if (match_held_lock(hlock, lock)) - return 1; + if (match_held_lock(hlock, lock)) { + if (read == -1 || hlock->read == read) + return 1; + + return 0; + } } return 0; @@ -3772,7 +3776,7 @@ void lock_release(struct lockdep_map *lock, int nested, } EXPORT_SYMBOL_GPL(lock_release); -int lock_is_held(struct lockdep_map *lock) +int lock_is_held_type(struct lockdep_map *lock, int read) { unsigned long flags; int ret = 0; @@ -3784,13 +3788,13 @@ int lock_is_held(struct lockdep_map *lock) check_flags(flags); current->lockdep_recursion = 1; - ret = __lock_is_held(lock); + ret = __lock_is_held(lock, read); current->lockdep_recursion = 0; raw_local_irq_restore(flags); return ret; } -EXPORT_SYMBOL_GPL(lock_is_held); +EXPORT_SYMBOL_GPL(lock_is_held_type); struct pin_cookie lock_pin_lock(struct lockdep_map *lock) { |