diff options
Diffstat (limited to 'fs/xfs/xfs_inode.c')
-rw-r--r-- | fs/xfs/xfs_inode.c | 698 |
1 files changed, 319 insertions, 379 deletions
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 9aea7d68d8ab..407d6299606d 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -44,7 +44,6 @@ kmem_zone_t *xfs_inode_zone; */ #define XFS_ITRUNC_MAX_EXTENTS 2 -STATIC int xfs_iflush_int(struct xfs_inode *, struct xfs_buf *); STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *); STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *); @@ -1740,10 +1739,31 @@ xfs_inactive_ifree( return error; } + /* + * We do not hold the inode locked across the entire rolling transaction + * here. We only need to hold it for the first transaction that + * xfs_ifree() builds, which may mark the inode XFS_ISTALE if the + * underlying cluster buffer is freed. Relogging an XFS_ISTALE inode + * here breaks the relationship between cluster buffer invalidation and + * stale inode invalidation on cluster buffer item journal commit + * completion, and can result in leaving dirty stale inodes hanging + * around in memory. + * + * We have no need for serialising this inode operation against other + * operations - we freed the inode and hence reallocation is required + * and that will serialise on reallocating the space the deferops need + * to free. Hence we can unlock the inode on the first commit of + * the transaction rather than roll it right through the deferops. This + * avoids relogging the XFS_ISTALE inode. + * + * We check that xfs_ifree() hasn't grown an internal transaction roll + * by asserting that the inode is still locked when it returns. + */ xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); error = xfs_ifree(tp, ip); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (error) { /* * If we fail to free the inode, shut down. The cancel @@ -1756,7 +1776,6 @@ xfs_inactive_ifree( xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); } xfs_trans_cancel(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -1774,7 +1793,6 @@ xfs_inactive_ifree( xfs_notice(mp, "%s: xfs_trans_commit returned error %d", __func__, error); - xfs_iunlock(ip, XFS_ILOCK_EXCL); return 0; } @@ -2147,7 +2165,6 @@ xfs_iunlink_update_dinode( xfs_dinode_calc_crc(mp, dip); xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1); - xfs_inobp_check(mp, ibp); } /* Set an in-core inode's unlinked pointer and return the old value. */ @@ -2248,7 +2265,6 @@ xfs_iunlink( } if (next_agino != NULLAGINO) { - struct xfs_perag *pag; xfs_agino_t old_agino; /* @@ -2265,9 +2281,7 @@ xfs_iunlink( * agino has been unlinked, add a backref from the next inode * back to agino. */ - pag = xfs_perag_get(mp, agno); - error = xfs_iunlink_add_backref(pag, agino, next_agino); - xfs_perag_put(pag); + error = xfs_iunlink_add_backref(agibp->b_pag, agino, next_agino); if (error) return error; } @@ -2403,7 +2417,6 @@ xfs_iunlink_remove( struct xfs_buf *agibp; struct xfs_buf *last_ibp; struct xfs_dinode *last_dip = NULL; - struct xfs_perag *pag = NULL; xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); xfs_agino_t next_agino; @@ -2447,32 +2460,22 @@ xfs_iunlink_remove( * this inode's backref to point from the next inode. */ if (next_agino != NULLAGINO) { - pag = xfs_perag_get(mp, agno); - error = xfs_iunlink_change_backref(pag, next_agino, + error = xfs_iunlink_change_backref(agibp->b_pag, next_agino, NULLAGINO); if (error) - goto out; + return error; } - if (head_agino == agino) { - /* Point the head of the list to the next unlinked inode. */ - error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, - next_agino); - if (error) - goto out; - } else { + if (head_agino != agino) { struct xfs_imap imap; xfs_agino_t prev_agino; - if (!pag) - pag = xfs_perag_get(mp, agno); - /* We need to search the list for the inode being freed. */ error = xfs_iunlink_map_prev(tp, agno, head_agino, agino, &prev_agino, &imap, &last_dip, &last_ibp, - pag); + agibp->b_pag); if (error) - goto out; + return error; /* Point the previous inode on the list to the next inode. */ xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp, @@ -2486,29 +2489,29 @@ xfs_iunlink_remove( * change_backref takes care of deleting the backref if * next_agino is NULLAGINO. */ - error = xfs_iunlink_change_backref(pag, agino, next_agino); - if (error) - goto out; + return xfs_iunlink_change_backref(agibp->b_pag, agino, + next_agino); } -out: - if (pag) - xfs_perag_put(pag); - return error; + /* Point the head of the list to the next unlinked inode. */ + return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, + next_agino); } /* - * Look up the inode number specified and mark it stale if it is found. If it is - * dirty, return the inode so it can be attached to the cluster buffer so it can - * be processed appropriately when the cluster free transaction completes. + * Look up the inode number specified and if it is not already marked XFS_ISTALE + * mark it stale. We should only find clean inodes in this lookup that aren't + * already stale. */ -static struct xfs_inode * -xfs_ifree_get_one_inode( - struct xfs_perag *pag, +static void +xfs_ifree_mark_inode_stale( + struct xfs_buf *bp, struct xfs_inode *free_ip, xfs_ino_t inum) { - struct xfs_mount *mp = pag->pag_mount; + struct xfs_mount *mp = bp->b_mount; + struct xfs_perag *pag = bp->b_pag; + struct xfs_inode_log_item *iip; struct xfs_inode *ip; retry: @@ -2516,8 +2519,10 @@ retry: ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum)); /* Inode not in memory, nothing to do */ - if (!ip) - goto out_rcu_unlock; + if (!ip) { + rcu_read_unlock(); + return; + } /* * because this is an RCU protected lookup, we could find a recently @@ -2528,9 +2533,9 @@ retry: spin_lock(&ip->i_flags_lock); if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) { spin_unlock(&ip->i_flags_lock); - goto out_rcu_unlock; + rcu_read_unlock(); + return; } - spin_unlock(&ip->i_flags_lock); /* * Don't try to lock/unlock the current inode, but we _cannot_ skip the @@ -2540,43 +2545,50 @@ retry: */ if (ip != free_ip) { if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { + spin_unlock(&ip->i_flags_lock); rcu_read_unlock(); delay(1); goto retry; } - - /* - * Check the inode number again in case we're racing with - * freeing in xfs_reclaim_inode(). See the comments in that - * function for more information as to why the initial check is - * not sufficient. - */ - if (ip->i_ino != inum) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - goto out_rcu_unlock; - } } + ip->i_flags |= XFS_ISTALE; + spin_unlock(&ip->i_flags_lock); rcu_read_unlock(); - xfs_iflock(ip); - xfs_iflags_set(ip, XFS_ISTALE); + /* + * If we can't get the flush lock, the inode is already attached. All + * we needed to do here is mark the inode stale so buffer IO completion + * will remove it from the AIL. + */ + iip = ip->i_itemp; + if (!xfs_iflock_nowait(ip)) { + ASSERT(!list_empty(&iip->ili_item.li_bio_list)); + ASSERT(iip->ili_last_fields); + goto out_iunlock; + } /* - * We don't need to attach clean inodes or those only with unlogged - * changes (which we throw away, anyway). + * Inodes not attached to the buffer can be released immediately. + * Everything else has to go through xfs_iflush_abort() on journal + * commit as the flock synchronises removal of the inode from the + * cluster buffer against inode reclaim. */ - if (!ip->i_itemp || xfs_inode_clean(ip)) { - ASSERT(ip != free_ip); + if (!iip || list_empty(&iip->ili_item.li_bio_list)) { xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - goto out_no_inode; + goto out_iunlock; } - return ip; -out_rcu_unlock: - rcu_read_unlock(); -out_no_inode: - return NULL; + /* we have a dirty inode in memory that has not yet been flushed. */ + spin_lock(&iip->ili_lock); + iip->ili_last_fields = iip->ili_fields; + iip->ili_fields = 0; + iip->ili_fsync_fields = 0; + spin_unlock(&iip->ili_lock); + ASSERT(iip->ili_last_fields); + +out_iunlock: + if (ip != free_ip) + xfs_iunlock(ip, XFS_ILOCK_EXCL); } /* @@ -2586,26 +2598,20 @@ out_no_inode: */ STATIC int xfs_ifree_cluster( - xfs_inode_t *free_ip, - xfs_trans_t *tp, + struct xfs_inode *free_ip, + struct xfs_trans *tp, struct xfs_icluster *xic) { - xfs_mount_t *mp = free_ip->i_mount; + struct xfs_mount *mp = free_ip->i_mount; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + struct xfs_buf *bp; + xfs_daddr_t blkno; + xfs_ino_t inum = xic->first_ino; int nbufs; int i, j; int ioffset; - xfs_daddr_t blkno; - xfs_buf_t *bp; - xfs_inode_t *ip; - struct xfs_inode_log_item *iip; - struct xfs_log_item *lip; - struct xfs_perag *pag; - struct xfs_ino_geometry *igeo = M_IGEO(mp); - xfs_ino_t inum; int error; - inum = xic->first_ino; - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster; for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) { @@ -2634,10 +2640,8 @@ xfs_ifree_cluster( error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, mp->m_bsize * igeo->blocks_per_cluster, XBF_UNMAPPED, &bp); - if (error) { - xfs_perag_put(pag); + if (error) return error; - } /* * This buffer may not have been correctly initialised as we @@ -2651,60 +2655,16 @@ xfs_ifree_cluster( bp->b_ops = &xfs_inode_buf_ops; /* - * Walk the inodes already attached to the buffer and mark them - * stale. These will all have the flush locks held, so an - * in-memory inode walk can't lock them. By marking them all - * stale first, we will not attempt to lock them in the loop - * below as the XFS_ISTALE flag will be set. + * Now we need to set all the cached clean inodes as XFS_ISTALE, + * too. This requires lookups, and will skip inodes that we've + * already marked XFS_ISTALE. */ - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { - if (lip->li_type == XFS_LI_INODE) { - iip = (struct xfs_inode_log_item *)lip; - ASSERT(iip->ili_logged == 1); - lip->li_cb = xfs_istale_done; - xfs_trans_ail_copy_lsn(mp->m_ail, - &iip->ili_flush_lsn, - &iip->ili_item.li_lsn); - xfs_iflags_set(iip->ili_inode, XFS_ISTALE); - } - } - - - /* - * For each inode in memory attempt to add it to the inode - * buffer and set it up for being staled on buffer IO - * completion. This is safe as we've locked out tail pushing - * and flushing by locking the buffer. - * - * We have already marked every inode that was part of a - * transaction stale above, which means there is no point in - * even trying to lock them. - */ - for (i = 0; i < igeo->inodes_per_cluster; i++) { - ip = xfs_ifree_get_one_inode(pag, free_ip, inum + i); - if (!ip) - continue; - - iip = ip->i_itemp; - iip->ili_last_fields = iip->ili_fields; - iip->ili_fields = 0; - iip->ili_fsync_fields = 0; - iip->ili_logged = 1; - xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, - &iip->ili_item.li_lsn); - - xfs_buf_attach_iodone(bp, xfs_istale_done, - &iip->ili_item); - - if (ip != free_ip) - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } + for (i = 0; i < igeo->inodes_per_cluster; i++) + xfs_ifree_mark_inode_stale(bp, free_ip, inum + i); xfs_trans_stale_inode_buf(tp, bp); xfs_trans_binval(tp, bp); } - - xfs_perag_put(pag); return 0; } @@ -2725,6 +2685,7 @@ xfs_ifree( { int error; struct xfs_icluster xic = { 0 }; + struct xfs_inode_log_item *iip = ip->i_itemp; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(VFS_I(ip)->i_nlink == 0); @@ -2762,7 +2723,9 @@ xfs_ifree( ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; /* Don't attempt to replay owner changes for a deleted inode */ - ip->i_itemp->ili_fields &= ~(XFS_ILOG_AOWNER|XFS_ILOG_DOWNER); + spin_lock(&iip->ili_lock); + iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER); + spin_unlock(&iip->ili_lock); /* * Bump the generation count so no one will be confused @@ -3469,232 +3432,9 @@ out_release_wip: return error; } -STATIC int -xfs_iflush_cluster( - struct xfs_inode *ip, - struct xfs_buf *bp) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_perag *pag; - unsigned long first_index, mask; - int cilist_size; - struct xfs_inode **cilist; - struct xfs_inode *cip; - struct xfs_ino_geometry *igeo = M_IGEO(mp); - int error = 0; - int nr_found; - int clcount = 0; - int i; - - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - - cilist_size = igeo->inodes_per_cluster * sizeof(struct xfs_inode *); - cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS); - if (!cilist) - goto out_put; - - mask = ~(igeo->inodes_per_cluster - 1); - first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; - rcu_read_lock(); - /* really need a gang lookup range call here */ - nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist, - first_index, igeo->inodes_per_cluster); - if (nr_found == 0) - goto out_free; - - for (i = 0; i < nr_found; i++) { - cip = cilist[i]; - if (cip == ip) - continue; - - /* - * because this is an RCU protected lookup, we could find a - * recently freed or even reallocated inode during the lookup. - * We need to check under the i_flags_lock for a valid inode - * here. Skip it if it is not valid or the wrong inode. - */ - spin_lock(&cip->i_flags_lock); - if (!cip->i_ino || - __xfs_iflags_test(cip, XFS_ISTALE)) { - spin_unlock(&cip->i_flags_lock); - continue; - } - - /* - * Once we fall off the end of the cluster, no point checking - * any more inodes in the list because they will also all be - * outside the cluster. - */ - if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) { - spin_unlock(&cip->i_flags_lock); - break; - } - spin_unlock(&cip->i_flags_lock); - - /* - * Do an un-protected check to see if the inode is dirty and - * is a candidate for flushing. These checks will be repeated - * later after the appropriate locks are acquired. - */ - if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0) - continue; - - /* - * Try to get locks. If any are unavailable or it is pinned, - * then this inode cannot be flushed and is skipped. - */ - - if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED)) - continue; - if (!xfs_iflock_nowait(cip)) { - xfs_iunlock(cip, XFS_ILOCK_SHARED); - continue; - } - if (xfs_ipincount(cip)) { - xfs_ifunlock(cip); - xfs_iunlock(cip, XFS_ILOCK_SHARED); - continue; - } - - - /* - * Check the inode number again, just to be certain we are not - * racing with freeing in xfs_reclaim_inode(). See the comments - * in that function for more information as to why the initial - * check is not sufficient. - */ - if (!cip->i_ino) { - xfs_ifunlock(cip); - xfs_iunlock(cip, XFS_ILOCK_SHARED); - continue; - } - - /* - * arriving here means that this inode can be flushed. First - * re-check that it's dirty before flushing. - */ - if (!xfs_inode_clean(cip)) { - error = xfs_iflush_int(cip, bp); - if (error) { - xfs_iunlock(cip, XFS_ILOCK_SHARED); - goto out_free; - } - clcount++; - } else { - xfs_ifunlock(cip); - } - xfs_iunlock(cip, XFS_ILOCK_SHARED); - } - - if (clcount) { - XFS_STATS_INC(mp, xs_icluster_flushcnt); - XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); - } - -out_free: - rcu_read_unlock(); - kmem_free(cilist); -out_put: - xfs_perag_put(pag); - return error; -} - -/* - * Flush dirty inode metadata into the backing buffer. - * - * The caller must have the inode lock and the inode flush lock held. The - * inode lock will still be held upon return to the caller, and the inode - * flush lock will be released after the inode has reached the disk. - * - * The caller must write out the buffer returned in *bpp and release it. - */ -int +static int xfs_iflush( struct xfs_inode *ip, - struct xfs_buf **bpp) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_buf *bp = NULL; - struct xfs_dinode *dip; - int error; - - XFS_STATS_INC(mp, xs_iflush_count); - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - ASSERT(xfs_isiflocked(ip)); - ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || - ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); - - *bpp = NULL; - - xfs_iunpin_wait(ip); - - /* - * For stale inodes we cannot rely on the backing buffer remaining - * stale in cache for the remaining life of the stale inode and so - * xfs_imap_to_bp() below may give us a buffer that no longer contains - * inodes below. We have to check this after ensuring the inode is - * unpinned so that it is safe to reclaim the stale inode after the - * flush call. - */ - if (xfs_iflags_test(ip, XFS_ISTALE)) { - xfs_ifunlock(ip); - return 0; - } - - /* - * Get the buffer containing the on-disk inode. We are doing a try-lock - * operation here, so we may get an EAGAIN error. In that case, return - * leaving the inode dirty. - * - * If we get any other error, we effectively have a corruption situation - * and we cannot flush the inode. Abort the flush and shut down. - */ - error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK); - if (error == -EAGAIN) { - xfs_ifunlock(ip); - return error; - } - if (error) - goto abort; - - /* - * If the buffer is pinned then push on the log now so we won't - * get stuck waiting in the write for too long. - */ - if (xfs_buf_ispinned(bp)) - xfs_log_force(mp, 0); - - /* - * Flush the provided inode then attempt to gather others from the - * cluster into the write. - * - * Note: Once we attempt to flush an inode, we must run buffer - * completion callbacks on any failure. If this fails, simulate an I/O - * failure on the buffer and shut down. - */ - error = xfs_iflush_int(ip, bp); - if (!error) - error = xfs_iflush_cluster(ip, bp); - if (error) { - bp->b_flags |= XBF_ASYNC; - xfs_buf_ioend_fail(bp); - goto shutdown; - } - - *bpp = bp; - return 0; - -abort: - xfs_iflush_abort(ip); -shutdown: - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - return error; -} - -STATIC int -xfs_iflush_int( - struct xfs_inode *ip, struct xfs_buf *bp) { struct xfs_inode_log_item *iip = ip->i_itemp; @@ -3706,7 +3446,7 @@ xfs_iflush_int( ASSERT(xfs_isiflocked(ip)); ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); - ASSERT(iip != NULL && iip->ili_fields != 0); + ASSERT(iip->ili_item.li_buf == bp); dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); @@ -3801,7 +3541,6 @@ xfs_iflush_int( xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); if (XFS_IFORK_Q(ip)) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); - xfs_inobp_check(mp, bp); /* * We've recorded everything logged in the inode, so we'd like to clear @@ -3818,40 +3557,148 @@ xfs_iflush_int( * know that the information those bits represent is permanently on * disk. As long as the flush completes before the inode is logged * again, then both ili_fields and ili_last_fields will be cleared. - * - * We can play with the ili_fields bits here, because the inode lock - * must be held exclusively in order to set bits there and the flush - * lock protects the ili_last_fields bits. Set ili_logged so the flush - * done routine can tell whether or not to look in the AIL. Also, store - * the current LSN of the inode so that we can tell whether the item has - * moved in the AIL from xfs_iflush_done(). In order to read the lsn we - * need the AIL lock, because it is a 64 bit value that cannot be read - * atomically. */ error = 0; flush_out: + spin_lock(&iip->ili_lock); iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; iip->ili_fsync_fields = 0; - iip->ili_logged = 1; + spin_unlock(&iip->ili_lock); + /* + * Store the current LSN of the inode so that we can tell whether the + * item has moved in the AIL from xfs_iflush_done(). + */ xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); + /* generate the checksum. */ + xfs_dinode_calc_crc(mp, dip); + return error; +} + +/* + * Non-blocking flush of dirty inode metadata into the backing buffer. + * + * The caller must have a reference to the inode and hold the cluster buffer + * locked. The function will walk across all the inodes on the cluster buffer it + * can find and lock without blocking, and flush them to the cluster buffer. + * + * On successful flushing of at least one inode, the caller must write out the + * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and + * the caller needs to release the buffer. On failure, the filesystem will be + * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED + * will be returned. + */ +int +xfs_iflush_cluster( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_log_item *lip, *n; + struct xfs_inode *ip; + struct xfs_inode_log_item *iip; + int clcount = 0; + int error = 0; + /* - * Attach the inode item callback to the buffer whether the flush - * succeeded or not. If not, the caller will shut down and fail I/O - * completion on the buffer to remove the inode from the AIL and release - * the flush lock. + * We must use the safe variant here as on shutdown xfs_iflush_abort() + * can remove itself from the list. */ - xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); + list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { + iip = (struct xfs_inode_log_item *)lip; + ip = iip->ili_inode; - /* generate the checksum. */ - xfs_dinode_calc_crc(mp, dip); + /* + * Quick and dirty check to avoid locks if possible. + */ + if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) + continue; + if (xfs_ipincount(ip)) + continue; + + /* + * The inode is still attached to the buffer, which means it is + * dirty but reclaim might try to grab it. Check carefully for + * that, and grab the ilock while still holding the i_flags_lock + * to guarantee reclaim will not be able to reclaim this inode + * once we drop the i_flags_lock. + */ + spin_lock(&ip->i_flags_lock); + ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE)); + if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) { + spin_unlock(&ip->i_flags_lock); + continue; + } + + /* + * ILOCK will pin the inode against reclaim and prevent + * concurrent transactions modifying the inode while we are + * flushing the inode. + */ + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { + spin_unlock(&ip->i_flags_lock); + continue; + } + spin_unlock(&ip->i_flags_lock); + + /* + * Skip inodes that are already flush locked as they have + * already been written to the buffer. + */ + if (!xfs_iflock_nowait(ip)) { + xfs_iunlock(ip, XFS_ILOCK_SHARED); + continue; + } + + /* + * Abort flushing this inode if we are shut down because the + * inode may not currently be in the AIL. This can occur when + * log I/O failure unpins the inode without inserting into the + * AIL, leaving a dirty/unpinned inode attached to the buffer + * that otherwise looks like it should be flushed. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + xfs_iunpin_wait(ip); + /* xfs_iflush_abort() drops the flush lock */ + xfs_iflush_abort(ip); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + error = -EIO; + continue; + } + + /* don't block waiting on a log force to unpin dirty inodes */ + if (xfs_ipincount(ip)) { + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + continue; + } + + if (!xfs_inode_clean(ip)) + error = xfs_iflush(ip, bp); + else + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + if (error) + break; + clcount++; + } + + if (error) { + bp->b_flags |= XBF_ASYNC; + xfs_buf_ioend_fail(bp); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; + } + + if (!clcount) + return -EAGAIN; + + XFS_STATS_INC(mp, xs_icluster_flushcnt); + XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); + return 0; - ASSERT(!list_empty(&bp->b_li_list)); - ASSERT(bp->b_iodone != NULL); - return error; } /* Release an inode. */ @@ -3881,3 +3728,96 @@ xfs_log_force_inode( return 0; return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL); } + +/* + * Grab the exclusive iolock for a data copy from src to dest, making sure to + * abide vfs locking order (lowest pointer value goes first) and breaking the + * layout leases before proceeding. The loop is needed because we cannot call + * the blocking break_layout() with the iolocks held, and therefore have to + * back out both locks. + */ +static int +xfs_iolock_two_inodes_and_break_layout( + struct inode *src, + struct inode *dest) +{ + int error; + + if (src > dest) + swap(src, dest); + +retry: + /* Wait to break both inodes' layouts before we start locking. */ + error = break_layout(src, true); + if (error) + return error; + if (src != dest) { + error = break_layout(dest, true); + if (error) + return error; + } + + /* Lock one inode and make sure nobody got in and leased it. */ + inode_lock(src); + error = break_layout(src, false); + if (error) { + inode_unlock(src); + if (error == -EWOULDBLOCK) + goto retry; + return error; + } + + if (src == dest) + return 0; + + /* Lock the other inode and make sure nobody got in and leased it. */ + inode_lock_nested(dest, I_MUTEX_NONDIR2); + error = break_layout(dest, false); + if (error) { + inode_unlock(src); + inode_unlock(dest); + if (error == -EWOULDBLOCK) + goto retry; + return error; + } + + return 0; +} + +/* + * Lock two inodes so that userspace cannot initiate I/O via file syscalls or + * mmap activity. + */ +int +xfs_ilock2_io_mmap( + struct xfs_inode *ip1, + struct xfs_inode *ip2) +{ + int ret; + + ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2)); + if (ret) + return ret; + if (ip1 == ip2) + xfs_ilock(ip1, XFS_MMAPLOCK_EXCL); + else + xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL, + ip2, XFS_MMAPLOCK_EXCL); + return 0; +} + +/* Unlock both inodes to allow IO and mmap activity. */ +void +xfs_iunlock2_io_mmap( + struct xfs_inode *ip1, + struct xfs_inode *ip2) +{ + bool same_inode = (ip1 == ip2); + + xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); + if (!same_inode) + xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); + inode_unlock(VFS_I(ip2)); + if (!same_inode) + inode_unlock(VFS_I(ip1)); +} |