xfs: skip flushing log items during push

The AIL pushing code spends a huge amount of time skipping over items that are already marked as flushing. It is not uncommon to see hundreds of thousands of items skipped every second due to inode clustering marking all the inodes in a cluster as flushing when the first one is flushed. However, to discover an item is already flushing and should be skipped we have to call the iop_push() method for it to try to flush the item. For inodes (where this matters most), we have to first check that inode is flushable first. We can optimise this overhead away by tracking whether the log item is flushing internally. This allows xfsaild_push() to check the log item directly for flushing state and immediately skip the log item. Whilst this doesn't remove the CPU cache misses for loading the log item, it does avoid the overhead of an indirect function call and the cache misses involved in accessing inode and backing cluster buffer structures to determine flushing state. When trying to flush hundreds of thousands of inodes each second, this CPU overhead saving adds up quickly. It's so noticeable that the biggest issue with pushing on the AIL on fast storage becomes the 10ms back-off wait when we hit enough pinned buffers to break out of the push loop but not enough for the AIL pushing to be considered stuck. This limits the xfsaild to about 70% total CPU usage, and on fast storage this isn't enough to keep the storage 100% busy. The xfsaild will block on IO submission on slow storage and so is self throttling - it does not need a backoff in the case where we are really just breaking out of the walk to submit the IO we have gathered. Further with no backoff we don't need to gather huge delwri lists to mitigate the impact of backoffs, so we can submit IO more frequently and reduce the time log items spend in flushing state by breaking out of the item push loop once we've gathered enough IO to batch submission effectively. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
author: Dave Chinner <dchinner@redhat.com> 2024-06-20 09:21:28 +0200
committer: Chandan Babu R <chandanbabu@kernel.org> 2024-07-04 12:46:47 +0530
commit: f3f7ae68a4ea23aa9c49530733f1faaa6996b03a (patch)
tree: a99ca7ad8a05b3d304a5755cf14cb15d96321af9 /fs/xfs
parent: c1220522ef405a9ebf19447330c9e9de5dfc649c (diff)
4 files changed, 16 insertions, 3 deletions
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 62ca6c75117c..7dc6f326936c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2555,6 +2555,7 @@ flush_out:
 	iip->ili_last_fields = iip->ili_fields;
 	iip->ili_fields = 0;
 	iip->ili_fsync_fields = 0;
+	set_bit(XFS_LI_FLUSHING, &iip->ili_item.li_flags);
 	spin_unlock(&iip->ili_lock);
 
 	/*
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index ef05cbbe116c..b509cbd191f4 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -965,6 +965,7 @@ xfs_iflush_finish(
 		}
 		iip->ili_last_fields = 0;
 		iip->ili_flush_lsn = 0;
+		clear_bit(XFS_LI_FLUSHING, &lip->li_flags);
 		spin_unlock(&iip->ili_lock);
 		xfs_iflags_clear(iip->ili_inode, XFS_IFLUSHING);
 		if (drop_buffer)
@@ -1023,8 +1024,10 @@ xfs_buf_inode_io_fail(
 {
 	struct xfs_log_item	*lip;
 
-	list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
+	list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
 		set_bit(XFS_LI_FAILED, &lip->li_flags);
+		clear_bit(XFS_LI_FLUSHING, &lip->li_flags);
+	}
 }
 
 /*
@@ -1043,6 +1046,7 @@ xfs_iflush_abort_clean(
 	iip->ili_flush_lsn = 0;
 	iip->ili_item.li_buf = NULL;
 	list_del_init(&iip->ili_item.li_bio_list);
+	clear_bit(XFS_LI_FLUSHING, &iip->ili_item.li_flags);
 }
 
 /*
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index f97e8c68641f..f06cc0f41665 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -58,13 +58,15 @@ struct xfs_log_item {
 #define	XFS_LI_FAILED	2
 #define	XFS_LI_DIRTY	3
 #define	XFS_LI_WHITEOUT	4
+#define	XFS_LI_FLUSHING	5
 
 #define XFS_LI_FLAGS \
 	{ (1u << XFS_LI_IN_AIL),	"IN_AIL" }, \
 	{ (1u << XFS_LI_ABORTED),	"ABORTED" }, \
 	{ (1u << XFS_LI_FAILED),	"FAILED" }, \
 	{ (1u << XFS_LI_DIRTY),		"DIRTY" }, \
-	{ (1u << XFS_LI_WHITEOUT),	"WHITEOUT" }
+	{ (1u << XFS_LI_WHITEOUT),	"WHITEOUT" }, \
+	{ (1u << XFS_LI_FLUSHING),	"FLUSHING" }
 
 struct xfs_item_ops {
 	unsigned flags;
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 6a106a05fae0..0fafcc9f3dbe 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -512,6 +512,9 @@ xfsaild_push(
 	while ((XFS_LSN_CMP(lip->li_lsn, ailp->ail_target) <= 0)) {
 		int	lock_result;
 
+		if (test_bit(XFS_LI_FLUSHING, &lip->li_flags))
+			goto next_item;
+
 		/*
 		 * Note that iop_push may unlock and reacquire the AIL lock.  We
 		 * rely on the AIL cursor implementation to be able to deal with
@@ -581,9 +584,12 @@ xfsaild_push(
 		if (stuck > 100)
 			break;
 
+next_item:
 		lip = xfs_trans_ail_cursor_next(ailp, &cur);
 		if (lip == NULL)
 			break;
+		if (lip->li_lsn != lsn && count > 1000)
+			break;
 		lsn = lip->li_lsn;
 	}
 
@@ -620,7 +626,7 @@ out_done:
 		/*
 		 * Assume we have more work to do in a short while.
 		 */
-		tout = 10;
+		tout = 0;
 	}
 
 	return tout;
author	Dave Chinner <dchinner@redhat.com>	2024-06-20 09:21:28 +0200
committer	Chandan Babu R <chandanbabu@kernel.org>	2024-07-04 12:46:47 +0530
commit	f3f7ae68a4ea23aa9c49530733f1faaa6996b03a (patch)
tree	a99ca7ad8a05b3d304a5755cf14cb15d96321af9 /fs/xfs
parent	c1220522ef405a9ebf19447330c9e9de5dfc649c (diff)