xfs: log all dirty inodes in xfs_fs_sync_fs

Commit be4f1ac828776bbc7868a68b465cd8eedb733cfd upstream. Since Linux 2.6.36 the writeback code has introduces various measures for live lock prevention during sync(). Unfortunately some of these are actively harmful for the XFS model, where the inode gets marked dirty for metadata from the data I/O handler. The older_than_this checks that are now more strictly enforced since writeback: avoid livelocking WB_SYNC_ALL writeback by only calling into __writeback_inodes_sb and thus only sampling the current cut off time once. But on a slow enough devices the previous asynchronous sync pass might not have fully completed yet, and thus XFS might mark metadata dirty only after that sampling of the cut off time for the blocking pass already happened. I have not myself reproduced this myself on a real system, but by introducing artificial delay into the XFS I/O completion workqueues it can be reproduced easily. Fix this by iterating over all XFS inodes in ->sync_fs and log all that are dirty. This might log inode that only got redirtied after the previous pass, but given how cheap delayed logging of inodes is it isn't a major concern for performance. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <dchinner@redhat.com> Tested-by: Mark Tinguely <tinguely@sgi.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com> Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
author: Christoph Hellwig <hch@infradead.org> 2012-01-04 09:48:36 -0500
committer: Greg Kroah-Hartman <gregkh@suse.de> 2012-01-06 14:14:12 -0800
commit: 6826d3e80d143ca7411fd2dca05bc57c7ed3e620 (patch)
tree: b0cddba0874ca2f9e49739c70a11de2d06435242 /fs
parent: b32a7304bea14c51fe279281db1ff02eefad2e3a (diff)
2 files changed, 39 insertions, 0 deletions
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index b69688d0776..2f277a04d67 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -336,6 +336,32 @@ xfs_sync_fsdata(
 	return xfs_bwrite(mp, bp);
 }
 
+int
+xfs_log_dirty_inode(
+	struct xfs_inode	*ip,
+	struct xfs_perag	*pag,
+	int			flags)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	int			error;
+
+	if (!ip->i_update_core)
+		return 0;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+	error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	return xfs_trans_commit(tp, 0);
+}
+
 /*
  * When remounting a filesystem read-only or freezing the filesystem, we have
  * two phases to execute. This first phase is syncing the data before we
@@ -365,6 +391,17 @@ xfs_quiesce_data(
 
 	/* push and block till complete */
 	xfs_sync_data(mp, SYNC_WAIT);
+
+	/*
+	 * Log all pending size and timestamp updates.  The vfs writeback
+	 * code is supposed to do this, but due to its overagressive
+	 * livelock detection it will skip inodes where appending writes
+	 * were written out in the first non-blocking sync phase if their
+	 * completion took long enough that it happened after taking the
+	 * timestamp for the cut-off in the blocking phase.
+	 */
+	xfs_inode_ag_iterator(mp, xfs_log_dirty_inode, 0);
+
 	xfs_qm_sync(mp, SYNC_WAIT);
 
 	/* write superblock and hoover up shutdown errors */
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index e3a6ad27415..ef5b2ce4298 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -42,6 +42,8 @@ void xfs_quiesce_attr(struct xfs_mount *mp);
 
 void xfs_flush_inodes(struct xfs_inode *ip);
 
+int xfs_log_dirty_inode(struct xfs_inode *ip, struct xfs_perag *pag, int flags);
+
 int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
 
 void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
author	Christoph Hellwig <hch@infradead.org>	2012-01-04 09:48:36 -0500
committer	Greg Kroah-Hartman <gregkh@suse.de>	2012-01-06 14:14:12 -0800
commit	6826d3e80d143ca7411fd2dca05bc57c7ed3e620 (patch)
tree	b0cddba0874ca2f9e49739c70a11de2d06435242 /fs
parent	b32a7304bea14c51fe279281db1ff02eefad2e3a (diff)