summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-07-15 11:39:44 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-07-15 11:39:44 -0700
commit2aae1d67fd1d9070f8f23a6e7d9a7a093cf35fbb (patch)
tree425d848ecbbd12856fe7ef7190b8497d47ed4c7e /fs
parentb8fc1bd73a5a12e48f9fd2e7ccea60cadf718c93 (diff)
parentdc99c0ff53f588bb210b1e8b3314c7581cde68a2 (diff)
Merge tag 'vfs-6.11.inode' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs inode / dentry updates from Christian Brauner: "This contains smaller performance improvements to inodes and dentries: inode: - Add rcu based inode lookup variants. They avoid one inode hash lock acquire in the common case thereby significantly reducing contention. We already support RCU-based operations but didn't take advantage of them during inode insertion. Callers of iget_locked() get the improvement without any code changes. Callers that need a custom callback can switch to iget5_locked_rcu() as e.g., did btrfs. With 20 threads each walking a dedicated 1000 dirs * 1000 files directory tree to stat(2) on a 32 core + 24GB ram vm: before: 3.54s user 892.30s system 1966% cpu 45.549 total after: 3.28s user 738.66s system 1955% cpu 37.932 total (-16.7%) Long-term we should pick up the effort to introduce more fine-grained locking and possibly improve on the currently used hash implementation. - Start zeroing i_state in inode_init_always() instead of doing it in individual filesystems. This allows us to remove an unneeded lock acquire in new_inode() and not burden individual filesystems with this. dcache: - Move d_lockref out of the area used by RCU lookup to avoid cacheline ping poing because the embedded name is sharing a cacheline with d_lockref. - Fix dentry size on 32bit with CONFIG_SMP=y so it does actually end up with 128 bytes in total" * tag 'vfs-6.11.inode' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: fs: fix dentry size vfs: move d_lockref out of the area used by RCU lookup bcachefs: remove now spurious i_state initialization xfs: remove now spurious i_state initialization in xfs_inode_alloc vfs: partially sanitize i_state zeroing on inode creation xfs: preserve i_state around inode_init_always in xfs_reinit_inode btrfs: use iget5_locked_rcu vfs: add rcu-based find_inode variants for iget ops
Diffstat (limited to 'fs')
-rw-r--r--fs/bcachefs/fs.c1
-rw-r--r--fs/btrfs/inode.c2
-rw-r--r--fs/inode.c108
-rw-r--r--fs/xfs/xfs_icache.c5
4 files changed, 86 insertions, 30 deletions
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index fa1fee05cf8f..0c7d1bc0548a 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -244,7 +244,6 @@ static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
inode->ei_flags = 0;
mutex_init(&inode->ei_quota_lock);
memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
- inode->v.i_state = 0;
if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) {
kmem_cache_free(bch2_inode_cache, inode);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3a2b902b2d1f..d62c96f00ff8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5587,7 +5587,7 @@ static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
args.ino = ino;
args.root = root;
- inode = iget5_locked(s, hashval, btrfs_find_actor,
+ inode = iget5_locked_rcu(s, hashval, btrfs_find_actor,
btrfs_init_locked_inode,
(void *)&args);
return inode;
diff --git a/fs/inode.c b/fs/inode.c
index e0815acc5abb..f356fe2ec2b6 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -162,6 +162,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_sb = sb;
inode->i_blkbits = sb->s_blocksize_bits;
inode->i_flags = 0;
+ inode->i_state = 0;
atomic64_set(&inode->i_sequence, 0);
atomic_set(&inode->i_count, 1);
inode->i_op = &empty_iops;
@@ -231,6 +232,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
if (unlikely(security_inode_alloc(inode)))
return -ENOMEM;
+
this_cpu_inc(nr_inodes);
return 0;
@@ -886,36 +888,45 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
return freed;
}
-static void __wait_on_freeing_inode(struct inode *inode);
+static void __wait_on_freeing_inode(struct inode *inode, bool locked);
/*
* Called with the inode lock held.
*/
static struct inode *find_inode(struct super_block *sb,
struct hlist_head *head,
int (*test)(struct inode *, void *),
- void *data)
+ void *data, bool locked)
{
struct inode *inode = NULL;
+ if (locked)
+ lockdep_assert_held(&inode_hash_lock);
+ else
+ lockdep_assert_not_held(&inode_hash_lock);
+
+ rcu_read_lock();
repeat:
- hlist_for_each_entry(inode, head, i_hash) {
+ hlist_for_each_entry_rcu(inode, head, i_hash) {
if (inode->i_sb != sb)
continue;
if (!test(inode, data))
continue;
spin_lock(&inode->i_lock);
if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
- __wait_on_freeing_inode(inode);
+ __wait_on_freeing_inode(inode, locked);
goto repeat;
}
if (unlikely(inode->i_state & I_CREATING)) {
spin_unlock(&inode->i_lock);
+ rcu_read_unlock();
return ERR_PTR(-ESTALE);
}
__iget(inode);
spin_unlock(&inode->i_lock);
+ rcu_read_unlock();
return inode;
}
+ rcu_read_unlock();
return NULL;
}
@@ -924,29 +935,39 @@ repeat:
* iget_locked for details.
*/
static struct inode *find_inode_fast(struct super_block *sb,
- struct hlist_head *head, unsigned long ino)
+ struct hlist_head *head, unsigned long ino,
+ bool locked)
{
struct inode *inode = NULL;
+ if (locked)
+ lockdep_assert_held(&inode_hash_lock);
+ else
+ lockdep_assert_not_held(&inode_hash_lock);
+
+ rcu_read_lock();
repeat:
- hlist_for_each_entry(inode, head, i_hash) {
+ hlist_for_each_entry_rcu(inode, head, i_hash) {
if (inode->i_ino != ino)
continue;
if (inode->i_sb != sb)
continue;
spin_lock(&inode->i_lock);
if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
- __wait_on_freeing_inode(inode);
+ __wait_on_freeing_inode(inode, locked);
goto repeat;
}
if (unlikely(inode->i_state & I_CREATING)) {
spin_unlock(&inode->i_lock);
+ rcu_read_unlock();
return ERR_PTR(-ESTALE);
}
__iget(inode);
spin_unlock(&inode->i_lock);
+ rcu_read_unlock();
return inode;
}
+ rcu_read_unlock();
return NULL;
}
@@ -1004,14 +1025,7 @@ EXPORT_SYMBOL(get_next_ino);
*/
struct inode *new_inode_pseudo(struct super_block *sb)
{
- struct inode *inode = alloc_inode(sb);
-
- if (inode) {
- spin_lock(&inode->i_lock);
- inode->i_state = 0;
- spin_unlock(&inode->i_lock);
- }
- return inode;
+ return alloc_inode(sb);
}
/**
@@ -1161,7 +1175,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
again:
spin_lock(&inode_hash_lock);
- old = find_inode(inode->i_sb, head, test, data);
+ old = find_inode(inode->i_sb, head, test, data, true);
if (unlikely(old)) {
/*
* Uhhuh, somebody else created the same inode under us.
@@ -1235,7 +1249,6 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
struct inode *new = alloc_inode(sb);
if (new) {
- new->i_state = 0;
inode = inode_insert5(new, hashval, test, set, data);
if (unlikely(inode != new))
destroy_inode(new);
@@ -1246,6 +1259,47 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
EXPORT_SYMBOL(iget5_locked);
/**
+ * iget5_locked_rcu - obtain an inode from a mounted file system
+ * @sb: super block of file system
+ * @hashval: hash value (usually inode number) to get
+ * @test: callback used for comparisons between inodes
+ * @set: callback used to initialize a new struct inode
+ * @data: opaque data pointer to pass to @test and @set
+ *
+ * This is equivalent to iget5_locked, except the @test callback must
+ * tolerate the inode not being stable, including being mid-teardown.
+ */
+struct inode *iget5_locked_rcu(struct super_block *sb, unsigned long hashval,
+ int (*test)(struct inode *, void *),
+ int (*set)(struct inode *, void *), void *data)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+ struct inode *inode, *new;
+
+again:
+ inode = find_inode(sb, head, test, data, false);
+ if (inode) {
+ if (IS_ERR(inode))
+ return NULL;
+ wait_on_inode(inode);
+ if (unlikely(inode_unhashed(inode))) {
+ iput(inode);
+ goto again;
+ }
+ return inode;
+ }
+
+ new = alloc_inode(sb);
+ if (new) {
+ inode = inode_insert5(new, hashval, test, set, data);
+ if (unlikely(inode != new))
+ destroy_inode(new);
+ }
+ return inode;
+}
+EXPORT_SYMBOL_GPL(iget5_locked_rcu);
+
+/**
* iget_locked - obtain an inode from a mounted file system
* @sb: super block of file system
* @ino: inode number to get
@@ -1263,9 +1317,7 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
struct hlist_head *head = inode_hashtable + hash(sb, ino);
struct inode *inode;
again:
- spin_lock(&inode_hash_lock);
- inode = find_inode_fast(sb, head, ino);
- spin_unlock(&inode_hash_lock);
+ inode = find_inode_fast(sb, head, ino, false);
if (inode) {
if (IS_ERR(inode))
return NULL;
@@ -1283,7 +1335,7 @@ again:
spin_lock(&inode_hash_lock);
/* We released the lock, so.. */
- old = find_inode_fast(sb, head, ino);
+ old = find_inode_fast(sb, head, ino, true);
if (!old) {
inode->i_ino = ino;
spin_lock(&inode->i_lock);
@@ -1419,7 +1471,7 @@ struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
struct inode *inode;
spin_lock(&inode_hash_lock);
- inode = find_inode(sb, head, test, data);
+ inode = find_inode(sb, head, test, data, true);
spin_unlock(&inode_hash_lock);
return IS_ERR(inode) ? NULL : inode;
@@ -1474,7 +1526,7 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
struct inode *inode;
again:
spin_lock(&inode_hash_lock);
- inode = find_inode_fast(sb, head, ino);
+ inode = find_inode_fast(sb, head, ino, true);
spin_unlock(&inode_hash_lock);
if (inode) {
@@ -2235,17 +2287,21 @@ EXPORT_SYMBOL(inode_needs_sync);
* wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
* will DTRT.
*/
-static void __wait_on_freeing_inode(struct inode *inode)
+static void __wait_on_freeing_inode(struct inode *inode, bool locked)
{
wait_queue_head_t *wq;
DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
wq = bit_waitqueue(&inode->i_state, __I_NEW);
prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_hash_lock);
+ rcu_read_unlock();
+ if (locked)
+ spin_unlock(&inode_hash_lock);
schedule();
finish_wait(wq, &wait.wq_entry);
- spin_lock(&inode_hash_lock);
+ if (locked)
+ spin_lock(&inode_hash_lock);
+ rcu_read_lock();
}
static __initdata unsigned long ihash_entries;
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 9967334ea99f..cf629302d48e 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -86,9 +86,8 @@ xfs_inode_alloc(
return NULL;
}
- /* VFS doesn't initialise i_mode or i_state! */
+ /* VFS doesn't initialise i_mode! */
VFS_I(ip)->i_mode = 0;
- VFS_I(ip)->i_state = 0;
mapping_set_large_folios(VFS_I(ip)->i_mapping);
XFS_STATS_INC(mp, vn_active);
@@ -314,6 +313,7 @@ xfs_reinit_inode(
dev_t dev = inode->i_rdev;
kuid_t uid = inode->i_uid;
kgid_t gid = inode->i_gid;
+ unsigned long state = inode->i_state;
error = inode_init_always(mp->m_super, inode);
@@ -324,6 +324,7 @@ xfs_reinit_inode(
inode->i_rdev = dev;
inode->i_uid = uid;
inode->i_gid = gid;
+ inode->i_state = state;
mapping_set_large_folios(inode->i_mapping);
return error;
}