summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/namespace.c268
-rw-r--r--include/linux/mount.h21
2 files changed, 106 insertions, 183 deletions
diff --git a/fs/namespace.c b/fs/namespace.c
index b94ad3d685f..22ae06ad751 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -131,10 +131,20 @@ struct vfsmount *alloc_vfsmnt(const char *name)
INIT_LIST_HEAD(&mnt->mnt_share);
INIT_LIST_HEAD(&mnt->mnt_slave_list);
INIT_LIST_HEAD(&mnt->mnt_slave);
- atomic_set(&mnt->__mnt_writers, 0);
+#ifdef CONFIG_SMP
+ mnt->mnt_writers = alloc_percpu(int);
+ if (!mnt->mnt_writers)
+ goto out_free_devname;
+#else
+ mnt->mnt_writers = 0;
+#endif
}
return mnt;
+#ifdef CONFIG_SMP
+out_free_devname:
+ kfree(mnt->mnt_devname);
+#endif
out_free_id:
mnt_free_id(mnt);
out_free_cache:
@@ -171,65 +181,38 @@ int __mnt_is_readonly(struct vfsmount *mnt)
}
EXPORT_SYMBOL_GPL(__mnt_is_readonly);
-struct mnt_writer {
- /*
- * If holding multiple instances of this lock, they
- * must be ordered by cpu number.
- */
- spinlock_t lock;
- struct lock_class_key lock_class; /* compiles out with !lockdep */
- unsigned long count;
- struct vfsmount *mnt;
-} ____cacheline_aligned_in_smp;
-static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
+static inline void inc_mnt_writers(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+ (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
+#else
+ mnt->mnt_writers++;
+#endif
+}
-static int __init init_mnt_writers(void)
+static inline void dec_mnt_writers(struct vfsmount *mnt)
{
- int cpu;
- for_each_possible_cpu(cpu) {
- struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
- spin_lock_init(&writer->lock);
- lockdep_set_class(&writer->lock, &writer->lock_class);
- writer->count = 0;
- }
- return 0;
+#ifdef CONFIG_SMP
+ (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
+#else
+ mnt->mnt_writers--;
+#endif
}
-fs_initcall(init_mnt_writers);
-static void unlock_mnt_writers(void)
+static unsigned int count_mnt_writers(struct vfsmount *mnt)
{
+#ifdef CONFIG_SMP
+ unsigned int count = 0;
int cpu;
- struct mnt_writer *cpu_writer;
for_each_possible_cpu(cpu) {
- cpu_writer = &per_cpu(mnt_writers, cpu);
- spin_unlock(&cpu_writer->lock);
+ count += *per_cpu_ptr(mnt->mnt_writers, cpu);
}
-}
-static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
-{
- if (!cpu_writer->mnt)
- return;
- /*
- * This is in case anyone ever leaves an invalid,
- * old ->mnt and a count of 0.
- */
- if (!cpu_writer->count)
- return;
- atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
- cpu_writer->count = 0;
-}
- /*
- * must hold cpu_writer->lock
- */
-static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
- struct vfsmount *mnt)
-{
- if (cpu_writer->mnt == mnt)
- return;
- __clear_mnt_count(cpu_writer);
- cpu_writer->mnt = mnt;
+ return count;
+#else
+ return mnt->mnt_writers;
+#endif
}
/*
@@ -253,75 +236,34 @@ static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
int mnt_want_write(struct vfsmount *mnt)
{
int ret = 0;
- struct mnt_writer *cpu_writer;
- cpu_writer = &get_cpu_var(mnt_writers);
- spin_lock(&cpu_writer->lock);
+ preempt_disable();
+ inc_mnt_writers(mnt);
+ /*
+ * The store to inc_mnt_writers must be visible before we pass
+ * MNT_WRITE_HOLD loop below, so that the slowpath can see our
+ * incremented count after it has set MNT_WRITE_HOLD.
+ */
+ smp_mb();
+ while (mnt->mnt_flags & MNT_WRITE_HOLD)
+ cpu_relax();
+ /*
+ * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
+ * be set to match its requirements. So we must not load that until
+ * MNT_WRITE_HOLD is cleared.
+ */
+ smp_rmb();
if (__mnt_is_readonly(mnt)) {
+ dec_mnt_writers(mnt);
ret = -EROFS;
goto out;
}
- use_cpu_writer_for_mount(cpu_writer, mnt);
- cpu_writer->count++;
out:
- spin_unlock(&cpu_writer->lock);
- put_cpu_var(mnt_writers);
+ preempt_enable();
return ret;
}
EXPORT_SYMBOL_GPL(mnt_want_write);
-static void lock_mnt_writers(void)
-{
- int cpu;
- struct mnt_writer *cpu_writer;
-
- for_each_possible_cpu(cpu) {
- cpu_writer = &per_cpu(mnt_writers, cpu);
- spin_lock(&cpu_writer->lock);
- __clear_mnt_count(cpu_writer);
- cpu_writer->mnt = NULL;
- }
-}
-
-/*
- * These per-cpu write counts are not guaranteed to have
- * matched increments and decrements on any given cpu.
- * A file open()ed for write on one cpu and close()d on
- * another cpu will imbalance this count. Make sure it
- * does not get too far out of whack.
- */
-static void handle_write_count_underflow(struct vfsmount *mnt)
-{
- if (atomic_read(&mnt->__mnt_writers) >=
- MNT_WRITER_UNDERFLOW_LIMIT)
- return;
- /*
- * It isn't necessary to hold all of the locks
- * at the same time, but doing it this way makes
- * us share a lot more code.
- */
- lock_mnt_writers();
- /*
- * vfsmount_lock is for mnt_flags.
- */
- spin_lock(&vfsmount_lock);
- /*
- * If coalescing the per-cpu writer counts did not
- * get us back to a positive writer count, we have
- * a bug.
- */
- if ((atomic_read(&mnt->__mnt_writers) < 0) &&
- !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
- WARN(1, KERN_DEBUG "leak detected on mount(%p) writers "
- "count: %d\n",
- mnt, atomic_read(&mnt->__mnt_writers));
- /* use the flag to keep the dmesg spam down */
- mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
- }
- spin_unlock(&vfsmount_lock);
- unlock_mnt_writers();
-}
-
/**
* mnt_drop_write - give up write access to a mount
* @mnt: the mount on which to give up write access
@@ -332,37 +274,9 @@ static void handle_write_count_underflow(struct vfsmount *mnt)
*/
void mnt_drop_write(struct vfsmount *mnt)
{
- int must_check_underflow = 0;
- struct mnt_writer *cpu_writer;
-
- cpu_writer = &get_cpu_var(mnt_writers);
- spin_lock(&cpu_writer->lock);
-
- use_cpu_writer_for_mount(cpu_writer, mnt);
- if (cpu_writer->count > 0) {
- cpu_writer->count--;
- } else {
- must_check_underflow = 1;
- atomic_dec(&mnt->__mnt_writers);
- }
-
- spin_unlock(&cpu_writer->lock);
- /*
- * Logically, we could call this each time,
- * but the __mnt_writers cacheline tends to
- * be cold, and makes this expensive.
- */
- if (must_check_underflow)
- handle_write_count_underflow(mnt);
- /*
- * This could be done right after the spinlock
- * is taken because the spinlock keeps us on
- * the cpu, and disables preemption. However,
- * putting it here bounds the amount that
- * __mnt_writers can underflow. Without it,
- * we could theoretically wrap __mnt_writers.
- */
- put_cpu_var(mnt_writers);
+ preempt_disable();
+ dec_mnt_writers(mnt);
+ preempt_enable();
}
EXPORT_SYMBOL_GPL(mnt_drop_write);
@@ -370,24 +284,41 @@ static int mnt_make_readonly(struct vfsmount *mnt)
{
int ret = 0;
- lock_mnt_writers();
+ spin_lock(&vfsmount_lock);
+ mnt->mnt_flags |= MNT_WRITE_HOLD;
/*
- * With all the locks held, this value is stable
+ * After storing MNT_WRITE_HOLD, we'll read the counters. This store
+ * should be visible before we do.
*/
- if (atomic_read(&mnt->__mnt_writers) > 0) {
- ret = -EBUSY;
- goto out;
- }
+ smp_mb();
+
/*
- * nobody can do a successful mnt_want_write() with all
- * of the counts in MNT_DENIED_WRITE and the locks held.
+ * With writers on hold, if this value is zero, then there are
+ * definitely no active writers (although held writers may subsequently
+ * increment the count, they'll have to wait, and decrement it after
+ * seeing MNT_READONLY).
+ *
+ * It is OK to have counter incremented on one CPU and decremented on
+ * another: the sum will add up correctly. The danger would be when we
+ * sum up each counter, if we read a counter before it is incremented,
+ * but then read another CPU's count which it has been subsequently
+ * decremented from -- we would see more decrements than we should.
+ * MNT_WRITE_HOLD protects against this scenario, because
+ * mnt_want_write first increments count, then smp_mb, then spins on
+ * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
+ * we're counting up here.
*/
- spin_lock(&vfsmount_lock);
- if (!ret)
+ if (count_mnt_writers(mnt) > 0)
+ ret = -EBUSY;
+ else
mnt->mnt_flags |= MNT_READONLY;
+ /*
+ * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
+ * that become unheld will see MNT_READONLY.
+ */
+ smp_wmb();
+ mnt->mnt_flags &= ~MNT_WRITE_HOLD;
spin_unlock(&vfsmount_lock);
-out:
- unlock_mnt_writers();
return ret;
}
@@ -410,6 +341,9 @@ void free_vfsmnt(struct vfsmount *mnt)
{
kfree(mnt->mnt_devname);
mnt_free_id(mnt);
+#ifdef CONFIG_SMP
+ free_percpu(mnt->mnt_writers);
+#endif
kmem_cache_free(mnt_cache, mnt);
}
@@ -604,38 +538,18 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
static inline void __mntput(struct vfsmount *mnt)
{
- int cpu;
struct super_block *sb = mnt->mnt_sb;
/*
- * We don't have to hold all of the locks at the
- * same time here because we know that we're the
- * last reference to mnt and that no new writers
- * can come in.
- */
- for_each_possible_cpu(cpu) {
- struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
- spin_lock(&cpu_writer->lock);
- if (cpu_writer->mnt != mnt) {
- spin_unlock(&cpu_writer->lock);
- continue;
- }
- atomic_add(cpu_writer->count, &mnt->__mnt_writers);
- cpu_writer->count = 0;
- /*
- * Might as well do this so that no one
- * ever sees the pointer and expects
- * it to be valid.
- */
- cpu_writer->mnt = NULL;
- spin_unlock(&cpu_writer->lock);
- }
- /*
* This probably indicates that somebody messed
* up a mnt_want/drop_write() pair. If this
* happens, the filesystem was probably unable
* to make r/w->r/o transitions.
*/
- WARN_ON(atomic_read(&mnt->__mnt_writers));
+ /*
+ * atomic_dec_and_lock() used to deal with ->mnt_count decrements
+ * provides barriers, so count_mnt_writers() below is safe. AV
+ */
+ WARN_ON(count_mnt_writers(mnt));
dput(mnt->mnt_root);
free_vfsmnt(mnt);
deactivate_super(sb);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 51f55f903af..ac49c1f8e5c 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -30,7 +30,7 @@ struct mnt_namespace;
#define MNT_STRICTATIME 0x80
#define MNT_SHRINKABLE 0x100
-#define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */
+#define MNT_WRITE_HOLD 0x200
#define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */
#define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */
@@ -65,13 +65,22 @@ struct vfsmount {
int mnt_expiry_mark; /* true if marked for expiry */
int mnt_pinned;
int mnt_ghosts;
- /*
- * This value is not stable unless all of the mnt_writers[] spinlocks
- * are held, and all mnt_writer[]s on this mount have 0 as their ->count
- */
- atomic_t __mnt_writers;
+#ifdef CONFIG_SMP
+ int *mnt_writers;
+#else
+ int mnt_writers;
+#endif
};
+static inline int *get_mnt_writers_ptr(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+ return mnt->mnt_writers;
+#else
+ return &mnt->mnt_writers;
+#endif
+}
+
static inline struct vfsmount *mntget(struct vfsmount *mnt)
{
if (mnt)