From 40cf2123c57928c3ec0626c49bef97ebdbce008e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 2 Jun 2016 16:19:52 +1000 Subject: md/multipath: add rcu protection to rdev access in multipath_status. Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/multipath.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'drivers/md/multipath.c') diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index dd483bb2e111..69244de2036b 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -141,17 +141,19 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio) return; } -static void multipath_status (struct seq_file *seq, struct mddev *mddev) +static void multipath_status(struct seq_file *seq, struct mddev *mddev) { struct mpconf *conf = mddev->private; int i; seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); - for (i = 0; i < conf->raid_disks; i++) - seq_printf (seq, "%s", - conf->multipaths[i].rdev && - test_bit(In_sync, &conf->multipaths[i].rdev->flags) ? "U" : "_"); + rcu_read_lock(); + for (i = 0; i < conf->raid_disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); + seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); + } + rcu_read_unlock(); seq_printf (seq, "]"); } -- cgit v1.2.3 From f5b67ae86ee317db20c0e10d54f16a0bbbd3207d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 2 Jun 2016 16:19:53 +1000 Subject: md: be extra careful not to take a reference to a Faulty device. It is important that we never increment rdev->nr_pending on a Faulty device as ->hot_remove_disk() assumes that once the Faulty flag is visible no code will take a new reference. Some places take a new reference after only check In_sync. This should be safe as the two are changed together. However to make the code more obviously safe, add checks for 'Faulty' as well. Note: the actual rule is: Never increment nr_pending if Faulty is set and Blocked is clear, never clear Faulty, and never set Blocked without holding a reference through nr_pending. fix build error (Shaohua) Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/multipath.c | 3 ++- drivers/md/raid10.c | 6 ++++++ drivers/md/raid5.c | 3 ++- 3 files changed, 10 insertions(+), 2 deletions(-) (limited to 'drivers/md/multipath.c') diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 69244de2036b..7eb9972a37e6 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -43,7 +43,8 @@ static int multipath_map (struct mpconf *conf) rcu_read_lock(); for (i = 0; i < disks; i++) { struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); - if (rdev && test_bit(In_sync, &rdev->flags)) { + if (rdev && test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) { atomic_inc(&rdev->nr_pending); rcu_read_unlock(); return i; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 526c1d82246e..34facda18e72 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2287,6 +2287,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev && test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags) && is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, &first_bad, &bad_sectors) == 0) { atomic_inc(&rdev->nr_pending); @@ -2339,6 +2340,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 d = r10_bio->devs[sl].devnum; rdev = rcu_dereference(conf->mirrors[d].rdev); if (!rdev || + test_bit(Faulty, &rdev->flags) || !test_bit(In_sync, &rdev->flags)) continue; @@ -2378,6 +2380,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 d = r10_bio->devs[sl].devnum; rdev = rcu_dereference(conf->mirrors[d].rdev); if (!rdev || + test_bit(Faulty, &rdev->flags) || !test_bit(In_sync, &rdev->flags)) continue; @@ -2953,6 +2956,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, mreplace = rcu_dereference(mirror->replacement); if ((mrdev == NULL || + test_bit(Faulty, &mrdev->flags) || test_bit(In_sync, &mrdev->flags)) && (mreplace == NULL || test_bit(Faulty, &mreplace->flags))) { @@ -2971,6 +2975,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, rcu_read_unlock(); continue; } + if (mreplace && test_bit(Faulty, &mreplace->flags)) + mreplace = NULL; /* Unless we are doing a full sync, or a replacement * we only need to recover the block if it is set in * the bitmap diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c07b22e8d946..f6a191aaaa91 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3080,7 +3080,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, struct md_rdev *rdev; rcu_read_lock(); rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && test_bit(In_sync, &rdev->flags)) + if (rdev && test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) atomic_inc(&rdev->nr_pending); else rdev = NULL; -- cgit v1.2.3 From d787be4092e27728cb4c012bee9762098ef3c662 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 2 Jun 2016 16:19:53 +1000 Subject: md: reduce the number of synchronize_rcu() calls when multiple devices fail. Every time a device is removed with ->hot_remove_disk() a synchronize_rcu() call is made which can delay several milliseconds in some case. If lots of devices fail at once - as could happen with a large RAID10 where one set of devices are removed all at once - these delays can add up to be very inconcenient. As failure is not reversible we can check for that first, setting a separate flag if it is found, and then all synchronize_rcu() once for all the flagged devices. Then ->hot_remove_disk() function can skip the synchronize_rcu() step if the flag is set. fix build error(Shaohua) Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 29 ++++++++++++++++++++++++++--- drivers/md/md.h | 5 +++++ drivers/md/multipath.c | 14 ++++++++------ drivers/md/raid1.c | 17 ++++++++++------- drivers/md/raid10.c | 19 +++++++++++-------- drivers/md/raid5.c | 15 +++++++++------ 6 files changed, 69 insertions(+), 30 deletions(-) (limited to 'drivers/md/multipath.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 0793754eeffd..2ed547f5c3b6 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8180,15 +8180,34 @@ static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *rdev; int spares = 0; int removed = 0; + bool remove_some = false; - rdev_for_each(rdev, mddev) + rdev_for_each(rdev, mddev) { + if ((this == NULL || rdev == this) && + rdev->raid_disk >= 0 && + !test_bit(Blocked, &rdev->flags) && + test_bit(Faulty, &rdev->flags) && + atomic_read(&rdev->nr_pending)==0) { + /* Faulty non-Blocked devices with nr_pending == 0 + * never get nr_pending incremented, + * never get Faulty cleared, and never get Blocked set. + * So we can synchronize_rcu now rather than once per device + */ + remove_some = true; + set_bit(RemoveSynchronized, &rdev->flags); + } + } + + if (remove_some) + synchronize_rcu(); + rdev_for_each(rdev, mddev) { if ((this == NULL || rdev == this) && rdev->raid_disk >= 0 && !test_bit(Blocked, &rdev->flags) && - (test_bit(Faulty, &rdev->flags) || + ((test_bit(RemoveSynchronized, &rdev->flags) || (!test_bit(In_sync, &rdev->flags) && !test_bit(Journal, &rdev->flags))) && - atomic_read(&rdev->nr_pending)==0) { + atomic_read(&rdev->nr_pending)==0)) { if (mddev->pers->hot_remove_disk( mddev, rdev) == 0) { sysfs_unlink_rdev(mddev, rdev); @@ -8196,6 +8215,10 @@ static int remove_and_add_spares(struct mddev *mddev, removed++; } } + if (remove_some && test_bit(RemoveSynchronized, &rdev->flags)) + clear_bit(RemoveSynchronized, &rdev->flags); + } + if (removed && mddev->kobj.sd) sysfs_notify(&mddev->kobj, NULL, "degraded"); diff --git a/drivers/md/md.h b/drivers/md/md.h index 03b19aad4921..dc65ca65b26e 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -163,6 +163,11 @@ enum flag_bits { * than other devices in the array */ ClusterRemove, + RemoveSynchronized, /* synchronize_rcu() was called after + * this device was known to be faulty, + * so it is safe to remove without + * another synchronize_rcu() call. + */ }; static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 7eb9972a37e6..c145a5a114eb 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -298,12 +298,14 @@ static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } p->rdev = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - p->rdev = rdev; - goto abort; + if (!test_bit(RemoveSynchronized, &rdev->flags)) { + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + p->rdev = rdev; + goto abort; + } } err = md_integrity_register(mddev); } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 34f20c03d1f6..5027ef4752ac 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1656,13 +1656,16 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } p->rdev = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - p->rdev = rdev; - goto abort; - } else if (conf->mirrors[conf->raid_disks + number].rdev) { + if (!test_bit(RemoveSynchronized, &rdev->flags)) { + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + p->rdev = rdev; + goto abort; + } + } + if (conf->mirrors[conf->raid_disks + number].rdev) { /* We just removed a device that is being replaced. * Move down the replacement. We drain all IO before * doing this to avoid confusion. diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 34facda18e72..8ee5d96e6a2d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1766,7 +1766,7 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) err = -EBUSY; goto abort; } - /* Only remove faulty devices if recovery + /* Only remove non-faulty devices if recovery * is not possible. */ if (!test_bit(Faulty, &rdev->flags) && @@ -1778,13 +1778,16 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } *rdevp = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - *rdevp = rdev; - goto abort; - } else if (p->replacement) { + if (!test_bit(RemoveSynchronized, &rdev->flags)) { + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + *rdevp = rdev; + goto abort; + } + } + if (p->replacement) { /* We must have just cleared 'rdev' */ p->rdev = p->replacement; clear_bit(Replacement, &p->replacement->flags); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f6a191aaaa91..413cc7d847da 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7197,12 +7197,15 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } *rdevp = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - *rdevp = rdev; - } else if (p->replacement) { + if (!test_bit(RemoveSynchronized, &rdev->flags)) { + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + *rdevp = rdev; + } + } + if (p->replacement) { /* We must have just cleared 'rdev' */ p->rdev = p->replacement; clear_bit(Replacement, &p->replacement->flags); -- cgit v1.2.3