summaryrefslogtreecommitdiff
path: root/drivers/md/dm-raid.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/dm-raid.c')
-rw-r--r--drivers/md/dm-raid.c398
1 files changed, 270 insertions, 128 deletions
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 6d53810963f7..f8564d63982f 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -24,6 +24,11 @@
*/
#define MIN_FREE_RESHAPE_SPACE to_sector(4*4096)
+/*
+ * Minimum journal space 4 MiB in sectors.
+ */
+#define MIN_RAID456_JOURNAL_SPACE (4*2048)
+
static bool devices_handle_discard_safely = false;
/*
@@ -73,6 +78,9 @@ struct raid_dev {
#define __CTR_FLAG_DATA_OFFSET 13 /* 2 */ /* Only with reshapable raid4/5/6/10! */
#define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */
+/* New for v1.10.0 */
+#define __CTR_FLAG_JOURNAL_DEV 15 /* 2 */ /* Only with raid4/5/6! */
+
/*
* Flags for rs->ctr_flags field.
*/
@@ -91,6 +99,9 @@ struct raid_dev {
#define CTR_FLAG_DELTA_DISKS (1 << __CTR_FLAG_DELTA_DISKS)
#define CTR_FLAG_DATA_OFFSET (1 << __CTR_FLAG_DATA_OFFSET)
#define CTR_FLAG_RAID10_USE_NEAR_SETS (1 << __CTR_FLAG_RAID10_USE_NEAR_SETS)
+#define CTR_FLAG_JOURNAL_DEV (1 << __CTR_FLAG_JOURNAL_DEV)
+
+#define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET)
/*
* Definitions of various constructor flags to
@@ -160,22 +171,22 @@ struct raid_dev {
CTR_FLAG_DAEMON_SLEEP | \
CTR_FLAG_MIN_RECOVERY_RATE | \
CTR_FLAG_MAX_RECOVERY_RATE | \
- CTR_FLAG_MAX_WRITE_BEHIND | \
CTR_FLAG_STRIPE_CACHE | \
CTR_FLAG_REGION_SIZE | \
CTR_FLAG_DELTA_DISKS | \
- CTR_FLAG_DATA_OFFSET)
+ CTR_FLAG_DATA_OFFSET | \
+ CTR_FLAG_JOURNAL_DEV)
#define RAID6_VALID_FLAGS (CTR_FLAG_SYNC | \
CTR_FLAG_REBUILD | \
CTR_FLAG_DAEMON_SLEEP | \
CTR_FLAG_MIN_RECOVERY_RATE | \
CTR_FLAG_MAX_RECOVERY_RATE | \
- CTR_FLAG_MAX_WRITE_BEHIND | \
CTR_FLAG_STRIPE_CACHE | \
CTR_FLAG_REGION_SIZE | \
CTR_FLAG_DELTA_DISKS | \
- CTR_FLAG_DATA_OFFSET)
+ CTR_FLAG_DATA_OFFSET | \
+ CTR_FLAG_JOURNAL_DEV)
/* ...valid options definitions per raid level */
/*
@@ -224,6 +235,12 @@ struct raid_set {
struct raid_type *raid_type;
struct dm_target_callbacks callbacks;
+ /* Optional raid4/5/6 journal device */
+ struct journal_dev {
+ struct dm_dev *dev;
+ struct md_rdev rdev;
+ } journal_dev;
+
struct raid_dev dev[0];
};
@@ -308,6 +325,7 @@ static struct arg_name_flag {
{ CTR_FLAG_DATA_OFFSET, "data_offset"},
{ CTR_FLAG_DELTA_DISKS, "delta_disks"},
{ CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"},
+ { CTR_FLAG_JOURNAL_DEV, "journal_dev" },
};
/* Return argument name string for given @flag */
@@ -372,7 +390,7 @@ static bool rs_is_reshapable(struct raid_set *rs)
/* Return true, if raid set in @rs is recovering */
static bool rs_is_recovering(struct raid_set *rs)
{
- return rs->md.recovery_cp < rs->dev[0].rdev.sectors;
+ return rs->md.recovery_cp < rs->md.dev_sectors;
}
/* Return true, if raid set in @rs is reshaping */
@@ -629,7 +647,8 @@ static void rs_set_capacity(struct raid_set *rs)
* is unintended in case of out-of-place reshaping
*/
rdev_for_each(rdev, mddev)
- rdev->sectors = mddev->dev_sectors;
+ if (!test_bit(Journal, &rdev->flags))
+ rdev->sectors = mddev->dev_sectors;
set_capacity(gendisk, mddev->array_sectors);
revalidate_disk(gendisk);
@@ -715,6 +734,11 @@ static void raid_set_free(struct raid_set *rs)
{
int i;
+ if (rs->journal_dev.dev) {
+ md_rdev_clear(&rs->journal_dev.rdev);
+ dm_put_device(rs->ti, rs->journal_dev.dev);
+ }
+
for (i = 0; i < rs->raid_disks; i++) {
if (rs->dev[i].meta_dev)
dm_put_device(rs->ti, rs->dev[i].meta_dev);
@@ -762,10 +786,11 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as)
rs->dev[i].data_dev = NULL;
/*
- * There are no offsets, since there is a separate device
- * for data and metadata.
+ * There are no offsets initially.
+ * Out of place reshape will set them accordingly.
*/
rs->dev[i].rdev.data_offset = 0;
+ rs->dev[i].rdev.new_data_offset = 0;
rs->dev[i].rdev.mddev = &rs->md;
arg = dm_shift_arg(as);
@@ -823,6 +848,9 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as)
rebuild++;
}
+ if (rs->journal_dev.dev)
+ list_add_tail(&rs->journal_dev.rdev.same_set, &rs->md.disks);
+
if (metadata_available) {
rs->md.external = 0;
rs->md.persistent = 1;
@@ -1028,6 +1056,8 @@ too_many:
* [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
* [stripe_cache <sectors>] Stripe cache size for higher RAIDs
* [region_size <sectors>] Defines granularity of bitmap
+ * [journal_dev <dev>] raid4/5/6 journaling deviice
+ * (i.e. write hole closing log)
*
* RAID10-only options:
* [raid10_copies <# copies>] Number of copies. (Default: 2)
@@ -1135,7 +1165,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
/*
* Parameters that take a string value are checked here.
*/
-
+ /* "raid10_format {near|offset|far} */
if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT))) {
if (test_and_set_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) {
rs->ti->error = "Only one 'raid10_format' argument pair allowed";
@@ -1153,6 +1183,41 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
continue;
}
+ /* "journal_dev dev" */
+ if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV))) {
+ int r;
+ struct md_rdev *jdev;
+
+ if (test_and_set_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
+ rs->ti->error = "Only one raid4/5/6 set journaling device allowed";
+ return -EINVAL;
+ }
+ if (!rt_is_raid456(rt)) {
+ rs->ti->error = "'journal_dev' is an invalid parameter for this RAID type";
+ return -EINVAL;
+ }
+ r = dm_get_device(rs->ti, arg, dm_table_get_mode(rs->ti->table),
+ &rs->journal_dev.dev);
+ if (r) {
+ rs->ti->error = "raid4/5/6 journal device lookup failure";
+ return r;
+ }
+ jdev = &rs->journal_dev.rdev;
+ md_rdev_init(jdev);
+ jdev->mddev = &rs->md;
+ jdev->bdev = rs->journal_dev.dev->bdev;
+ jdev->sectors = to_sector(i_size_read(jdev->bdev->bd_inode));
+ if (jdev->sectors < MIN_RAID456_JOURNAL_SPACE) {
+ rs->ti->error = "No space for raid4/5/6 journal";
+ return -ENOSPC;
+ }
+ set_bit(Journal, &jdev->flags);
+ continue;
+ }
+
+ /*
+ * Parameters with number values from here on.
+ */
if (kstrtoint(arg, 10, &value) < 0) {
rs->ti->error = "Bad numerical argument given in raid params";
return -EINVAL;
@@ -1427,6 +1492,25 @@ static unsigned int rs_data_stripes(struct raid_set *rs)
return rs->raid_disks - rs->raid_type->parity_devs;
}
+/*
+ * Retrieve rdev->sectors from any valid raid device of @rs
+ * to allow userpace to pass in arbitray "- -" device tupples.
+ */
+static sector_t __rdev_sectors(struct raid_set *rs)
+{
+ int i;
+
+ for (i = 0; i < rs->md.raid_disks; i++) {
+ struct md_rdev *rdev = &rs->dev[i].rdev;
+
+ if (!test_bit(Journal, &rdev->flags) &&
+ rdev->bdev && rdev->sectors)
+ return rdev->sectors;
+ }
+
+ BUG(); /* Constructor ensures we got some. */
+}
+
/* Calculate the sectors per device and per array used for @rs */
static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
{
@@ -1470,7 +1554,8 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
array_sectors = (data_stripes + delta_disks) * dev_sectors;
rdev_for_each(rdev, mddev)
- rdev->sectors = dev_sectors;
+ if (!test_bit(Journal, &rdev->flags))
+ rdev->sectors = dev_sectors;
mddev->array_sectors = array_sectors;
mddev->dev_sectors = dev_sectors;
@@ -1512,9 +1597,9 @@ static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors)
else if (dev_sectors == MaxSector)
/* Prevent recovery */
__rs_setup_recovery(rs, MaxSector);
- else if (rs->dev[0].rdev.sectors < dev_sectors)
+ else if (__rdev_sectors(rs) < dev_sectors)
/* Grown raid set */
- __rs_setup_recovery(rs, rs->dev[0].rdev.sectors);
+ __rs_setup_recovery(rs, __rdev_sectors(rs));
else
__rs_setup_recovery(rs, MaxSector);
}
@@ -1853,18 +1938,21 @@ static int rs_check_reshape(struct raid_set *rs)
return -EPERM;
}
-static int read_disk_sb(struct md_rdev *rdev, int size)
+static int read_disk_sb(struct md_rdev *rdev, int size, bool force_reload)
{
BUG_ON(!rdev->sb_page);
- if (rdev->sb_loaded)
+ if (rdev->sb_loaded && !force_reload)
return 0;
+ rdev->sb_loaded = 0;
+
if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true)) {
DMERR("Failed to read superblock of device at position %d",
rdev->raid_disk);
md_error(rdev->mddev, rdev);
- return -EINVAL;
+ set_bit(Faulty, &rdev->flags);
+ return -EIO;
}
rdev->sb_loaded = 1;
@@ -1992,7 +2080,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
return -EINVAL;
}
- r = read_disk_sb(rdev, rdev->sb_size);
+ r = read_disk_sb(rdev, rdev->sb_size, false);
if (r)
return r;
@@ -2011,7 +2099,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
sb->compat_features = cpu_to_le32(FEATURE_FLAG_SUPPORTS_V190);
/* Force writing of superblocks to disk */
- set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags);
+ set_bit(MD_SB_CHANGE_DEVS, &rdev->mddev->sb_flags);
/* Any superblock is better than none, choose that if given */
return refdev ? 0 : 1;
@@ -2050,16 +2138,17 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
mddev->reshape_position = MaxSector;
+ mddev->raid_disks = le32_to_cpu(sb->num_devices);
+ mddev->level = le32_to_cpu(sb->level);
+ mddev->layout = le32_to_cpu(sb->layout);
+ mddev->chunk_sectors = le32_to_cpu(sb->stripe_sectors);
+
/*
* Reshaping is supported, e.g. reshape_position is valid
* in superblock and superblock content is authoritative.
*/
if (le32_to_cpu(sb->compat_features) & FEATURE_FLAG_SUPPORTS_V190) {
/* Superblock is authoritative wrt given raid set layout! */
- mddev->raid_disks = le32_to_cpu(sb->num_devices);
- mddev->level = le32_to_cpu(sb->level);
- mddev->layout = le32_to_cpu(sb->layout);
- mddev->chunk_sectors = le32_to_cpu(sb->stripe_sectors);
mddev->new_level = le32_to_cpu(sb->new_level);
mddev->new_layout = le32_to_cpu(sb->new_layout);
mddev->new_chunk_sectors = le32_to_cpu(sb->new_stripe_sectors);
@@ -2087,38 +2176,44 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
/*
* No takeover/reshaping, because we don't have the extended v1.9.0 metadata
*/
- if (le32_to_cpu(sb->level) != mddev->new_level) {
- DMERR("Reshaping/takeover raid sets not yet supported. (raid level/stripes/size change)");
- return -EINVAL;
- }
- if (le32_to_cpu(sb->layout) != mddev->new_layout) {
- DMERR("Reshaping raid sets not yet supported. (raid layout change)");
- DMERR(" 0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout);
- DMERR(" Old layout: %s w/ %d copies",
- raid10_md_layout_to_format(le32_to_cpu(sb->layout)),
- raid10_md_layout_to_copies(le32_to_cpu(sb->layout)));
- DMERR(" New layout: %s w/ %d copies",
- raid10_md_layout_to_format(mddev->layout),
- raid10_md_layout_to_copies(mddev->layout));
- return -EINVAL;
- }
- if (le32_to_cpu(sb->stripe_sectors) != mddev->new_chunk_sectors) {
- DMERR("Reshaping raid sets not yet supported. (stripe sectors change)");
- return -EINVAL;
- }
+ struct raid_type *rt_cur = get_raid_type_by_ll(mddev->level, mddev->layout);
+ struct raid_type *rt_new = get_raid_type_by_ll(mddev->new_level, mddev->new_layout);
- /* We can only change the number of devices in raid1 with old (i.e. pre 1.0.7) metadata */
- if (!rt_is_raid1(rs->raid_type) &&
- (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
- DMERR("Reshaping raid sets not yet supported. (device count change from %u to %u)",
- sb->num_devices, mddev->raid_disks);
+ if (rs_takeover_requested(rs)) {
+ if (rt_cur && rt_new)
+ DMERR("Takeover raid sets from %s to %s not yet supported by metadata. (raid level change)",
+ rt_cur->name, rt_new->name);
+ else
+ DMERR("Takeover raid sets not yet supported by metadata. (raid level change)");
+ return -EINVAL;
+ } else if (rs_reshape_requested(rs)) {
+ DMERR("Reshaping raid sets not yet supported by metadata. (raid layout change keeping level)");
+ if (mddev->layout != mddev->new_layout) {
+ if (rt_cur && rt_new)
+ DMERR(" current layout %s vs new layout %s",
+ rt_cur->name, rt_new->name);
+ else
+ DMERR(" current layout 0x%X vs new layout 0x%X",
+ le32_to_cpu(sb->layout), mddev->new_layout);
+ }
+ if (mddev->chunk_sectors != mddev->new_chunk_sectors)
+ DMERR(" current stripe sectors %u vs new stripe sectors %u",
+ mddev->chunk_sectors, mddev->new_chunk_sectors);
+ if (rs->delta_disks)
+ DMERR(" current %u disks vs new %u disks",
+ mddev->raid_disks, mddev->raid_disks + rs->delta_disks);
+ if (rs_is_raid10(rs)) {
+ DMERR(" Old layout: %s w/ %u copies",
+ raid10_md_layout_to_format(mddev->layout),
+ raid10_md_layout_to_copies(mddev->layout));
+ DMERR(" New layout: %s w/ %u copies",
+ raid10_md_layout_to_format(mddev->new_layout),
+ raid10_md_layout_to_copies(mddev->new_layout));
+ }
return -EINVAL;
}
DMINFO("Discovered old metadata format; upgrading to extended metadata format");
-
- /* Table line is checked vs. authoritative superblock */
- rs_set_new(rs);
}
if (!test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
@@ -2141,6 +2236,9 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
*/
d = 0;
rdev_for_each(r, mddev) {
+ if (test_bit(Journal, &rdev->flags))
+ continue;
+
if (test_bit(FirstUse, &r->flags))
new_devs++;
@@ -2196,7 +2294,8 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
*/
sb_retrieve_failed_devices(sb, failed_devices);
rdev_for_each(r, mddev) {
- if (!r->sb_page)
+ if (test_bit(Journal, &rdev->flags) ||
+ !r->sb_page)
continue;
sb2 = page_address(r->sb_page);
sb2->failed_devices = 0;
@@ -2211,7 +2310,7 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
continue;
if (role != r->raid_disk) {
- if (__is_raid10_near(mddev->layout)) {
+ if (rs_is_raid10(rs) && __is_raid10_near(mddev->layout)) {
if (mddev->raid_disks % __raid10_near_copies(mddev->layout) ||
rs->raid_disks % rs->raid10_copies) {
rs->ti->error =
@@ -2248,7 +2347,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
struct mddev *mddev = &rs->md;
struct dm_raid_superblock *sb;
- if (rs_is_raid0(rs) || !rdev->sb_page)
+ if (rs_is_raid0(rs) || !rdev->sb_page || rdev->raid_disk < 0)
return 0;
sb = page_address(rdev->sb_page);
@@ -2273,7 +2372,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
/* Enable bitmap creation for RAID levels != 0 */
mddev->bitmap_info.offset = rt_is_raid0(rs->raid_type) ? 0 : to_sector(4096);
- rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
+ mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
if (!test_and_clear_bit(FirstUse, &rdev->flags)) {
/* Retrieve device size stored in superblock to be prepared for shrink */
@@ -2311,21 +2410,22 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
{
int r;
- struct raid_dev *dev;
- struct md_rdev *rdev, *tmp, *freshest;
+ struct md_rdev *rdev, *freshest;
struct mddev *mddev = &rs->md;
freshest = NULL;
- rdev_for_each_safe(rdev, tmp, mddev) {
+ rdev_for_each(rdev, mddev) {
+ if (test_bit(Journal, &rdev->flags))
+ continue;
+
/*
* Skipping super_load due to CTR_FLAG_SYNC will cause
* the array to undergo initialization again as
* though it were new. This is the intended effect
* of the "sync" directive.
*
- * When reshaping capability is added, we must ensure
- * that the "sync" directive is disallowed during the
- * reshape.
+ * With reshaping capability added, we must ensure that
+ * that the "sync" directive is disallowed during the reshape.
*/
if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags))
continue;
@@ -2342,6 +2442,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
case 0:
break;
default:
+ /* This is a failure to read the superblock from the metadata device. */
/*
* We have to keep any raid0 data/metadata device pairs or
* the MD raid0 personality will fail to start the array.
@@ -2349,33 +2450,16 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
if (rs_is_raid0(rs))
continue;
- dev = container_of(rdev, struct raid_dev, rdev);
- if (dev->meta_dev)
- dm_put_device(ti, dev->meta_dev);
-
- dev->meta_dev = NULL;
- rdev->meta_bdev = NULL;
-
- if (rdev->sb_page)
- put_page(rdev->sb_page);
-
- rdev->sb_page = NULL;
-
- rdev->sb_loaded = 0;
-
/*
- * We might be able to salvage the data device
- * even though the meta device has failed. For
- * now, we behave as though '- -' had been
- * set for this device in the table.
+ * We keep the dm_devs to be able to emit the device tuple
+ * properly on the table line in raid_status() (rather than
+ * mistakenly acting as if '- -' got passed into the constructor).
+ *
+ * The rdev has to stay on the same_set list to allow for
+ * the attempt to restore faulty devices on second resume.
*/
- if (dev->data_dev)
- dm_put_device(ti, dev->data_dev);
-
- dev->data_dev = NULL;
- rdev->bdev = NULL;
-
- list_del(&rdev->same_set);
+ rdev->raid_disk = rdev->saved_raid_disk = -1;
+ break;
}
}
@@ -2396,7 +2480,9 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
return -EINVAL;
rdev_for_each(rdev, mddev)
- if ((rdev != freshest) && super_validate(rs, rdev))
+ if (!test_bit(Journal, &rdev->flags) &&
+ rdev != freshest &&
+ super_validate(rs, rdev))
return -EINVAL;
return 0;
}
@@ -2483,10 +2569,12 @@ static int rs_adjust_data_offsets(struct raid_set *rs)
return -ENOSPC;
}
out:
- /* Adjust data offsets on all rdevs */
+ /* Adjust data offsets on all rdevs but on any raid4/5/6 journal device */
rdev_for_each(rdev, &rs->md) {
- rdev->data_offset = data_offset;
- rdev->new_data_offset = new_data_offset;
+ if (!test_bit(Journal, &rdev->flags)) {
+ rdev->data_offset = data_offset;
+ rdev->new_data_offset = new_data_offset;
+ }
}
return 0;
@@ -2499,8 +2587,10 @@ static void __reorder_raid_disk_indexes(struct raid_set *rs)
struct md_rdev *rdev;
rdev_for_each(rdev, &rs->md) {
- rdev->raid_disk = i++;
- rdev->saved_raid_disk = rdev->new_raid_disk = -1;
+ if (!test_bit(Journal, &rdev->flags)) {
+ rdev->raid_disk = i++;
+ rdev->saved_raid_disk = rdev->new_raid_disk = -1;
+ }
}
}
@@ -2840,7 +2930,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (r)
goto bad;
- calculated_dev_sectors = rs->dev[0].rdev.sectors;
+ calculated_dev_sectors = rs->md.dev_sectors;
/*
* Backup any new raid set level, layout, ...
@@ -2853,7 +2943,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (r)
goto bad;
- resize = calculated_dev_sectors != rs->dev[0].rdev.sectors;
+ resize = calculated_dev_sectors != __rdev_sectors(rs);
INIT_WORK(&rs->md.event_work, do_table_event);
ti->private = rs;
@@ -2897,6 +2987,13 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad;
}
+ /* We can't takeover a journaled raid4/5/6 */
+ if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
+ ti->error = "Can't takeover a journaled raid4/5/6 set";
+ r = -EPERM;
+ goto bad;
+ }
+
/*
* If a takeover is needed, userspace sets any additional
* devices to rebuild and we can check for a valid request here.
@@ -2919,6 +3016,18 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
rs_set_new(rs);
} else if (rs_reshape_requested(rs)) {
/*
+ * No need to check for 'ongoing' takeover here, because takeover
+ * is an instant operation as oposed to an ongoing reshape.
+ */
+
+ /* We can't reshape a journaled raid4/5/6 */
+ if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
+ ti->error = "Can't reshape a journaled raid4/5/6 set";
+ r = -EPERM;
+ goto bad;
+ }
+
+ /*
* We can only prepare for a reshape here, because the
* raid set needs to run to provide the repective reshape
* check functions via its MD personality instance.
@@ -2994,6 +3103,9 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
}
+ /* Disable/enable discard support on raid set. */
+ configure_discard_support(rs);
+
mddev_unlock(&rs->md);
return 0;
@@ -3063,18 +3175,23 @@ static const char *decipher_sync_action(struct mddev *mddev)
}
/*
- * Return status string @rdev
+ * Return status string for @rdev
*
* Status characters:
*
- * 'D' = Dead/Failed device
+ * 'D' = Dead/Failed raid set component or raid4/5/6 journal device
* 'a' = Alive but not in-sync
- * 'A' = Alive and in-sync
+ * 'A' = Alive and in-sync raid set component or alive raid4/5/6 journal device
+ * '-' = Non-existing device (i.e. uspace passed '- -' into the ctr)
*/
static const char *__raid_dev_status(struct md_rdev *rdev, bool array_in_sync)
{
- if (test_bit(Faulty, &rdev->flags))
+ if (!rdev->bdev)
+ return "-";
+ else if (test_bit(Faulty, &rdev->flags))
return "D";
+ else if (test_bit(Journal, &rdev->flags))
+ return "A";
else if (!array_in_sync || !test_bit(In_sync, &rdev->flags))
return "a";
else
@@ -3143,7 +3260,8 @@ static sector_t rs_get_progress(struct raid_set *rs,
* being initialized.
*/
rdev_for_each(rdev, mddev)
- if (!test_bit(In_sync, &rdev->flags))
+ if (!test_bit(Journal, &rdev->flags) &&
+ !test_bit(In_sync, &rdev->flags))
*array_in_sync = true;
#if 0
r = 0; /* HM FIXME: TESTME: https://bugzilla.redhat.com/show_bug.cgi?id=1210637 ? */
@@ -3175,7 +3293,6 @@ static void raid_status(struct dm_target *ti, status_type_t type,
sector_t progress, resync_max_sectors, resync_mismatches;
const char *sync_action;
struct raid_type *rt;
- struct md_rdev *rdev;
switch (type) {
case STATUSTYPE_INFO:
@@ -3196,9 +3313,9 @@ static void raid_status(struct dm_target *ti, status_type_t type,
atomic64_read(&mddev->resync_mismatches) : 0;
sync_action = decipher_sync_action(&rs->md);
- /* HM FIXME: do we want another state char for raid0? It shows 'D' or 'A' now */
- rdev_for_each(rdev, mddev)
- DMEMIT(__raid_dev_status(rdev, array_in_sync));
+ /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
+ for (i = 0; i < rs->raid_disks; i++)
+ DMEMIT(__raid_dev_status(&rs->dev[i].rdev, array_in_sync));
/*
* In-sync/Reshape ratio:
@@ -3244,6 +3361,12 @@ static void raid_status(struct dm_target *ti, status_type_t type,
* so retrieving it from the first raid disk is sufficient.
*/
DMEMIT(" %llu", (unsigned long long) rs->dev[0].rdev.data_offset);
+
+ /*
+ * v1.10.0+:
+ */
+ DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ?
+ __raid_dev_status(&rs->journal_dev.rdev, 0) : "-");
break;
case STATUSTYPE_TABLE:
@@ -3257,7 +3380,8 @@ static void raid_status(struct dm_target *ti, status_type_t type,
raid_param_cnt += rebuild_disks * 2 +
write_mostly_params +
hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) +
- hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2;
+ hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 +
+ (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0);
/* Emit table line */
DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors);
if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags))
@@ -3304,6 +3428,9 @@ static void raid_status(struct dm_target *ti, status_type_t type,
if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags))
DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE),
mddev->sync_speed_min);
+ if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags))
+ DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV),
+ __get_dev_name(rs->journal_dev.dev));
DMEMIT(" %d", rs->raid_disks);
for (i = 0; i < rs->raid_disks; i++)
DMEMIT(" %s %s", __get_dev_name(rs->dev[i].meta_dev),
@@ -3337,12 +3464,15 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv)
else if (!strcasecmp(argv[0], "recover"))
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
else {
- if (!strcasecmp(argv[0], "check"))
+ if (!strcasecmp(argv[0], "check")) {
set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
- else if (!!strcasecmp(argv[0], "repair"))
+ set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+ set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ } else if (!strcasecmp(argv[0], "repair")) {
+ set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+ set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ } else
return -EINVAL;
- set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
- set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
}
if (mddev->ro == 2) {
/* A write to sync_action is enough to justify
@@ -3419,11 +3549,14 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
memset(cleared_failed_devices, 0, sizeof(cleared_failed_devices));
- for (i = 0; i < rs->md.raid_disks; i++) {
+ for (i = 0; i < mddev->raid_disks; i++) {
r = &rs->dev[i].rdev;
- if (test_bit(Faulty, &r->flags) && r->sb_page &&
- sync_page_io(r, 0, r->sb_size, r->sb_page,
- REQ_OP_READ, 0, true)) {
+ /* HM FIXME: enhance journal device recovery processing */
+ if (test_bit(Journal, &r->flags))
+ continue;
+
+ if (test_bit(Faulty, &r->flags) &&
+ r->meta_bdev && !read_disk_sb(r, r->sb_size, true)) {
DMINFO("Faulty %s device #%d has readable super block."
" Attempting to revive it.",
rs->raid_type->name, i);
@@ -3437,22 +3570,26 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
* '>= 0' - meaning we must call this function
* ourselves.
*/
- if ((r->raid_disk >= 0) &&
- (mddev->pers->hot_remove_disk(mddev, r) != 0))
- /* Failed to revive this device, try next */
- continue;
-
- r->raid_disk = i;
- r->saved_raid_disk = i;
flags = r->flags;
+ clear_bit(In_sync, &r->flags); /* Mandatory for hot remove. */
+ if (r->raid_disk >= 0) {
+ if (mddev->pers->hot_remove_disk(mddev, r)) {
+ /* Failed to revive this device, try next */
+ r->flags = flags;
+ continue;
+ }
+ } else
+ r->raid_disk = r->saved_raid_disk = i;
+
clear_bit(Faulty, &r->flags);
clear_bit(WriteErrorSeen, &r->flags);
- clear_bit(In_sync, &r->flags);
+
if (mddev->pers->hot_add_disk(mddev, r)) {
- r->raid_disk = -1;
- r->saved_raid_disk = -1;
+ /* Failed to revive this device, try next */
+ r->raid_disk = r->saved_raid_disk = -1;
r->flags = flags;
} else {
+ clear_bit(In_sync, &r->flags);
r->recovery_offset = 0;
set_bit(i, (void *) cleared_failed_devices);
cleared = true;
@@ -3465,6 +3602,9 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
uint64_t failed_devices[DISKS_ARRAY_ELEMS];
rdev_for_each(r, &rs->md) {
+ if (test_bit(Journal, &r->flags))
+ continue;
+
sb = page_address(r->sb_page);
sb_retrieve_failed_devices(sb, failed_devices);
@@ -3497,7 +3637,7 @@ static void rs_update_sbs(struct raid_set *rs)
struct mddev *mddev = &rs->md;
int ro = mddev->ro;
- set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
mddev->ro = 0;
md_update_sb(mddev, 1);
mddev->ro = ro;
@@ -3580,12 +3720,6 @@ static int raid_preresume(struct dm_target *ti)
if (test_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags))
rs_update_sbs(rs);
- /*
- * Disable/enable discard support on raid set after any
- * conversion, because devices can have been added
- */
- configure_discard_support(rs);
-
/* Load the bitmap from disk unless raid0 */
r = __load_dirty_region_bitmap(rs);
if (r)
@@ -3641,7 +3775,15 @@ static void raid_resume(struct dm_target *ti)
mddev->ro = 0;
mddev->in_sync = 0;
- clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ /*
+ * Keep the RAID set frozen if reshape/rebuild flags are set.
+ * The RAID set is unfrozen once the next table load/resume,
+ * which clears the reshape/rebuild flags, occurs.
+ * This ensures that the constructor for the inactive table
+ * retrieves an up-to-date reshape_position.
+ */
+ if (!(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS))
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
if (mddev->suspended)
mddev_resume(mddev);
@@ -3649,7 +3791,7 @@ static void raid_resume(struct dm_target *ti)
static struct target_type raid_target = {
.name = "raid",
- .version = {1, 9, 1},
+ .version = {1, 10, 1},
.module = THIS_MODULE,
.ctr = raid_ctr,
.dtr = raid_dtr,