From 2c810cddc44d6f95cef75df3f07fc0850ff92417 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 21 May 2012 09:27:00 +1000 Subject: md: allow a reshape operation to be reversed. Currently a reshape operation always progresses from the start of the array to the end unless the number of devices is being reduced, in which case it progressed in the opposite direction. To reverse a partial reshape which changes the number of devices you can stop the array and re-assemble with the raid-disks numbers reversed and it will undo. However for a reshape that does not change the number of devices it is not possible to reverse the reshape in the middle - you have to wait until it completes. So add a 'reshape_direction' attribute with is either 'forwards' or 'backwards' and can be explicitly set when delta_disks is zero. This will become more important when we allow the data_offset to change in a reshape. Then the explicit statement of what direction is being used will be more useful. This can be enabled in raid5 trivially as it already supports reverse reshape and just needs to use a different trigger to request it. Signed-off-by: NeilBrown --- include/linux/raid/md_p.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h index 8c0a3adc5df5..07e05f92d050 100644 --- a/include/linux/raid/md_p.h +++ b/include/linux/raid/md_p.h @@ -281,10 +281,15 @@ struct mdp_superblock_1 { * active device with same 'role'. * 'recovery_offset' is also set. */ +#define MD_FEATURE_RESHAPE_BACKWARDS 32 /* Reshape doesn't change number + * of devices, but is going + * backwards anyway. + */ #define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \ |MD_FEATURE_RECOVERY_OFFSET \ |MD_FEATURE_RESHAPE_ACTIVE \ |MD_FEATURE_BAD_BLOCKS \ - |MD_FEATURE_REPLACEMENT) + |MD_FEATURE_REPLACEMENT \ + |MD_FEATURE_RESHAPE_BACKWARDS) #endif -- cgit v1.2.3 From c6563a8c38fde3c1c7fc925a10bde3ca20799301 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 21 May 2012 09:27:00 +1000 Subject: md: add possibility to change data-offset for devices. When reshaping we can avoid costly intermediate backup by changing the 'start' address of the array on the device (if there is enough room). So as a first step, allow such a change to be requested through sysfs, and recorded in v1.x metadata. (As we didn't previous check that all 'pad' fields were zero, we need a new FEATURE flag for this. A (belatedly) check that all remaining 'pad' fields are zero to avoid a repeat of this) The new data offset must be requested separately for each device. This allows each to have a different change in the data offset. This is not likely to be used often but as data_offset can be set per-device, new_data_offset should be too. This patch also removes the 'acknowledged' arg to rdev_set_badblocks as it is never used and never will be. At the same time we add a new arg ('in_new') which is currently always zero but will be used more soon. When a reshape finishes we will need to update the data_offset and rdev->sectors. So provide an exported function to do that. Signed-off-by: NeilBrown --- drivers/md/md.c | 217 +++++++++++++++++++++++++++++++++++++++++----- drivers/md/md.h | 7 +- drivers/md/raid1.c | 4 +- drivers/md/raid10.c | 8 +- drivers/md/raid5.c | 10 ++- include/linux/raid/md_p.h | 10 ++- 6 files changed, 222 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/drivers/md/md.c b/drivers/md/md.c index 44bb1d52dd4c..9fa98fc74b05 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1035,12 +1035,17 @@ static unsigned int calc_sb_csum(mdp_super_t * sb) struct super_type { char *name; struct module *owner; - int (*load_super)(struct md_rdev *rdev, struct md_rdev *refdev, + int (*load_super)(struct md_rdev *rdev, + struct md_rdev *refdev, int minor_version); - int (*validate_super)(struct mddev *mddev, struct md_rdev *rdev); - void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); + int (*validate_super)(struct mddev *mddev, + struct md_rdev *rdev); + void (*sync_super)(struct mddev *mddev, + struct md_rdev *rdev); unsigned long long (*rdev_size_change)(struct md_rdev *rdev, sector_t num_sectors); + int (*allow_new_offset)(struct md_rdev *rdev, + unsigned long long new_offset); }; /* @@ -1112,6 +1117,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor rdev->preferred_minor = sb->md_minor; rdev->data_offset = 0; + rdev->new_data_offset = 0; rdev->sb_size = MD_SB_BYTES; rdev->badblocks.shift = -1; @@ -1438,6 +1444,12 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) return num_sectors; } +static int +super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) +{ + /* non-zero offset changes not possible with v0.90 */ + return new_offset == 0; +} /* * version 1 superblock @@ -1473,6 +1485,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ struct mdp_superblock_1 *sb; int ret; sector_t sb_start; + sector_t sectors; char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; int bmask; @@ -1527,9 +1540,18 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ bdevname(rdev->bdev,b)); return -EINVAL; } + if (sb->pad0 || + sb->pad3[0] || + memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) + /* Some padding is non-zero, might be a new feature */ + return -EINVAL; rdev->preferred_minor = 0xffff; rdev->data_offset = le64_to_cpu(sb->data_offset); + rdev->new_data_offset = rdev->data_offset; + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && + (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) + rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; @@ -1540,6 +1562,9 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ if (minor_version && rdev->data_offset < sb_start + (rdev->sb_size/512)) return -EINVAL; + if (minor_version + && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) + return -EINVAL; if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) rdev->desc_nr = -1; @@ -1611,16 +1636,14 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ else ret = 0; } - if (minor_version) - rdev->sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - - le64_to_cpu(sb->data_offset); - else - rdev->sectors = rdev->sb_start; - if (rdev->sectors < le64_to_cpu(sb->data_size)) + if (minor_version) { + sectors = (i_size_read(rdev->bdev->bd_inode) >> 9); + sectors -= rdev->data_offset; + } else + sectors = rdev->sb_start; + if (sectors < le64_to_cpu(sb->data_size)) return -EINVAL; rdev->sectors = le64_to_cpu(sb->data_size); - if (le64_to_cpu(sb->size) > rdev->sectors) - return -EINVAL; return ret; } @@ -1745,7 +1768,6 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->feature_map = 0; sb->pad0 = 0; sb->recovery_offset = cpu_to_le64(0); - memset(sb->pad1, 0, sizeof(sb->pad1)); memset(sb->pad3, 0, sizeof(sb->pad3)); sb->utime = cpu_to_le64((__u64)mddev->utime); @@ -1767,6 +1789,8 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->devflags |= WriteMostly1; else sb->devflags &= ~WriteMostly1; + sb->data_offset = cpu_to_le64(rdev->data_offset); + sb->data_size = cpu_to_le64(rdev->sectors); if (mddev->bitmap && mddev->bitmap_info.file == NULL) { sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); @@ -1795,6 +1819,12 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) mddev->reshape_backwards) sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); + if (rdev->new_data_offset != rdev->data_offset) { + sb->feature_map + |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); + sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset + - rdev->data_offset)); + } } if (rdev->badblocks.count == 0) @@ -1871,6 +1901,8 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) sector_t max_sectors; if (num_sectors && num_sectors < rdev->mddev->dev_sectors) return 0; /* component must fit device */ + if (rdev->data_offset != rdev->new_data_offset) + return 0; /* too confusing */ if (rdev->sb_start < rdev->data_offset) { /* minor versions 1 and 2; superblock before data */ max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; @@ -1898,6 +1930,40 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) rdev->sb_page); md_super_wait(rdev->mddev); return num_sectors; + +} + +static int +super_1_allow_new_offset(struct md_rdev *rdev, + unsigned long long new_offset) +{ + /* All necessary checks on new >= old have been done */ + struct bitmap *bitmap; + if (new_offset >= rdev->data_offset) + return 1; + + /* with 1.0 metadata, there is no metadata to tread on + * so we can always move back */ + if (rdev->mddev->minor_version == 0) + return 1; + + /* otherwise we must be sure not to step on + * any metadata, so stay: + * 36K beyond start of superblock + * beyond end of badblocks + * beyond write-intent bitmap + */ + if (rdev->sb_start + (32+4)*2 > new_offset) + return 0; + bitmap = rdev->mddev->bitmap; + if (bitmap && !rdev->mddev->bitmap_info.file && + rdev->sb_start + rdev->mddev->bitmap_info.offset + + bitmap->file_pages * (PAGE_SIZE>>9) > new_offset) + return 0; + if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) + return 0; + + return 1; } static struct super_type super_types[] = { @@ -1908,6 +1974,7 @@ static struct super_type super_types[] = { .validate_super = super_90_validate, .sync_super = super_90_sync, .rdev_size_change = super_90_rdev_size_change, + .allow_new_offset = super_90_allow_new_offset, }, [1] = { .name = "md-1", @@ -1916,6 +1983,7 @@ static struct super_type super_types[] = { .validate_super = super_1_validate, .sync_super = super_1_sync, .rdev_size_change = super_1_rdev_size_change, + .allow_new_offset = super_1_allow_new_offset, }, }; @@ -2823,9 +2891,8 @@ offset_show(struct md_rdev *rdev, char *page) static ssize_t offset_store(struct md_rdev *rdev, const char *buf, size_t len) { - char *e; - unsigned long long offset = simple_strtoull(buf, &e, 10); - if (e==buf || (*e && *e != '\n')) + unsigned long long offset; + if (strict_strtoull(buf, 10, &offset) < 0) return -EINVAL; if (rdev->mddev->pers && rdev->raid_disk >= 0) return -EBUSY; @@ -2840,6 +2907,63 @@ offset_store(struct md_rdev *rdev, const char *buf, size_t len) static struct rdev_sysfs_entry rdev_offset = __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); +static ssize_t new_offset_show(struct md_rdev *rdev, char *page) +{ + return sprintf(page, "%llu\n", + (unsigned long long)rdev->new_data_offset); +} + +static ssize_t new_offset_store(struct md_rdev *rdev, + const char *buf, size_t len) +{ + unsigned long long new_offset; + struct mddev *mddev = rdev->mddev; + + if (strict_strtoull(buf, 10, &new_offset) < 0) + return -EINVAL; + + if (mddev->sync_thread) + return -EBUSY; + if (new_offset == rdev->data_offset) + /* reset is always permitted */ + ; + else if (new_offset > rdev->data_offset) { + /* must not push array size beyond rdev_sectors */ + if (new_offset - rdev->data_offset + + mddev->dev_sectors > rdev->sectors) + return -E2BIG; + } + /* Metadata worries about other space details. */ + + /* decreasing the offset is inconsistent with a backwards + * reshape. + */ + if (new_offset < rdev->data_offset && + mddev->reshape_backwards) + return -EINVAL; + /* Increasing offset is inconsistent with forwards + * reshape. reshape_direction should be set to + * 'backwards' first. + */ + if (new_offset > rdev->data_offset && + !mddev->reshape_backwards) + return -EINVAL; + + if (mddev->pers && mddev->persistent && + !super_types[mddev->major_version] + .allow_new_offset(rdev, new_offset)) + return -E2BIG; + rdev->new_data_offset = new_offset; + if (new_offset > rdev->data_offset) + mddev->reshape_backwards = 1; + else if (new_offset < rdev->data_offset) + mddev->reshape_backwards = 0; + + return len; +} +static struct rdev_sysfs_entry rdev_new_offset = +__ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); + static ssize_t rdev_size_show(struct md_rdev *rdev, char *page) { @@ -2884,6 +3008,8 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) if (strict_blocks_to_sectors(buf, §ors) < 0) return -EINVAL; + if (rdev->data_offset != rdev->new_data_offset) + return -EINVAL; /* too confusing */ if (my_mddev->pers && rdev->raid_disk >= 0) { if (my_mddev->persistent) { sectors = super_types[my_mddev->major_version]. @@ -3020,6 +3146,7 @@ static struct attribute *rdev_default_attrs[] = { &rdev_errors.attr, &rdev_slot.attr, &rdev_offset.attr, + &rdev_new_offset.attr, &rdev_size.attr, &rdev_recovery_start.attr, &rdev_bad_blocks.attr, @@ -3094,6 +3221,7 @@ int md_rdev_init(struct md_rdev *rdev) rdev->raid_disk = -1; rdev->flags = 0; rdev->data_offset = 0; + rdev->new_data_offset = 0; rdev->sb_events = 0; rdev->last_read_error.tv_sec = 0; rdev->last_read_error.tv_nsec = 0; @@ -3598,7 +3726,17 @@ raid_disks_store(struct mddev *mddev, const char *buf, size_t len) if (mddev->pers) rv = update_raid_disks(mddev, n); else if (mddev->reshape_position != MaxSector) { + struct md_rdev *rdev; int olddisks = mddev->raid_disks - mddev->delta_disks; + + rdev_for_each(rdev, mddev) { + if (olddisks < n && + rdev->data_offset < rdev->new_data_offset) + return -EINVAL; + if (olddisks > n && + rdev->data_offset > rdev->new_data_offset) + return -EINVAL; + } mddev->delta_disks = n - olddisks; mddev->raid_disks = n; mddev->reshape_backwards = (mddev->delta_disks < 0); @@ -4445,6 +4583,7 @@ reshape_position_show(struct mddev *mddev, char *page) static ssize_t reshape_position_store(struct mddev *mddev, const char *buf, size_t len) { + struct md_rdev *rdev; char *e; unsigned long long new = simple_strtoull(buf, &e, 10); if (mddev->pers) @@ -4457,6 +4596,8 @@ reshape_position_store(struct mddev *mddev, const char *buf, size_t len) mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; mddev->new_chunk_sectors = mddev->chunk_sectors; + rdev_for_each(rdev, mddev) + rdev->new_data_offset = rdev->data_offset; return len; } @@ -6001,6 +6142,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) static int update_raid_disks(struct mddev *mddev, int raid_disks) { int rv; + struct md_rdev *rdev; /* change the number of raid disks */ if (mddev->pers->check_reshape == NULL) return -EINVAL; @@ -6009,6 +6151,16 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) return -EINVAL; if (mddev->sync_thread || mddev->reshape_position != MaxSector) return -EBUSY; + + rdev_for_each(rdev, mddev) { + if (mddev->raid_disks < raid_disks && + rdev->data_offset < rdev->new_data_offset) + return -EINVAL; + if (mddev->raid_disks > raid_disks && + rdev->data_offset > rdev->new_data_offset) + return -EINVAL; + } + mddev->delta_disks = raid_disks - mddev->raid_disks; if (mddev->delta_disks < 0) mddev->reshape_backwards = 1; @@ -7709,6 +7861,20 @@ void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) } EXPORT_SYMBOL(md_wait_for_blocked_rdev); +void md_finish_reshape(struct mddev *mddev) +{ + /* called be personality module when reshape completes. */ + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) { + if (rdev->data_offset > rdev->new_data_offset) + rdev->sectors += rdev->data_offset - rdev->new_data_offset; + else + rdev->sectors -= rdev->new_data_offset - rdev->data_offset; + rdev->data_offset = rdev->new_data_offset; + } +} +EXPORT_SYMBOL(md_finish_reshape); /* Bad block management. * We can record which blocks on each device are 'bad' and so just @@ -7957,10 +8123,15 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, } int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, - int acknowledged) + int is_new) { - int rv = md_set_badblocks(&rdev->badblocks, - s + rdev->data_offset, sectors, acknowledged); + int rv; + if (is_new) + s += rdev->new_data_offset; + else + s += rdev->data_offset; + rv = md_set_badblocks(&rdev->badblocks, + s, sectors, 0); if (rv) { /* Make sure they get written out promptly */ sysfs_notify_dirent_safe(rdev->sysfs_state); @@ -8066,11 +8237,15 @@ out: return rv; } -int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors) +int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new) { + if (is_new) + s += rdev->new_data_offset; + else + s += rdev->data_offset; return md_clear_badblocks(&rdev->badblocks, - s + rdev->data_offset, - sectors); + s, sectors); } EXPORT_SYMBOL_GPL(rdev_clear_badblocks); diff --git a/drivers/md/md.h b/drivers/md/md.h index d51c0ca37777..98913e8dac1a 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -55,6 +55,7 @@ struct md_rdev { int sb_loaded; __u64 sb_events; sector_t data_offset; /* start of data in array */ + sector_t new_data_offset;/* only relevant while reshaping */ sector_t sb_start; /* offset of the super block (in 512byte sectors) */ int sb_size; /* bytes in the superblock */ int preferred_minor; /* autorun support */ @@ -193,8 +194,9 @@ static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, return 0; } extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, - int acknowledged); -extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors); + int is_new); +extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new); extern void md_ack_all_badblocks(struct badblocks *bb); struct mddev { @@ -592,6 +594,7 @@ extern void md_write_start(struct mddev *mddev, struct bio *bi); extern void md_write_end(struct mddev *mddev); extern void md_done_sync(struct mddev *mddev, int blocks, int ok); extern void md_error(struct mddev *mddev, struct md_rdev *rdev); +extern void md_finish_reshape(struct mddev *mddev); extern int mddev_congested(struct mddev *mddev, int bits); extern void md_flush_request(struct mddev *mddev, struct bio *bio); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 15dd59b84e94..71a7dc038a82 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2024,7 +2024,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio continue; if (test_bit(BIO_UPTODATE, &bio->bi_flags) && test_bit(R1BIO_MadeGood, &r1_bio->state)) { - rdev_clear_badblocks(rdev, r1_bio->sector, s); + rdev_clear_badblocks(rdev, r1_bio->sector, s, 0); } if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && test_bit(R1BIO_WriteError, &r1_bio->state)) { @@ -2044,7 +2044,7 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) struct md_rdev *rdev = conf->mirrors[m].rdev; rdev_clear_badblocks(rdev, r1_bio->sector, - r1_bio->sectors); + r1_bio->sectors, 0); rdev_dec_pending(rdev, conf->mddev); } else if (r1_bio->bios[m] != NULL) { /* This drive got a write error. We need to diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 3f91c2e1dfe7..832fb4d56657 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2480,7 +2480,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) rdev_clear_badblocks( rdev, r10_bio->devs[m].addr, - r10_bio->sectors); + r10_bio->sectors, 0); } else { if (!rdev_set_badblocks( rdev, @@ -2496,7 +2496,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) rdev_clear_badblocks( rdev, r10_bio->devs[m].addr, - r10_bio->sectors); + r10_bio->sectors, 0); } else { if (!rdev_set_badblocks( rdev, @@ -2515,7 +2515,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) rdev_clear_badblocks( rdev, r10_bio->devs[m].addr, - r10_bio->sectors); + r10_bio->sectors, 0); rdev_dec_pending(rdev, conf->mddev); } else if (bio != NULL && !test_bit(BIO_UPTODATE, &bio->bi_flags)) { @@ -2532,7 +2532,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) rdev_clear_badblocks( rdev, r10_bio->devs[m].addr, - r10_bio->sectors); + r10_bio->sectors, 0); rdev_dec_pending(rdev, conf->mddev); } } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 0abbd3447cfb..3705585d7567 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3561,7 +3561,7 @@ finish: if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { rdev = conf->disks[i].rdev; rdev_clear_badblocks(rdev, sh->sector, - STRIPE_SECTORS); + STRIPE_SECTORS, 0); rdev_dec_pending(rdev, conf->mddev); } if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { @@ -3570,7 +3570,7 @@ finish: /* rdev have been moved down */ rdev = conf->disks[i].rdev; rdev_clear_badblocks(rdev, sh->sector, - STRIPE_SECTORS); + STRIPE_SECTORS, 0); rdev_dec_pending(rdev, conf->mddev); } } @@ -5505,10 +5505,14 @@ static int raid5_start_reshape(struct mddev *mddev) if (!check_stripe_cache(mddev)) return -ENOSPC; - rdev_for_each(rdev, mddev) + rdev_for_each(rdev, mddev) { + /* Don't support changing data_offset yet */ + if (rdev->new_data_offset != rdev->data_offset) + return -EINVAL; if (!test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) spares++; + } if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) /* Not enough devices even to make a degraded array diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h index 07e05f92d050..ee753536ab70 100644 --- a/include/linux/raid/md_p.h +++ b/include/linux/raid/md_p.h @@ -233,7 +233,10 @@ struct mdp_superblock_1 { __le32 delta_disks; /* change in number of raid_disks */ __le32 new_layout; /* new layout */ __le32 new_chunk; /* new chunk size (512byte sectors) */ - __u8 pad1[128-124]; /* set to 0 when written */ + __le32 new_offset; /* signed number to add to data_offset in new + * layout. 0 == no-change. This can be + * different on each device in the array. + */ /* constant this-device information - 64 bytes */ __le64 data_offset; /* sector start of data, often 0 */ @@ -285,11 +288,14 @@ struct mdp_superblock_1 { * of devices, but is going * backwards anyway. */ +#define MD_FEATURE_NEW_OFFSET 64 /* new_offset must be honoured */ #define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \ |MD_FEATURE_RECOVERY_OFFSET \ |MD_FEATURE_RESHAPE_ACTIVE \ |MD_FEATURE_BAD_BLOCKS \ |MD_FEATURE_REPLACEMENT \ - |MD_FEATURE_RESHAPE_BACKWARDS) + |MD_FEATURE_RESHAPE_BACKWARDS \ + |MD_FEATURE_NEW_OFFSET \ + ) #endif -- cgit v1.2.3 From 048a8b8c89dc427dd7a58527c8923224b1e66d83 Mon Sep 17 00:00:00 2001 From: Jim Kukunas Date: Tue, 22 May 2012 13:54:18 +1000 Subject: lib/raid6: Add SSSE3 optimized recovery functions Add SSSE3 optimized recovery functions, as well as a system for selecting the most appropriate recovery functions to use. Originally-by: H. Peter Anvin Signed-off-by: Jim Kukunas Signed-off-by: NeilBrown --- include/linux/raid/pq.h | 18 ++- lib/raid6/Makefile | 2 +- lib/raid6/algos.c | 37 ++++++ lib/raid6/mktables.c | 25 ++++ lib/raid6/recov.c | 15 ++- lib/raid6/recov_ssse3.c | 335 ++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 425 insertions(+), 7 deletions(-) create mode 100644 lib/raid6/recov_ssse3.c (limited to 'include') diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index 53272e9860a7..640c69ceec96 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -99,8 +99,20 @@ extern const struct raid6_calls raid6_altivec2; extern const struct raid6_calls raid6_altivec4; extern const struct raid6_calls raid6_altivec8; +struct raid6_recov_calls { + void (*data2)(int, size_t, int, int, void **); + void (*datap)(int, size_t, int, void **); + int (*valid)(void); + const char *name; + int priority; +}; + +extern const struct raid6_recov_calls raid6_recov_intx1; +extern const struct raid6_recov_calls raid6_recov_ssse3; + /* Algorithm list */ extern const struct raid6_calls * const raid6_algos[]; +extern const struct raid6_recov_calls *const raid6_recov_algos[]; int raid6_select_algo(void); /* Return values from chk_syndrome */ @@ -111,14 +123,16 @@ int raid6_select_algo(void); /* Galois field tables */ extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256))); +extern const u8 raid6_vgfmul[256][32] __attribute__((aligned(256))); extern const u8 raid6_gfexp[256] __attribute__((aligned(256))); extern const u8 raid6_gfinv[256] __attribute__((aligned(256))); extern const u8 raid6_gfexi[256] __attribute__((aligned(256))); /* Recovery routines */ -void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, +extern void (*raid6_2data_recov)(int disks, size_t bytes, int faila, int failb, void **ptrs); -void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs); +extern void (*raid6_datap_recov)(int disks, size_t bytes, int faila, + void **ptrs); void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs); diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index 8a38102770f3..de06dfe165b8 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile @@ -1,6 +1,6 @@ obj-$(CONFIG_RAID6_PQ) += raid6_pq.o -raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \ +raid6_pq-y += algos.o recov.o recov_ssse3.o tables.o int1.o int2.o int4.o \ int8.o int16.o int32.o altivec1.o altivec2.o altivec4.o \ altivec8.o mmx.o sse1.o sse2.o hostprogs-y += mktables diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index f6a0f7899163..5a7f8022be13 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -64,6 +64,20 @@ const struct raid6_calls * const raid6_algos[] = { NULL }; +void (*raid6_2data_recov)(int, size_t, int, int, void **); +EXPORT_SYMBOL_GPL(raid6_2data_recov); + +void (*raid6_datap_recov)(int, size_t, int, void **); +EXPORT_SYMBOL_GPL(raid6_datap_recov); + +const struct raid6_recov_calls *const raid6_recov_algos[] = { +#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) + &raid6_recov_ssse3, +#endif + &raid6_recov_intx1, + NULL +}; + #ifdef __KERNEL__ #define RAID6_TIME_JIFFIES_LG2 4 #else @@ -72,6 +86,26 @@ const struct raid6_calls * const raid6_algos[] = { #define time_before(x, y) ((x) < (y)) #endif +static inline void raid6_choose_recov(void) +{ + const struct raid6_recov_calls *const *algo; + const struct raid6_recov_calls *best; + + for (best = NULL, algo = raid6_recov_algos; *algo; algo++) + if (!best || (*algo)->priority > best->priority) + if (!(*algo)->valid || (*algo)->valid()) + best = *algo; + + if (best) { + raid6_2data_recov = best->data2; + raid6_datap_recov = best->datap; + + printk("raid6: using %s recovery algorithm\n", best->name); + } else + printk("raid6: Yikes! No recovery algorithm found!\n"); +} + + /* Try to pick the best algorithm */ /* This code uses the gfmul table as convenient data set to abuse */ @@ -141,6 +175,9 @@ int __init raid6_select_algo(void) free_pages((unsigned long)syndromes, 1); + /* select raid recover functions */ + raid6_choose_recov(); + return best ? 0 : -EINVAL; } diff --git a/lib/raid6/mktables.c b/lib/raid6/mktables.c index 8a3780902cec..39787db588b0 100644 --- a/lib/raid6/mktables.c +++ b/lib/raid6/mktables.c @@ -81,6 +81,31 @@ int main(int argc, char *argv[]) printf("EXPORT_SYMBOL(raid6_gfmul);\n"); printf("#endif\n"); + /* Compute vector multiplication table */ + printf("\nconst u8 __attribute__((aligned(256)))\n" + "raid6_vgfmul[256][32] =\n" + "{\n"); + for (i = 0; i < 256; i++) { + printf("\t{\n"); + for (j = 0; j < 16; j += 8) { + printf("\t\t"); + for (k = 0; k < 8; k++) + printf("0x%02x,%c", gfmul(i, j + k), + (k == 7) ? '\n' : ' '); + } + for (j = 0; j < 16; j += 8) { + printf("\t\t"); + for (k = 0; k < 8; k++) + printf("0x%02x,%c", gfmul(i, (j + k) << 4), + (k == 7) ? '\n' : ' '); + } + printf("\t},\n"); + } + printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_vgfmul);\n"); + printf("#endif\n"); + /* Compute power-of-2 table (exponent) */ v = 1; printf("\nconst u8 __attribute__((aligned(256)))\n" diff --git a/lib/raid6/recov.c b/lib/raid6/recov.c index fe275d7b6b36..1805a5cc5daa 100644 --- a/lib/raid6/recov.c +++ b/lib/raid6/recov.c @@ -22,7 +22,7 @@ #include /* Recover two failed data blocks. */ -void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, +void raid6_2data_recov_intx1(int disks, size_t bytes, int faila, int failb, void **ptrs) { u8 *p, *q, *dp, *dq; @@ -64,10 +64,9 @@ void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, p++; q++; } } -EXPORT_SYMBOL_GPL(raid6_2data_recov); /* Recover failure of one data block plus the P block */ -void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) +void raid6_datap_recov_intx1(int disks, size_t bytes, int faila, void **ptrs) { u8 *p, *q, *dq; const u8 *qmul; /* Q multiplier table */ @@ -96,7 +95,15 @@ void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) q++; dq++; } } -EXPORT_SYMBOL_GPL(raid6_datap_recov); + + +const struct raid6_recov_calls raid6_recov_intx1 = { + .data2 = raid6_2data_recov_intx1, + .datap = raid6_datap_recov_intx1, + .valid = NULL, + .name = "intx1", + .priority = 0, +}; #ifndef __KERNEL__ /* Testing only */ diff --git a/lib/raid6/recov_ssse3.c b/lib/raid6/recov_ssse3.c new file mode 100644 index 000000000000..37ae61930559 --- /dev/null +++ b/lib/raid6/recov_ssse3.c @@ -0,0 +1,335 @@ +/* + * Copyright (C) 2012 Intel Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) + +#include +#include "x86.h" + +static int raid6_has_ssse3(void) +{ + return boot_cpu_has(X86_FEATURE_XMM) && + boot_cpu_has(X86_FEATURE_XMM2) && + boot_cpu_has(X86_FEATURE_SSSE3); +} + +void raid6_2data_recov_ssse3(int disks, size_t bytes, int faila, int failb, + void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + static const u8 __aligned(16) x0f[16] = { + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f}; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data pages + Use the dead data pages as temporary storage for + delta p and delta q */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks-2] = p; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ + raid6_gfexp[failb]]]; + + kernel_fpu_begin(); + + asm volatile("movdqa %0,%%xmm7" : : "m" (x0f[0])); + +#ifdef CONFIG_X86_64 + asm volatile("movdqa %0,%%xmm6" : : "m" (qmul[0])); + asm volatile("movdqa %0,%%xmm14" : : "m" (pbmul[0])); + asm volatile("movdqa %0,%%xmm15" : : "m" (pbmul[16])); +#endif + + /* Now do it... */ + while (bytes) { +#ifdef CONFIG_X86_64 + /* xmm6, xmm14, xmm15 */ + + asm volatile("movdqa %0,%%xmm1" : : "m" (q[0])); + asm volatile("movdqa %0,%%xmm9" : : "m" (q[16])); + asm volatile("movdqa %0,%%xmm0" : : "m" (p[0])); + asm volatile("movdqa %0,%%xmm8" : : "m" (p[16])); + asm volatile("pxor %0,%%xmm1" : : "m" (dq[0])); + asm volatile("pxor %0,%%xmm9" : : "m" (dq[16])); + asm volatile("pxor %0,%%xmm0" : : "m" (dp[0])); + asm volatile("pxor %0,%%xmm8" : : "m" (dp[16])); + + /* xmm0/8 = px */ + + asm volatile("movdqa %xmm6,%xmm4"); + asm volatile("movdqa %0,%%xmm5" : : "m" (qmul[16])); + asm volatile("movdqa %xmm6,%xmm12"); + asm volatile("movdqa %xmm5,%xmm13"); + asm volatile("movdqa %xmm1,%xmm3"); + asm volatile("movdqa %xmm9,%xmm11"); + asm volatile("movdqa %xmm0,%xmm2"); /* xmm2/10 = px */ + asm volatile("movdqa %xmm8,%xmm10"); + asm volatile("psraw $4,%xmm1"); + asm volatile("psraw $4,%xmm9"); + asm volatile("pand %xmm7,%xmm3"); + asm volatile("pand %xmm7,%xmm11"); + asm volatile("pand %xmm7,%xmm1"); + asm volatile("pand %xmm7,%xmm9"); + asm volatile("pshufb %xmm3,%xmm4"); + asm volatile("pshufb %xmm11,%xmm12"); + asm volatile("pshufb %xmm1,%xmm5"); + asm volatile("pshufb %xmm9,%xmm13"); + asm volatile("pxor %xmm4,%xmm5"); + asm volatile("pxor %xmm12,%xmm13"); + + /* xmm5/13 = qx */ + + asm volatile("movdqa %xmm14,%xmm4"); + asm volatile("movdqa %xmm15,%xmm1"); + asm volatile("movdqa %xmm14,%xmm12"); + asm volatile("movdqa %xmm15,%xmm9"); + asm volatile("movdqa %xmm2,%xmm3"); + asm volatile("movdqa %xmm10,%xmm11"); + asm volatile("psraw $4,%xmm2"); + asm volatile("psraw $4,%xmm10"); + asm volatile("pand %xmm7,%xmm3"); + asm volatile("pand %xmm7,%xmm11"); + asm volatile("pand %xmm7,%xmm2"); + asm volatile("pand %xmm7,%xmm10"); + asm volatile("pshufb %xmm3,%xmm4"); + asm volatile("pshufb %xmm11,%xmm12"); + asm volatile("pshufb %xmm2,%xmm1"); + asm volatile("pshufb %xmm10,%xmm9"); + asm volatile("pxor %xmm4,%xmm1"); + asm volatile("pxor %xmm12,%xmm9"); + + /* xmm1/9 = pbmul[px] */ + asm volatile("pxor %xmm5,%xmm1"); + asm volatile("pxor %xmm13,%xmm9"); + /* xmm1/9 = db = DQ */ + asm volatile("movdqa %%xmm1,%0" : "=m" (dq[0])); + asm volatile("movdqa %%xmm9,%0" : "=m" (dq[16])); + + asm volatile("pxor %xmm1,%xmm0"); + asm volatile("pxor %xmm9,%xmm8"); + asm volatile("movdqa %%xmm0,%0" : "=m" (dp[0])); + asm volatile("movdqa %%xmm8,%0" : "=m" (dp[16])); + + bytes -= 32; + p += 32; + q += 32; + dp += 32; + dq += 32; +#else + asm volatile("movdqa %0,%%xmm1" : : "m" (*q)); + asm volatile("movdqa %0,%%xmm0" : : "m" (*p)); + asm volatile("pxor %0,%%xmm1" : : "m" (*dq)); + asm volatile("pxor %0,%%xmm0" : : "m" (*dp)); + + /* 1 = dq ^ q + * 0 = dp ^ p + */ + asm volatile("movdqa %0,%%xmm4" : : "m" (qmul[0])); + asm volatile("movdqa %0,%%xmm5" : : "m" (qmul[16])); + + asm volatile("movdqa %xmm1,%xmm3"); + asm volatile("psraw $4,%xmm1"); + asm volatile("pand %xmm7,%xmm3"); + asm volatile("pand %xmm7,%xmm1"); + asm volatile("pshufb %xmm3,%xmm4"); + asm volatile("pshufb %xmm1,%xmm5"); + asm volatile("pxor %xmm4,%xmm5"); + + asm volatile("movdqa %xmm0,%xmm2"); /* xmm2 = px */ + + /* xmm5 = qx */ + + asm volatile("movdqa %0,%%xmm4" : : "m" (pbmul[0])); + asm volatile("movdqa %0,%%xmm1" : : "m" (pbmul[16])); + asm volatile("movdqa %xmm2,%xmm3"); + asm volatile("psraw $4,%xmm2"); + asm volatile("pand %xmm7,%xmm3"); + asm volatile("pand %xmm7,%xmm2"); + asm volatile("pshufb %xmm3,%xmm4"); + asm volatile("pshufb %xmm2,%xmm1"); + asm volatile("pxor %xmm4,%xmm1"); + + /* xmm1 = pbmul[px] */ + asm volatile("pxor %xmm5,%xmm1"); + /* xmm1 = db = DQ */ + asm volatile("movdqa %%xmm1,%0" : "=m" (*dq)); + + asm volatile("pxor %xmm1,%xmm0"); + asm volatile("movdqa %%xmm0,%0" : "=m" (*dp)); + + bytes -= 16; + p += 16; + q += 16; + dp += 16; + dq += 16; +#endif + } + + kernel_fpu_end(); +} + + +void raid6_datap_recov_ssse3(int disks, size_t bytes, int faila, void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + static const u8 __aligned(16) x0f[16] = { + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f}; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data page + Use the dead data page as temporary storage for delta q */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + kernel_fpu_begin(); + + asm volatile("movdqa %0, %%xmm7" : : "m" (x0f[0])); + + while (bytes) { +#ifdef CONFIG_X86_64 + asm volatile("movdqa %0, %%xmm3" : : "m" (dq[0])); + asm volatile("movdqa %0, %%xmm4" : : "m" (dq[16])); + asm volatile("pxor %0, %%xmm3" : : "m" (q[0])); + asm volatile("movdqa %0, %%xmm0" : : "m" (qmul[0])); + + /* xmm3 = q[0] ^ dq[0] */ + + asm volatile("pxor %0, %%xmm4" : : "m" (q[16])); + asm volatile("movdqa %0, %%xmm1" : : "m" (qmul[16])); + + /* xmm4 = q[16] ^ dq[16] */ + + asm volatile("movdqa %xmm3, %xmm6"); + asm volatile("movdqa %xmm4, %xmm8"); + + /* xmm4 = xmm8 = q[16] ^ dq[16] */ + + asm volatile("psraw $4, %xmm3"); + asm volatile("pand %xmm7, %xmm6"); + asm volatile("pand %xmm7, %xmm3"); + asm volatile("pshufb %xmm6, %xmm0"); + asm volatile("pshufb %xmm3, %xmm1"); + asm volatile("movdqa %0, %%xmm10" : : "m" (qmul[0])); + asm volatile("pxor %xmm0, %xmm1"); + asm volatile("movdqa %0, %%xmm11" : : "m" (qmul[16])); + + /* xmm1 = qmul[q[0] ^ dq[0]] */ + + asm volatile("psraw $4, %xmm4"); + asm volatile("pand %xmm7, %xmm8"); + asm volatile("pand %xmm7, %xmm4"); + asm volatile("pshufb %xmm8, %xmm10"); + asm volatile("pshufb %xmm4, %xmm11"); + asm volatile("movdqa %0, %%xmm2" : : "m" (p[0])); + asm volatile("pxor %xmm10, %xmm11"); + asm volatile("movdqa %0, %%xmm12" : : "m" (p[16])); + + /* xmm11 = qmul[q[16] ^ dq[16]] */ + + asm volatile("pxor %xmm1, %xmm2"); + + /* xmm2 = p[0] ^ qmul[q[0] ^ dq[0]] */ + + asm volatile("pxor %xmm11, %xmm12"); + + /* xmm12 = p[16] ^ qmul[q[16] ^ dq[16]] */ + + asm volatile("movdqa %%xmm1, %0" : "=m" (dq[0])); + asm volatile("movdqa %%xmm11, %0" : "=m" (dq[16])); + + asm volatile("movdqa %%xmm2, %0" : "=m" (p[0])); + asm volatile("movdqa %%xmm12, %0" : "=m" (p[16])); + + bytes -= 32; + p += 32; + q += 32; + dq += 32; + +#else + asm volatile("movdqa %0, %%xmm3" : : "m" (dq[0])); + asm volatile("movdqa %0, %%xmm0" : : "m" (qmul[0])); + asm volatile("pxor %0, %%xmm3" : : "m" (q[0])); + asm volatile("movdqa %0, %%xmm1" : : "m" (qmul[16])); + + /* xmm3 = *q ^ *dq */ + + asm volatile("movdqa %xmm3, %xmm6"); + asm volatile("movdqa %0, %%xmm2" : : "m" (p[0])); + asm volatile("psraw $4, %xmm3"); + asm volatile("pand %xmm7, %xmm6"); + asm volatile("pand %xmm7, %xmm3"); + asm volatile("pshufb %xmm6, %xmm0"); + asm volatile("pshufb %xmm3, %xmm1"); + asm volatile("pxor %xmm0, %xmm1"); + + /* xmm1 = qmul[*q ^ *dq */ + + asm volatile("pxor %xmm1, %xmm2"); + + /* xmm2 = *p ^ qmul[*q ^ *dq] */ + + asm volatile("movdqa %%xmm1, %0" : "=m" (dq[0])); + asm volatile("movdqa %%xmm2, %0" : "=m" (p[0])); + + bytes -= 16; + p += 16; + q += 16; + dq += 16; +#endif + } + + kernel_fpu_end(); +} + +const struct raid6_recov_calls raid6_recov_ssse3 = { + .data2 = raid6_2data_recov_ssse3, + .datap = raid6_datap_recov_ssse3, + .valid = raid6_has_ssse3, +#ifdef CONFIG_X86_64 + .name = "ssse3x2", +#else + .name = "ssse3x1", +#endif + .priority = 1, +}; + +#endif -- cgit v1.2.3