diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-07-28 05:50:27 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-07-28 05:50:27 -0700 |
commit | 6140333d3656f62ac7e6a5af87e7fe92cfb8d655 (patch) | |
tree | d96f7ad2196b4383f5ca4396c956e24c82b2952c /drivers/md/md.h | |
parent | 6f56c218666b5c7eff354364357307d18c10058b (diff) | |
parent | 58c54fcca3bac5bf9290cfed31c76e4c4bfbabaf (diff) |
Merge branch 'for-linus' of git://neil.brown.name/md
* 'for-linus' of git://neil.brown.name/md: (75 commits)
md/raid10: handle further errors during fix_read_error better.
md/raid10: Handle read errors during recovery better.
md/raid10: simplify read error handling during recovery.
md/raid10: record bad blocks due to write errors during resync/recovery.
md/raid10: attempt to fix read errors during resync/check
md/raid10: Handle write errors by updating badblock log.
md/raid10: clear bad-block record when write succeeds.
md/raid10: avoid writing to known bad blocks on known bad drives.
md/raid10 record bad blocks as needed during recovery.
md/raid10: avoid reading known bad blocks during resync/recovery.
md/raid10 - avoid reading from known bad blocks - part 3
md/raid10: avoid reading from known bad blocks - part 2
md/raid10: avoid reading from known bad blocks - part 1
md/raid10: Split handle_read_error out from raid10d.
md/raid10: simplify/reindent some loops.
md/raid5: Clear bad blocks on successful write.
md/raid5. Don't write to known bad block on doubtful devices.
md/raid5: write errors should be recorded as bad blocks if possible.
md/raid5: use bad-block log to improve handling of uncorrectable read errors.
md/raid5: avoid reading from known bad blocks.
...
Diffstat (limited to 'drivers/md/md.h')
-rw-r--r-- | drivers/md/md.h | 110 |
1 files changed, 98 insertions, 12 deletions
diff --git a/drivers/md/md.h b/drivers/md/md.h index 1c26c7a08ae6..1e586bb4452e 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -29,6 +29,13 @@ typedef struct mddev_s mddev_t; typedef struct mdk_rdev_s mdk_rdev_t; +/* Bad block numbers are stored sorted in a single page. + * 64bits is used for each block or extent. + * 54 bits are sector number, 9 bits are extent size, + * 1 bit is an 'acknowledged' flag. + */ +#define MD_MAX_BADBLOCKS (PAGE_SIZE/8) + /* * MD's 'extended' device */ @@ -48,7 +55,7 @@ struct mdk_rdev_s struct block_device *meta_bdev; struct block_device *bdev; /* block device handle */ - struct page *sb_page; + struct page *sb_page, *bb_page; int sb_loaded; __u64 sb_events; sector_t data_offset; /* start of data in array */ @@ -74,9 +81,29 @@ struct mdk_rdev_s #define In_sync 2 /* device is in_sync with rest of array */ #define WriteMostly 4 /* Avoid reading if at all possible */ #define AutoDetected 7 /* added by auto-detect */ -#define Blocked 8 /* An error occurred on an externally - * managed array, don't allow writes +#define Blocked 8 /* An error occurred but has not yet + * been acknowledged by the metadata + * handler, so don't allow writes * until it is cleared */ +#define WriteErrorSeen 9 /* A write error has been seen on this + * device + */ +#define FaultRecorded 10 /* Intermediate state for clearing + * Blocked. The Fault is/will-be + * recorded in the metadata, but that + * metadata hasn't been stored safely + * on disk yet. + */ +#define BlockedBadBlocks 11 /* A writer is blocked because they + * found an unacknowledged bad-block. + * This can safely be cleared at any + * time, and the writer will re-check. + * It may be set at any time, and at + * worst the writer will timeout and + * re-check. So setting it as + * accurately as possible is good, but + * not absolutely critical. + */ wait_queue_head_t blocked_wait; int desc_nr; /* descriptor index in the superblock */ @@ -111,8 +138,54 @@ struct mdk_rdev_s struct sysfs_dirent *sysfs_state; /* handle for 'state' * sysfs entry */ + + struct badblocks { + int count; /* count of bad blocks */ + int unacked_exist; /* there probably are unacknowledged + * bad blocks. This is only cleared + * when a read discovers none + */ + int shift; /* shift from sectors to block size + * a -ve shift means badblocks are + * disabled.*/ + u64 *page; /* badblock list */ + int changed; + seqlock_t lock; + + sector_t sector; + sector_t size; /* in sectors */ + } badblocks; }; +#define BB_LEN_MASK (0x00000000000001FFULL) +#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) +#define BB_ACK_MASK (0x8000000000000000ULL) +#define BB_MAX_LEN 512 +#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9) +#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1) +#define BB_ACK(x) (!!((x) & BB_ACK_MASK)) +#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63)) + +extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors); +static inline int is_badblock(mdk_rdev_t *rdev, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors) +{ + if (unlikely(rdev->badblocks.count)) { + int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s, + sectors, + first_bad, bad_sectors); + if (rv) + *first_bad -= rdev->data_offset; + return rv; + } + return 0; +} +extern int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors, + int acknowledged); +extern int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors); +extern void md_ack_all_badblocks(struct badblocks *bb); + struct mddev_s { void *private; @@ -239,9 +312,12 @@ struct mddev_s #define MD_RECOVERY_FROZEN 9 unsigned long recovery; - int recovery_disabled; /* if we detect that recovery - * will always fail, set this - * so we don't loop trying */ + /* If a RAID personality determines that recovery (of a particular + * device) will fail due to a read error on the source device, it + * takes a copy of this number and does not attempt recovery again + * until this number changes. + */ + int recovery_disabled; int in_sync; /* know to not need resync */ /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so @@ -304,11 +380,6 @@ struct mddev_s * hot-adding a bitmap. It should * eventually be settable by sysfs. */ - /* When md is serving under dm, it might use a - * dirty_log to store the bits. - */ - struct dm_dirty_log *log; - struct mutex mutex; unsigned long chunksize; unsigned long daemon_sleep; /* how many jiffies between updates? */ @@ -413,6 +484,20 @@ static inline char * mdname (mddev_t * mddev) return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; } +static inline int sysfs_link_rdev(mddev_t *mddev, mdk_rdev_t *rdev) +{ + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); +} + +static inline void sysfs_unlink_rdev(mddev_t *mddev, mdk_rdev_t *rdev) +{ + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); +} + /* * iterates through some rdev ringlist. It's safe to remove the * current 'rdev'. Dont touch 'tmp' though. @@ -505,7 +590,7 @@ extern void mddev_init(mddev_t *mddev); extern int md_run(mddev_t *mddev); extern void md_stop(mddev_t *mddev); extern void md_stop_writes(mddev_t *mddev); -extern void md_rdev_init(mdk_rdev_t *rdev); +extern int md_rdev_init(mdk_rdev_t *rdev); extern void mddev_suspend(mddev_t *mddev); extern void mddev_resume(mddev_t *mddev); @@ -514,4 +599,5 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, mddev_t *mddev); extern int mddev_check_plugged(mddev_t *mddev); +extern void md_trim_bio(struct bio *bio, int offset, int size); #endif /* _MD_MD_H */ |